115 files changed, 3199 insertions, 1716 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 76768ee812b2..08561f1acd13 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -231,6 +231,10 @@ config RWSEM_SPIN_ON_OWNER
       def_bool y
       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
+config LOCK_SPIN_ON_OWNER
+       def_bool y
+       depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER
 config ARCH_USE_QUEUE_RWLOCK
        bool
diff --git a/kernel/Makefile b/kernel/Makefile
index a59481a3fa6c..1408b3353a3c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -13,8 +13,8 @@ obj-y     = fork.o exec_domain.o panic.o \
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
-CFLAGS_REMOVE_cgroup-debug.o = -pg
+CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE)
-CFLAGS_REMOVE_irq_work.o = -pg
+CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE)
 endif
 # cond_syscall is currently not LTO compatible
@@ -26,6 +26,7 @@ obj-y += power/
 obj-y += printk/
 obj-y += irq/
 obj-y += rcu/
+obj-y += livepatch/
 obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
 obj-$(CONFIG_FREEZER) += freezer.o
@@ -142,7 +143,7 @@ endif
 kernel/system_certificates.o: $(obj)/x509_certificate_list
 quiet_cmd_x509certs  = CERTS   $@
-      cmd_x509certs  = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; echo "  - Including cert $(X509)")
+      cmd_x509certs  = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; $(kecho) "  - Including cert $(X509)")
 targets += $(obj)/x509_certificate_list
 $(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list
diff --git a/kernel/acct.c b/kernel/acct.c
index 33738ef972f3..e6c10d1a4058 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -76,10 +76,11 @@ int acct_parm[3] = {4, 2, 30};
 /*
 * External references and all of the globals.
 */
-static void do_acct_process(struct bsd_acct_struct *acct);
 struct bsd_acct_struct {
        struct fs_pin           pin;
+        atomic_long_t           count;
+        struct rcu_head         rcu;
        struct mutex            lock;
        int                     active;
        unsigned long           needcheck;
@@ -89,6 +90,8 @@ struct bsd_acct_struct {
        struct completion       done;
 };
+static void do_acct_process(struct bsd_acct_struct *acct);
 /*
 * Check the amount of free space and suspend/resume accordingly.
 */
@@ -124,32 +127,56 @@ out:
        return acct->active;
 }
+static void acct_put(struct bsd_acct_struct *p)
+{
+        if (atomic_long_dec_and_test(&p->count))
+                kfree_rcu(p, rcu);
+}
+static inline struct bsd_acct_struct *to_acct(struct fs_pin *p)
+{
+        return p ? container_of(p, struct bsd_acct_struct, pin) : NULL;
+}
 static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
 {
        struct bsd_acct_struct *res;
 again:
        smp_rmb();
        rcu_read_lock();
-        res = ACCESS_ONCE(ns->bacct);
+        res = to_acct(ACCESS_ONCE(ns->bacct));
        if (!res) {
                rcu_read_unlock();
                return NULL;
        }
-        if (!atomic_long_inc_not_zero(&res->pin.count)) {
+        if (!atomic_long_inc_not_zero(&res->count)) {
                rcu_read_unlock();
                cpu_relax();
                goto again;
        }
        rcu_read_unlock();
        mutex_lock(&res->lock);
-        if (!res->ns) {
+        if (res != to_acct(ACCESS_ONCE(ns->bacct))) {
                mutex_unlock(&res->lock);
-                pin_put(&res->pin);
+                acct_put(res);
                goto again;
        }
        return res;
 }
+static void acct_pin_kill(struct fs_pin *pin)
+{
+        struct bsd_acct_struct *acct = to_acct(pin);
+        mutex_lock(&acct->lock);
+        do_acct_process(acct);
+        schedule_work(&acct->work);
+        wait_for_completion(&acct->done);
+        cmpxchg(&acct->ns->bacct, pin, NULL);
+        mutex_unlock(&acct->lock);
+        pin_remove(pin);
+        acct_put(acct);
+}
 static void close_work(struct work_struct *work)
 {
        struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
@@ -160,44 +187,13 @@ static void close_work(struct work_struct *work)
        complete(&acct->done);
 }
-static void acct_kill(struct bsd_acct_struct *acct,
-                      struct bsd_acct_struct *new)
-{
-        if (acct) {
-                struct pid_namespace *ns = acct->ns;
-                do_acct_process(acct);
-                INIT_WORK(&acct->work, close_work);
-                init_completion(&acct->done);
-                schedule_work(&acct->work);
-                wait_for_completion(&acct->done);
-                pin_remove(&acct->pin);
-                ns->bacct = new;
-                acct->ns = NULL;
-                atomic_long_dec(&acct->pin.count);
-                mutex_unlock(&acct->lock);
-                pin_put(&acct->pin);
-        }
-}
-static void acct_pin_kill(struct fs_pin *pin)
-{
-        struct bsd_acct_struct *acct;
-        acct = container_of(pin, struct bsd_acct_struct, pin);
-        mutex_lock(&acct->lock);
-        if (!acct->ns) {
-                mutex_unlock(&acct->lock);
-                pin_put(pin);
-                acct = NULL;
-        }
-        acct_kill(acct, NULL);
-}
 static int acct_on(struct filename *pathname)
 {
        struct file *file;
        struct vfsmount *mnt, *internal;
        struct pid_namespace *ns = task_active_pid_ns(current);
-        struct bsd_acct_struct *acct, *old;
+        struct bsd_acct_struct *acct;
+        struct fs_pin *old;
        int err;
        acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
@@ -238,21 +234,21 @@ static int acct_on(struct filename *pathname)
        mnt = file->f_path.mnt;
        file->f_path.mnt = internal;
-        atomic_long_set(&acct->pin.count, 1);
+        atomic_long_set(&acct->count, 1);
-        acct->pin.kill = acct_pin_kill;
+        init_fs_pin(&acct->pin, acct_pin_kill);
        acct->file = file;
        acct->needcheck = jiffies;
        acct->ns = ns;
        mutex_init(&acct->lock);
+        INIT_WORK(&acct->work, close_work);
+        init_completion(&acct->done);
        mutex_lock_nested(&acct->lock, 1);      /* nobody has seen it yet */
        pin_insert(&acct->pin, mnt);
-        old = acct_get(ns);
+        rcu_read_lock();
-        if (old)
+        old = xchg(&ns->bacct, &acct->pin);
-                acct_kill(old, acct);
-        else
-                ns->bacct = acct;
        mutex_unlock(&acct->lock);
+        pin_kill(old);
        mnt_drop_write(mnt);
        mntput(mnt);
        return 0;
@@ -288,7 +284,8 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
                mutex_unlock(&acct_on_mutex);
                putname(tmp);
        } else {
-                acct_kill(acct_get(task_active_pid_ns(current)), NULL);
+                rcu_read_lock();
+                pin_kill(task_active_pid_ns(current)->bacct);
        }
        return error;
@@ -296,7 +293,8 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
 void acct_exit_ns(struct pid_namespace *ns)
 {
-        acct_kill(acct_get(ns), NULL);
+        rcu_read_lock();
+        pin_kill(ns->bacct);
 }
 /*
@@ -576,7 +574,7 @@ static void slow_acct_process(struct pid_namespace *ns)
                if (acct) {
                        do_acct_process(acct);
                        mutex_unlock(&acct->lock);
-                        pin_put(&acct->pin);
+                        acct_put(acct);
                }
        }
 }
diff --git a/kernel/audit.h b/kernel/audit.h
index 3cdffad5a1d9..1caa0d345d90 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -24,12 +24,6 @@
 #include <linux/skbuff.h>
 #include <uapi/linux/mqueue.h>
-/* 0 = no checking
-   1 = put_count checking
-   2 = verbose put_count checking
-*/
-#define AUDIT_DEBUG 0
 /* AUDIT_NAMES is the number of slots we reserve in the audit_context
 * for saving names from getname().  If we get more names we will allocate
 * a name dynamically and also add those to the list anchored by names_list. */
@@ -74,9 +68,8 @@ struct audit_cap_data {
        };
 };
-/* When fs/namei.c:getname() is called, we store the pointer in name and
+/* When fs/namei.c:getname() is called, we store the pointer in name and bump
- * we don't let putname() free it (instead we free all of the saved
+ * the refcnt in the associated filename struct.
- * pointers at syscall exit time).
 *
 * Further, in fs/namei.c:path_lookup() we store the inode and device.
 */
@@ -86,7 +79,6 @@ struct audit_names {
        struct filename         *name;
        int                     name_len;       /* number of chars to log */
        bool                    hidden;         /* don't log this record */
-        bool                    name_put;       /* call __putname()? */
        unsigned long           ino;
        dev_t                   dev;
@@ -208,11 +200,6 @@ struct audit_context {
        };
        int fds[2];
        struct audit_proctitle proctitle;
-#if AUDIT_DEBUG
-        int                 put_count;
-        int                 ino_count;
-#endif
 };
 extern u32 audit_ever_enabled;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 4f68a326d92e..72e1660a79a3 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -425,7 +425,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
                goto exit_nofree;
        bufp = data->buf;
-        entry->rule.vers_ops = 2;
        for (i = 0; i < data->field_count; i++) {
                struct audit_field *f = &entry->rule.fields[i];
@@ -758,7 +757,6 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old)
                return ERR_PTR(-ENOMEM);
        new = &entry->rule;
-        new->vers_ops = old->vers_ops;
        new->flags = old->flags;
        new->pflags = old->pflags;
        new->listnr = old->listnr;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 072566dd0caf..dc4ae70a7413 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -866,33 +866,10 @@ static inline void audit_free_names(struct audit_context *context)
 {
        struct audit_names *n, *next;
-#if AUDIT_DEBUG == 2
-        if (context->put_count + context->ino_count != context->name_count) {
-                int i = 0;
-                pr_err("%s:%d(:%d): major=%d in_syscall=%d"
-                       " name_count=%d put_count=%d ino_count=%d"
-                       " [NOT freeing]\n", __FILE__, __LINE__,
-                       context->serial, context->major, context->in_syscall,
-                       context->name_count, context->put_count,
-                       context->ino_count);
-                list_for_each_entry(n, &context->names_list, list) {
-                        pr_err("names[%d] = %p = %s\n", i++, n->name,
-                               n->name->name ?: "(null)");
-                }
-                dump_stack();
-                return;
-        }
-#endif
-#if AUDIT_DEBUG
-        context->put_count  = 0;
-        context->ino_count  = 0;
-#endif
        list_for_each_entry_safe(n, next, &context->names_list, list) {
                list_del(&n->list);
-                if (n->name && n->name_put)
+                if (n->name)
-                        final_putname(n->name);
+                        putname(n->name);
                if (n->should_free)
                        kfree(n);
        }
@@ -1711,9 +1688,6 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
        list_add_tail(&aname->list, &context->names_list);
        context->name_count++;
-#if AUDIT_DEBUG
-        context->ino_count++;
-#endif
        return aname;
 }
@@ -1734,8 +1708,10 @@ __audit_reusename(const __user char *uptr)
        list_for_each_entry(n, &context->names_list, list) {
                if (!n->name)
                        continue;
-                if (n->name->uptr == uptr)
+                if (n->name->uptr == uptr) {
+                        n->name->refcnt++;
                        return n->name;
+                }
        }
        return NULL;
 }
@@ -1752,19 +1728,8 @@ void __audit_getname(struct filename *name)
        struct audit_context *context = current->audit_context;
        struct audit_names *n;
-        if (!context->in_syscall) {
+        if (!context->in_syscall)
-#if AUDIT_DEBUG == 2
-                pr_err("%s:%d(:%d): ignoring getname(%p)\n",
-                       __FILE__, __LINE__, context->serial, name);
-                dump_stack();
-#endif
                return;
-        }
-#if AUDIT_DEBUG
-        /* The filename _must_ have a populated ->name */
-        BUG_ON(!name->name);
-#endif
        n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
        if (!n)
@@ -1772,56 +1737,13 @@ void __audit_getname(struct filename *name)
        n->name = name;
        n->name_len = AUDIT_NAME_FULL;
-        n->name_put = true;
        name->aname = n;
+        name->refcnt++;
        if (!context->pwd.dentry)
                get_fs_pwd(current->fs, &context->pwd);
 }
-/* audit_putname - intercept a putname request
- * @name: name to intercept and delay for putname
- *
- * If we have stored the name from getname in the audit context,
- * then we delay the putname until syscall exit.
- * Called from include/linux/fs.h:putname().
- */
-void audit_putname(struct filename *name)
-{
-        struct audit_context *context = current->audit_context;
-        BUG_ON(!context);
-        if (!name->aname || !context->in_syscall) {
-#if AUDIT_DEBUG == 2
-                pr_err("%s:%d(:%d): final_putname(%p)\n",
-                       __FILE__, __LINE__, context->serial, name);
-                if (context->name_count) {
-                        struct audit_names *n;
-                        int i = 0;
-                        list_for_each_entry(n, &context->names_list, list)
-                                pr_err("name[%d] = %p = %s\n", i++, n->name,
-                                       n->name->name ?: "(null)");
-                        }
-#endif
-                final_putname(name);
-        }
-#if AUDIT_DEBUG
-        else {
-                ++context->put_count;
-                if (context->put_count > context->name_count) {
-                        pr_err("%s:%d(:%d): major=%d in_syscall=%d putname(%p)"
-                               " name_count=%d put_count=%d\n",
-                               __FILE__, __LINE__,
-                               context->serial, context->major,
-                               context->in_syscall, name->name,
-                               context->name_count, context->put_count);
-                        dump_stack();
-                }
-        }
-#endif
-}
 /**
 * __audit_inode - store the inode and device from a lookup
 * @name: name being audited
@@ -1842,10 +1764,6 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
        if (!name)
                goto out_alloc;
-#if AUDIT_DEBUG
-        /* The struct filename _must_ have a populated ->name */
-        BUG_ON(!name->name);
-#endif
        /*
         * If we have a pointer to an audit_names entry already, then we can
         * just use it directly if the type is correct.
@@ -1863,7 +1781,17 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
        }
        list_for_each_entry_reverse(n, &context->names_list, list) {
-                if (!n->name || strcmp(n->name->name, name->name))
+                if (n->ino) {
+                        /* valid inode number, use that for the comparison */
+                        if (n->ino != inode->i_ino ||
+                            n->dev != inode->i_sb->s_dev)
+                                continue;
+                } else if (n->name) {
+                        /* inode number has not been set, check the name */
+                        if (strcmp(n->name->name, name->name))
+                                continue;
+                } else
+                        /* no inode and no name (?!) ... this is odd ... */
                        continue;
                /* match the correct record type */
@@ -1882,44 +1810,11 @@ out_alloc:
        n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
        if (!n)
                return;
-        /* unfortunately, while we may have a path name to record with the
-         * inode, we can't always rely on the string lasting until the end of
-         * the syscall so we need to create our own copy, it may fail due to
-         * memory allocation issues, but we do our best */
        if (name) {
-                /* we can't use getname_kernel() due to size limits */
+                n->name = name;
-                size_t len = strlen(name->name) + 1;
+                name->refcnt++;
-                struct filename *new = __getname();
-                if (unlikely(!new))
-                        goto out;
-                if (len <= (PATH_MAX - sizeof(*new))) {
-                        new->name = (char *)(new) + sizeof(*new);
-                        new->separate = false;
-                } else if (len <= PATH_MAX) {
-                        /* this looks odd, but is due to final_putname() */
-                        struct filename *new2;
-                        new2 = kmalloc(sizeof(*new2), GFP_KERNEL);
-                        if (unlikely(!new2)) {
-                                __putname(new);
-                                goto out;
-                        }
-                        new2->name = (char *)new;
-                        new2->separate = true;
-                        new = new2;
-                } else {
-                        /* we should never get here, but let's be safe */
-                        __putname(new);
-                        goto out;
-                }
-                strlcpy((char *)new->name, name->name, len);
-                new->uptr = NULL;
-                new->aname = n;
-                n->name = new;
-                n->name_put = true;
        }
 out:
        if (parent) {
                n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
@@ -1970,11 +1865,16 @@ void __audit_inode_child(const struct inode *parent,
        /* look for a parent entry first */
        list_for_each_entry(n, &context->names_list, list) {
-                if (!n->name || n->type != AUDIT_TYPE_PARENT)
+                if (!n->name ||
+                    (n->type != AUDIT_TYPE_PARENT &&
+                     n->type != AUDIT_TYPE_UNKNOWN))
                        continue;
-                if (n->ino == parent->i_ino &&
+                if (n->ino == parent->i_ino && n->dev == parent->i_sb->s_dev &&
-                    !audit_compare_dname_path(dname, n->name->name, n->name_len)) {
+                    !audit_compare_dname_path(dname,
+                                              n->name->name, n->name_len)) {
+                        if (n->type == AUDIT_TYPE_UNKNOWN)
+                                n->type = AUDIT_TYPE_PARENT;
                        found_parent = n;
                        break;
                }
@@ -1983,11 +1883,8 @@ void __audit_inode_child(const struct inode *parent,
        /* is there a matching child entry? */
        list_for_each_entry(n, &context->names_list, list) {
                /* can only match entries that have a name */
-                if (!n->name || n->type != type)
+                if (!n->name ||
-                        continue;
+                    (n->type != type && n->type != AUDIT_TYPE_UNKNOWN))
-                /* if we found a parent, make sure this one is a child of it */
-                if (found_parent && (n->name != found_parent->name))
                        continue;
                if (!strcmp(dname, n->name->name) ||
@@ -1995,6 +1892,8 @@ void __audit_inode_child(const struct inode *parent,
                                                found_parent ?
                                                found_parent->name_len :
                                                AUDIT_NAME_FULL)) {
+                        if (n->type == AUDIT_TYPE_UNKNOWN)
+                                n->type = type;
                        found_child = n;
                        break;
                }
@@ -2019,10 +1918,10 @@ void __audit_inode_child(const struct inode *parent,
                if (found_parent) {
                        found_child->name = found_parent->name;
                        found_child->name_len = AUDIT_NAME_FULL;
-                        /* don't call __putname() */
+                        found_child->name->refcnt++;
-                        found_child->name_put = false;
                }
        }
        if (inode)
                audit_copy_inode(found_child, dentry, inode);
        else
@@ -2405,7 +2304,6 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
        struct audit_aux_data_bprm_fcaps *ax;
        struct audit_context *context = current->audit_context;
        struct cpu_vfs_cap_data vcaps;
-        struct dentry *dentry;
        ax = kmalloc(sizeof(*ax), GFP_KERNEL);
        if (!ax)
@@ -2415,9 +2313,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
        ax->d.next = context->aux;
        context->aux = (void *)ax;
-        dentry = dget(bprm->file->f_path.dentry);
+        get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
-        get_vfs_caps_from_disk(dentry, &vcaps);
-        dput(dentry);
        ax->fcap.permitted = vcaps.permitted;
        ax->fcap.inheritable = vcaps.inheritable;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 04cfe8ace520..29a7b2cc593e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3077,7 +3077,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
 #endif
        kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
                                  cgroup_file_mode(cft), 0, cft->kf_ops, cft,
-                                  NULL, false, key);
+                                  NULL, key);
        if (IS_ERR(kn))
                return PTR_ERR(kn);
@@ -4373,16 +4373,20 @@ static void css_free_work_fn(struct work_struct *work)
 {
        struct cgroup_subsys_state *css =
                container_of(work, struct cgroup_subsys_state, destroy_work);
+        struct cgroup_subsys *ss = css->ss;
        struct cgroup *cgrp = css->cgroup;
        percpu_ref_exit(&css->refcnt);
-        if (css->ss) {
+        if (ss) {
                /* css free path */
+                int id = css->id;
                if (css->parent)
                        css_put(css->parent);
-                css->ss->css_free(css);
+                ss->css_free(css);
+                cgroup_idr_remove(&ss->css_idr, id);
                cgroup_put(cgrp);
        } else {
                /* cgroup free path */
@@ -4434,7 +4438,7 @@ static void css_release_work_fn(struct work_struct *work)
        if (ss) {
                /* css release path */
-                cgroup_idr_remove(&ss->css_idr, css->id);
+                cgroup_idr_replace(&ss->css_idr, NULL, css->id);
                if (ss->css_released)
                        ss->css_released(css);
        } else {
diff --git a/kernel/compat.c b/kernel/compat.c
index ebb3c369d03d..24f00610c575 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -276,8 +276,7 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
         * core implementation decides to return random nonsense.
         */
        if (ret == -ERESTART_RESTARTBLOCK) {
-                struct restart_block *restart
+                struct restart_block *restart = &current->restart_block;
-                        = &current_thread_info()->restart_block;
                restart->fn = compat_nanosleep_restart;
                restart->nanosleep.compat_rmtp = rmtp;
@@ -860,7 +859,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
                return -EFAULT;
        if (err == -ERESTART_RESTARTBLOCK) {
-                restart = &current_thread_info()->restart_block;
+                restart = &current->restart_block;
                restart->fn = compat_clock_nanosleep_restart;
                restart->nanosleep.compat_rmtp = rmtp;
        }
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5d220234b3ca..1972b161c61e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -58,22 +58,23 @@ static int cpu_hotplug_disabled;
 static struct {
        struct task_struct *active_writer;
-        struct mutex lock; /* Synchronizes accesses to refcount, */
+        /* wait queue to wake up the active_writer */
+        wait_queue_head_t wq;
+        /* verifies that no writer will get active while readers are active */
+        struct mutex lock;
        /*
         * Also blocks the new readers during
         * an ongoing cpu hotplug operation.
         */
-        int refcount;
+        atomic_t refcount;
-        /* And allows lockless put_online_cpus(). */
-        atomic_t puts_pending;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map dep_map;
 #endif
 } cpu_hotplug = {
        .active_writer = NULL,
+        .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
        .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
-        .refcount = 0,
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
        .dep_map = {.name = "cpu_hotplug.lock" },
 #endif
@@ -86,15 +87,6 @@ static struct {
 #define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map)
 #define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
-static void apply_puts_pending(int max)
-{
-        int delta;
-        if (atomic_read(&cpu_hotplug.puts_pending) >= max) {
-                delta = atomic_xchg(&cpu_hotplug.puts_pending, 0);
-                cpu_hotplug.refcount -= delta;
-        }
-}
 void get_online_cpus(void)
 {
@@ -103,8 +95,7 @@ void get_online_cpus(void)
                return;
        cpuhp_lock_acquire_read();
        mutex_lock(&cpu_hotplug.lock);
-        apply_puts_pending(65536);
+        atomic_inc(&cpu_hotplug.refcount);
-        cpu_hotplug.refcount++;
        mutex_unlock(&cpu_hotplug.lock);
 }
 EXPORT_SYMBOL_GPL(get_online_cpus);
@@ -116,8 +107,7 @@ bool try_get_online_cpus(void)
        if (!mutex_trylock(&cpu_hotplug.lock))
                return false;
        cpuhp_lock_acquire_tryread();
-        apply_puts_pending(65536);
+        atomic_inc(&cpu_hotplug.refcount);
-        cpu_hotplug.refcount++;
        mutex_unlock(&cpu_hotplug.lock);
        return true;
 }
@@ -125,20 +115,18 @@ EXPORT_SYMBOL_GPL(try_get_online_cpus);
 void put_online_cpus(void)
 {
+        int refcount;
        if (cpu_hotplug.active_writer == current)
                return;
-        if (!mutex_trylock(&cpu_hotplug.lock)) {
-                atomic_inc(&cpu_hotplug.puts_pending);
-                cpuhp_lock_release();
-                return;
-        }
-        if (WARN_ON(!cpu_hotplug.refcount))
+        refcount = atomic_dec_return(&cpu_hotplug.refcount);
-                cpu_hotplug.refcount++; /* try to fix things up */
+        if (WARN_ON(refcount < 0)) /* try to fix things up */
+                atomic_inc(&cpu_hotplug.refcount);
+        if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq))
+                wake_up(&cpu_hotplug.wq);
-        if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
-                wake_up_process(cpu_hotplug.active_writer);
-        mutex_unlock(&cpu_hotplug.lock);
        cpuhp_lock_release();
 }
@@ -168,18 +156,20 @@ EXPORT_SYMBOL_GPL(put_online_cpus);
 */
 void cpu_hotplug_begin(void)
 {
-        cpu_hotplug.active_writer = current;
+        DEFINE_WAIT(wait);
+        cpu_hotplug.active_writer = current;
        cpuhp_lock_acquire();
        for (;;) {
                mutex_lock(&cpu_hotplug.lock);
-                apply_puts_pending(1);
+                prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE);
-                if (likely(!cpu_hotplug.refcount))
+                if (likely(!atomic_read(&cpu_hotplug.refcount)))
-                        break;
+                                break;
-                __set_current_state(TASK_UNINTERRUPTIBLE);
                mutex_unlock(&cpu_hotplug.lock);
                schedule();
        }
+        finish_wait(&cpu_hotplug.wq, &wait);
 }
 void cpu_hotplug_done(void)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 64b257f6bca2..1d1fe9361d29 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1707,40 +1707,27 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
 {
        struct cpuset *cs = css_cs(seq_css(sf));
        cpuset_filetype_t type = seq_cft(sf)->private;
-        ssize_t count;
-        char *buf, *s;
        int ret = 0;
-        count = seq_get_buf(sf, &buf);
-        s = buf;
        spin_lock_irq(&callback_lock);
        switch (type) {
        case FILE_CPULIST:
-                s += cpulist_scnprintf(s, count, cs->cpus_allowed);
+                seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
                break;
        case FILE_MEMLIST:
-                s += nodelist_scnprintf(s, count, cs->mems_allowed);
+                seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
                break;
        case FILE_EFFECTIVE_CPULIST:
-                s += cpulist_scnprintf(s, count, cs->effective_cpus);
+                seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
                break;
        case FILE_EFFECTIVE_MEMLIST:
-                s += nodelist_scnprintf(s, count, cs->effective_mems);
+                seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
                break;
        default:
                ret = -EINVAL;
-                goto out_unlock;
        }
-        if (s < buf + count - 1) {
-                *s++ = '\n';
-                seq_commit(sf, s - buf);
-        } else {
-                seq_commit(sf, -1);
-        }
-out_unlock:
        spin_unlock_irq(&callback_lock);
        return ret;
 }
@@ -2400,7 +2387,7 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
         */
 }
-void cpuset_init_current_mems_allowed(void)
+void __init cpuset_init_current_mems_allowed(void)
 {
        nodes_setall(current->mems_allowed);
 }
@@ -2610,8 +2597,6 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
        return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
 }
-#define CPUSET_NODELIST_LEN     (256)
 /**
 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
 * @tsk: pointer to task_struct of some task.
@@ -2621,23 +2606,16 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
 */
 void cpuset_print_task_mems_allowed(struct task_struct *tsk)
 {
-         /* Statically allocated to prevent using excess stack. */
-        static char cpuset_nodelist[CPUSET_NODELIST_LEN];
-        static DEFINE_SPINLOCK(cpuset_buffer_lock);
        struct cgroup *cgrp;
-        spin_lock(&cpuset_buffer_lock);
        rcu_read_lock();
        cgrp = task_cs(tsk)->css.cgroup;
-        nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
-                           tsk->mems_allowed);
        pr_info("%s cpuset=", tsk->comm);
        pr_cont_cgroup_name(cgrp);
-        pr_cont(" mems_allowed=%s\n", cpuset_nodelist);
+        pr_cont(" mems_allowed=%*pbl\n", nodemask_pr_args(&tsk->mems_allowed));
        rcu_read_unlock();
-        spin_unlock(&cpuset_buffer_lock);
 }
 /*
@@ -2715,10 +2693,8 @@ out:
 /* Display task mems_allowed in /proc/<pid>/status file. */
 void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
 {
-        seq_puts(m, "Mems_allowed:\t");
+        seq_printf(m, "Mems_allowed:\t%*pb\n",
-        seq_nodemask(m, &task->mems_allowed);
+                   nodemask_pr_args(&task->mems_allowed));
-        seq_puts(m, "\n");
+        seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
-        seq_puts(m, "Mems_allowed_list:\t");
+                   nodemask_pr_args(&task->mems_allowed));
-        seq_nodemask_list(m, &task->mems_allowed);
-        seq_puts(m, "\n");
 }
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 07ce18ca71e0..0874e2edd275 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -604,7 +604,7 @@ return_normal:
                   online_cpus)
                cpu_relax();
        if (!time_left)
-                pr_crit("KGDB: Timed out waiting for secondary CPUs.\n");
+                pr_crit("Timed out waiting for secondary CPUs.\n");
        /*
         * At this point the primary processor is completely
@@ -696,6 +696,14 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
        if (arch_kgdb_ops.enable_nmi)
                arch_kgdb_ops.enable_nmi(0);
+        /*
+         * Avoid entering the debugger if we were triggered due to an oops
+         * but panic_timeout indicates the system should automatically
+         * reboot on panic. We don't want to get stuck waiting for input
+         * on such systems, especially if its "just" an oops.
+         */
+        if (signo != SIGTRAP && panic_timeout)
+                return 1;
        memset(ks, 0, sizeof(struct kgdb_state));
        ks->cpu                 = raw_smp_processor_id();
@@ -828,6 +836,15 @@ static int kgdb_panic_event(struct notifier_block *self,
                            unsigned long val,
                            void *data)
 {
+        /*
+         * Avoid entering the debugger if we were triggered due to a panic
+         * We don't want to get stuck waiting for input from user in such case.
+         * panic_timeout indicates the system should automatically
+         * reboot on panic.
+         */
+        if (panic_timeout)
+                return NOTIFY_DONE;
        if (dbg_kdb_mode)
                kdb_printf("PANIC: %s\n", (char *)data);
        kgdb_breakpoint();
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 7c70812caea5..fc1ef736253c 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -439,7 +439,7 @@ poll_again:
 *      substituted for %d, %x or %o in the prompt.
 */
-char *kdb_getstr(char *buffer, size_t bufsize, char *prompt)
+char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt)
 {
        if (prompt && kdb_prompt_str != prompt)
                strncpy(kdb_prompt_str, prompt, CMD_BUFLEN);
@@ -548,7 +548,7 @@ static int kdb_search_string(char *searched, char *searchfor)
        return 0;
 }
-int vkdb_printf(const char *fmt, va_list ap)
+int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
 {
        int diag;
        int linecount;
@@ -680,6 +680,12 @@ int vkdb_printf(const char *fmt, va_list ap)
                        size_avail = sizeof(kdb_buffer) - len;
                        goto kdb_print_out;
                }
+                if (kdb_grepping_flag >= KDB_GREPPING_FLAG_SEARCH)
+                        /*
+                         * This was a interactive search (using '/' at more
+                         * prompt) and it has completed. Clear the flag.
+                         */
+                        kdb_grepping_flag = 0;
                /*
                 * at this point the string is a full line and
                 * should be printed, up to the null.
@@ -691,19 +697,20 @@ kdb_printit:
         * Write to all consoles.
         */
        retlen = strlen(kdb_buffer);
+        cp = (char *) printk_skip_level(kdb_buffer);
        if (!dbg_kdb_mode && kgdb_connected) {
-                gdbstub_msg_write(kdb_buffer, retlen);
+                gdbstub_msg_write(cp, retlen - (cp - kdb_buffer));
        } else {
                if (dbg_io_ops && !dbg_io_ops->is_console) {
-                        len = retlen;
+                        len = retlen - (cp - kdb_buffer);
-                        cp = kdb_buffer;
+                        cp2 = cp;
                        while (len--) {
-                                dbg_io_ops->write_char(*cp);
+                                dbg_io_ops->write_char(*cp2);
-                                cp++;
+                                cp2++;
                        }
                }
                while (c) {
-                        c->write(c, kdb_buffer, retlen);
+                        c->write(c, cp, retlen - (cp - kdb_buffer));
                        touch_nmi_watchdog();
                        c = c->next;
                }
@@ -711,7 +718,10 @@ kdb_printit:
        if (logging) {
                saved_loglevel = console_loglevel;
                console_loglevel = CONSOLE_LOGLEVEL_SILENT;
-                printk(KERN_INFO "%s", kdb_buffer);
+                if (printk_get_level(kdb_buffer) || src == KDB_MSGSRC_PRINTK)
+                        printk("%s", kdb_buffer);
+                else
+                        pr_info("%s", kdb_buffer);
        }
        if (KDB_STATE(PAGER)) {
@@ -794,11 +804,23 @@ kdb_printit:
                        kdb_nextline = linecount - 1;
                        kdb_printf("\r");
                        suspend_grep = 1; /* for this recursion */
+                } else if (buf1[0] == '/' && !kdb_grepping_flag) {
+                        kdb_printf("\r");
+                        kdb_getstr(kdb_grep_string, KDB_GREP_STRLEN,
+                                   kdbgetenv("SEARCHPROMPT") ?: "search> ");
+                        *strchrnul(kdb_grep_string, '\n') = '\0';
+                        kdb_grepping_flag += KDB_GREPPING_FLAG_SEARCH;
+                        suspend_grep = 1; /* for this recursion */
                } else if (buf1[0] && buf1[0] != '\n') {
                        /* user hit something other than enter */
                        suspend_grep = 1; /* for this recursion */
-                        kdb_printf("\nOnly 'q' or 'Q' are processed at more "
+                        if (buf1[0] != '/')
-                                   "prompt, input ignored\n");
+                                kdb_printf(
+                                    "\nOnly 'q', 'Q' or '/' are processed at "
+                                    "more prompt, input ignored\n");
+                        else
+                                kdb_printf("\n'/' cannot be used during | "
+                                           "grep filtering, input ignored\n");
                } else if (kdb_grepping_flag) {
                        /* user hit enter */
                        suspend_grep = 1; /* for this recursion */
@@ -844,7 +866,7 @@ int kdb_printf(const char *fmt, ...)
        int r;
        va_start(ap, fmt);
-        r = vkdb_printf(fmt, ap);
+        r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap);
        va_end(ap);
        return r;
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 7b40c5f07dce..4121345498e0 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -50,8 +50,7 @@
 static int kdb_cmd_enabled = CONFIG_KDB_DEFAULT_ENABLE;
 module_param_named(cmd_enable, kdb_cmd_enabled, int, 0600);
-#define GREP_LEN 256
+char kdb_grep_string[KDB_GREP_STRLEN];
-char kdb_grep_string[GREP_LEN];
 int kdb_grepping_flag;
 EXPORT_SYMBOL(kdb_grepping_flag);
 int kdb_grep_leading;
@@ -870,7 +869,7 @@ static void parse_grep(const char *str)
        len = strlen(cp);
        if (!len)
                return;
-        if (len >= GREP_LEN) {
+        if (len >= KDB_GREP_STRLEN) {
                kdb_printf("search string too long\n");
                return;
        }
@@ -915,13 +914,12 @@ int kdb_parse(const char *cmdstr)
        char *cp;
        char *cpp, quoted;
        kdbtab_t *tp;
-        int i, escaped, ignore_errors = 0, check_grep;
+        int i, escaped, ignore_errors = 0, check_grep = 0;
        /*
         * First tokenize the command string.
         */
        cp = (char *)cmdstr;
-        kdb_grepping_flag = check_grep = 0;
        if (KDB_FLAG(CMD_INTERRUPT)) {
                /* Previous command was interrupted, newline must not
@@ -1247,7 +1245,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
                kdb_printf("due to NonMaskable Interrupt @ "
                           kdb_machreg_fmt "\n",
                           instruction_pointer(regs));
-                kdb_dumpregs(regs);
                break;
        case KDB_REASON_SSTEP:
        case KDB_REASON_BREAK:
@@ -1281,6 +1278,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
                 */
                kdb_nextline = 1;
                KDB_STATE_CLEAR(SUPPRESS);
+                kdb_grepping_flag = 0;
+                /* ensure the old search does not leak into '/' commands */
+                kdb_grep_string[0] = '\0';
                cmdbuf = cmd_cur;
                *cmdbuf = '\0';
@@ -2256,7 +2256,7 @@ static int kdb_cpu(int argc, const char **argv)
        /*
         * Validate cpunum
         */
-        if ((cpunum > NR_CPUS) || !kgdb_info[cpunum].enter_kgdb)
+        if ((cpunum >= CONFIG_NR_CPUS) || !kgdb_info[cpunum].enter_kgdb)
                return KDB_BADCPUNUM;
        dbg_switch_cpu = cpunum;
@@ -2583,7 +2583,7 @@ static int kdb_summary(int argc, const char **argv)
 #define K(x) ((x) << (PAGE_SHIFT - 10))
        kdb_printf("\nMemTotal:       %8lu kB\nMemFree:        %8lu kB\n"
                   "Buffers:        %8lu kB\n",
-                   val.totalram, val.freeram, val.bufferram);
+                   K(val.totalram), K(val.freeram), K(val.bufferram));
        return 0;
 }
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index eaacd1693954..75014d7f4568 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -196,7 +196,9 @@ extern int kdb_main_loop(kdb_reason_t, kdb_reason_t,
 /* Miscellaneous functions and data areas */
 extern int kdb_grepping_flag;
+#define KDB_GREPPING_FLAG_SEARCH 0x8000
 extern char kdb_grep_string[];
+#define KDB_GREP_STRLEN 256
 extern int kdb_grep_leading;
 extern int kdb_grep_trailing;
 extern char *kdb_cmds[];
@@ -209,7 +211,7 @@ extern void kdb_ps1(const struct task_struct *p);
 extern void kdb_print_nameval(const char *name, unsigned long val);
 extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
 extern void kdb_meminfo_proc_show(void);
-extern char *kdb_getstr(char *, size_t, char *);
+extern char *kdb_getstr(char *, size_t, const char *);
 extern void kdb_gdb_state_pass(char *buf);
 /* Defines for kdb_symbol_print */
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 103f5d147b2f..2925188f50ea 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -1,5 +1,5 @@
 ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_core.o = -pg
+CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
 endif
 obj-y := core.o ring_buffer.o callchain.o
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 19efcf13375a..f04daabfd1cf 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -872,22 +872,32 @@ void perf_pmu_enable(struct pmu *pmu)
                pmu->pmu_enable(pmu);
 }
-static DEFINE_PER_CPU(struct list_head, rotation_list);
+static DEFINE_PER_CPU(struct list_head, active_ctx_list);
 /*
- * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
+ * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
- * because they're strictly cpu affine and rotate_start is called with IRQs
+ * perf_event_task_tick() are fully serialized because they're strictly cpu
- * disabled, while rotate_context is called from IRQ context.
+ * affine and perf_event_ctx{activate,deactivate} are called with IRQs
+ * disabled, while perf_event_task_tick is called from IRQ context.
 */
-static void perf_pmu_rotate_start(struct pmu *pmu)
+static void perf_event_ctx_activate(struct perf_event_context *ctx)
 {
-        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+        struct list_head *head = this_cpu_ptr(&active_ctx_list);
-        struct list_head *head = this_cpu_ptr(&rotation_list);
        WARN_ON(!irqs_disabled());
-        if (list_empty(&cpuctx->rotation_list))
+        WARN_ON(!list_empty(&ctx->active_ctx_list));
-                list_add(&cpuctx->rotation_list, head);
+        list_add(&ctx->active_ctx_list, head);
+}
+static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
+{
+        WARN_ON(!irqs_disabled());
+        WARN_ON(list_empty(&ctx->active_ctx_list));
+        list_del_init(&ctx->active_ctx_list);
 }
 static void get_ctx(struct perf_event_context *ctx)
@@ -907,6 +917,84 @@ static void put_ctx(struct perf_event_context *ctx)
 }
 /*
+ * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
+ * perf_pmu_migrate_context() we need some magic.
+ *
+ * Those places that change perf_event::ctx will hold both
+ * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
+ *
+ * Lock ordering is by mutex address. There is one other site where
+ * perf_event_context::mutex nests and that is put_event(). But remember that
+ * that is a parent<->child context relation, and migration does not affect
+ * children, therefore these two orderings should not interact.
+ *
+ * The change in perf_event::ctx does not affect children (as claimed above)
+ * because the sys_perf_event_open() case will install a new event and break
+ * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
+ * concerned with cpuctx and that doesn't have children.
+ *
+ * The places that change perf_event::ctx will issue:
+ *
+ *   perf_remove_from_context();
+ *   synchronize_rcu();
+ *   perf_install_in_context();
+ *
+ * to affect the change. The remove_from_context() + synchronize_rcu() should
+ * quiesce the event, after which we can install it in the new location. This
+ * means that only external vectors (perf_fops, prctl) can perturb the event
+ * while in transit. Therefore all such accessors should also acquire
+ * perf_event_context::mutex to serialize against this.
+ *
+ * However; because event->ctx can change while we're waiting to acquire
+ * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
+ * function.
+ *
+ * Lock order:
+ *      task_struct::perf_event_mutex
+ *        perf_event_context::mutex
+ *          perf_event_context::lock
+ *          perf_event::child_mutex;
+ *          perf_event::mmap_mutex
+ *          mmap_sem
+ */
+static struct perf_event_context *
+perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
+{
+        struct perf_event_context *ctx;
+again:
+        rcu_read_lock();
+        ctx = ACCESS_ONCE(event->ctx);
+        if (!atomic_inc_not_zero(&ctx->refcount)) {
+                rcu_read_unlock();
+                goto again;
+        }
+        rcu_read_unlock();
+        mutex_lock_nested(&ctx->mutex, nesting);
+        if (event->ctx != ctx) {
+                mutex_unlock(&ctx->mutex);
+                put_ctx(ctx);
+                goto again;
+        }
+        return ctx;
+}
+static inline struct perf_event_context *
+perf_event_ctx_lock(struct perf_event *event)
+{
+        return perf_event_ctx_lock_nested(event, 0);
+}
+static void perf_event_ctx_unlock(struct perf_event *event,
+                                  struct perf_event_context *ctx)
+{
+        mutex_unlock(&ctx->mutex);
+        put_ctx(ctx);
+}
+/*
 * This must be done under the ctx->lock, such as to serialize against
 * context_equiv(), therefore we cannot call put_ctx() since that might end up
 * calling scheduler related locks and ctx->lock nests inside those.
@@ -1155,8 +1243,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
                ctx->nr_branch_stack++;
        list_add_rcu(&event->event_entry, &ctx->event_list);
-        if (!ctx->nr_events)
-                perf_pmu_rotate_start(ctx->pmu);
        ctx->nr_events++;
        if (event->attr.inherit_stat)
                ctx->nr_stat++;
@@ -1275,6 +1361,8 @@ static void perf_group_attach(struct perf_event *event)
        if (group_leader == event)
                return;
+        WARN_ON_ONCE(group_leader->ctx != event->ctx);
        if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
                        !is_software_event(event))
                group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
@@ -1296,6 +1384,10 @@ static void
 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 {
        struct perf_cpu_context *cpuctx;
+        WARN_ON_ONCE(event->ctx != ctx);
+        lockdep_assert_held(&ctx->lock);
        /*
         * We can have double detach due to exit/hot-unplug + close.
         */
@@ -1380,6 +1472,8 @@ static void perf_group_detach(struct perf_event *event)
                /* Inherit group flags from the previous leader */
                sibling->group_flags = event->group_flags;
+                WARN_ON_ONCE(sibling->ctx != event->ctx);
        }
 out:
@@ -1442,6 +1536,10 @@ event_sched_out(struct perf_event *event,
 {
        u64 tstamp = perf_event_time(event);
        u64 delta;
+        WARN_ON_ONCE(event->ctx != ctx);
+        lockdep_assert_held(&ctx->lock);
        /*
         * An event which could not be activated because of
         * filter mismatch still needs to have its timings
@@ -1471,7 +1569,8 @@ event_sched_out(struct perf_event *event,
        if (!is_software_event(event))
                cpuctx->active_oncpu--;
-        ctx->nr_active--;
+        if (!--ctx->nr_active)
+                perf_event_ctx_deactivate(ctx);
        if (event->attr.freq && event->attr.sample_freq)
                ctx->nr_freq--;
        if (event->attr.exclusive || !cpuctx->active_oncpu)
@@ -1654,7 +1753,7 @@ int __perf_event_disable(void *info)
 * is the current context on this CPU and preemption is disabled,
 * hence we can't get into perf_event_task_sched_out for this context.
 */
-void perf_event_disable(struct perf_event *event)
+static void _perf_event_disable(struct perf_event *event)
 {
        struct perf_event_context *ctx = event->ctx;
        struct task_struct *task = ctx->task;
@@ -1695,6 +1794,19 @@ retry:
        }
        raw_spin_unlock_irq(&ctx->lock);
 }
+/*
+ * Strictly speaking kernel users cannot create groups and therefore this
+ * interface does not need the perf_event_ctx_lock() magic.
+ */
+void perf_event_disable(struct perf_event *event)
+{
+        struct perf_event_context *ctx;
+        ctx = perf_event_ctx_lock(event);
+        _perf_event_disable(event);
+        perf_event_ctx_unlock(event, ctx);
+}
 EXPORT_SYMBOL_GPL(perf_event_disable);
 static void perf_set_shadow_time(struct perf_event *event,
@@ -1782,7 +1894,8 @@ event_sched_in(struct perf_event *event,
        if (!is_software_event(event))
                cpuctx->active_oncpu++;
-        ctx->nr_active++;
+        if (!ctx->nr_active++)
+                perf_event_ctx_activate(ctx);
        if (event->attr.freq && event->attr.sample_freq)
                ctx->nr_freq++;
@@ -2158,7 +2271,7 @@ unlock:
 * perf_event_for_each_child or perf_event_for_each as described
 * for perf_event_disable.
 */
-void perf_event_enable(struct perf_event *event)
+static void _perf_event_enable(struct perf_event *event)
 {
        struct perf_event_context *ctx = event->ctx;
        struct task_struct *task = ctx->task;
@@ -2214,9 +2327,21 @@ retry:
 out:
        raw_spin_unlock_irq(&ctx->lock);
 }
+/*
+ * See perf_event_disable();
+ */
+void perf_event_enable(struct perf_event *event)
+{
+        struct perf_event_context *ctx;
+        ctx = perf_event_ctx_lock(event);
+        _perf_event_enable(event);
+        perf_event_ctx_unlock(event, ctx);
+}
 EXPORT_SYMBOL_GPL(perf_event_enable);
-int perf_event_refresh(struct perf_event *event, int refresh)
+static int _perf_event_refresh(struct perf_event *event, int refresh)
 {
        /*
         * not supported on inherited events
@@ -2225,10 +2350,25 @@ int perf_event_refresh(struct perf_event *event, int refresh)
                return -EINVAL;
        atomic_add(refresh, &event->event_limit);
-        perf_event_enable(event);
+        _perf_event_enable(event);
        return 0;
 }
+/*
+ * See perf_event_disable()
+ */
+int perf_event_refresh(struct perf_event *event, int refresh)
+{
+        struct perf_event_context *ctx;
+        int ret;
+        ctx = perf_event_ctx_lock(event);
+        ret = _perf_event_refresh(event, refresh);
+        perf_event_ctx_unlock(event, ctx);
+        return ret;
+}
 EXPORT_SYMBOL_GPL(perf_event_refresh);
 static void ctx_sched_out(struct perf_event_context *ctx,
@@ -2612,12 +2752,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
        perf_pmu_enable(ctx->pmu);
        perf_ctx_unlock(cpuctx, ctx);
-        /*
-         * Since these rotations are per-cpu, we need to ensure the
-         * cpu-context we got scheduled on is actually rotating.
-         */
-        perf_pmu_rotate_start(ctx->pmu);
 }
 /*
@@ -2905,25 +3039,18 @@ static void rotate_ctx(struct perf_event_context *ctx)
                list_rotate_left(&ctx->flexible_groups);
 }
-/*
- * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
- * because they're strictly cpu affine and rotate_start is called with IRQs
- * disabled, while rotate_context is called from IRQ context.
- */
 static int perf_rotate_context(struct perf_cpu_context *cpuctx)
 {
        struct perf_event_context *ctx = NULL;
-        int rotate = 0, remove = 1;
+        int rotate = 0;
        if (cpuctx->ctx.nr_events) {
-                remove = 0;
                if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
                        rotate = 1;
        }
        ctx = cpuctx->task_ctx;
        if (ctx && ctx->nr_events) {
-                remove = 0;
                if (ctx->nr_events != ctx->nr_active)
                        rotate = 1;
        }
@@ -2947,8 +3074,6 @@ static int perf_rotate_context(struct perf_cpu_context *cpuctx)
        perf_pmu_enable(cpuctx->ctx.pmu);
        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 done:
-        if (remove)
-                list_del_init(&cpuctx->rotation_list);
        return rotate;
 }
@@ -2966,9 +3091,8 @@ bool perf_event_can_stop_tick(void)
 void perf_event_task_tick(void)
 {
-        struct list_head *head = this_cpu_ptr(&rotation_list);
+        struct list_head *head = this_cpu_ptr(&active_ctx_list);
-        struct perf_cpu_context *cpuctx, *tmp;
+        struct perf_event_context *ctx, *tmp;
-        struct perf_event_context *ctx;
        int throttled;
        WARN_ON(!irqs_disabled());
@@ -2976,14 +3100,8 @@ void perf_event_task_tick(void)
        __this_cpu_inc(perf_throttled_seq);
        throttled = __this_cpu_xchg(perf_throttled_count, 0);
-        list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
+        list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
-                ctx = &cpuctx->ctx;
                perf_adjust_freq_unthr_context(ctx, throttled);
-                ctx = cpuctx->task_ctx;
-                if (ctx)
-                        perf_adjust_freq_unthr_context(ctx, throttled);
-        }
 }
 static int event_enable_on_exec(struct perf_event *event,
@@ -3142,6 +3260,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
 {
        raw_spin_lock_init(&ctx->lock);
        mutex_init(&ctx->mutex);
+        INIT_LIST_HEAD(&ctx->active_ctx_list);
        INIT_LIST_HEAD(&ctx->pinned_groups);
        INIT_LIST_HEAD(&ctx->flexible_groups);
        INIT_LIST_HEAD(&ctx->event_list);
@@ -3421,7 +3540,16 @@ static void perf_remove_from_owner(struct perf_event *event)
        rcu_read_unlock();
        if (owner) {
-                mutex_lock(&owner->perf_event_mutex);
+                /*
+                 * If we're here through perf_event_exit_task() we're already
+                 * holding ctx->mutex which would be an inversion wrt. the
+                 * normal lock order.
+                 *
+                 * However we can safely take this lock because its the child
+                 * ctx->mutex.
+                 */
+                mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
                /*
                 * We have to re-check the event->owner field, if it is cleared
                 * we raced with perf_event_exit_task(), acquiring the mutex
@@ -3440,7 +3568,7 @@ static void perf_remove_from_owner(struct perf_event *event)
 */
 static void put_event(struct perf_event *event)
 {
-        struct perf_event_context *ctx = event->ctx;
+        struct perf_event_context *ctx;
        if (!atomic_long_dec_and_test(&event->refcount))
                return;
@@ -3448,7 +3576,6 @@ static void put_event(struct perf_event *event)
        if (!is_kernel_event(event))
                perf_remove_from_owner(event);
-        WARN_ON_ONCE(ctx->parent_ctx);
        /*
         * There are two ways this annotation is useful:
         *
@@ -3461,7 +3588,8 @@ static void put_event(struct perf_event *event)
         *     the last filedesc died, so there is no possibility
         *     to trigger the AB-BA case.
         */
-        mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
+        ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
+        WARN_ON_ONCE(ctx->parent_ctx);
        perf_remove_from_context(event, true);
        mutex_unlock(&ctx->mutex);
@@ -3547,12 +3675,13 @@ static int perf_event_read_group(struct perf_event *event,
                                   u64 read_format, char __user *buf)
 {
        struct perf_event *leader = event->group_leader, *sub;
-        int n = 0, size = 0, ret = -EFAULT;
        struct perf_event_context *ctx = leader->ctx;
-        u64 values[5];
+        int n = 0, size = 0, ret;
        u64 count, enabled, running;
+        u64 values[5];
+        lockdep_assert_held(&ctx->mutex);
-        mutex_lock(&ctx->mutex);
        count = perf_event_read_value(leader, &enabled, &running);
        values[n++] = 1 + leader->nr_siblings;
@@ -3567,7 +3696,7 @@ static int perf_event_read_group(struct perf_event *event,
        size = n * sizeof(u64);
        if (copy_to_user(buf, values, size))
-                goto unlock;
+                return -EFAULT;
        ret = size;
@@ -3581,14 +3710,11 @@ static int perf_event_read_group(struct perf_event *event,
                size = n * sizeof(u64);
                if (copy_to_user(buf + ret, values, size)) {
-                        ret = -EFAULT;
+                        return -EFAULT;
-                        goto unlock;
                }
                ret += size;
        }
-unlock:
-        mutex_unlock(&ctx->mutex);
        return ret;
 }
@@ -3660,8 +3786,14 @@ static ssize_t
 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 {
        struct perf_event *event = file->private_data;
+        struct perf_event_context *ctx;
+        int ret;
+        ctx = perf_event_ctx_lock(event);
+        ret = perf_read_hw(event, buf, count);
+        perf_event_ctx_unlock(event, ctx);
-        return perf_read_hw(event, buf, count);
+        return ret;
 }
 static unsigned int perf_poll(struct file *file, poll_table *wait)
@@ -3687,7 +3819,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
        return events;
 }
-static void perf_event_reset(struct perf_event *event)
+static void _perf_event_reset(struct perf_event *event)
 {
        (void)perf_event_read(event);
        local64_set(&event->count, 0);
@@ -3706,6 +3838,7 @@ static void perf_event_for_each_child(struct perf_event *event,
        struct perf_event *child;
        WARN_ON_ONCE(event->ctx->parent_ctx);
        mutex_lock(&event->child_mutex);
        func(event);
        list_for_each_entry(child, &event->child_list, child_list)
@@ -3719,14 +3852,13 @@ static void perf_event_for_each(struct perf_event *event,
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *sibling;
-        WARN_ON_ONCE(ctx->parent_ctx);
+        lockdep_assert_held(&ctx->mutex);
-        mutex_lock(&ctx->mutex);
        event = event->group_leader;
        perf_event_for_each_child(event, func);
        list_for_each_entry(sibling, &event->sibling_list, group_entry)
                perf_event_for_each_child(sibling, func);
-        mutex_unlock(&ctx->mutex);
 }
 static int perf_event_period(struct perf_event *event, u64 __user *arg)
@@ -3796,25 +3928,24 @@ static int perf_event_set_output(struct perf_event *event,
                                 struct perf_event *output_event);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
-static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
 {
-        struct perf_event *event = file->private_data;
        void (*func)(struct perf_event *);
        u32 flags = arg;
        switch (cmd) {
        case PERF_EVENT_IOC_ENABLE:
-                func = perf_event_enable;
+                func = _perf_event_enable;
                break;
        case PERF_EVENT_IOC_DISABLE:
-                func = perf_event_disable;
+                func = _perf_event_disable;
                break;
        case PERF_EVENT_IOC_RESET:
-                func = perf_event_reset;
+                func = _perf_event_reset;
                break;
        case PERF_EVENT_IOC_REFRESH:
-                return perf_event_refresh(event, arg);
+                return _perf_event_refresh(event, arg);
        case PERF_EVENT_IOC_PERIOD:
                return perf_event_period(event, (u64 __user *)arg);
@@ -3861,6 +3992,19 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        return 0;
 }
+static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+        struct perf_event *event = file->private_data;
+        struct perf_event_context *ctx;
+        long ret;
+        ctx = perf_event_ctx_lock(event);
+        ret = _perf_ioctl(event, cmd, arg);
+        perf_event_ctx_unlock(event, ctx);
+        return ret;
+}
 #ifdef CONFIG_COMPAT
 static long perf_compat_ioctl(struct file *file, unsigned int cmd,
                                unsigned long arg)
@@ -3883,11 +4027,15 @@ static long perf_compat_ioctl(struct file *file, unsigned int cmd,
 int perf_event_task_enable(void)
 {
+        struct perf_event_context *ctx;
        struct perf_event *event;
        mutex_lock(&current->perf_event_mutex);
-        list_for_each_entry(event, &current->perf_event_list, owner_entry)
+        list_for_each_entry(event, &current->perf_event_list, owner_entry) {
-                perf_event_for_each_child(event, perf_event_enable);
+                ctx = perf_event_ctx_lock(event);
+                perf_event_for_each_child(event, _perf_event_enable);
+                perf_event_ctx_unlock(event, ctx);
+        }
        mutex_unlock(&current->perf_event_mutex);
        return 0;
@@ -3895,11 +4043,15 @@ int perf_event_task_enable(void)
 int perf_event_task_disable(void)
 {
+        struct perf_event_context *ctx;
        struct perf_event *event;
        mutex_lock(&current->perf_event_mutex);
-        list_for_each_entry(event, &current->perf_event_list, owner_entry)
+        list_for_each_entry(event, &current->perf_event_list, owner_entry) {
-                perf_event_for_each_child(event, perf_event_disable);
+                ctx = perf_event_ctx_lock(event);
+                perf_event_for_each_child(event, _perf_event_disable);
+                perf_event_ctx_unlock(event, ctx);
+        }
        mutex_unlock(&current->perf_event_mutex);
        return 0;
@@ -3949,7 +4101,8 @@ unlock:
        rcu_read_unlock();
 }
-void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
+void __weak arch_perf_update_userpage(
+        struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
 {
 }
@@ -3999,7 +4152,7 @@ void perf_event_update_userpage(struct perf_event *event)
        userpg->time_running = running +
                        atomic64_read(&event->child_total_time_running);
-        arch_perf_update_userpage(userpg, now);
+        arch_perf_update_userpage(event, userpg, now);
        barrier();
        ++userpg->lock;
@@ -4141,6 +4294,9 @@ static void perf_mmap_open(struct vm_area_struct *vma)
        atomic_inc(&event->mmap_count);
        atomic_inc(&event->rb->mmap_count);
+        if (event->pmu->event_mapped)
+                event->pmu->event_mapped(event);
 }
 /*
@@ -4160,6 +4316,9 @@ static void perf_mmap_close(struct vm_area_struct *vma)
        int mmap_locked = rb->mmap_locked;
        unsigned long size = perf_data_size(rb);
+        if (event->pmu->event_unmapped)
+                event->pmu->event_unmapped(event);
        atomic_dec(&rb->mmap_count);
        if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
@@ -4361,6 +4520,9 @@ unlock:
        vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
        vma->vm_ops = &perf_mmap_vmops;
+        if (event->pmu->event_mapped)
+                event->pmu->event_mapped(event);
        return ret;
 }
@@ -5889,6 +6051,8 @@ end:
        rcu_read_unlock();
 }
+DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
 int perf_swevent_get_recursion_context(void)
 {
        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
@@ -5904,21 +6068,30 @@ inline void perf_swevent_put_recursion_context(int rctx)
        put_recursion_context(swhash->recursion, rctx);
 }
-void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
 {
        struct perf_sample_data data;
-        int rctx;
-        preempt_disable_notrace();
+        if (WARN_ON_ONCE(!regs))
-        rctx = perf_swevent_get_recursion_context();
-        if (rctx < 0)
                return;
        perf_sample_data_init(&data, addr, 0);
        do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
+}
+void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+{
+        int rctx;
+        preempt_disable_notrace();
+        rctx = perf_swevent_get_recursion_context();
+        if (unlikely(rctx < 0))
+                goto fail;
+        ___perf_sw_event(event_id, nr, regs, addr);
        perf_swevent_put_recursion_context(rctx);
+fail:
        preempt_enable_notrace();
 }
@@ -6780,7 +6953,6 @@ skip_type:
                __perf_cpu_hrtimer_init(cpuctx, cpu);
-                INIT_LIST_HEAD(&cpuctx->rotation_list);
                cpuctx->unique_pmu = pmu;
        }
@@ -6853,6 +7025,20 @@ void perf_pmu_unregister(struct pmu *pmu)
 }
 EXPORT_SYMBOL_GPL(perf_pmu_unregister);
+static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
+{
+        int ret;
+        if (!try_module_get(pmu->module))
+                return -ENODEV;
+        event->pmu = pmu;
+        ret = pmu->event_init(event);
+        if (ret)
+                module_put(pmu->module);
+        return ret;
+}
 struct pmu *perf_init_event(struct perf_event *event)
 {
        struct pmu *pmu = NULL;
@@ -6865,24 +7051,14 @@ struct pmu *perf_init_event(struct perf_event *event)
        pmu = idr_find(&pmu_idr, event->attr.type);
        rcu_read_unlock();
        if (pmu) {
-                if (!try_module_get(pmu->module)) {
+                ret = perf_try_init_event(pmu, event);
-                        pmu = ERR_PTR(-ENODEV);
-                        goto unlock;
-                }
-                event->pmu = pmu;
-                ret = pmu->event_init(event);
                if (ret)
                        pmu = ERR_PTR(ret);
                goto unlock;
        }
        list_for_each_entry_rcu(pmu, &pmus, entry) {
-                if (!try_module_get(pmu->module)) {
+                ret = perf_try_init_event(pmu, event);
-                        pmu = ERR_PTR(-ENODEV);
-                        goto unlock;
-                }
-                event->pmu = pmu;
-                ret = pmu->event_init(event);
                if (!ret)
                        goto unlock;
@@ -7246,6 +7422,15 @@ out:
        return ret;
 }
+static void mutex_lock_double(struct mutex *a, struct mutex *b)
+{
+        if (b < a)
+                swap(a, b);
+        mutex_lock(a);
+        mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
+}
 /**
 * sys_perf_event_open - open a performance event, associate it to a task/cpu
 *
@@ -7261,7 +7446,7 @@ SYSCALL_DEFINE5(perf_event_open,
        struct perf_event *group_leader = NULL, *output_event = NULL;
        struct perf_event *event, *sibling;
        struct perf_event_attr attr;
-        struct perf_event_context *ctx;
+        struct perf_event_context *ctx, *uninitialized_var(gctx);
        struct file *event_file = NULL;
        struct fd group = {NULL, 0};
        struct task_struct *task = NULL;
@@ -7459,43 +7644,68 @@ SYSCALL_DEFINE5(perf_event_open,
        }
        if (move_group) {
-                struct perf_event_context *gctx = group_leader->ctx;
+                gctx = group_leader->ctx;
-                mutex_lock(&gctx->mutex);
-                perf_remove_from_context(group_leader, false);
                /*
-                 * Removing from the context ends up with disabled
+                 * See perf_event_ctx_lock() for comments on the details
-                 * event. What we want here is event in the initial
+                 * of swizzling perf_event::ctx.
-                 * startup state, ready to be add into new context.
                 */
-                perf_event__state_init(group_leader);
+                mutex_lock_double(&gctx->mutex, &ctx->mutex);
+                perf_remove_from_context(group_leader, false);
                list_for_each_entry(sibling, &group_leader->sibling_list,
                                    group_entry) {
                        perf_remove_from_context(sibling, false);
-                        perf_event__state_init(sibling);
                        put_ctx(gctx);
                }
-                mutex_unlock(&gctx->mutex);
+        } else {
-                put_ctx(gctx);
+                mutex_lock(&ctx->mutex);
        }
        WARN_ON_ONCE(ctx->parent_ctx);
-        mutex_lock(&ctx->mutex);
        if (move_group) {
+                /*
+                 * Wait for everybody to stop referencing the events through
+                 * the old lists, before installing it on new lists.
+                 */
                synchronize_rcu();
-                perf_install_in_context(ctx, group_leader, group_leader->cpu);
-                get_ctx(ctx);
+                /*
+                 * Install the group siblings before the group leader.
+                 *
+                 * Because a group leader will try and install the entire group
+                 * (through the sibling list, which is still in-tact), we can
+                 * end up with siblings installed in the wrong context.
+                 *
+                 * By installing siblings first we NO-OP because they're not
+                 * reachable through the group lists.
+                 */
                list_for_each_entry(sibling, &group_leader->sibling_list,
                                    group_entry) {
+                        perf_event__state_init(sibling);
                        perf_install_in_context(ctx, sibling, sibling->cpu);
                        get_ctx(ctx);
                }
+                /*
+                 * Removing from the context ends up with disabled
+                 * event. What we want here is event in the initial
+                 * startup state, ready to be add into new context.
+                 */
+                perf_event__state_init(group_leader);
+                perf_install_in_context(ctx, group_leader, group_leader->cpu);
+                get_ctx(ctx);
        }
        perf_install_in_context(ctx, event, event->cpu);
        perf_unpin_context(ctx);
+        if (move_group) {
+                mutex_unlock(&gctx->mutex);
+                put_ctx(gctx);
+        }
        mutex_unlock(&ctx->mutex);
        put_online_cpus();
@@ -7603,7 +7813,11 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
        src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
        dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
-        mutex_lock(&src_ctx->mutex);
+        /*
+         * See perf_event_ctx_lock() for comments on the details
+         * of swizzling perf_event::ctx.
+         */
+        mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
        list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
                                 event_entry) {
                perf_remove_from_context(event, false);
@@ -7611,11 +7825,36 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
                put_ctx(src_ctx);
                list_add(&event->migrate_entry, &events);
        }
-        mutex_unlock(&src_ctx->mutex);
+        /*
+         * Wait for the events to quiesce before re-instating them.
+         */
        synchronize_rcu();
-        mutex_lock(&dst_ctx->mutex);
+        /*
+         * Re-instate events in 2 passes.
+         *
+         * Skip over group leaders and only install siblings on this first
+         * pass, siblings will not get enabled without a leader, however a
+         * leader will enable its siblings, even if those are still on the old
+         * context.
+         */
+        list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+                if (event->group_leader == event)
+                        continue;
+                list_del(&event->migrate_entry);
+                if (event->state >= PERF_EVENT_STATE_OFF)
+                        event->state = PERF_EVENT_STATE_INACTIVE;
+                account_event_cpu(event, dst_cpu);
+                perf_install_in_context(dst_ctx, event, dst_cpu);
+                get_ctx(dst_ctx);
+        }
+        /*
+         * Once all the siblings are setup properly, install the group leaders
+         * to make it go.
+         */
        list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
                list_del(&event->migrate_entry);
                if (event->state >= PERF_EVENT_STATE_OFF)
@@ -7625,6 +7864,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
                get_ctx(dst_ctx);
        }
        mutex_unlock(&dst_ctx->mutex);
+        mutex_unlock(&src_ctx->mutex);
 }
 EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
@@ -7811,14 +8051,19 @@ static void perf_free_event(struct perf_event *event,
        put_event(parent);
+        raw_spin_lock_irq(&ctx->lock);
        perf_group_detach(event);
        list_del_event(event, ctx);
+        raw_spin_unlock_irq(&ctx->lock);
        free_event(event);
 }
 /*
- * free an unexposed, unused context as created by inheritance by
+ * Free an unexposed, unused context as created by inheritance by
 * perf_event_init_task below, used by fork() in case of fail.
+ *
+ * Not all locks are strictly required, but take them anyway to be nice and
+ * help out with the lockdep assertions.
 */
 void perf_event_free_task(struct task_struct *task)
 {
@@ -8137,7 +8382,7 @@ static void __init perf_event_init_all_cpus(void)
        for_each_possible_cpu(cpu) {
                swhash = &per_cpu(swevent_htable, cpu);
                mutex_init(&swhash->hlist_mutex);
-                INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
+                INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
        }
 }
@@ -8158,22 +8403,11 @@ static void perf_event_init_cpu(int cpu)
 }
 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
-static void perf_pmu_rotate_stop(struct pmu *pmu)
-{
-        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-        WARN_ON(!irqs_disabled());
-        list_del_init(&cpuctx->rotation_list);
-}
 static void __perf_event_exit_context(void *__info)
 {
        struct remove_event re = { .detach_group = true };
        struct perf_event_context *ctx = __info;
-        perf_pmu_rotate_stop(ctx->pmu);
        rcu_read_lock();
        list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
                __perf_remove_from_context(&re);
@@ -8284,6 +8518,18 @@ void __init perf_event_init(void)
                     != 1024);
 }
+ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
+                              char *page)
+{
+        struct perf_pmu_events_attr *pmu_attr =
+                container_of(attr, struct perf_pmu_events_attr, attr);
+        if (pmu_attr->event_str)
+                return sprintf(page, "%s\n", pmu_attr->event_str);
+        return 0;
+}
 static int __init perf_event_sysfs_init(void)
 {
        struct pmu *pmu;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 146a5792b1d2..eadb95ce7aac 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -13,12 +13,13 @@
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
 #include <linux/circ_buf.h>
+#include <linux/poll.h>
 #include "internal.h"
 static void perf_output_wakeup(struct perf_output_handle *handle)
 {
-        atomic_set(&handle->rb->poll, POLL_IN);
+        atomic_set(&handle->rb->poll, POLLIN);
        handle->event->pending_wakeup = 1;
        irq_work_queue(&handle->event->pending);
diff --git a/kernel/exit.c b/kernel/exit.c
index 6806c55475ee..feff10bbb307 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -435,7 +435,8 @@ static void exit_mm(struct task_struct *tsk)
        task_unlock(tsk);
        mm_update_next_owner(mm);
        mmput(mm);
-        clear_thread_flag(TIF_MEMDIE);
+        if (test_thread_flag(TIF_MEMDIE))
+                unmark_oom_victim();
 }
 static struct task_struct *find_alive_thread(struct task_struct *p)
diff --git a/kernel/fork.c b/kernel/fork.c
index 4dc2ddade9f1..cf65139615a0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -438,12 +438,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                                atomic_inc(&mapping->i_mmap_writable);
                        flush_dcache_mmap_lock(mapping);
                        /* insert tmp into the share list, just after mpnt */
-                        if (unlikely(tmp->vm_flags & VM_NONLINEAR))
+                        vma_interval_tree_insert_after(tmp, mpnt,
-                                vma_nonlinear_insert(tmp,
+                                        &mapping->i_mmap);
-                                                &mapping->i_mmap_nonlinear);
-                        else
-                                vma_interval_tree_insert_after(tmp, mpnt,
-                                                        &mapping->i_mmap);
                        flush_dcache_mmap_unlock(mapping);
                        i_mmap_unlock_write(mapping);
                }
@@ -559,6 +555,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
        INIT_LIST_HEAD(&mm->mmlist);
        mm->core_state = NULL;
        atomic_long_set(&mm->nr_ptes, 0);
+        mm_nr_pmds_init(mm);
        mm->map_count = 0;
        mm->locked_vm = 0;
        mm->pinned_vm = 0;
@@ -607,6 +604,14 @@ static void check_mm(struct mm_struct *mm)
                        printk(KERN_ALERT "BUG: Bad rss-counter state "
                                          "mm:%p idx:%d val:%ld\n", mm, i, x);
        }
+        if (atomic_long_read(&mm->nr_ptes))
+                pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n",
+                                atomic_long_read(&mm->nr_ptes));
+        if (mm_nr_pmds(mm))
+                pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
+                                mm_nr_pmds(mm));
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
        VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
 #endif
diff --git a/kernel/futex.c b/kernel/futex.c
index 63678b573d61..2a5e3830e953 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2217,7 +2217,7 @@ retry:
        if (!abs_time)
                goto out;
-        restart = &current_thread_info()->restart_block;
+        restart = &current->restart_block;
        restart->fn = futex_wait_restart;
        restart->futex.uaddr = uaddr;
        restart->futex.val = val;
@@ -2258,7 +2258,7 @@ static long futex_wait_restart(struct restart_block *restart)
 * if there are waiters then it will block, it does PI, etc. (Due to
 * races the kernel might see a 0 value of the futex too.)
 */
-static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
+static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
                         ktime_t *time, int trylock)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
@@ -2953,11 +2953,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
        case FUTEX_WAKE_OP:
                return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
        case FUTEX_LOCK_PI:
-                return futex_lock_pi(uaddr, flags, val, timeout, 0);
+                return futex_lock_pi(uaddr, flags, timeout, 0);
        case FUTEX_UNLOCK_PI:
                return futex_unlock_pi(uaddr, flags);
        case FUTEX_TRYLOCK_PI:
-                return futex_lock_pi(uaddr, flags, 0, timeout, 1);
+                return futex_lock_pi(uaddr, flags, NULL, 1);
        case FUTEX_WAIT_REQUEUE_PI:
                val3 = FUTEX_BITSET_MATCH_ANY;
                return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index 52aa7e8de927..752d6486b67e 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -1,33 +1,7 @@
 ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
-# if-lt
+obj-y := base.o fs.o
-# Usage VAR := $(call if-lt, $(a), $(b))
+obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o
-# Returns 1 if (a < b)
+obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o
-if-lt = $(shell [ $(1) -lt $(2) ] && echo 1)
+obj-$(CONFIG_GCOV_FORMAT_AUTODETECT) += $(call cc-ifversion, -lt, 0407, \
+                                                        gcc_3_4.o, gcc_4_7.o)
-ifeq ($(CONFIG_GCOV_FORMAT_3_4),y)
-  cc-ver := 0304
-else ifeq ($(CONFIG_GCOV_FORMAT_4_7),y)
-  cc-ver := 0407
-else
-# Use cc-version if available, otherwise set 0
-#
-# scripts/Kbuild.include, which contains cc-version function, is not included
-# during make clean "make -f scripts/Makefile.clean obj=kernel/gcov"
-# Meaning cc-ver is empty causing if-lt test to fail with
-# "/bin/sh: line 0: [: -lt: unary operator expected" error mesage.
-# This has no affect on the clean phase, but the error message could be
-# confusing/annoying. So this dummy workaround sets cc-ver to zero if cc-version
-# is not available. We can probably move if-lt to Kbuild.include, so it's also
-# not defined during clean or to include Kbuild.include in
-# scripts/Makefile.clean. But the following workaround seems least invasive.
-  cc-ver := $(if $(call cc-version),$(call cc-version),0)
-endif
-obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o
-ifeq ($(call if-lt, $(cc-ver), 0407),1)
-  obj-$(CONFIG_GCOV_KERNEL) += gcc_3_4.o
-else
-  obj-$(CONFIG_GCOV_KERNEL) += gcc_4_7.o
-endif
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 80692373abd6..196a06fbc122 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -243,6 +243,9 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
                return -EINVAL;
        desc->affinity_hint = m;
        irq_put_desc_unlock(desc, flags);
+        /* set the initial affinity to prevent every interrupt being on CPU0 */
+        if (m)
+                __irq_set_affinity(irq, m, false);
        return 0;
 }
 EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 9dc9bfd8a678..df2f4642d1e7 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -46,10 +46,9 @@ static int show_irq_affinity(int type, struct seq_file *m, void *v)
                mask = desc->pending_mask;
 #endif
        if (type)
-                seq_cpumask_list(m, mask);
+                seq_printf(m, "%*pbl\n", cpumask_pr_args(mask));
        else
-                seq_cpumask(m, mask);
+                seq_printf(m, "%*pb\n", cpumask_pr_args(mask));
-        seq_putc(m, '\n');
        return 0;
 }
@@ -67,8 +66,7 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
                cpumask_copy(mask, desc->affinity_hint);
        raw_spin_unlock_irqrestore(&desc->lock, flags);
-        seq_cpumask(m, mask);
+        seq_printf(m, "%*pb\n", cpumask_pr_args(mask));
-        seq_putc(m, '\n');
        free_cpumask_var(mask);
        return 0;
@@ -186,8 +184,7 @@ static const struct file_operations irq_affinity_list_proc_fops = {
 static int default_affinity_show(struct seq_file *m, void *v)
 {
-        seq_cpumask(m, irq_default_affinity);
+        seq_printf(m, "%*pb\n", cpumask_pr_args(irq_default_affinity));
-        seq_putc(m, '\n');
        return 0;
 }
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 9a8a01abbaed..38c25b1f2fd5 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -444,7 +444,7 @@ arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
 }
 /*
- * Free up memory used by kernel, initrd, and comand line. This is temporary
+ * Free up memory used by kernel, initrd, and command line. This is temporary
 * memory allocation which is not needed any more after these buffers have
 * been loaded into separate segments and have been copied elsewhere.
 */
@@ -856,8 +856,6 @@ static int kimage_set_destination(struct kimage *image,
        destination &= PAGE_MASK;
        result = kimage_add_entry(image, destination | IND_DESTINATION);
-        if (result == 0)
-                image->destination = destination;
        return result;
 }
@@ -869,8 +867,6 @@ static int kimage_add_page(struct kimage *image, unsigned long page)
        page &= PAGE_MASK;
        result = kimage_add_entry(image, page | IND_SOURCE);
-        if (result == 0)
-                image->destination += PAGE_SIZE;
        return result;
 }
@@ -1288,19 +1284,22 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
        if (nr_segments > 0) {
                unsigned long i;
-                /* Loading another kernel to reboot into */
+                if (flags & KEXEC_ON_CRASH) {
-                if ((flags & KEXEC_ON_CRASH) == 0)
+                        /*
-                        result = kimage_alloc_init(&image, entry, nr_segments,
+                         * Loading another kernel to switch to if this one
-                                                   segments, flags);
+                         * crashes.  Free any current crash dump kernel before
-                /* Loading another kernel to switch to if this one crashes */
-                else if (flags & KEXEC_ON_CRASH) {
-                        /* Free any current crash dump kernel before
                         * we corrupt it.
                         */
                        kimage_free(xchg(&kexec_crash_image, NULL));
                        result = kimage_alloc_init(&image, entry, nr_segments,
                                                   segments, flags);
                        crash_map_reserved_pages();
+                } else {
+                        /* Loading another kernel to reboot into. */
+                        result = kimage_alloc_init(&image, entry, nr_segments,
+                                                   segments, flags);
                }
                if (result)
                        goto out;
@@ -2512,7 +2511,7 @@ static int kexec_apply_relocations(struct kimage *image)
                        continue;
                /*
-                 * Respective archicture needs to provide support for applying
+                 * Respective architecture needs to provide support for applying
                 * relocations of type SHT_RELA/SHT_REL.
                 */
                if (sechdrs[i].sh_type == SHT_RELA)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ee619929cf90..c90e417bb963 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -717,7 +717,7 @@ static void prepare_optimized_kprobe(struct kprobe *p)
        struct optimized_kprobe *op;
        op = container_of(p, struct optimized_kprobe, kp);
-        arch_prepare_optimized_kprobe(op);
+        arch_prepare_optimized_kprobe(op, p);
 }
 /* Allocate new optimized_kprobe and try to prepare optimized instructions */
@@ -731,7 +731,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
        INIT_LIST_HEAD(&op->list);
        op->kp.addr = p->addr;
-        arch_prepare_optimized_kprobe(op);
+        arch_prepare_optimized_kprobe(op, p);
        return &op->kp;
 }
@@ -869,7 +869,8 @@ static void __disarm_kprobe(struct kprobe *p, bool reopt)
 {
        struct kprobe *_p;
-        unoptimize_kprobe(p, false);    /* Try to unoptimize */
+        /* Try to unoptimize */
+        unoptimize_kprobe(p, kprobes_all_disarmed);
        if (!kprobe_queued(p)) {
                arch_disarm_kprobe(p);
@@ -1571,7 +1572,13 @@ static struct kprobe *__disable_kprobe(struct kprobe *p)
                /* Try to disarm and disable this/parent probe */
                if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
-                        disarm_kprobe(orig_p, true);
+                        /*
+                         * If kprobes_all_disarmed is set, orig_p
+                         * should have already been disarmed, so
+                         * skip unneed disarming process.
+                         */
+                        if (!kprobes_all_disarmed)
+                                disarm_kprobe(orig_p, true);
                        orig_p->flags |= KPROBE_FLAG_DISABLED;
                }
        }
@@ -2320,6 +2327,12 @@ static void arm_all_kprobes(void)
        if (!kprobes_all_disarmed)
                goto already_enabled;
+        /*
+         * optimize_kprobe() called by arm_kprobe() checks
+         * kprobes_all_disarmed, so set kprobes_all_disarmed before
+         * arm_kprobe.
+         */
+        kprobes_all_disarmed = false;
        /* Arming kprobes doesn't optimize kprobe itself */
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
@@ -2328,7 +2341,6 @@ static void arm_all_kprobes(void)
                                arm_kprobe(p);
        }
-        kprobes_all_disarmed = false;
        printk(KERN_INFO "Kprobes globally enabled\n");
 already_enabled:
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
new file mode 100644
index 000000000000..045022557936
--- /dev/null
+++ b/kernel/livepatch/Kconfig
@@ -0,0 +1,18 @@
+config HAVE_LIVEPATCH
+        bool
+        help
+          Arch supports kernel live patching
+config LIVEPATCH
+        bool "Kernel Live Patching"
+        depends on DYNAMIC_FTRACE_WITH_REGS
+        depends on MODULES
+        depends on SYSFS
+        depends on KALLSYMS_ALL
+        depends on HAVE_LIVEPATCH
+        help
+          Say Y here if you want to support kernel live patching.
+          This option has no runtime impact until a kernel "patch"
+          module uses the interface provided by this option to register
+          a patch, causing calls to patched functions to be redirected
+          to new function code contained in the patch module.
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
new file mode 100644
index 000000000000..e8780c0901d9
--- /dev/null
+++ b/kernel/livepatch/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_LIVEPATCH) += livepatch.o
+livepatch-objs := core.o
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
new file mode 100644
index 000000000000..ff7f47d026ac
--- /dev/null
+++ b/kernel/livepatch/core.c
@@ -0,0 +1,1015 @@
+/*
+ * core.c - Kernel Live Patching Core
+ *
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2014 SUSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/ftrace.h>
+#include <linux/list.h>
+#include <linux/kallsyms.h>
+#include <linux/livepatch.h>
+/**
+ * struct klp_ops - structure for tracking registered ftrace ops structs
+ *
+ * A single ftrace_ops is shared between all enabled replacement functions
+ * (klp_func structs) which have the same old_addr.  This allows the switch
+ * between function versions to happen instantaneously by updating the klp_ops
+ * struct's func_stack list.  The winner is the klp_func at the top of the
+ * func_stack (front of the list).
+ *
+ * @node:       node for the global klp_ops list
+ * @func_stack: list head for the stack of klp_func's (active func is on top)
+ * @fops:       registered ftrace ops struct
+ */
+struct klp_ops {
+        struct list_head node;
+        struct list_head func_stack;
+        struct ftrace_ops fops;
+};
+/*
+ * The klp_mutex protects the global lists and state transitions of any
+ * structure reachable from them.  References to any structure must be obtained
+ * under mutex protection (except in klp_ftrace_handler(), which uses RCU to
+ * ensure it gets consistent data).
+ */
+static DEFINE_MUTEX(klp_mutex);
+static LIST_HEAD(klp_patches);
+static LIST_HEAD(klp_ops);
+static struct kobject *klp_root_kobj;
+static struct klp_ops *klp_find_ops(unsigned long old_addr)
+{
+        struct klp_ops *ops;
+        struct klp_func *func;
+        list_for_each_entry(ops, &klp_ops, node) {
+                func = list_first_entry(&ops->func_stack, struct klp_func,
+                                        stack_node);
+                if (func->old_addr == old_addr)
+                        return ops;
+        }
+        return NULL;
+}
+static bool klp_is_module(struct klp_object *obj)
+{
+        return obj->name;
+}
+static bool klp_is_object_loaded(struct klp_object *obj)
+{
+        return !obj->name || obj->mod;
+}
+/* sets obj->mod if object is not vmlinux and module is found */
+static void klp_find_object_module(struct klp_object *obj)
+{
+        if (!klp_is_module(obj))
+                return;
+        mutex_lock(&module_mutex);
+        /*
+         * We don't need to take a reference on the module here because we have
+         * the klp_mutex, which is also taken by the module notifier.  This
+         * prevents any module from unloading until we release the klp_mutex.
+         */
+        obj->mod = find_module(obj->name);
+        mutex_unlock(&module_mutex);
+}
+/* klp_mutex must be held by caller */
+static bool klp_is_patch_registered(struct klp_patch *patch)
+{
+        struct klp_patch *mypatch;
+        list_for_each_entry(mypatch, &klp_patches, list)
+                if (mypatch == patch)
+                        return true;
+        return false;
+}
+static bool klp_initialized(void)
+{
+        return klp_root_kobj;
+}
+struct klp_find_arg {
+        const char *objname;
+        const char *name;
+        unsigned long addr;
+        /*
+         * If count == 0, the symbol was not found. If count == 1, a unique
+         * match was found and addr is set.  If count > 1, there is
+         * unresolvable ambiguity among "count" number of symbols with the same
+         * name in the same object.
+         */
+        unsigned long count;
+};
+static int klp_find_callback(void *data, const char *name,
+                             struct module *mod, unsigned long addr)
+{
+        struct klp_find_arg *args = data;
+        if ((mod && !args->objname) || (!mod && args->objname))
+                return 0;
+        if (strcmp(args->name, name))
+                return 0;
+        if (args->objname && strcmp(args->objname, mod->name))
+                return 0;
+        /*
+         * args->addr might be overwritten if another match is found
+         * but klp_find_object_symbol() handles this and only returns the
+         * addr if count == 1.
+         */
+        args->addr = addr;
+        args->count++;
+        return 0;
+}
+static int klp_find_object_symbol(const char *objname, const char *name,
+                                  unsigned long *addr)
+{
+        struct klp_find_arg args = {
+                .objname = objname,
+                .name = name,
+                .addr = 0,
+                .count = 0
+        };
+        kallsyms_on_each_symbol(klp_find_callback, &args);
+        if (args.count == 0)
+                pr_err("symbol '%s' not found in symbol table\n", name);
+        else if (args.count > 1)
+                pr_err("unresolvable ambiguity (%lu matches) on symbol '%s' in object '%s'\n",
+                       args.count, name, objname);
+        else {
+                *addr = args.addr;
+                return 0;
+        }
+        *addr = 0;
+        return -EINVAL;
+}
+struct klp_verify_args {
+        const char *name;
+        const unsigned long addr;
+};
+static int klp_verify_callback(void *data, const char *name,
+                               struct module *mod, unsigned long addr)
+{
+        struct klp_verify_args *args = data;
+        if (!mod &&
+            !strcmp(args->name, name) &&
+            args->addr == addr)
+                return 1;
+        return 0;
+}
+static int klp_verify_vmlinux_symbol(const char *name, unsigned long addr)
+{
+        struct klp_verify_args args = {
+                .name = name,
+                .addr = addr,
+        };
+        if (kallsyms_on_each_symbol(klp_verify_callback, &args))
+                return 0;
+        pr_err("symbol '%s' not found at specified address 0x%016lx, kernel mismatch?\n",
+                name, addr);
+        return -EINVAL;
+}
+static int klp_find_verify_func_addr(struct klp_object *obj,
+                                     struct klp_func *func)
+{
+        int ret;
+#if defined(CONFIG_RANDOMIZE_BASE)
+        /* KASLR is enabled, disregard old_addr from user */
+        func->old_addr = 0;
+#endif
+        if (!func->old_addr || klp_is_module(obj))
+                ret = klp_find_object_symbol(obj->name, func->old_name,
+                                             &func->old_addr);
+        else
+                ret = klp_verify_vmlinux_symbol(func->old_name,
+                                                func->old_addr);
+        return ret;
+}
+/*
+ * external symbols are located outside the parent object (where the parent
+ * object is either vmlinux or the kmod being patched).
+ */
+static int klp_find_external_symbol(struct module *pmod, const char *name,
+                                    unsigned long *addr)
+{
+        const struct kernel_symbol *sym;
+        /* first, check if it's an exported symbol */
+        preempt_disable();
+        sym = find_symbol(name, NULL, NULL, true, true);
+        preempt_enable();
+        if (sym) {
+                *addr = sym->value;
+                return 0;
+        }
+        /* otherwise check if it's in another .o within the patch module */
+        return klp_find_object_symbol(pmod->name, name, addr);
+}
+static int klp_write_object_relocations(struct module *pmod,
+                                        struct klp_object *obj)
+{
+        int ret;
+        struct klp_reloc *reloc;
+        if (WARN_ON(!klp_is_object_loaded(obj)))
+                return -EINVAL;
+        if (WARN_ON(!obj->relocs))
+                return -EINVAL;
+        for (reloc = obj->relocs; reloc->name; reloc++) {
+                if (!klp_is_module(obj)) {
+                        ret = klp_verify_vmlinux_symbol(reloc->name,
+                                                        reloc->val);
+                        if (ret)
+                                return ret;
+                } else {
+                        /* module, reloc->val needs to be discovered */
+                        if (reloc->external)
+                                ret = klp_find_external_symbol(pmod,
+                                                               reloc->name,
+                                                               &reloc->val);
+                        else
+                                ret = klp_find_object_symbol(obj->mod->name,
+                                                             reloc->name,
+                                                             &reloc->val);
+                        if (ret)
+                                return ret;
+                }
+                ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc,
+                                             reloc->val + reloc->addend);
+                if (ret) {
+                        pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n",
+                               reloc->name, reloc->val, ret);
+                        return ret;
+                }
+        }
+        return 0;
+}
+static void notrace klp_ftrace_handler(unsigned long ip,
+                                       unsigned long parent_ip,
+                                       struct ftrace_ops *fops,
+                                       struct pt_regs *regs)
+{
+        struct klp_ops *ops;
+        struct klp_func *func;
+        ops = container_of(fops, struct klp_ops, fops);
+        rcu_read_lock();
+        func = list_first_or_null_rcu(&ops->func_stack, struct klp_func,
+                                      stack_node);
+        rcu_read_unlock();
+        if (WARN_ON_ONCE(!func))
+                return;
+        klp_arch_set_pc(regs, (unsigned long)func->new_func);
+}
+static int klp_disable_func(struct klp_func *func)
+{
+        struct klp_ops *ops;
+        int ret;
+        if (WARN_ON(func->state != KLP_ENABLED))
+                return -EINVAL;
+        if (WARN_ON(!func->old_addr))
+                return -EINVAL;
+        ops = klp_find_ops(func->old_addr);
+        if (WARN_ON(!ops))
+                return -EINVAL;
+        if (list_is_singular(&ops->func_stack)) {
+                ret = unregister_ftrace_function(&ops->fops);
+                if (ret) {
+                        pr_err("failed to unregister ftrace handler for function '%s' (%d)\n",
+                               func->old_name, ret);
+                        return ret;
+                }
+                ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0);
+                if (ret)
+                        pr_warn("function unregister succeeded but failed to clear the filter\n");
+                list_del_rcu(&func->stack_node);
+                list_del(&ops->node);
+                kfree(ops);
+        } else {
+                list_del_rcu(&func->stack_node);
+        }
+        func->state = KLP_DISABLED;
+        return 0;
+}
+static int klp_enable_func(struct klp_func *func)
+{
+        struct klp_ops *ops;
+        int ret;
+        if (WARN_ON(!func->old_addr))
+                return -EINVAL;
+        if (WARN_ON(func->state != KLP_DISABLED))
+                return -EINVAL;
+        ops = klp_find_ops(func->old_addr);
+        if (!ops) {
+                ops = kzalloc(sizeof(*ops), GFP_KERNEL);
+                if (!ops)
+                        return -ENOMEM;
+                ops->fops.func = klp_ftrace_handler;
+                ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS |
+                                  FTRACE_OPS_FL_DYNAMIC |
+                                  FTRACE_OPS_FL_IPMODIFY;
+                list_add(&ops->node, &klp_ops);
+                INIT_LIST_HEAD(&ops->func_stack);
+                list_add_rcu(&func->stack_node, &ops->func_stack);
+                ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 0, 0);
+                if (ret) {
+                        pr_err("failed to set ftrace filter for function '%s' (%d)\n",
+                               func->old_name, ret);
+                        goto err;
+                }
+                ret = register_ftrace_function(&ops->fops);
+                if (ret) {
+                        pr_err("failed to register ftrace handler for function '%s' (%d)\n",
+                               func->old_name, ret);
+                        ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0);
+                        goto err;
+                }
+        } else {
+                list_add_rcu(&func->stack_node, &ops->func_stack);
+        }
+        func->state = KLP_ENABLED;
+        return 0;
+err:
+        list_del_rcu(&func->stack_node);
+        list_del(&ops->node);
+        kfree(ops);
+        return ret;
+}
+static int klp_disable_object(struct klp_object *obj)
+{
+        struct klp_func *func;
+        int ret;
+        for (func = obj->funcs; func->old_name; func++) {
+                if (func->state != KLP_ENABLED)
+                        continue;
+                ret = klp_disable_func(func);
+                if (ret)
+                        return ret;
+        }
+        obj->state = KLP_DISABLED;
+        return 0;
+}
+static int klp_enable_object(struct klp_object *obj)
+{
+        struct klp_func *func;
+        int ret;
+        if (WARN_ON(obj->state != KLP_DISABLED))
+                return -EINVAL;
+        if (WARN_ON(!klp_is_object_loaded(obj)))
+                return -EINVAL;
+        for (func = obj->funcs; func->old_name; func++) {
+                ret = klp_enable_func(func);
+                if (ret)
+                        goto unregister;
+        }
+        obj->state = KLP_ENABLED;
+        return 0;
+unregister:
+        WARN_ON(klp_disable_object(obj));
+        return ret;
+}
+static int __klp_disable_patch(struct klp_patch *patch)
+{
+        struct klp_object *obj;
+        int ret;
+        /* enforce stacking: only the last enabled patch can be disabled */
+        if (!list_is_last(&patch->list, &klp_patches) &&
+            list_next_entry(patch, list)->state == KLP_ENABLED)
+                return -EBUSY;
+        pr_notice("disabling patch '%s'\n", patch->mod->name);
+        for (obj = patch->objs; obj->funcs; obj++) {
+                if (obj->state != KLP_ENABLED)
+                        continue;
+                ret = klp_disable_object(obj);
+                if (ret)
+                        return ret;
+        }
+        patch->state = KLP_DISABLED;
+        return 0;
+}
+/**
+ * klp_disable_patch() - disables a registered patch
+ * @patch:      The registered, enabled patch to be disabled
+ *
+ * Unregisters the patched functions from ftrace.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_disable_patch(struct klp_patch *patch)
+{
+        int ret;
+        mutex_lock(&klp_mutex);
+        if (!klp_is_patch_registered(patch)) {
+                ret = -EINVAL;
+                goto err;
+        }
+        if (patch->state == KLP_DISABLED) {
+                ret = -EINVAL;
+                goto err;
+        }
+        ret = __klp_disable_patch(patch);
+err:
+        mutex_unlock(&klp_mutex);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(klp_disable_patch);
+static int __klp_enable_patch(struct klp_patch *patch)
+{
+        struct klp_object *obj;
+        int ret;
+        if (WARN_ON(patch->state != KLP_DISABLED))
+                return -EINVAL;
+        /* enforce stacking: only the first disabled patch can be enabled */
+        if (patch->list.prev != &klp_patches &&
+            list_prev_entry(patch, list)->state == KLP_DISABLED)
+                return -EBUSY;
+        pr_notice_once("tainting kernel with TAINT_LIVEPATCH\n");
+        add_taint(TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
+        pr_notice("enabling patch '%s'\n", patch->mod->name);
+        for (obj = patch->objs; obj->funcs; obj++) {
+                klp_find_object_module(obj);
+                if (!klp_is_object_loaded(obj))
+                        continue;
+                ret = klp_enable_object(obj);
+                if (ret)
+                        goto unregister;
+        }
+        patch->state = KLP_ENABLED;
+        return 0;
+unregister:
+        WARN_ON(__klp_disable_patch(patch));
+        return ret;
+}
+/**
+ * klp_enable_patch() - enables a registered patch
+ * @patch:      The registered, disabled patch to be enabled
+ *
+ * Performs the needed symbol lookups and code relocations,
+ * then registers the patched functions with ftrace.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_enable_patch(struct klp_patch *patch)
+{
+        int ret;
+        mutex_lock(&klp_mutex);
+        if (!klp_is_patch_registered(patch)) {
+                ret = -EINVAL;
+                goto err;
+        }
+        ret = __klp_enable_patch(patch);
+err:
+        mutex_unlock(&klp_mutex);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(klp_enable_patch);
+/*
+ * Sysfs Interface
+ *
+ * /sys/kernel/livepatch
+ * /sys/kernel/livepatch/<patch>
+ * /sys/kernel/livepatch/<patch>/enabled
+ * /sys/kernel/livepatch/<patch>/<object>
+ * /sys/kernel/livepatch/<patch>/<object>/<func>
+ */
+static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
+                             const char *buf, size_t count)
+{
+        struct klp_patch *patch;
+        int ret;
+        unsigned long val;
+        ret = kstrtoul(buf, 10, &val);
+        if (ret)
+                return -EINVAL;
+        if (val != KLP_DISABLED && val != KLP_ENABLED)
+                return -EINVAL;
+        patch = container_of(kobj, struct klp_patch, kobj);
+        mutex_lock(&klp_mutex);
+        if (val == patch->state) {
+                /* already in requested state */
+                ret = -EINVAL;
+                goto err;
+        }
+        if (val == KLP_ENABLED) {
+                ret = __klp_enable_patch(patch);
+                if (ret)
+                        goto err;
+        } else {
+                ret = __klp_disable_patch(patch);
+                if (ret)
+                        goto err;
+        }
+        mutex_unlock(&klp_mutex);
+        return count;
+err:
+        mutex_unlock(&klp_mutex);
+        return ret;
+}
+static ssize_t enabled_show(struct kobject *kobj,
+                            struct kobj_attribute *attr, char *buf)
+{
+        struct klp_patch *patch;
+        patch = container_of(kobj, struct klp_patch, kobj);
+        return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->state);
+}
+static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled);
+static struct attribute *klp_patch_attrs[] = {
+        &enabled_kobj_attr.attr,
+        NULL
+};
+static void klp_kobj_release_patch(struct kobject *kobj)
+{
+        /*
+         * Once we have a consistency model we'll need to module_put() the
+         * patch module here.  See klp_register_patch() for more details.
+         */
+}
+static struct kobj_type klp_ktype_patch = {
+        .release = klp_kobj_release_patch,
+        .sysfs_ops = &kobj_sysfs_ops,
+        .default_attrs = klp_patch_attrs,
+};
+static void klp_kobj_release_func(struct kobject *kobj)
+{
+}
+static struct kobj_type klp_ktype_func = {
+        .release = klp_kobj_release_func,
+        .sysfs_ops = &kobj_sysfs_ops,
+};
+/*
+ * Free all functions' kobjects in the array up to some limit. When limit is
+ * NULL, all kobjects are freed.
+ */
+static void klp_free_funcs_limited(struct klp_object *obj,
+                                   struct klp_func *limit)
+{
+        struct klp_func *func;
+        for (func = obj->funcs; func->old_name && func != limit; func++)
+                kobject_put(&func->kobj);
+}
+/* Clean up when a patched object is unloaded */
+static void klp_free_object_loaded(struct klp_object *obj)
+{
+        struct klp_func *func;
+        obj->mod = NULL;
+        for (func = obj->funcs; func->old_name; func++)
+                func->old_addr = 0;
+}
+/*
+ * Free all objects' kobjects in the array up to some limit. When limit is
+ * NULL, all kobjects are freed.
+ */
+static void klp_free_objects_limited(struct klp_patch *patch,
+                                     struct klp_object *limit)
+{
+        struct klp_object *obj;
+        for (obj = patch->objs; obj->funcs && obj != limit; obj++) {
+                klp_free_funcs_limited(obj, NULL);
+                kobject_put(obj->kobj);
+        }
+}
+static void klp_free_patch(struct klp_patch *patch)
+{
+        klp_free_objects_limited(patch, NULL);
+        if (!list_empty(&patch->list))
+                list_del(&patch->list);
+        kobject_put(&patch->kobj);
+}
+static int klp_init_func(struct klp_object *obj, struct klp_func *func)
+{
+        INIT_LIST_HEAD(&func->stack_node);
+        func->state = KLP_DISABLED;
+        return kobject_init_and_add(&func->kobj, &klp_ktype_func,
+                                    obj->kobj, func->old_name);
+}
+/* parts of the initialization that is done only when the object is loaded */
+static int klp_init_object_loaded(struct klp_patch *patch,
+                                  struct klp_object *obj)
+{
+        struct klp_func *func;
+        int ret;
+        if (obj->relocs) {
+                ret = klp_write_object_relocations(patch->mod, obj);
+                if (ret)
+                        return ret;
+        }
+        for (func = obj->funcs; func->old_name; func++) {
+                ret = klp_find_verify_func_addr(obj, func);
+                if (ret)
+                        return ret;
+        }
+        return 0;
+}
+static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
+{
+        struct klp_func *func;
+        int ret;
+        const char *name;
+        if (!obj->funcs)
+                return -EINVAL;
+        obj->state = KLP_DISABLED;
+        klp_find_object_module(obj);
+        name = klp_is_module(obj) ? obj->name : "vmlinux";
+        obj->kobj = kobject_create_and_add(name, &patch->kobj);
+        if (!obj->kobj)
+                return -ENOMEM;
+        for (func = obj->funcs; func->old_name; func++) {
+                ret = klp_init_func(obj, func);
+                if (ret)
+                        goto free;
+        }
+        if (klp_is_object_loaded(obj)) {
+                ret = klp_init_object_loaded(patch, obj);
+                if (ret)
+                        goto free;
+        }
+        return 0;
+free:
+        klp_free_funcs_limited(obj, func);
+        kobject_put(obj->kobj);
+        return ret;
+}
+static int klp_init_patch(struct klp_patch *patch)
+{
+        struct klp_object *obj;
+        int ret;
+        if (!patch->objs)
+                return -EINVAL;
+        mutex_lock(&klp_mutex);
+        patch->state = KLP_DISABLED;
+        ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch,
+                                   klp_root_kobj, patch->mod->name);
+        if (ret)
+                goto unlock;
+        for (obj = patch->objs; obj->funcs; obj++) {
+                ret = klp_init_object(patch, obj);
+                if (ret)
+                        goto free;
+        }
+        list_add_tail(&patch->list, &klp_patches);
+        mutex_unlock(&klp_mutex);
+        return 0;
+free:
+        klp_free_objects_limited(patch, obj);
+        kobject_put(&patch->kobj);
+unlock:
+        mutex_unlock(&klp_mutex);
+        return ret;
+}
+/**
+ * klp_unregister_patch() - unregisters a patch
+ * @patch:      Disabled patch to be unregistered
+ *
+ * Frees the data structures and removes the sysfs interface.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_unregister_patch(struct klp_patch *patch)
+{
+        int ret = 0;
+        mutex_lock(&klp_mutex);
+        if (!klp_is_patch_registered(patch)) {
+                ret = -EINVAL;
+                goto out;
+        }
+        if (patch->state == KLP_ENABLED) {
+                ret = -EBUSY;
+                goto out;
+        }
+        klp_free_patch(patch);
+out:
+        mutex_unlock(&klp_mutex);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(klp_unregister_patch);
+/**
+ * klp_register_patch() - registers a patch
+ * @patch:      Patch to be registered
+ *
+ * Initializes the data structure associated with the patch and
+ * creates the sysfs interface.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_register_patch(struct klp_patch *patch)
+{
+        int ret;
+        if (!klp_initialized())
+                return -ENODEV;
+        if (!patch || !patch->mod)
+                return -EINVAL;
+        /*
+         * A reference is taken on the patch module to prevent it from being
+         * unloaded.  Right now, we don't allow patch modules to unload since
+         * there is currently no method to determine if a thread is still
+         * running in the patched code contained in the patch module once
+         * the ftrace registration is successful.
+         */
+        if (!try_module_get(patch->mod))
+                return -ENODEV;
+        ret = klp_init_patch(patch);
+        if (ret)
+                module_put(patch->mod);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(klp_register_patch);
+static void klp_module_notify_coming(struct klp_patch *patch,
+                                     struct klp_object *obj)
+{
+        struct module *pmod = patch->mod;
+        struct module *mod = obj->mod;
+        int ret;
+        ret = klp_init_object_loaded(patch, obj);
+        if (ret)
+                goto err;
+        if (patch->state == KLP_DISABLED)
+                return;
+        pr_notice("applying patch '%s' to loading module '%s'\n",
+                  pmod->name, mod->name);
+        ret = klp_enable_object(obj);
+        if (!ret)
+                return;
+err:
+        pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
+                pmod->name, mod->name, ret);
+}
+static void klp_module_notify_going(struct klp_patch *patch,
+                                    struct klp_object *obj)
+{
+        struct module *pmod = patch->mod;
+        struct module *mod = obj->mod;
+        int ret;
+        if (patch->state == KLP_DISABLED)
+                goto disabled;
+        pr_notice("reverting patch '%s' on unloading module '%s'\n",
+                  pmod->name, mod->name);
+        ret = klp_disable_object(obj);
+        if (ret)
+                pr_warn("failed to revert patch '%s' on module '%s' (%d)\n",
+                        pmod->name, mod->name, ret);
+disabled:
+        klp_free_object_loaded(obj);
+}
+static int klp_module_notify(struct notifier_block *nb, unsigned long action,
+                             void *data)
+{
+        struct module *mod = data;
+        struct klp_patch *patch;
+        struct klp_object *obj;
+        if (action != MODULE_STATE_COMING && action != MODULE_STATE_GOING)
+                return 0;
+        mutex_lock(&klp_mutex);
+        list_for_each_entry(patch, &klp_patches, list) {
+                for (obj = patch->objs; obj->funcs; obj++) {
+                        if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
+                                continue;
+                        if (action == MODULE_STATE_COMING) {
+                                obj->mod = mod;
+                                klp_module_notify_coming(patch, obj);
+                        } else /* MODULE_STATE_GOING */
+                                klp_module_notify_going(patch, obj);
+                        break;
+                }
+        }
+        mutex_unlock(&klp_mutex);
+        return 0;
+}
+static struct notifier_block klp_module_nb = {
+        .notifier_call = klp_module_notify,
+        .priority = INT_MIN+1, /* called late but before ftrace notifier */
+};
+static int klp_init(void)
+{
+        int ret;
+        ret = klp_check_compiler_support();
+        if (ret) {
+                pr_info("Your compiler is too old; turning off.\n");
+                return -EINVAL;
+        }
+        ret = register_module_notifier(&klp_module_nb);
+        if (ret)
+                return ret;
+        klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj);
+        if (!klp_root_kobj) {
+                ret = -ENOMEM;
+                goto unregister;
+        }
+        return 0;
+unregister:
+        unregister_module_notifier(&klp_module_nb);
+        return ret;
+}
+module_init(klp_init);
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 8541bfdfd232..de7a416cca2a 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,11 +1,11 @@
-obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o
+obj-y += mutex.o semaphore.o rwsem.o
 ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_lockdep.o = -pg
+CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
-CFLAGS_REMOVE_lockdep_proc.o = -pg
+CFLAGS_REMOVE_lockdep_proc.o = $(CC_FLAGS_FTRACE)
-CFLAGS_REMOVE_mutex-debug.o = -pg
+CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
-CFLAGS_REMOVE_rtmutex-debug.o = -pg
+CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
 endif
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
@@ -14,6 +14,7 @@ ifeq ($(CONFIG_PROC_FS),y)
 obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
 endif
 obj-$(CONFIG_SMP) += spinlock.o
+obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
 obj-$(CONFIG_SMP) += lglock.o
 obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
 obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 4d60986fcbee..d1fe2ba5bac9 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -108,20 +108,4 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
        arch_mcs_spin_unlock_contended(&next->locked);
 }
-/*
- * Cancellable version of the MCS lock above.
- *
- * Intended for adaptive spinning of sleeping locks:
- * mutex_lock()/rwsem_down_{read,write}() etc.
- */
-struct optimistic_spin_node {
-        struct optimistic_spin_node *next, *prev;
-        int locked; /* 1 if lock acquired */
-        int cpu; /* encoded CPU # value */
-};
-extern bool osq_lock(struct optimistic_spin_queue *lock);
-extern void osq_unlock(struct optimistic_spin_queue *lock);
 #endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 454195194d4a..94674e5919cb 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -81,7 +81,7 @@ __visible void __sched __mutex_lock_slowpath(atomic_t *lock_count);
 * The mutex must later on be released by the same task that
 * acquired it. Recursive locking is not allowed. The task
 * may not exit without first unlocking the mutex. Also, kernel
- * memory where the mutex resides mutex must not be freed with
+ * memory where the mutex resides must not be freed with
 * the mutex still locked. The mutex must first be initialized
 * (or statically defined) before it can be locked. memset()-ing
 * the mutex to 0 is not allowed.
@@ -147,7 +147,7 @@ static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
 }
 /*
- * after acquiring lock with fastpath or when we lost out in contested
+ * After acquiring lock with fastpath or when we lost out in contested
 * slowpath, set ctx and wake up any waiters so they can recheck.
 *
 * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
@@ -191,19 +191,32 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock,
        spin_unlock_mutex(&lock->base.wait_lock, flags);
 }
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 /*
- * In order to avoid a stampede of mutex spinners from acquiring the mutex
+ * After acquiring lock in the slowpath set ctx and wake up any
- * more or less simultaneously, the spinners need to acquire a MCS lock
+ * waiters so they can recheck.
- * first before spinning on the owner field.
 *
+ * Callers must hold the mutex wait_lock.
 */
+static __always_inline void
+ww_mutex_set_context_slowpath(struct ww_mutex *lock,
+                              struct ww_acquire_ctx *ctx)
+{
+        struct mutex_waiter *cur;
-/*
+        ww_mutex_lock_acquired(lock, ctx);
- * Mutex spinning code migrated from kernel/sched/core.c
+        lock->ctx = ctx;
- */
+        /*
+         * Give any possible sleeping processes the chance to wake up,
+         * so they can recheck if they have to back off.
+         */
+        list_for_each_entry(cur, &lock->base.wait_list, list) {
+                debug_mutex_wake_waiter(&lock->base, cur);
+                wake_up_process(cur->task);
+        }
+}
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
 {
        if (lock->owner != owner)
@@ -307,6 +320,11 @@ static bool mutex_optimistic_spin(struct mutex *lock,
        if (!mutex_can_spin_on_owner(lock))
                goto done;
+        /*
+         * In order to avoid a stampede of mutex spinners trying to
+         * acquire the mutex all at once, the spinners need to take a
+         * MCS (queued) lock first before spinning on the owner field.
+         */
        if (!osq_lock(&lock->osq))
                goto done;
@@ -469,7 +487,7 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock)
 EXPORT_SYMBOL(ww_mutex_unlock);
 static inline int __sched
-__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
+__ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
 {
        struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
        struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
@@ -557,7 +575,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                }
                if (use_ww_ctx && ww_ctx->acquired > 0) {
-                        ret = __mutex_lock_check_stamp(lock, ww_ctx);
+                        ret = __ww_mutex_lock_check_stamp(lock, ww_ctx);
                        if (ret)
                                goto err;
                }
@@ -569,6 +587,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                schedule_preempt_disabled();
                spin_lock_mutex(&lock->wait_lock, flags);
        }
+        __set_task_state(task, TASK_RUNNING);
        mutex_remove_waiter(lock, &waiter, current_thread_info());
        /* set it to 0 if there are no waiters left: */
        if (likely(list_empty(&lock->wait_list)))
@@ -582,23 +602,7 @@ skip_wait:
        if (use_ww_ctx) {
                struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
-                struct mutex_waiter *cur;
+                ww_mutex_set_context_slowpath(ww, ww_ctx);
-                /*
-                 * This branch gets optimized out for the common case,
-                 * and is only important for ww_mutex_lock.
-                 */
-                ww_mutex_lock_acquired(ww, ww_ctx);
-                ww->ctx = ww_ctx;
-                /*
-                 * Give any possible sleeping processes the chance to wake up,
-                 * so they can recheck if they have to back off.
-                 */
-                list_for_each_entry(cur, &lock->wait_list, list) {
-                        debug_mutex_wake_waiter(lock, cur);
-                        wake_up_process(cur->task);
-                }
        }
        spin_unlock_mutex(&lock->wait_lock, flags);
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/osq_lock.c
index 9887a905a762..c112d00341b0 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/osq_lock.c
@@ -1,8 +1,6 @@
 #include <linux/percpu.h>
 #include <linux/sched.h>
-#include "mcs_spinlock.h"
+#include <linux/osq_lock.h>
-#ifdef CONFIG_SMP
 /*
 * An MCS like lock especially tailored for optimistic spinning for sleeping
@@ -111,7 +109,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
         * cmpxchg in an attempt to undo our queueing.
         */
-        while (!smp_load_acquire(&node->locked)) {
+        while (!ACCESS_ONCE(node->locked)) {
                /*
                 * If we need to reschedule bail... so we can block.
                 */
@@ -203,6 +201,3 @@ void osq_unlock(struct optimistic_spin_queue *lock)
        if (next)
                ACCESS_ONCE(next->locked) = 1;
 }
-#endif
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 7c98873a3077..e16e5542bf13 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1130,6 +1130,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
                set_current_state(state);
        }
+        __set_current_state(TASK_RUNNING);
        return ret;
 }
@@ -1188,12 +1189,12 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
        ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
        if (likely(!ret))
+                /* sleep on the mutex */
                ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
-        set_current_state(TASK_RUNNING);
        if (unlikely(ret)) {
-                remove_waiter(lock, &waiter);
+                if (rt_mutex_has_waiters(lock))
+                        remove_waiter(lock, &waiter);
                rt_mutex_handle_deadlock(ret, chwalk, &waiter);
        }
@@ -1626,10 +1627,9 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
        set_current_state(TASK_INTERRUPTIBLE);
+        /* sleep on the mutex */
        ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
-        set_current_state(TASK_RUNNING);
        if (unlikely(ret))
                remove_waiter(lock, waiter);
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 2c93571162cb..2555ae15ec14 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -154,7 +154,7 @@ void __sched __down_read(struct rw_semaphore *sem)
                set_task_state(tsk, TASK_UNINTERRUPTIBLE);
        }
-        tsk->state = TASK_RUNNING;
+        __set_task_state(tsk, TASK_RUNNING);
 out:
        ;
 }
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 7628c3fc37ca..2f7cc4076f50 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -242,8 +242,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
                schedule();
        }
-        tsk->state = TASK_RUNNING;
+        __set_task_state(tsk, TASK_RUNNING);
        return sem;
 }
 EXPORT_SYMBOL(rwsem_down_read_failed);
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 4b082b5cac9e..db3ccb1dd614 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -363,6 +363,14 @@ void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
 }
 EXPORT_SYMBOL(_raw_spin_lock_nested);
+void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass)
+{
+        __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
+        spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+        LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
+}
+EXPORT_SYMBOL(_raw_spin_lock_bh_nested);
 unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
                                                   int subclass)
 {
diff --git a/kernel/module.c b/kernel/module.c
index d856e96a3cce..b34813f725e9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -56,6 +56,7 @@
 #include <linux/async.h>
 #include <linux/percpu.h>
 #include <linux/kmemleak.h>
+#include <linux/kasan.h>
 #include <linux/jump_label.h>
 #include <linux/pfn.h>
 #include <linux/bsearch.h>
@@ -1225,6 +1226,12 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
        const unsigned long *crc;
        int err;
+        /*
+         * The module_mutex should not be a heavily contended lock;
+         * if we get the occasional sleep here, we'll go an extra iteration
+         * in the wait_event_interruptible(), which is harmless.
+         */
+        sched_annotate_sleep();
        mutex_lock(&module_mutex);
        sym = find_symbol(name, &owner, &crc,
                          !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
@@ -1807,6 +1814,7 @@ static void unset_module_init_ro_nx(struct module *mod) { }
 void __weak module_memfree(void *module_region)
 {
        vfree(module_region);
+        kasan_module_free(module_region);
 }
 void __weak module_arch_cleanup(struct module *mod)
@@ -2978,6 +2986,12 @@ static bool finished_loading(const char *name)
        struct module *mod;
        bool ret;
+        /*
+         * The module_mutex should not be a heavily contended lock;
+         * if we get the occasional sleep here, we'll go an extra iteration
+         * in the wait_event_interruptible(), which is harmless.
+         */
+        sched_annotate_sleep();
        mutex_lock(&module_mutex);
        mod = find_module_all(name, strlen(name), true);
        ret = !mod || mod->state == MODULE_STATE_LIVE
@@ -3011,8 +3025,13 @@ static void do_free_init(struct rcu_head *head)
        kfree(m);
 }
-/* This is where the real work happens */
+/*
-static int do_init_module(struct module *mod)
+ * This is where the real work happens.
+ *
+ * Keep it uninlined to provide a reliable breakpoint target, e.g. for the gdb
+ * helper command 'lx-symbols'.
+ */
+static noinline int do_init_module(struct module *mod)
 {
        int ret = 0;
        struct mod_initfree *freeinit;
@@ -3120,32 +3139,6 @@ static int may_init_module(void)
 }
 /*
- * Can't use wait_event_interruptible() because our condition
- * 'finished_loading()' contains a blocking primitive itself (mutex_lock).
- */
-static int wait_finished_loading(struct module *mod)
-{
-        DEFINE_WAIT_FUNC(wait, woken_wake_function);
-        int ret = 0;
-        add_wait_queue(&module_wq, &wait);
-        for (;;) {
-                if (finished_loading(mod->name))
-                        break;
-                if (signal_pending(current)) {
-                        ret = -ERESTARTSYS;
-                        break;
-                }
-                wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
-        }
-        remove_wait_queue(&module_wq, &wait);
-        return ret;
-}
-/*
 * We try to place it in the list now to make sure it's unique before
 * we dedicate too many resources.  In particular, temporary percpu
 * memory exhaustion.
@@ -3165,8 +3158,8 @@ again:
                    || old->state == MODULE_STATE_UNFORMED) {
                        /* Wait in case it fails to load. */
                        mutex_unlock(&module_mutex);
+                        err = wait_event_interruptible(module_wq,
-                        err = wait_finished_loading(mod);
+                                               finished_loading(mod->name));
                        if (err)
                                goto out_unlocked;
                        goto again;
@@ -3265,7 +3258,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
        mod->sig_ok = info->sig_ok;
        if (!mod->sig_ok) {
                pr_notice_once("%s: module verification failed: signature "
-                               "and/or  required key missing - tainting "
+                               "and/or required key missing - tainting "
                               "kernel\n", mod->name);
                add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
        }
@@ -3356,6 +3349,9 @@ static int load_module(struct load_info *info, const char __user *uargs,
        module_bug_cleanup(mod);
        mutex_unlock(&module_mutex);
+        /* Free lock-classes: */
+        lockdep_free_key_range(mod->module_core, mod->core_size);
        /* we can't deallocate the module until we clear memory protection */
        unset_module_init_ro_nx(mod);
        unset_module_core_ro_nx(mod);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 4803da6eab62..ae9fc7cc360e 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -402,6 +402,7 @@ int raw_notifier_call_chain(struct raw_notifier_head *nh,
 }
 EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
+#ifdef CONFIG_SRCU
 /*
 *      SRCU notifier chain routines.    Registration and unregistration
 *      use a mutex, and call_chain is synchronized by SRCU (no locks).
@@ -528,6 +529,8 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh)
 }
 EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
+#endif /* CONFIG_SRCU */
 static ATOMIC_NOTIFIER_HEAD(die_chain);
 int notrace notify_die(enum die_val val, const char *str,
diff --git a/kernel/padata.c b/kernel/padata.c
index 161402f0b517..b38bea9c466a 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -917,15 +917,10 @@ static ssize_t show_cpumask(struct padata_instance *pinst,
        else
                cpumask = pinst->cpumask.pcpu;
-        len = bitmap_scnprintf(buf, PAGE_SIZE, cpumask_bits(cpumask),
+        len = snprintf(buf, PAGE_SIZE, "%*pb\n",
-                               nr_cpu_ids);
+                       nr_cpu_ids, cpumask_bits(cpumask));
-        if (PAGE_SIZE - len < 2)
-                len = -EINVAL;
-        else
-                len += sprintf(buf + len, "\n");
        mutex_unlock(&pinst->lock);
-        return len;
+        return len < PAGE_SIZE ? len : -EINVAL;
 }
 static ssize_t store_cpumask(struct padata_instance *pinst,
diff --git a/kernel/panic.c b/kernel/panic.c
index 4d8d6f906dec..8136ad76e5fd 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -226,6 +226,7 @@ static const struct tnt tnts[] = {
        { TAINT_OOT_MODULE,             'O', ' ' },
        { TAINT_UNSIGNED_MODULE,        'E', ' ' },
        { TAINT_SOFTLOCKUP,             'L', ' ' },
+        { TAINT_LIVEPATCH,              'K', ' ' },
 };
 /**
@@ -246,6 +247,7 @@ static const struct tnt tnts[] = {
 *  'O' - Out-of-tree module has been loaded.
 *  'E' - Unsigned module has been loaded.
 *  'L' - A soft lockup has previously occurred.
+ *  'K' - Kernel has been live patched.
 *
 *      The string is overwritten by the next call to print_tainted().
 */
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 48b28d387c7f..7e01f78f0417 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -251,6 +251,7 @@ config APM_EMULATION
 config PM_OPP
        bool
+        select SRCU
        ---help---
          SOCs have a standard set of tuples consisting of frequency and
          voltage pairs that the device will support per voltage domain. This
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 5a6ec8678b9a..564f786df470 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -84,8 +84,8 @@ static int try_to_freeze_tasks(bool user_only)
        elapsed_msecs = elapsed_msecs64;
        if (todo) {
-                printk("\n");
+                pr_cont("\n");
-                printk(KERN_ERR "Freezing of tasks %s after %d.%03d seconds "
+                pr_err("Freezing of tasks %s after %d.%03d seconds "
                       "(%d tasks refusing to freeze, wq_busy=%d):\n",
                       wakeup ? "aborted" : "failed",
                       elapsed_msecs / 1000, elapsed_msecs % 1000,
@@ -101,37 +101,13 @@ static int try_to_freeze_tasks(bool user_only)
                        read_unlock(&tasklist_lock);
                }
        } else {
-                printk("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
+                pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
                        elapsed_msecs % 1000);
        }
        return todo ? -EBUSY : 0;
 }
-static bool __check_frozen_processes(void)
-{
-        struct task_struct *g, *p;
-        for_each_process_thread(g, p)
-                if (p != current && !freezer_should_skip(p) && !frozen(p))
-                        return false;
-        return true;
-}
-/*
- * Returns true if all freezable tasks (except for current) are frozen already
- */
-static bool check_frozen_processes(void)
-{
-        bool ret;
-        read_lock(&tasklist_lock);
-        ret = __check_frozen_processes();
-        read_unlock(&tasklist_lock);
-        return ret;
-}
 /**
 * freeze_processes - Signal user space processes to enter the refrigerator.
 * The current thread will not be frozen.  The same process that calls
@@ -142,7 +118,6 @@ static bool check_frozen_processes(void)
 int freeze_processes(void)
 {
        int error;
-        int oom_kills_saved;
        error = __usermodehelper_disable(UMH_FREEZING);
        if (error)
@@ -155,31 +130,24 @@ int freeze_processes(void)
                atomic_inc(&system_freezing_cnt);
        pm_wakeup_clear();
-        printk("Freezing user space processes ... ");
+        pr_info("Freezing user space processes ... ");
        pm_freezing = true;
-        oom_kills_saved = oom_kills_count();
        error = try_to_freeze_tasks(true);
        if (!error) {
                __usermodehelper_set_disable_depth(UMH_DISABLED);
-                oom_killer_disable();
+                pr_cont("done.");
-                /*
-                 * There might have been an OOM kill while we were
-                 * freezing tasks and the killed task might be still
-                 * on the way out so we have to double check for race.
-                 */
-                if (oom_kills_count() != oom_kills_saved &&
-                    !check_frozen_processes()) {
-                        __usermodehelper_set_disable_depth(UMH_ENABLED);
-                        printk("OOM in progress.");
-                        error = -EBUSY;
-                } else {
-                        printk("done.");
-                }
        }
-        printk("\n");
+        pr_cont("\n");
        BUG_ON(in_atomic());
+        /*
+         * Now that the whole userspace is frozen we need to disbale
+         * the OOM killer to disallow any further interference with
+         * killable tasks.
+         */
+        if (!error && !oom_killer_disable())
+                error = -EBUSY;
        if (error)
                thaw_processes();
        return error;
@@ -197,13 +165,14 @@ int freeze_kernel_threads(void)
 {
        int error;
-        printk("Freezing remaining freezable tasks ... ");
+        pr_info("Freezing remaining freezable tasks ... ");
        pm_nosig_freezing = true;
        error = try_to_freeze_tasks(false);
        if (!error)
-                printk("done.");
+                pr_cont("done.");
-        printk("\n");
+        pr_cont("\n");
        BUG_ON(in_atomic());
        if (error)
@@ -224,7 +193,7 @@ void thaw_processes(void)
        oom_killer_enable();
-        printk("Restarting tasks ... ");
+        pr_info("Restarting tasks ... ");
        __usermodehelper_set_disable_depth(UMH_FREEZING);
        thaw_workqueues();
@@ -243,7 +212,7 @@ void thaw_processes(void)
        usermodehelper_enable();
        schedule();
-        printk("done.\n");
+        pr_cont("done.\n");
        trace_suspend_resume(TPS("thaw_processes"), 0, false);
 }
@@ -252,7 +221,7 @@ void thaw_kernel_threads(void)
        struct task_struct *g, *p;
        pm_nosig_freezing = false;
-        printk("Restarting kernel threads ... ");
+        pr_info("Restarting kernel threads ... ");
        thaw_workqueues();
@@ -264,5 +233,5 @@ void thaw_kernel_threads(void)
        read_unlock(&tasklist_lock);
        schedule();
-        printk("done.\n");
+        pr_cont("done.\n");
 }
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 5f4c006c4b1e..97b0df71303e 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -41,6 +41,8 @@
 #include <linux/platform_device.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
 #include <linux/uaccess.h>
 #include <linux/export.h>
@@ -182,6 +184,81 @@ static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value)
        c->target_value = value;
 }
+static inline int pm_qos_get_value(struct pm_qos_constraints *c);
+static int pm_qos_dbg_show_requests(struct seq_file *s, void *unused)
+{
+        struct pm_qos_object *qos = (struct pm_qos_object *)s->private;
+        struct pm_qos_constraints *c;
+        struct pm_qos_request *req;
+        char *type;
+        unsigned long flags;
+        int tot_reqs = 0;
+        int active_reqs = 0;
+        if (IS_ERR_OR_NULL(qos)) {
+                pr_err("%s: bad qos param!\n", __func__);
+                return -EINVAL;
+        }
+        c = qos->constraints;
+        if (IS_ERR_OR_NULL(c)) {
+                pr_err("%s: Bad constraints on qos?\n", __func__);
+                return -EINVAL;
+        }
+        /* Lock to ensure we have a snapshot */
+        spin_lock_irqsave(&pm_qos_lock, flags);
+        if (plist_head_empty(&c->list)) {
+                seq_puts(s, "Empty!\n");
+                goto out;
+        }
+        switch (c->type) {
+        case PM_QOS_MIN:
+                type = "Minimum";
+                break;
+        case PM_QOS_MAX:
+                type = "Maximum";
+                break;
+        case PM_QOS_SUM:
+                type = "Sum";
+                break;
+        default:
+                type = "Unknown";
+        }
+        plist_for_each_entry(req, &c->list, node) {
+                char *state = "Default";
+                if ((req->node).prio != c->default_value) {
+                        active_reqs++;
+                        state = "Active";
+                }
+                tot_reqs++;
+                seq_printf(s, "%d: %d: %s\n", tot_reqs,
+                           (req->node).prio, state);
+        }
+        seq_printf(s, "Type=%s, Value=%d, Requests: active=%d / total=%d\n",
+                   type, pm_qos_get_value(c), active_reqs, tot_reqs);
+out:
+        spin_unlock_irqrestore(&pm_qos_lock, flags);
+        return 0;
+}
+static int pm_qos_dbg_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, pm_qos_dbg_show_requests,
+                           inode->i_private);
+}
+static const struct file_operations pm_qos_debug_fops = {
+        .open           = pm_qos_dbg_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
 /**
 * pm_qos_update_target - manages the constraints list and calls the notifiers
 *  if needed
@@ -509,12 +586,17 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
 EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
 /* User space interface to PM QoS classes via misc devices */
-static int register_pm_qos_misc(struct pm_qos_object *qos)
+static int register_pm_qos_misc(struct pm_qos_object *qos, struct dentry *d)
 {
        qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
        qos->pm_qos_power_miscdev.name = qos->name;
        qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
+        if (d) {
+                (void)debugfs_create_file(qos->name, S_IRUGO, d,
+                                          (void *)qos, &pm_qos_debug_fops);
+        }
        return misc_register(&qos->pm_qos_power_miscdev);
 }
@@ -608,11 +690,16 @@ static int __init pm_qos_power_init(void)
 {
        int ret = 0;
        int i;
+        struct dentry *d;
        BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
+        d = debugfs_create_dir("pm_qos", NULL);
+        if (IS_ERR_OR_NULL(d))
+                d = NULL;
        for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) {
-                ret = register_pm_qos_misc(pm_qos_array[i]);
+                ret = register_pm_qos_misc(pm_qos_array[i], d);
                if (ret < 0) {
                        printk(KERN_ERR "pm_qos_param: %s setup failed\n",
                               pm_qos_array[i]->name);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0c40c16174b4..c24d5a23bf93 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1472,9 +1472,9 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
 /**
 * free_unnecessary_pages - Release preallocated pages not needed for the image
 */
-static void free_unnecessary_pages(void)
+static unsigned long free_unnecessary_pages(void)
 {
-        unsigned long save, to_free_normal, to_free_highmem;
+        unsigned long save, to_free_normal, to_free_highmem, free;
        save = count_data_pages();
        if (alloc_normal >= save) {
@@ -1495,6 +1495,7 @@ static void free_unnecessary_pages(void)
                else
                        to_free_normal = 0;
        }
+        free = to_free_normal + to_free_highmem;
        memory_bm_position_reset(&copy_bm);
@@ -1518,6 +1519,8 @@ static void free_unnecessary_pages(void)
                swsusp_unset_page_free(page);
                __free_page(page);
        }
+        return free;
 }
 /**
@@ -1707,7 +1710,7 @@ int hibernate_preallocate_memory(void)
         * pages in memory, but we have allocated more.  Release the excessive
         * ones now.
         */
-        free_unnecessary_pages();
+        pages -= free_unnecessary_pages();
 out:
        stop = ktime_get();
@@ -2310,8 +2313,6 @@ static inline void free_highmem_data(void)
                free_image_page(buffer, PG_UNSAFE_CLEAR);
 }
 #else
-static inline int get_safe_write_buffer(void) { return 0; }
 static unsigned int
 count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c347e3ce3a55..b7d6b3a721b1 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -37,7 +37,9 @@ const char *pm_states[PM_SUSPEND_MAX];
 static const struct platform_suspend_ops *suspend_ops;
 static const struct platform_freeze_ops *freeze_ops;
 static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
-static bool suspend_freeze_wake;
+enum freeze_state __read_mostly suspend_freeze_state;
+static DEFINE_SPINLOCK(suspend_freeze_lock);
 void freeze_set_ops(const struct platform_freeze_ops *ops)
 {
@@ -48,22 +50,49 @@ void freeze_set_ops(const struct platform_freeze_ops *ops)
 static void freeze_begin(void)
 {
-        suspend_freeze_wake = false;
+        suspend_freeze_state = FREEZE_STATE_NONE;
 }
 static void freeze_enter(void)
 {
-        cpuidle_use_deepest_state(true);
+        spin_lock_irq(&suspend_freeze_lock);
+        if (pm_wakeup_pending())
+                goto out;
+        suspend_freeze_state = FREEZE_STATE_ENTER;
+        spin_unlock_irq(&suspend_freeze_lock);
+        get_online_cpus();
        cpuidle_resume();
-        wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
+        /* Push all the CPUs into the idle loop. */
+        wake_up_all_idle_cpus();
+        pr_debug("PM: suspend-to-idle\n");
+        /* Make the current CPU wait so it can enter the idle loop too. */
+        wait_event(suspend_freeze_wait_head,
+                   suspend_freeze_state == FREEZE_STATE_WAKE);
+        pr_debug("PM: resume from suspend-to-idle\n");
        cpuidle_pause();
-        cpuidle_use_deepest_state(false);
+        put_online_cpus();
+        spin_lock_irq(&suspend_freeze_lock);
+ out:
+        suspend_freeze_state = FREEZE_STATE_NONE;
+        spin_unlock_irq(&suspend_freeze_lock);
 }
 void freeze_wake(void)
 {
-        suspend_freeze_wake = true;
+        unsigned long flags;
-        wake_up(&suspend_freeze_wait_head);
+        spin_lock_irqsave(&suspend_freeze_lock, flags);
+        if (suspend_freeze_state > FREEZE_STATE_NONE) {
+                suspend_freeze_state = FREEZE_STATE_WAKE;
+                wake_up(&suspend_freeze_wait_head);
+        }
+        spin_unlock_irqrestore(&suspend_freeze_lock, flags);
 }
 EXPORT_SYMBOL_GPL(freeze_wake);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 02d6b6d28796..01cfd69c54c6 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -935,8 +935,8 @@ static int __init ignore_loglevel_setup(char *str)
 early_param("ignore_loglevel", ignore_loglevel_setup);
 module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
+MODULE_PARM_DESC(ignore_loglevel,
-        "print all kernel messages to the console.");
+                 "ignore loglevel setting (prints all kernel messages to the console)");
 #ifdef CONFIG_BOOT_PRINTK_DELAY
@@ -1419,16 +1419,16 @@ static void call_console_drivers(int level, const char *text, size_t len)
 }
 /*
- * Zap console related locks when oopsing. Only zap at most once
+ * Zap console related locks when oopsing.
- * every 10 seconds, to leave time for slow consoles to print a
+ * To leave time for slow consoles to print a full oops,
- * full oops.
+ * only zap at most once every 30 seconds.
 */
 static void zap_locks(void)
 {
        static unsigned long oops_timestamp;
        if (time_after_eq(jiffies, oops_timestamp) &&
-                        !time_after(jiffies, oops_timestamp + 30 * HZ))
+            !time_after(jiffies, oops_timestamp + 30 * HZ))
                return;
        oops_timestamp = jiffies;
@@ -1811,7 +1811,7 @@ int vprintk_default(const char *fmt, va_list args)
 #ifdef CONFIG_KGDB_KDB
        if (unlikely(kdb_trap_printk)) {
-                r = vkdb_printf(fmt, args);
+                r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
                return r;
        }
 #endif
diff --git a/kernel/profile.c b/kernel/profile.c
index 54bf5ba26420..a7bcd28d6e9f 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -422,8 +422,7 @@ void profile_tick(int type)
 static int prof_cpu_mask_proc_show(struct seq_file *m, void *v)
 {
-        seq_cpumask(m, prof_cpu_mask);
+        seq_printf(m, "%*pb\n", cpumask_pr_args(prof_cpu_mask));
-        seq_putc(m, '\n');
        return 0;
 }
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1eb9d90c3af9..227fec36b12a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1077,7 +1077,6 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
 }
 #if defined CONFIG_COMPAT
-#include <linux/compat.h>
 int compat_ptrace_request(struct task_struct *child, compat_long_t request,
                          compat_ulong_t addr, compat_ulong_t data)
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index e6fae503d1bc..50a808424b06 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,4 +1,5 @@
-obj-y += update.o srcu.o
+obj-y += update.o
+obj-$(CONFIG_SRCU) += srcu.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_TREE_RCU) += tree.o
 obj-$(CONFIG_PREEMPT_RCU) += tree.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 07bb02eda844..80adef7d4c3d 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -137,4 +137,10 @@ int rcu_jiffies_till_stall_check(void);
 void rcu_early_boot_tests(void);
+/*
+ * This function really isn't for public consumption, but RCU is special in
+ * that context switches can allow the state machine to make progress.
+ */
+extern void resched_cpu(int cpu);
 #endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 4d559baf06e0..30d42aa55d83 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -244,7 +244,8 @@ struct rcu_torture_ops {
        int (*readlock)(void);
        void (*read_delay)(struct torture_random_state *rrsp);
        void (*readunlock)(int idx);
-        int (*completed)(void);
+        unsigned long (*started)(void);
+        unsigned long (*completed)(void);
        void (*deferred_free)(struct rcu_torture *p);
        void (*sync)(void);
        void (*exp_sync)(void);
@@ -296,11 +297,6 @@ static void rcu_torture_read_unlock(int idx) __releases(RCU)
        rcu_read_unlock();
 }
-static int rcu_torture_completed(void)
-{
-        return rcu_batches_completed();
-}
 /*
 * Update callback in the pipe.  This should be invoked after a grace period.
 */
@@ -356,7 +352,7 @@ rcu_torture_cb(struct rcu_head *p)
                cur_ops->deferred_free(rp);
 }
-static int rcu_no_completed(void)
+static unsigned long rcu_no_completed(void)
 {
        return 0;
 }
@@ -377,7 +373,8 @@ static struct rcu_torture_ops rcu_ops = {
        .readlock       = rcu_torture_read_lock,
        .read_delay     = rcu_read_delay,
        .readunlock     = rcu_torture_read_unlock,
-        .completed      = rcu_torture_completed,
+        .started        = rcu_batches_started,
+        .completed      = rcu_batches_completed,
        .deferred_free  = rcu_torture_deferred_free,
        .sync           = synchronize_rcu,
        .exp_sync       = synchronize_rcu_expedited,
@@ -407,11 +404,6 @@ static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
        rcu_read_unlock_bh();
 }
-static int rcu_bh_torture_completed(void)
-{
-        return rcu_batches_completed_bh();
-}
 static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
 {
        call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
@@ -423,7 +415,8 @@ static struct rcu_torture_ops rcu_bh_ops = {
        .readlock       = rcu_bh_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = rcu_bh_torture_read_unlock,
-        .completed      = rcu_bh_torture_completed,
+        .started        = rcu_batches_started_bh,
+        .completed      = rcu_batches_completed_bh,
        .deferred_free  = rcu_bh_torture_deferred_free,
        .sync           = synchronize_rcu_bh,
        .exp_sync       = synchronize_rcu_bh_expedited,
@@ -466,6 +459,7 @@ static struct rcu_torture_ops rcu_busted_ops = {
        .readlock       = rcu_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = rcu_torture_read_unlock,
+        .started        = rcu_no_completed,
        .completed      = rcu_no_completed,
        .deferred_free  = rcu_busted_torture_deferred_free,
        .sync           = synchronize_rcu_busted,
@@ -510,7 +504,7 @@ static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
        srcu_read_unlock(&srcu_ctl, idx);
 }
-static int srcu_torture_completed(void)
+static unsigned long srcu_torture_completed(void)
 {
        return srcu_batches_completed(&srcu_ctl);
 }
@@ -564,6 +558,7 @@ static struct rcu_torture_ops srcu_ops = {
        .readlock       = srcu_torture_read_lock,
        .read_delay     = srcu_read_delay,
        .readunlock     = srcu_torture_read_unlock,
+        .started        = NULL,
        .completed      = srcu_torture_completed,
        .deferred_free  = srcu_torture_deferred_free,
        .sync           = srcu_torture_synchronize,
@@ -600,7 +595,8 @@ static struct rcu_torture_ops sched_ops = {
        .readlock       = sched_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = sched_torture_read_unlock,
-        .completed      = rcu_no_completed,
+        .started        = rcu_batches_started_sched,
+        .completed      = rcu_batches_completed_sched,
        .deferred_free  = rcu_sched_torture_deferred_free,
        .sync           = synchronize_sched,
        .exp_sync       = synchronize_sched_expedited,
@@ -638,6 +634,7 @@ static struct rcu_torture_ops tasks_ops = {
        .readlock       = tasks_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = tasks_torture_read_unlock,
+        .started        = rcu_no_completed,
        .completed      = rcu_no_completed,
        .deferred_free  = rcu_tasks_torture_deferred_free,
        .sync           = synchronize_rcu_tasks,
@@ -1015,8 +1012,8 @@ static void rcutorture_trace_dump(void)
 static void rcu_torture_timer(unsigned long unused)
 {
        int idx;
-        int completed;
+        unsigned long started;
-        int completed_end;
+        unsigned long completed;
        static DEFINE_TORTURE_RANDOM(rand);
        static DEFINE_SPINLOCK(rand_lock);
        struct rcu_torture *p;
@@ -1024,7 +1021,10 @@ static void rcu_torture_timer(unsigned long unused)
        unsigned long long ts;
        idx = cur_ops->readlock();
-        completed = cur_ops->completed();
+        if (cur_ops->started)
+                started = cur_ops->started();
+        else
+                started = cur_ops->completed();
        ts = rcu_trace_clock_local();
        p = rcu_dereference_check(rcu_torture_current,
                                  rcu_read_lock_bh_held() ||
@@ -1047,14 +1047,16 @@ static void rcu_torture_timer(unsigned long unused)
                /* Should not happen, but... */
                pipe_count = RCU_TORTURE_PIPE_LEN;
        }
-        completed_end = cur_ops->completed();
+        completed = cur_ops->completed();
        if (pipe_count > 1) {
                do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
-                                          completed, completed_end);
+                                          started, completed);
                rcutorture_trace_dump();
        }
        __this_cpu_inc(rcu_torture_count[pipe_count]);
-        completed = completed_end - completed;
+        completed = completed - started;
+        if (cur_ops->started)
+                completed++;
        if (completed > RCU_TORTURE_PIPE_LEN) {
                /* Should not happen, but... */
                completed = RCU_TORTURE_PIPE_LEN;
@@ -1073,8 +1075,8 @@ static void rcu_torture_timer(unsigned long unused)
 static int
 rcu_torture_reader(void *arg)
 {
-        int completed;
+        unsigned long started;
-        int completed_end;
+        unsigned long completed;
        int idx;
        DEFINE_TORTURE_RANDOM(rand);
        struct rcu_torture *p;
@@ -1093,7 +1095,10 @@ rcu_torture_reader(void *arg)
                                mod_timer(&t, jiffies + 1);
                }
                idx = cur_ops->readlock();
-                completed = cur_ops->completed();
+                if (cur_ops->started)
+                        started = cur_ops->started();
+                else
+                        started = cur_ops->completed();
                ts = rcu_trace_clock_local();
                p = rcu_dereference_check(rcu_torture_current,
                                          rcu_read_lock_bh_held() ||
@@ -1114,14 +1119,16 @@ rcu_torture_reader(void *arg)
                        /* Should not happen, but... */
                        pipe_count = RCU_TORTURE_PIPE_LEN;
                }
-                completed_end = cur_ops->completed();
+                completed = cur_ops->completed();
                if (pipe_count > 1) {
                        do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
-                                                  ts, completed, completed_end);
+                                                  ts, started, completed);
                        rcutorture_trace_dump();
                }
                __this_cpu_inc(rcu_torture_count[pipe_count]);
-                completed = completed_end - completed;
+                completed = completed - started;
+                if (cur_ops->started)
+                        completed++;
                if (completed > RCU_TORTURE_PIPE_LEN) {
                        /* Should not happen, but... */
                        completed = RCU_TORTURE_PIPE_LEN;
@@ -1420,6 +1427,9 @@ static int rcu_torture_barrier(void *arg)
                cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */
                if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
                        n_rcu_torture_barrier_error++;
+                        pr_err("barrier_cbs_invoked = %d, n_barrier_cbs = %d\n",
+                               atomic_read(&barrier_cbs_invoked),
+                               n_barrier_cbs);
                        WARN_ON_ONCE(1);
                }
                n_barrier_successes++;
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index e037f3eb2f7b..445bf8ffe3fb 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -546,7 +546,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier);
 * Report the number of batches, correlated with, but not necessarily
 * precisely the same as, the number of grace periods that have elapsed.
 */
-long srcu_batches_completed(struct srcu_struct *sp)
+unsigned long srcu_batches_completed(struct srcu_struct *sp)
 {
        return sp->completed;
 }
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 0db5649f8817..cc9ceca7bde1 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -47,54 +47,14 @@ static void __call_rcu(struct rcu_head *head,
                       void (*func)(struct rcu_head *rcu),
                       struct rcu_ctrlblk *rcp);
-static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
 #include "tiny_plugin.h"
-/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcu/tree.c. */
-static void rcu_idle_enter_common(long long newval)
-{
-        if (newval) {
-                RCU_TRACE(trace_rcu_dyntick(TPS("--="),
-                                            rcu_dynticks_nesting, newval));
-                rcu_dynticks_nesting = newval;
-                return;
-        }
-        RCU_TRACE(trace_rcu_dyntick(TPS("Start"),
-                                    rcu_dynticks_nesting, newval));
-        if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) {
-                struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
-                RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"),
-                                            rcu_dynticks_nesting, newval));
-                ftrace_dump(DUMP_ALL);
-                WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
-                          current->pid, current->comm,
-                          idle->pid, idle->comm); /* must be idle task! */
-        }
-        rcu_sched_qs(); /* implies rcu_bh_inc() */
-        barrier();
-        rcu_dynticks_nesting = newval;
-}
 /*
 * Enter idle, which is an extended quiescent state if we have fully
- * entered that mode (i.e., if the new value of dynticks_nesting is zero).
+ * entered that mode.
 */
 void rcu_idle_enter(void)
 {
-        unsigned long flags;
-        long long newval;
-        local_irq_save(flags);
-        WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0);
-        if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) ==
-            DYNTICK_TASK_NEST_VALUE)
-                newval = 0;
-        else
-                newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE;
-        rcu_idle_enter_common(newval);
-        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -103,55 +63,14 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
 */
 void rcu_irq_exit(void)
 {
-        unsigned long flags;
-        long long newval;
-        local_irq_save(flags);
-        newval = rcu_dynticks_nesting - 1;
-        WARN_ON_ONCE(newval < 0);
-        rcu_idle_enter_common(newval);
-        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(rcu_irq_exit);
-/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcu/tree.c. */
-static void rcu_idle_exit_common(long long oldval)
-{
-        if (oldval) {
-                RCU_TRACE(trace_rcu_dyntick(TPS("++="),
-                                            oldval, rcu_dynticks_nesting));
-                return;
-        }
-        RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting));
-        if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) {
-                struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
-                RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"),
-                          oldval, rcu_dynticks_nesting));
-                ftrace_dump(DUMP_ALL);
-                WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
-                          current->pid, current->comm,
-                          idle->pid, idle->comm); /* must be idle task! */
-        }
-}
 /*
 * Exit idle, so that we are no longer in an extended quiescent state.
 */
 void rcu_idle_exit(void)
 {
-        unsigned long flags;
-        long long oldval;
-        local_irq_save(flags);
-        oldval = rcu_dynticks_nesting;
-        WARN_ON_ONCE(rcu_dynticks_nesting < 0);
-        if (rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK)
-                rcu_dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
-        else
-                rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
-        rcu_idle_exit_common(oldval);
-        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(rcu_idle_exit);
@@ -160,15 +79,6 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit);
 */
 void rcu_irq_enter(void)
 {
-        unsigned long flags;
-        long long oldval;
-        local_irq_save(flags);
-        oldval = rcu_dynticks_nesting;
-        rcu_dynticks_nesting++;
-        WARN_ON_ONCE(rcu_dynticks_nesting == 0);
-        rcu_idle_exit_common(oldval);
-        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(rcu_irq_enter);
@@ -179,23 +89,13 @@ EXPORT_SYMBOL_GPL(rcu_irq_enter);
 */
 bool notrace __rcu_is_watching(void)
 {
-        return rcu_dynticks_nesting;
+        return true;
 }
 EXPORT_SYMBOL(__rcu_is_watching);
 #endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
 /*
- * Test whether the current CPU was interrupted from idle.  Nested
- * interrupts don't count, we must be running at the first interrupt
- * level.
- */
-static int rcu_is_cpu_rrupt_from_idle(void)
-{
-        return rcu_dynticks_nesting <= 1;
-}
-/*
 * Helper function for rcu_sched_qs() and rcu_bh_qs().
 * Also irqs are disabled to avoid confusion due to interrupt handlers
 * invoking call_rcu().
@@ -250,7 +150,7 @@ void rcu_bh_qs(void)
 void rcu_check_callbacks(int user)
 {
        RCU_TRACE(check_cpu_stalls());
-        if (user || rcu_is_cpu_rrupt_from_idle())
+        if (user)
                rcu_sched_qs();
        else if (!in_softirq())
                rcu_bh_qs();
@@ -357,6 +257,11 @@ static void __call_rcu(struct rcu_head *head,
        rcp->curtail = &head->next;
        RCU_TRACE(rcp->qlen++);
        local_irq_restore(flags);
+        if (unlikely(is_idle_task(current))) {
+                /* force scheduling for rcu_sched_qs() */
+                resched_cpu(0);
+        }
 }
 /*
@@ -383,6 +288,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
 void __init rcu_init(void)
 {
        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+        RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk));
+        RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk));
        rcu_early_boot_tests();
 }
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 858c56569127..f94e209a10d6 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -145,17 +145,16 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
        rcp->ticks_this_gp++;
        j = jiffies;
        js = ACCESS_ONCE(rcp->jiffies_stall);
-        if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
+        if (rcp->rcucblist && ULONG_CMP_GE(j, js)) {
                pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
-                       rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
+                       rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE,
                       jiffies - rcp->gp_start, rcp->qlen);
                dump_stack();
-        }
-        if (*rcp->curtail && ULONG_CMP_GE(j, js))
                ACCESS_ONCE(rcp->jiffies_stall) = jiffies +
                        3 * rcu_jiffies_till_stall_check() + 3;
-        else if (ULONG_CMP_GE(j, js))
+        } else if (ULONG_CMP_GE(j, js)) {
                ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
+        }
 }
 static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 7680fc275036..48d640ca1a05 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -156,6 +156,10 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 static void invoke_rcu_core(void);
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
+/* rcuc/rcub kthread realtime priority */
+static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
+module_param(kthread_prio, int, 0644);
 /*
 * Track the rcutorture test sequence number and the update version
 * number within a given test.  The rcutorture_testseq is incremented
@@ -215,6 +219,9 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
 #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
 };
+DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
+EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
 /*
 * Let the RCU core know that this CPU has gone through the scheduler,
 * which is a quiescent state.  This is called when the need for a
@@ -284,6 +291,22 @@ void rcu_note_context_switch(void)
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
+/*
+ * Register a quiesecent state for all RCU flavors.  If there is an
+ * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
+ * dyntick-idle quiescent state visible to other CPUs (but only for those
+ * RCU flavors in desparate need of a quiescent state, which will normally
+ * be none of them).  Either way, do a lightweight quiescent state for
+ * all RCU flavors.
+ */
+void rcu_all_qs(void)
+{
+        if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+                rcu_momentary_dyntick_idle();
+        this_cpu_inc(rcu_qs_ctr);
+}
+EXPORT_SYMBOL_GPL(rcu_all_qs);
 static long blimit = 10;        /* Maximum callbacks per rcu_do_batch. */
 static long qhimark = 10000;    /* If this many pending, ignore blimit. */
 static long qlowmark = 100;     /* Once only this many pending, use blimit. */
@@ -315,18 +338,54 @@ static void force_quiescent_state(struct rcu_state *rsp);
 static int rcu_pending(void);
 /*
- * Return the number of RCU-sched batches processed thus far for debug & stats.
+ * Return the number of RCU batches started thus far for debug & stats.
+ */
+unsigned long rcu_batches_started(void)
+{
+        return rcu_state_p->gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started);
+/*
+ * Return the number of RCU-sched batches started thus far for debug & stats.
+ */
+unsigned long rcu_batches_started_sched(void)
+{
+        return rcu_sched_state.gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
+/*
+ * Return the number of RCU BH batches started thus far for debug & stats.
 */
-long rcu_batches_completed_sched(void)
+unsigned long rcu_batches_started_bh(void)
+{
+        return rcu_bh_state.gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
+/*
+ * Return the number of RCU batches completed thus far for debug & stats.
+ */
+unsigned long rcu_batches_completed(void)
+{
+        return rcu_state_p->completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+/*
+ * Return the number of RCU-sched batches completed thus far for debug & stats.
+ */
+unsigned long rcu_batches_completed_sched(void)
 {
        return rcu_sched_state.completed;
 }
 EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
 /*
- * Return the number of RCU BH batches processed thus far for debug & stats.
+ * Return the number of RCU BH batches completed thus far for debug & stats.
 */
-long rcu_batches_completed_bh(void)
+unsigned long rcu_batches_completed_bh(void)
 {
        return rcu_bh_state.completed;
 }
@@ -759,39 +818,71 @@ void rcu_irq_enter(void)
 /**
 * rcu_nmi_enter - inform RCU of entry to NMI context
 *
- * If the CPU was idle with dynamic ticks active, and there is no
+ * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and
- * irq handler running, this updates rdtp->dynticks_nmi to let the
+ * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know
- * RCU grace-period handling know that the CPU is active.
+ * that the CPU is active.  This implementation permits nested NMIs, as
+ * long as the nesting level does not overflow an int.  (You will probably
+ * run out of stack space first.)
 */
 void rcu_nmi_enter(void)
 {
        struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+        int incby = 2;
-        if (rdtp->dynticks_nmi_nesting == 0 &&
+        /* Complain about underflow. */
-            (atomic_read(&rdtp->dynticks) & 0x1))
+        WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0);
-                return;
-        rdtp->dynticks_nmi_nesting++;
+        /*
-        smp_mb__before_atomic();  /* Force delay from prior write. */
+         * If idle from RCU viewpoint, atomically increment ->dynticks
-        atomic_inc(&rdtp->dynticks);
+         * to mark non-idle and increment ->dynticks_nmi_nesting by one.
-        /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
+         * Otherwise, increment ->dynticks_nmi_nesting by two.  This means
-        smp_mb__after_atomic();  /* See above. */
+         * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
-        WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+         * to be in the outermost NMI handler that interrupted an RCU-idle
+         * period (observation due to Andy Lutomirski).
+         */
+        if (!(atomic_read(&rdtp->dynticks) & 0x1)) {
+                smp_mb__before_atomic();  /* Force delay from prior write. */
+                atomic_inc(&rdtp->dynticks);
+                /* atomic_inc() before later RCU read-side crit sects */
+                smp_mb__after_atomic();  /* See above. */
+                WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+                incby = 1;
+        }
+        rdtp->dynticks_nmi_nesting += incby;
+        barrier();
 }
 /**
 * rcu_nmi_exit - inform RCU of exit from NMI context
 *
- * If the CPU was idle with dynamic ticks active, and there is no
+ * If we are returning from the outermost NMI handler that interrupted an
- * irq handler running, this updates rdtp->dynticks_nmi to let the
+ * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting
- * RCU grace-period handling know that the CPU is no longer active.
+ * to let the RCU grace-period handling know that the CPU is back to
+ * being RCU-idle.
 */
 void rcu_nmi_exit(void)
 {
        struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
-        if (rdtp->dynticks_nmi_nesting == 0 ||
+        /*
-            --rdtp->dynticks_nmi_nesting != 0)
+         * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
+         * (We are exiting an NMI handler, so RCU better be paying attention
+         * to us!)
+         */
+        WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0);
+        WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+        /*
+         * If the nesting level is not 1, the CPU wasn't RCU-idle, so
+         * leave it in non-RCU-idle state.
+         */
+        if (rdtp->dynticks_nmi_nesting != 1) {
+                rdtp->dynticks_nmi_nesting -= 2;
                return;
+        }
+        /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
+        rdtp->dynticks_nmi_nesting = 0;
        /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
        smp_mb__before_atomic();  /* See above. */
        atomic_inc(&rdtp->dynticks);
@@ -898,17 +989,14 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
                trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
                return 1;
        } else {
+                if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4,
+                                 rdp->mynode->gpnum))
+                        ACCESS_ONCE(rdp->gpwrap) = true;
                return 0;
        }
 }
 /*
- * This function really isn't for public consumption, but RCU is special in
- * that context switches can allow the state machine to make progress.
- */
-extern void resched_cpu(int cpu);
-/*
 * Return true if the specified CPU has passed through a quiescent
 * state by virtue of being in or having passed through an dynticks
 * idle state since the last call to dyntick_save_progress_counter()
@@ -1011,6 +1099,22 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
        j1 = rcu_jiffies_till_stall_check();
        ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
        rsp->jiffies_resched = j + j1 / 2;
+        rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs);
+}
+/*
+ * Complain about starvation of grace-period kthread.
+ */
+static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
+{
+        unsigned long gpa;
+        unsigned long j;
+        j = jiffies;
+        gpa = ACCESS_ONCE(rsp->gp_activity);
+        if (j - gpa > 2 * HZ)
+                pr_err("%s kthread starved for %ld jiffies!\n",
+                       rsp->name, j - gpa);
 }
 /*
@@ -1033,11 +1137,13 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
        }
 }
-static void print_other_cpu_stall(struct rcu_state *rsp)
+static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 {
        int cpu;
        long delta;
        unsigned long flags;
+        unsigned long gpa;
+        unsigned long j;
        int ndetected = 0;
        struct rcu_node *rnp = rcu_get_root(rsp);
        long totqlen = 0;
@@ -1075,30 +1181,34 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        }
-        /*
-         * Now rat on any tasks that got kicked up to the root rcu_node
-         * due to CPU offlining.
-         */
-        rnp = rcu_get_root(rsp);
-        raw_spin_lock_irqsave(&rnp->lock, flags);
-        ndetected += rcu_print_task_stall(rnp);
-        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        print_cpu_stall_info_end();
        for_each_possible_cpu(cpu)
                totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
        pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
               smp_processor_id(), (long)(jiffies - rsp->gp_start),
               (long)rsp->gpnum, (long)rsp->completed, totqlen);
-        if (ndetected == 0)
+        if (ndetected) {
-                pr_err("INFO: Stall ended before state dump start\n");
-        else
                rcu_dump_cpu_stacks(rsp);
+        } else {
+                if (ACCESS_ONCE(rsp->gpnum) != gpnum ||
+                    ACCESS_ONCE(rsp->completed) == gpnum) {
+                        pr_err("INFO: Stall ended before state dump start\n");
+                } else {
+                        j = jiffies;
+                        gpa = ACCESS_ONCE(rsp->gp_activity);
+                        pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n",
+                               rsp->name, j - gpa, j, gpa,
+                               jiffies_till_next_fqs);
+                        /* In this case, the current CPU might be at fault. */
+                        sched_show_task(current);
+                }
+        }
        /* Complain about tasks blocking the grace period. */
        rcu_print_detail_task_stall(rsp);
+        rcu_check_gp_kthread_starvation(rsp);
        force_quiescent_state(rsp);  /* Kick them all. */
 }
@@ -1123,6 +1233,9 @@ static void print_cpu_stall(struct rcu_state *rsp)
        pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
                jiffies - rsp->gp_start,
                (long)rsp->gpnum, (long)rsp->completed, totqlen);
+        rcu_check_gp_kthread_starvation(rsp);
        rcu_dump_cpu_stacks(rsp);
        raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1193,7 +1306,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
                   ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
                /* They had a few time units to dump stack, so complain. */
-                print_other_cpu_stall(rsp);
+                print_other_cpu_stall(rsp, gpnum);
        }
 }
@@ -1530,7 +1643,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
        bool ret;
        /* Handle the ends of any preceding grace periods first. */
-        if (rdp->completed == rnp->completed) {
+        if (rdp->completed == rnp->completed &&
+            !unlikely(ACCESS_ONCE(rdp->gpwrap))) {
                /* No grace period end, so just accelerate recent callbacks. */
                ret = rcu_accelerate_cbs(rsp, rnp, rdp);
@@ -1545,7 +1659,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
                trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
        }
-        if (rdp->gpnum != rnp->gpnum) {
+        if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) {
                /*
                 * If the current grace period is waiting for this CPU,
                 * set up to detect a quiescent state, otherwise don't
@@ -1554,8 +1668,10 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
                rdp->gpnum = rnp->gpnum;
                trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
                rdp->passed_quiesce = 0;
+                rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
                rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
                zero_cpu_stall_ticks(rdp);
+                ACCESS_ONCE(rdp->gpwrap) = false;
        }
        return ret;
 }
@@ -1569,7 +1685,8 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
        local_irq_save(flags);
        rnp = rdp->mynode;
        if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) &&
-             rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */
+             rdp->completed == ACCESS_ONCE(rnp->completed) &&
+             !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */
            !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
                local_irq_restore(flags);
                return;
@@ -1589,6 +1706,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
        struct rcu_data *rdp;
        struct rcu_node *rnp = rcu_get_root(rsp);
+        ACCESS_ONCE(rsp->gp_activity) = jiffies;
        rcu_bind_gp_kthread();
        raw_spin_lock_irq(&rnp->lock);
        smp_mb__after_unlock_lock();
@@ -1649,6 +1767,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
                                            rnp->grphi, rnp->qsmask);
                raw_spin_unlock_irq(&rnp->lock);
                cond_resched_rcu_qs();
+                ACCESS_ONCE(rsp->gp_activity) = jiffies;
        }
        mutex_unlock(&rsp->onoff_mutex);
@@ -1665,6 +1784,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
        unsigned long maxj;
        struct rcu_node *rnp = rcu_get_root(rsp);
+        ACCESS_ONCE(rsp->gp_activity) = jiffies;
        rsp->n_force_qs++;
        if (fqs_state == RCU_SAVE_DYNTICK) {
                /* Collect dyntick-idle snapshots. */
@@ -1703,6 +1823,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
        struct rcu_data *rdp;
        struct rcu_node *rnp = rcu_get_root(rsp);
+        ACCESS_ONCE(rsp->gp_activity) = jiffies;
        raw_spin_lock_irq(&rnp->lock);
        smp_mb__after_unlock_lock();
        gp_duration = jiffies - rsp->gp_start;
@@ -1739,6 +1860,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
                nocb += rcu_future_gp_cleanup(rsp, rnp);
                raw_spin_unlock_irq(&rnp->lock);
                cond_resched_rcu_qs();
+                ACCESS_ONCE(rsp->gp_activity) = jiffies;
        }
        rnp = rcu_get_root(rsp);
        raw_spin_lock_irq(&rnp->lock);
@@ -1788,6 +1910,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
                        if (rcu_gp_init(rsp))
                                break;
                        cond_resched_rcu_qs();
+                        ACCESS_ONCE(rsp->gp_activity) = jiffies;
                        WARN_ON(signal_pending(current));
                        trace_rcu_grace_period(rsp->name,
                                               ACCESS_ONCE(rsp->gpnum),
@@ -1831,9 +1954,11 @@ static int __noreturn rcu_gp_kthread(void *arg)
                                                       ACCESS_ONCE(rsp->gpnum),
                                                       TPS("fqsend"));
                                cond_resched_rcu_qs();
+                                ACCESS_ONCE(rsp->gp_activity) = jiffies;
                        } else {
                                /* Deal with stray signal. */
                                cond_resched_rcu_qs();
+                                ACCESS_ONCE(rsp->gp_activity) = jiffies;
                                WARN_ON(signal_pending(current));
                                trace_rcu_grace_period(rsp->name,
                                                       ACCESS_ONCE(rsp->gpnum),
@@ -2010,8 +2135,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
        rnp = rdp->mynode;
        raw_spin_lock_irqsave(&rnp->lock, flags);
        smp_mb__after_unlock_lock();
-        if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum ||
+        if ((rdp->passed_quiesce == 0 &&
-            rnp->completed == rnp->gpnum) {
+             rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
+            rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
+            rdp->gpwrap) {
                /*
                 * The grace period in which this quiescent state was
@@ -2020,6 +2147,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
                 * within the current grace period.
                 */
                rdp->passed_quiesce = 0;        /* need qs for new gp. */
+                rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
@@ -2064,7 +2192,8 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
         * Was there a quiescent state since the beginning of the grace
         * period? If no, then exit and wait for the next call.
         */
-        if (!rdp->passed_quiesce)
+        if (!rdp->passed_quiesce &&
+            rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr))
                return;
        /*
@@ -2195,6 +2324,46 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
 }
 /*
+ * All CPUs for the specified rcu_node structure have gone offline,
+ * and all tasks that were preempted within an RCU read-side critical
+ * section while running on one of those CPUs have since exited their RCU
+ * read-side critical section.  Some other CPU is reporting this fact with
+ * the specified rcu_node structure's ->lock held and interrupts disabled.
+ * This function therefore goes up the tree of rcu_node structures,
+ * clearing the corresponding bits in the ->qsmaskinit fields.  Note that
+ * the leaf rcu_node structure's ->qsmaskinit field has already been
+ * updated
+ *
+ * This function does check that the specified rcu_node structure has
+ * all CPUs offline and no blocked tasks, so it is OK to invoke it
+ * prematurely.  That said, invoking it after the fact will cost you
+ * a needless lock acquisition.  So once it has done its work, don't
+ * invoke it again.
+ */
+static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
+{
+        long mask;
+        struct rcu_node *rnp = rnp_leaf;
+        if (rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
+                return;
+        for (;;) {
+                mask = rnp->grpmask;
+                rnp = rnp->parent;
+                if (!rnp)
+                        break;
+                raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+                smp_mb__after_unlock_lock(); /* GP memory ordering. */
+                rnp->qsmaskinit &= ~mask;
+                if (rnp->qsmaskinit) {
+                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+                        return;
+                }
+                raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+        }
+}
+/*
 * The CPU has been completely removed, and some other CPU is reporting
 * this fact from process context.  Do the remainder of the cleanup,
 * including orphaning the outgoing CPU's RCU callbacks, and also
@@ -2204,8 +2373,6 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
 static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 {
        unsigned long flags;
-        unsigned long mask;
-        int need_report = 0;
        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
        struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
@@ -2219,40 +2386,15 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
        /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
        rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
        rcu_adopt_orphan_cbs(rsp, flags);
+        raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
-        /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
+        /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
-        mask = rdp->grpmask;    /* rnp->grplo is constant. */
+        raw_spin_lock_irqsave(&rnp->lock, flags);
-        do {
+        smp_mb__after_unlock_lock();    /* Enforce GP memory-order guarantee. */
-                raw_spin_lock(&rnp->lock);      /* irqs already disabled. */
+        rnp->qsmaskinit &= ~rdp->grpmask;
-                smp_mb__after_unlock_lock();
+        if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp))
-                rnp->qsmaskinit &= ~mask;
+                rcu_cleanup_dead_rnp(rnp);
-                if (rnp->qsmaskinit != 0) {
+        rcu_report_qs_rnp(rdp->grpmask, rsp, rnp, flags); /* Rlses rnp->lock. */
-                        if (rnp != rdp->mynode)
-                                raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
-                        break;
-                }
-                if (rnp == rdp->mynode)
-                        need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
-                else
-                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
-                mask = rnp->grpmask;
-                rnp = rnp->parent;
-        } while (rnp != NULL);
-        /*
-         * We still hold the leaf rcu_node structure lock here, and
-         * irqs are still disabled.  The reason for this subterfuge is
-         * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock
-         * held leads to deadlock.
-         */
-        raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */
-        rnp = rdp->mynode;
-        if (need_report & RCU_OFL_TASKS_NORM_GP)
-                rcu_report_unblock_qs_rnp(rnp, flags);
-        else
-                raw_spin_unlock_irqrestore(&rnp->lock, flags);
-        if (need_report & RCU_OFL_TASKS_EXP_GP)
-                rcu_report_exp_rnp(rsp, rnp, true);
        WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
                  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
                  cpu, rdp->qlen, rdp->nxtlist);
@@ -2268,6 +2410,10 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
 {
 }
+static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
+{
+}
 static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 {
 }
@@ -2464,12 +2610,6 @@ static void force_qs_rnp(struct rcu_state *rsp,
                }
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        }
-        rnp = rcu_get_root(rsp);
-        if (rnp->qsmask == 0) {
-                raw_spin_lock_irqsave(&rnp->lock, flags);
-                smp_mb__after_unlock_lock();
-                rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
-        }
 }
 /*
@@ -2569,7 +2709,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
 * Schedule RCU callback invocation.  If the specified type of RCU
 * does not support RCU priority boosting, just do a direct call,
 * otherwise wake up the per-CPU kernel kthread.  Note that because we
- * are running on the current CPU with interrupts disabled, the
+ * are running on the current CPU with softirqs disabled, the
 * rcu_cpu_kthread_task cannot disappear out from under us.
 */
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
@@ -3109,9 +3249,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
        /* Is the RCU core waiting for a quiescent state from this CPU? */
        if (rcu_scheduler_fully_active &&
-            rdp->qs_pending && !rdp->passed_quiesce) {
+            rdp->qs_pending && !rdp->passed_quiesce &&
+            rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
                rdp->n_rp_qs_pending++;
-        } else if (rdp->qs_pending && rdp->passed_quiesce) {
+        } else if (rdp->qs_pending &&
+                   (rdp->passed_quiesce ||
+                    rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
                rdp->n_rp_report_qs++;
                return 1;
        }
@@ -3135,7 +3278,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
        }
        /* Has a new RCU grace period started? */
-        if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */
+        if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum ||
+            unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */
                rdp->n_rp_gp_started++;
                return 1;
        }
@@ -3318,6 +3462,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
                        } else {
                                _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
                                                   rsp->n_barrier_done);
+                                smp_mb__before_atomic();
                                atomic_inc(&rsp->barrier_cpu_count);
                                __call_rcu(&rdp->barrier_head,
                                           rcu_barrier_callback, rsp, cpu, 0);
@@ -3385,9 +3530,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
        /* Set up local state, ensuring consistent view of global state. */
        raw_spin_lock_irqsave(&rnp->lock, flags);
        rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
-        init_callback_list(rdp);
-        rdp->qlen_lazy = 0;
-        ACCESS_ONCE(rdp->qlen) = 0;
        rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
        WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
        WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
@@ -3444,6 +3586,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
                        rdp->gpnum = rnp->completed;
                        rdp->completed = rnp->completed;
                        rdp->passed_quiesce = 0;
+                        rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
                        rdp->qs_pending = 0;
                        trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
                }
@@ -3535,17 +3678,35 @@ static int rcu_pm_notify(struct notifier_block *self,
 static int __init rcu_spawn_gp_kthread(void)
 {
        unsigned long flags;
+        int kthread_prio_in = kthread_prio;
        struct rcu_node *rnp;
        struct rcu_state *rsp;
+        struct sched_param sp;
        struct task_struct *t;
+        /* Force priority into range. */
+        if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
+                kthread_prio = 1;
+        else if (kthread_prio < 0)
+                kthread_prio = 0;
+        else if (kthread_prio > 99)
+                kthread_prio = 99;
+        if (kthread_prio != kthread_prio_in)
+                pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n",
+                         kthread_prio, kthread_prio_in);
        rcu_scheduler_fully_active = 1;
        for_each_rcu_flavor(rsp) {
-                t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name);
+                t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name);
                BUG_ON(IS_ERR(t));
                rnp = rcu_get_root(rsp);
                raw_spin_lock_irqsave(&rnp->lock, flags);
                rsp->gp_kthread = t;
+                if (kthread_prio) {
+                        sp.sched_priority = kthread_prio;
+                        sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+                }
+                wake_up_process(t);
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        }
        rcu_spawn_nocb_kthreads();
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 8e7b1843896e..119de399eb2f 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -27,7 +27,6 @@
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
-#include <linux/irq_work.h>
 /*
 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
@@ -172,11 +171,6 @@ struct rcu_node {
                                /*  queued on this rcu_node structure that */
                                /*  are blocking the current grace period, */
                                /*  there can be no such task. */
-        struct completion boost_completion;
-                                /* Used to ensure that the rt_mutex used */
-                                /*  to carry out the boosting is fully */
-                                /*  released with no future boostee accesses */
-                                /*  before that rt_mutex is re-initialized. */
        struct rt_mutex boost_mtx;
                                /* Used only for the priority-boosting */
                                /*  side effect, not as a lock. */
@@ -257,9 +251,12 @@ struct rcu_data {
                                        /*  in order to detect GP end. */
        unsigned long   gpnum;          /* Highest gp number that this CPU */
                                        /*  is aware of having started. */
+        unsigned long   rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */
+                                        /*  for rcu_all_qs() invocations. */
        bool            passed_quiesce; /* User-mode/idle loop etc. */
        bool            qs_pending;     /* Core waits for quiesc state. */
        bool            beenonline;     /* CPU online at least once. */
+        bool            gpwrap;         /* Possible gpnum/completed wrap. */
        struct rcu_node *mynode;        /* This CPU's leaf of hierarchy */
        unsigned long grpmask;          /* Mask to apply to leaf qsmask. */
 #ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -340,14 +337,10 @@ struct rcu_data {
 #ifdef CONFIG_RCU_NOCB_CPU
        struct rcu_head *nocb_head;     /* CBs waiting for kthread. */
        struct rcu_head **nocb_tail;
-        atomic_long_t nocb_q_count;     /* # CBs waiting for kthread */
+        atomic_long_t nocb_q_count;     /* # CBs waiting for nocb */
-        atomic_long_t nocb_q_count_lazy; /*  (approximate). */
+        atomic_long_t nocb_q_count_lazy; /*  invocation (all stages). */
        struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
        struct rcu_head **nocb_follower_tail;
-        atomic_long_t nocb_follower_count; /* # CBs ready to invoke. */
-        atomic_long_t nocb_follower_count_lazy; /*  (approximate). */
-        int nocb_p_count;               /* # CBs being invoked by kthread */
-        int nocb_p_count_lazy;          /*  (approximate). */
        wait_queue_head_t nocb_wq;      /* For nocb kthreads to sleep on. */
        struct task_struct *nocb_kthread;
        int nocb_defer_wakeup;          /* Defer wakeup of nocb_kthread. */
@@ -356,8 +349,6 @@ struct rcu_data {
        struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
                                        /* CBs waiting for GP. */
        struct rcu_head **nocb_gp_tail;
-        long nocb_gp_count;
-        long nocb_gp_count_lazy;
        bool nocb_leader_sleep;         /* Is the nocb leader thread asleep? */
        struct rcu_data *nocb_next_follower;
                                        /* Next follower in wakeup chain. */
@@ -488,10 +479,14 @@ struct rcu_state {
                                                /*  due to no GP active. */
        unsigned long gp_start;                 /* Time at which GP started, */
                                                /*  but in jiffies. */
+        unsigned long gp_activity;              /* Time of last GP kthread */
+                                                /*  activity in jiffies. */
        unsigned long jiffies_stall;            /* Time at which to check */
                                                /*  for CPU stalls. */
        unsigned long jiffies_resched;          /* Time at which to resched */
                                                /*  a reluctant CPU. */
+        unsigned long n_force_qs_gpstart;       /* Snapshot of n_force_qs at */
+                                                /*  GP start. */
        unsigned long gp_max;                   /* Maximum GP duration in */
                                                /*  jiffies. */
        const char *name;                       /* Name of structure. */
@@ -514,13 +509,6 @@ extern struct list_head rcu_struct_flavors;
 #define for_each_rcu_flavor(rsp) \
        list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
-/* Return values for rcu_preempt_offline_tasks(). */
-#define RCU_OFL_TASKS_NORM_GP   0x1             /* Tasks blocking normal */
-                                                /*  GP were moved to root. */
-#define RCU_OFL_TASKS_EXP_GP    0x2             /* Tasks blocking expedited */
-                                                /*  GP were moved to root. */
 /*
 * RCU implementation internal declarations:
 */
@@ -546,27 +534,16 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
 /* Forward declarations for rcutree_plugin.h */
 static void rcu_bootup_announce(void);
-long rcu_batches_completed(void);
 static void rcu_preempt_note_context_switch(void);
 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
-static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
-                                      unsigned long flags);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_print_detail_task_stall(struct rcu_state *rsp);
 static int rcu_print_task_stall(struct rcu_node *rnp);
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
-#ifdef CONFIG_HOTPLUG_CPU
-static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
-                                     struct rcu_node *rnp,
-                                     struct rcu_data *rdp);
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_preempt_check_callbacks(void);
 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
-#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU)
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
-                               bool wake);
-#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) */
 static void __init __rcu_init_preempt(void);
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
@@ -622,24 +599,15 @@ static void rcu_dynticks_task_exit(void);
 #endif /* #ifndef RCU_TREE_NONCORE */
 #ifdef CONFIG_RCU_TRACE
-#ifdef CONFIG_RCU_NOCB_CPU
+/* Read out queue lengths for tracing. */
-/* Sum up queue lengths for tracing. */
 static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
 {
-        *ql = atomic_long_read(&rdp->nocb_q_count) +
+#ifdef CONFIG_RCU_NOCB_CPU
-              rdp->nocb_p_count +
+        *ql = atomic_long_read(&rdp->nocb_q_count);
-              atomic_long_read(&rdp->nocb_follower_count) +
+        *qll = atomic_long_read(&rdp->nocb_q_count_lazy);
-              rdp->nocb_p_count + rdp->nocb_gp_count;
-        *qll = atomic_long_read(&rdp->nocb_q_count_lazy) +
-               rdp->nocb_p_count_lazy +
-               atomic_long_read(&rdp->nocb_follower_count_lazy) +
-               rdp->nocb_p_count_lazy + rdp->nocb_gp_count_lazy;
-}
 #else /* #ifdef CONFIG_RCU_NOCB_CPU */
-static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
-{
        *ql = 0;
        *qll = 0;
-}
 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
+}
 #endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3ec85cb5d544..0a571e9a0f1d 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -34,10 +34,6 @@
 #include "../locking/rtmutex_common.h"
-/* rcuc/rcub kthread realtime priority */
-static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
-module_param(kthread_prio, int, 0644);
 /*
 * Control variables for per-CPU and per-rcu_node kthreads.  These
 * handle all flavors of RCU.
@@ -53,7 +49,6 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work);
 static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
 static bool have_rcu_nocb_mask;     /* Was rcu_nocb_mask allocated? */
 static bool __read_mostly rcu_nocb_poll;    /* Offload kthread are to poll. */
-static char __initdata nocb_buf[NR_CPUS * 5];
 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
 /*
@@ -103,6 +98,8 @@ RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
 static struct rcu_state *rcu_state_p = &rcu_preempt_state;
 static int rcu_preempted_readers_exp(struct rcu_node *rnp);
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+                               bool wake);
 /*
 * Tell them what RCU they are running.
@@ -114,25 +111,6 @@ static void __init rcu_bootup_announce(void)
 }
 /*
- * Return the number of RCU-preempt batches processed thus far
- * for debug and statistics.
- */
-static long rcu_batches_completed_preempt(void)
-{
-        return rcu_preempt_state.completed;
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
-/*
- * Return the number of RCU batches processed thus far for debug & stats.
- */
-long rcu_batches_completed(void)
-{
-        return rcu_batches_completed_preempt();
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-/*
 * Record a preemptible-RCU quiescent state for the specified CPU.  Note
 * that this just means that the task currently running on the CPU is
 * not in a quiescent state.  There might be any number of tasks blocked
@@ -307,15 +285,25 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t,
 }
 /*
+ * Return true if the specified rcu_node structure has tasks that were
+ * preempted within an RCU read-side critical section.
+ */
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
+{
+        return !list_empty(&rnp->blkd_tasks);
+}
+/*
 * Handle special cases during rcu_read_unlock(), such as needing to
 * notify RCU core processing or task having blocked during the RCU
 * read-side critical section.
 */
 void rcu_read_unlock_special(struct task_struct *t)
 {
-        int empty;
+        bool empty;
-        int empty_exp;
+        bool empty_exp;
-        int empty_exp_now;
+        bool empty_norm;
+        bool empty_exp_now;
        unsigned long flags;
        struct list_head *np;
 #ifdef CONFIG_RCU_BOOST
@@ -338,6 +326,7 @@ void rcu_read_unlock_special(struct task_struct *t)
        special = t->rcu_read_unlock_special;
        if (special.b.need_qs) {
                rcu_preempt_qs();
+                t->rcu_read_unlock_special.b.need_qs = false;
                if (!t->rcu_read_unlock_special.s) {
                        local_irq_restore(flags);
                        return;
@@ -367,7 +356,8 @@ void rcu_read_unlock_special(struct task_struct *t)
                                break;
                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
                }
-                empty = !rcu_preempt_blocked_readers_cgp(rnp);
+                empty = !rcu_preempt_has_tasks(rnp);
+                empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
                empty_exp = !rcu_preempted_readers_exp(rnp);
                smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
                np = rcu_next_node_entry(t, rnp);
@@ -387,13 +377,21 @@ void rcu_read_unlock_special(struct task_struct *t)
 #endif /* #ifdef CONFIG_RCU_BOOST */
                /*
+                 * If this was the last task on the list, go see if we
+                 * need to propagate ->qsmaskinit bit clearing up the
+                 * rcu_node tree.
+                 */
+                if (!empty && !rcu_preempt_has_tasks(rnp))
+                        rcu_cleanup_dead_rnp(rnp);
+                /*
                 * If this was the last task on the current list, and if
                 * we aren't waiting on any CPUs, report the quiescent state.
                 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
                 * so we must take a snapshot of the expedited state.
                 */
                empty_exp_now = !rcu_preempted_readers_exp(rnp);
-                if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
+                if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
                        trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
                                                         rnp->gpnum,
                                                         0, rnp->qsmask,
@@ -408,10 +406,8 @@ void rcu_read_unlock_special(struct task_struct *t)
 #ifdef CONFIG_RCU_BOOST
                /* Unboost if we were boosted. */
-                if (drop_boost_mutex) {
+                if (drop_boost_mutex)
                        rt_mutex_unlock(&rnp->boost_mtx);
-                        complete(&rnp->boost_completion);
-                }
 #endif /* #ifdef CONFIG_RCU_BOOST */
                /*
@@ -519,99 +515,13 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 {
        WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
-        if (!list_empty(&rnp->blkd_tasks))
+        if (rcu_preempt_has_tasks(rnp))
                rnp->gp_tasks = rnp->blkd_tasks.next;
        WARN_ON_ONCE(rnp->qsmask);
 }
 #ifdef CONFIG_HOTPLUG_CPU
-/*
- * Handle tasklist migration for case in which all CPUs covered by the
- * specified rcu_node have gone offline.  Move them up to the root
- * rcu_node.  The reason for not just moving them to the immediate
- * parent is to remove the need for rcu_read_unlock_special() to
- * make more than two attempts to acquire the target rcu_node's lock.
- * Returns true if there were tasks blocking the current RCU grace
- * period.
- *
- * Returns 1 if there was previously a task blocking the current grace
- * period on the specified rcu_node structure.
- *
- * The caller must hold rnp->lock with irqs disabled.
- */
-static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
-                                     struct rcu_node *rnp,
-                                     struct rcu_data *rdp)
-{
-        struct list_head *lp;
-        struct list_head *lp_root;
-        int retval = 0;
-        struct rcu_node *rnp_root = rcu_get_root(rsp);
-        struct task_struct *t;
-        if (rnp == rnp_root) {
-                WARN_ONCE(1, "Last CPU thought to be offlined?");
-                return 0;  /* Shouldn't happen: at least one CPU online. */
-        }
-        /* If we are on an internal node, complain bitterly. */
-        WARN_ON_ONCE(rnp != rdp->mynode);
-        /*
-         * Move tasks up to root rcu_node.  Don't try to get fancy for
-         * this corner-case operation -- just put this node's tasks
-         * at the head of the root node's list, and update the root node's
-         * ->gp_tasks and ->exp_tasks pointers to those of this node's,
-         * if non-NULL.  This might result in waiting for more tasks than
-         * absolutely necessary, but this is a good performance/complexity
-         * tradeoff.
-         */
-        if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0)
-                retval |= RCU_OFL_TASKS_NORM_GP;
-        if (rcu_preempted_readers_exp(rnp))
-                retval |= RCU_OFL_TASKS_EXP_GP;
-        lp = &rnp->blkd_tasks;
-        lp_root = &rnp_root->blkd_tasks;
-        while (!list_empty(lp)) {
-                t = list_entry(lp->next, typeof(*t), rcu_node_entry);
-                raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
-                smp_mb__after_unlock_lock();
-                list_del(&t->rcu_node_entry);
-                t->rcu_blocked_node = rnp_root;
-                list_add(&t->rcu_node_entry, lp_root);
-                if (&t->rcu_node_entry == rnp->gp_tasks)
-                        rnp_root->gp_tasks = rnp->gp_tasks;
-                if (&t->rcu_node_entry == rnp->exp_tasks)
-                        rnp_root->exp_tasks = rnp->exp_tasks;
-#ifdef CONFIG_RCU_BOOST
-                if (&t->rcu_node_entry == rnp->boost_tasks)
-                        rnp_root->boost_tasks = rnp->boost_tasks;
-#endif /* #ifdef CONFIG_RCU_BOOST */
-                raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
-        }
-        rnp->gp_tasks = NULL;
-        rnp->exp_tasks = NULL;
-#ifdef CONFIG_RCU_BOOST
-        rnp->boost_tasks = NULL;
-        /*
-         * In case root is being boosted and leaf was not.  Make sure
-         * that we boost the tasks blocking the current grace period
-         * in this case.
-         */
-        raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
-        smp_mb__after_unlock_lock();
-        if (rnp_root->boost_tasks != NULL &&
-            rnp_root->boost_tasks != rnp_root->gp_tasks &&
-            rnp_root->boost_tasks != rnp_root->exp_tasks)
-                rnp_root->boost_tasks = rnp_root->gp_tasks;
-        raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
-#endif /* #ifdef CONFIG_RCU_BOOST */
-        return retval;
-}
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
@@ -771,7 +681,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
        raw_spin_lock_irqsave(&rnp->lock, flags);
        smp_mb__after_unlock_lock();
-        if (list_empty(&rnp->blkd_tasks)) {
+        if (!rcu_preempt_has_tasks(rnp)) {
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        } else {
                rnp->exp_tasks = rnp->blkd_tasks.next;
@@ -933,15 +843,6 @@ static void __init rcu_bootup_announce(void)
 }
 /*
- * Return the number of RCU batches processed thus far for debug & stats.
- */
-long rcu_batches_completed(void)
-{
-        return rcu_batches_completed_sched();
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-/*
 * Because preemptible RCU does not exist, we never have to check for
 * CPUs being in quiescent states.
 */
@@ -960,11 +861,12 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 #ifdef CONFIG_HOTPLUG_CPU
-/* Because preemptible RCU does not exist, no quieting of tasks. */
+/*
-static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
+ * Because there is no preemptible RCU, there can be no readers blocked.
-        __releases(rnp->lock)
+ */
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
 {
-        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+        return false;
 }
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -996,23 +898,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
        WARN_ON_ONCE(rnp->qsmask);
 }
-#ifdef CONFIG_HOTPLUG_CPU
-/*
- * Because preemptible RCU does not exist, it never needs to migrate
- * tasks that were blocked within RCU read-side critical sections, and
- * such non-existent tasks cannot possibly have been blocking the current
- * grace period.
- */
-static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
-                                     struct rcu_node *rnp,
-                                     struct rcu_data *rdp)
-{
-        return 0;
-}
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
 * Because preemptible RCU does not exist, it never has any callbacks
 * to check.
@@ -1031,20 +916,6 @@ void synchronize_rcu_expedited(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
-#ifdef CONFIG_HOTPLUG_CPU
-/*
- * Because preemptible RCU does not exist, there is never any need to
- * report on tasks preempted in RCU read-side critical sections during
- * expedited RCU grace periods.
- */
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
-                               bool wake)
-{
-}
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
 * Because preemptible RCU does not exist, rcu_barrier() is just
 * another name for rcu_barrier_sched().
@@ -1080,7 +951,7 @@ void exit_rcu(void)
 static void rcu_initiate_boost_trace(struct rcu_node *rnp)
 {
-        if (list_empty(&rnp->blkd_tasks))
+        if (!rcu_preempt_has_tasks(rnp))
                rnp->n_balk_blkd_tasks++;
        else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
                rnp->n_balk_exp_gp_tasks++;
@@ -1127,7 +998,8 @@ static int rcu_boost(struct rcu_node *rnp)
        struct task_struct *t;
        struct list_head *tb;
-        if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
+        if (ACCESS_ONCE(rnp->exp_tasks) == NULL &&
+            ACCESS_ONCE(rnp->boost_tasks) == NULL)
                return 0;  /* Nothing left to boost. */
        raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1175,15 +1047,11 @@ static int rcu_boost(struct rcu_node *rnp)
         */
        t = container_of(tb, struct task_struct, rcu_node_entry);
        rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
-        init_completion(&rnp->boost_completion);
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        /* Lock only for side effect: boosts task t's priority. */
        rt_mutex_lock(&rnp->boost_mtx);
        rt_mutex_unlock(&rnp->boost_mtx);  /* Then keep lockdep happy. */
-        /* Wait for boostee to be done w/boost_mtx before reinitializing. */
-        wait_for_completion(&rnp->boost_completion);
        return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
               ACCESS_ONCE(rnp->boost_tasks) != NULL;
 }
@@ -1416,12 +1284,8 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
        for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
                if ((mask & 0x1) && cpu != outgoingcpu)
                        cpumask_set_cpu(cpu, cm);
-        if (cpumask_weight(cm) == 0) {
+        if (cpumask_weight(cm) == 0)
                cpumask_setall(cm);
-                for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
-                        cpumask_clear_cpu(cpu, cm);
-                WARN_ON_ONCE(cpumask_weight(cm) == 0);
-        }
        set_cpus_allowed_ptr(t, cm);
        free_cpumask_var(cm);
 }
@@ -1446,12 +1310,8 @@ static void __init rcu_spawn_boost_kthreads(void)
        for_each_possible_cpu(cpu)
                per_cpu(rcu_cpu_has_work, cpu) = 0;
        BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
-        rnp = rcu_get_root(rcu_state_p);
+        rcu_for_each_leaf_node(rcu_state_p, rnp)
-        (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
+                (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
-        if (NUM_RCU_NODES > 1) {
-                rcu_for_each_leaf_node(rcu_state_p, rnp)
-                        (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
-        }
 }
 static void rcu_prepare_kthreads(int cpu)
@@ -1605,7 +1465,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
                 * completed since we last checked and there are
                 * callbacks not yet ready to invoke.
                 */
-                if (rdp->completed != rnp->completed &&
+                if ((rdp->completed != rnp->completed ||
+                     unlikely(ACCESS_ONCE(rdp->gpwrap))) &&
                    rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
                        note_gp_changes(rsp, rdp);
@@ -1898,11 +1759,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
                ticks_value = rsp->gpnum - rdp->gpnum;
        }
        print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
-        pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",
+        pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
               cpu, ticks_value, ticks_title,
               atomic_read(&rdtp->dynticks) & 0xfff,
               rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
               rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
+               ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
               fast_no_hz);
 }
@@ -2056,9 +1918,26 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
 static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
 {
        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+        unsigned long ret;
+#ifdef CONFIG_PROVE_RCU
        struct rcu_head *rhp;
+#endif /* #ifdef CONFIG_PROVE_RCU */
-        /* No-CBs CPUs might have callbacks on any of three lists. */
+        /*
+         * Check count of all no-CBs callbacks awaiting invocation.
+         * There needs to be a barrier before this function is called,
+         * but associated with a prior determination that no more
+         * callbacks would be posted.  In the worst case, the first
+         * barrier in _rcu_barrier() suffices (but the caller cannot
+         * necessarily rely on this, not a substitute for the caller
+         * getting the concurrency design right!).  There must also be
+         * a barrier between the following load an posting of a callback
+         * (if a callback is in fact needed).  This is associated with an
+         * atomic_inc() in the caller.
+         */
+        ret = atomic_long_read(&rdp->nocb_q_count);
+#ifdef CONFIG_PROVE_RCU
        rhp = ACCESS_ONCE(rdp->nocb_head);
        if (!rhp)
                rhp = ACCESS_ONCE(rdp->nocb_gp_head);
@@ -2072,8 +1951,9 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
                       cpu, rhp->func);
                WARN_ON_ONCE(1);
        }
+#endif /* #ifdef CONFIG_PROVE_RCU */
-        return !!rhp;
+        return !!ret;
 }
 /*
@@ -2095,9 +1975,10 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
        struct task_struct *t;
        /* Enqueue the callback on the nocb list and update counts. */
+        atomic_long_add(rhcount, &rdp->nocb_q_count);
+        /* rcu_barrier() relies on ->nocb_q_count add before xchg. */
        old_rhpp = xchg(&rdp->nocb_tail, rhtp);
        ACCESS_ONCE(*old_rhpp) = rhp;
-        atomic_long_add(rhcount, &rdp->nocb_q_count);
        atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
        smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
@@ -2288,9 +2169,6 @@ wait_again:
                /* Move callbacks to wait-for-GP list, which is empty. */
                ACCESS_ONCE(rdp->nocb_head) = NULL;
                rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
-                rdp->nocb_gp_count = atomic_long_xchg(&rdp->nocb_q_count, 0);
-                rdp->nocb_gp_count_lazy =
-                        atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
                gotcbs = true;
        }
@@ -2338,9 +2216,6 @@ wait_again:
                /* Append callbacks to follower's "done" list. */
                tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail);
                *tail = rdp->nocb_gp_head;
-                atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count);
-                atomic_long_add(rdp->nocb_gp_count_lazy,
-                                &rdp->nocb_follower_count_lazy);
                smp_mb__after_atomic(); /* Store *tail before wakeup. */
                if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
                        /*
@@ -2415,13 +2290,11 @@ static int rcu_nocb_kthread(void *arg)
                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
                ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
                tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
-                c = atomic_long_xchg(&rdp->nocb_follower_count, 0);
-                cl = atomic_long_xchg(&rdp->nocb_follower_count_lazy, 0);
-                rdp->nocb_p_count += c;
-                rdp->nocb_p_count_lazy += cl;
                /* Each pass through the following loop invokes a callback. */
-                trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
+                trace_rcu_batch_start(rdp->rsp->name,
+                                      atomic_long_read(&rdp->nocb_q_count_lazy),
+                                      atomic_long_read(&rdp->nocb_q_count), -1);
                c = cl = 0;
                while (list) {
                        next = list->next;
@@ -2443,9 +2316,9 @@ static int rcu_nocb_kthread(void *arg)
                        list = next;
                }
                trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
-                ACCESS_ONCE(rdp->nocb_p_count) = rdp->nocb_p_count - c;
+                smp_mb__before_atomic();  /* _add after CB invocation. */
-                ACCESS_ONCE(rdp->nocb_p_count_lazy) =
+                atomic_long_add(-c, &rdp->nocb_q_count);
-                                                rdp->nocb_p_count_lazy - cl;
+                atomic_long_add(-cl, &rdp->nocb_q_count_lazy);
                rdp->n_nocbs_invoked += c;
        }
        return 0;
@@ -2513,8 +2386,8 @@ void __init rcu_init_nohz(void)
                cpumask_and(rcu_nocb_mask, cpu_possible_mask,
                            rcu_nocb_mask);
        }
-        cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
+        pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
-        pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf);
+                cpumask_pr_args(rcu_nocb_mask));
        if (rcu_nocb_poll)
                pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 5cdc62e1beeb..fbb6240509ea 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -46,6 +46,8 @@
 #define RCU_TREE_NONCORE
 #include "tree.h"
+DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
 static int r_open(struct inode *inode, struct file *file,
                                        const struct seq_operations *op)
 {
@@ -115,11 +117,13 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
        if (!rdp->beenonline)
                return;
-        seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d",
+        seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d",
                   rdp->cpu,
                   cpu_is_offline(rdp->cpu) ? '!' : ' ',
                   ulong2long(rdp->completed), ulong2long(rdp->gpnum),
-                   rdp->passed_quiesce, rdp->qs_pending);
+                   rdp->passed_quiesce,
+                   rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
+                   rdp->qs_pending);
        seq_printf(m, " dt=%d/%llx/%d df=%lu",
                   atomic_read(&rdp->dynticks->dynticks),
                   rdp->dynticks->dynticks_nesting,
diff --git a/kernel/resource.c b/kernel/resource.c
index 0bcebffc4e77..19f2357dfda3 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -22,6 +22,7 @@
 #include <linux/device.h>
 #include <linux/pfn.h>
 #include <linux/mm.h>
+#include <linux/resource_ext.h>
 #include <asm/io.h>
@@ -1529,6 +1530,30 @@ int iomem_is_exclusive(u64 addr)
        return err;
 }
+struct resource_entry *resource_list_create_entry(struct resource *res,
+                                                  size_t extra_size)
+{
+        struct resource_entry *entry;
+        entry = kzalloc(sizeof(*entry) + extra_size, GFP_KERNEL);
+        if (entry) {
+                INIT_LIST_HEAD(&entry->node);
+                entry->res = res ? res : &entry->__res;
+        }
+        return entry;
+}
+EXPORT_SYMBOL(resource_list_create_entry);
+void resource_list_free(struct list_head *head)
+{
+        struct resource_entry *entry, *tmp;
+        list_for_each_entry_safe(entry, tmp, head, node)
+                resource_list_destroy_entry(entry);
+}
+EXPORT_SYMBOL(resource_list_free);
 static int __init strict_iomem(char *str)
 {
        if (strstr(str, "relaxed"))
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index ab32b7b0db5c..46be87024875 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -1,5 +1,5 @@
 ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_clock.o = -pg
+CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE)
 endif
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 8a2e230fb86a..eae160dd669d 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -87,8 +87,7 @@ static inline struct autogroup *autogroup_create(void)
         * so we don't have to move tasks around upon policy change,
         * or flail around trying to allocate bandwidth on the fly.
         * A bandwidth exception in __sched_setscheduler() allows
-         * the policy change to proceed.  Thereafter, task_group()
+         * the policy change to proceed.
-         * returns &root_task_group, so zero bandwidth is required.
         */
        free_rt_sched_group(tg);
        tg->rt_se = root_task_group.rt_se;
@@ -115,9 +114,6 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
        if (tg != &root_task_group)
                return false;
-        if (p->sched_class != &fair_sched_class)
-                return false;
        /*
         * We can only assume the task group can't go away on us if
         * autogroup_move_group() can see us on ->thread_group list.
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c27e4f8f4879..c0a205101c23 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -420,3 +420,16 @@ u64 local_clock(void)
 EXPORT_SYMBOL_GPL(cpu_clock);
 EXPORT_SYMBOL_GPL(local_clock);
+/*
+ * Running clock - returns the time that has elapsed while a guest has been
+ * running.
+ * On a guest this value should be local_clock minus the time the guest was
+ * suspended by the hypervisor (for any reason).
+ * On bare metal this function should return the same as local_clock.
+ * Architectures and sub-architectures can override this.
+ */
+u64 __weak running_clock(void)
+{
+        return local_clock();
+}
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 607f852b4d04..8d0f35debf35 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -268,6 +268,15 @@ bool try_wait_for_completion(struct completion *x)
        unsigned long flags;
        int ret = 1;
+        /*
+         * Since x->done will need to be locked only
+         * in the non-blocking case, we check x->done
+         * first without taking the lock so we can
+         * return early in the blocking case.
+         */
+        if (!READ_ONCE(x->done))
+                return 0;
        spin_lock_irqsave(&x->wait.lock, flags);
        if (!x->done)
                ret = 0;
@@ -288,13 +297,21 @@ EXPORT_SYMBOL(try_wait_for_completion);
 */
 bool completion_done(struct completion *x)
 {
-        unsigned long flags;
+        if (!READ_ONCE(x->done))
-        int ret = 1;
+                return false;
-        spin_lock_irqsave(&x->wait.lock, flags);
+        /*
-        if (!x->done)
+         * If ->done, we need to wait for complete() to release ->wait.lock
-                ret = 0;
+         * otherwise we can end up freeing the completion before complete()
-        spin_unlock_irqrestore(&x->wait.lock, flags);
+         * is done referencing it.
-        return ret;
+         *
+         * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders
+         * the loads of ->done and ->wait.lock such that we cannot observe
+         * the lock before complete() acquires it while observing the ->done
+         * after it's acquired the lock.
+         */
+        smp_rmb();
+        spin_unlock_wait(&x->wait.lock);
+        return true;
 }
 EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5eab11d4b747..f0f831e8a345 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -119,7 +119,9 @@ void update_rq_clock(struct rq *rq)
 {
        s64 delta;
-        if (rq->skip_clock_update > 0)
+        lockdep_assert_held(&rq->lock);
+        if (rq->clock_skip_update & RQCF_ACT_SKIP)
                return;
        delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
@@ -305,66 +307,6 @@ __read_mostly int scheduler_running;
 int sysctl_sched_rt_runtime = 950000;
 /*
- * __task_rq_lock - lock the rq @p resides on.
- */
-static inline struct rq *__task_rq_lock(struct task_struct *p)
-        __acquires(rq->lock)
-{
-        struct rq *rq;
-        lockdep_assert_held(&p->pi_lock);
-        for (;;) {
-                rq = task_rq(p);
-                raw_spin_lock(&rq->lock);
-                if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
-                        return rq;
-                raw_spin_unlock(&rq->lock);
-                while (unlikely(task_on_rq_migrating(p)))
-                        cpu_relax();
-        }
-}
-/*
- * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
- */
-static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
-        __acquires(p->pi_lock)
-        __acquires(rq->lock)
-{
-        struct rq *rq;
-        for (;;) {
-                raw_spin_lock_irqsave(&p->pi_lock, *flags);
-                rq = task_rq(p);
-                raw_spin_lock(&rq->lock);
-                if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
-                        return rq;
-                raw_spin_unlock(&rq->lock);
-                raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
-                while (unlikely(task_on_rq_migrating(p)))
-                        cpu_relax();
-        }
-}
-static void __task_rq_unlock(struct rq *rq)
-        __releases(rq->lock)
-{
-        raw_spin_unlock(&rq->lock);
-}
-static inline void
-task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
-        __releases(rq->lock)
-        __releases(p->pi_lock)
-{
-        raw_spin_unlock(&rq->lock);
-        raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
-}
-/*
 * this_rq_lock - lock this runqueue and disable interrupts.
 */
 static struct rq *this_rq_lock(void)
@@ -490,6 +432,11 @@ static __init void init_hrtick(void)
 */
 void hrtick_start(struct rq *rq, u64 delay)
 {
+        /*
+         * Don't schedule slices shorter than 10000ns, that just
+         * doesn't make sense. Rely on vruntime for fairness.
+         */
+        delay = max_t(u64, delay, 10000LL);
        __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
                        HRTIMER_MODE_REL_PINNED, 0);
 }
@@ -1046,7 +993,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
         * this case, we can save a useless back to back clock update.
         */
        if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
-                rq->skip_clock_update = 1;
+                rq_clock_skip_update(rq, true);
 }
 #ifdef CONFIG_SMP
@@ -1082,7 +1029,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
                if (p->sched_class->migrate_task_rq)
                        p->sched_class->migrate_task_rq(p, new_cpu);
                p->se.nr_migrations++;
-                perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
+                perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
        }
        __set_task_cpu(p, new_cpu);
@@ -1836,6 +1783,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
        p->se.prev_sum_exec_runtime     = 0;
        p->se.nr_migrations             = 0;
        p->se.vruntime                  = 0;
+#ifdef CONFIG_SMP
+        p->se.avg.decay_count           = 0;
+#endif
        INIT_LIST_HEAD(&p->se.group_node);
 #ifdef CONFIG_SCHEDSTATS
@@ -2755,6 +2705,10 @@ again:
 *          - explicit schedule() call
 *          - return from syscall or exception to user-space
 *          - return from interrupt-handler to user-space
+ *
+ * WARNING: all callers must re-check need_resched() afterward and reschedule
+ * accordingly in case an event triggered the need for rescheduling (such as
+ * an interrupt waking up a task) while preemption was disabled in __schedule().
 */
 static void __sched __schedule(void)
 {
@@ -2763,7 +2717,6 @@ static void __sched __schedule(void)
        struct rq *rq;
        int cpu;
-need_resched:
        preempt_disable();
        cpu = smp_processor_id();
        rq = cpu_rq(cpu);
@@ -2783,6 +2736,8 @@ need_resched:
        smp_mb__before_spinlock();
        raw_spin_lock_irq(&rq->lock);
+        rq->clock_skip_update <<= 1; /* promote REQ to ACT */
        switch_count = &prev->nivcsw;
        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
                if (unlikely(signal_pending_state(prev->state, prev))) {
@@ -2807,13 +2762,13 @@ need_resched:
                switch_count = &prev->nvcsw;
        }
-        if (task_on_rq_queued(prev) || rq->skip_clock_update < 0)
+        if (task_on_rq_queued(prev))
                update_rq_clock(rq);
        next = pick_next_task(rq, prev);
        clear_tsk_need_resched(prev);
        clear_preempt_need_resched();
-        rq->skip_clock_update = 0;
+        rq->clock_skip_update = 0;
        if (likely(prev != next)) {
                rq->nr_switches++;
@@ -2828,8 +2783,6 @@ need_resched:
        post_schedule(rq);
        sched_preempt_enable_no_resched();
-        if (need_resched())
-                goto need_resched;
 }
 static inline void sched_submit_work(struct task_struct *tsk)
@@ -2849,7 +2802,9 @@ asmlinkage __visible void __sched schedule(void)
        struct task_struct *tsk = current;
        sched_submit_work(tsk);
-        __schedule();
+        do {
+                __schedule();
+        } while (need_resched());
 }
 EXPORT_SYMBOL(schedule);
@@ -2884,6 +2839,21 @@ void __sched schedule_preempt_disabled(void)
        preempt_disable();
 }
+static void __sched notrace preempt_schedule_common(void)
+{
+        do {
+                __preempt_count_add(PREEMPT_ACTIVE);
+                __schedule();
+                __preempt_count_sub(PREEMPT_ACTIVE);
+                /*
+                 * Check again in case we missed a preemption opportunity
+                 * between schedule and now.
+                 */
+                barrier();
+        } while (need_resched());
+}
 #ifdef CONFIG_PREEMPT
 /*
 * this is the entry point to schedule() from in-kernel preemption
@@ -2899,17 +2869,7 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
        if (likely(!preemptible()))
                return;
-        do {
+        preempt_schedule_common();
-                __preempt_count_add(PREEMPT_ACTIVE);
-                __schedule();
-                __preempt_count_sub(PREEMPT_ACTIVE);
-                /*
-                 * Check again in case we missed a preemption opportunity
-                 * between schedule and now.
-                 */
-                barrier();
-        } while (need_resched());
 }
 NOKPROBE_SYMBOL(preempt_schedule);
 EXPORT_SYMBOL(preempt_schedule);
@@ -3405,6 +3365,20 @@ static bool check_same_owner(struct task_struct *p)
        return match;
 }
+static bool dl_param_changed(struct task_struct *p,
+                const struct sched_attr *attr)
+{
+        struct sched_dl_entity *dl_se = &p->dl;
+        if (dl_se->dl_runtime != attr->sched_runtime ||
+                dl_se->dl_deadline != attr->sched_deadline ||
+                dl_se->dl_period != attr->sched_period ||
+                dl_se->flags != attr->sched_flags)
+                return true;
+        return false;
+}
 static int __sched_setscheduler(struct task_struct *p,
                                const struct sched_attr *attr,
                                bool user)
@@ -3533,7 +3507,7 @@ recheck:
                        goto change;
                if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
                        goto change;
-                if (dl_policy(policy))
+                if (dl_policy(policy) && dl_param_changed(p, attr))
                        goto change;
                p->sched_reset_on_fork = reset_on_fork;
@@ -4225,17 +4199,10 @@ SYSCALL_DEFINE0(sched_yield)
        return 0;
 }
-static void __cond_resched(void)
-{
-        __preempt_count_add(PREEMPT_ACTIVE);
-        __schedule();
-        __preempt_count_sub(PREEMPT_ACTIVE);
-}
 int __sched _cond_resched(void)
 {
        if (should_resched()) {
-                __cond_resched();
+                preempt_schedule_common();
                return 1;
        }
        return 0;
@@ -4260,7 +4227,7 @@ int __cond_resched_lock(spinlock_t *lock)
        if (spin_needbreak(lock) || resched) {
                spin_unlock(lock);
                if (resched)
-                        __cond_resched();
+                        preempt_schedule_common();
                else
                        cpu_relax();
                ret = 1;
@@ -4276,7 +4243,7 @@ int __sched __cond_resched_softirq(void)
        if (should_resched()) {
                local_bh_enable();
-                __cond_resched();
+                preempt_schedule_common();
                local_bh_disable();
                return 1;
        }
@@ -4391,36 +4358,29 @@ EXPORT_SYMBOL_GPL(yield_to);
 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
 * that process accounting knows that this is a task in IO wait state.
 */
-void __sched io_schedule(void)
-{
-        struct rq *rq = raw_rq();
-        delayacct_blkio_start();
-        atomic_inc(&rq->nr_iowait);
-        blk_flush_plug(current);
-        current->in_iowait = 1;
-        schedule();
-        current->in_iowait = 0;
-        atomic_dec(&rq->nr_iowait);
-        delayacct_blkio_end();
-}
-EXPORT_SYMBOL(io_schedule);
 long __sched io_schedule_timeout(long timeout)
 {
-        struct rq *rq = raw_rq();
+        int old_iowait = current->in_iowait;
+        struct rq *rq;
        long ret;
+        current->in_iowait = 1;
+        if (old_iowait)
+                blk_schedule_flush_plug(current);
+        else
+                blk_flush_plug(current);
        delayacct_blkio_start();
+        rq = raw_rq();
        atomic_inc(&rq->nr_iowait);
-        blk_flush_plug(current);
-        current->in_iowait = 1;
        ret = schedule_timeout(timeout);
-        current->in_iowait = 0;
+        current->in_iowait = old_iowait;
        atomic_dec(&rq->nr_iowait);
        delayacct_blkio_end();
        return ret;
 }
+EXPORT_SYMBOL(io_schedule_timeout);
 /**
 * sys_sched_get_priority_max - return maximum RT priority.
@@ -4531,9 +4491,10 @@ void sched_show_task(struct task_struct *p)
 {
        unsigned long free = 0;
        int ppid;
-        unsigned state;
+        unsigned long state = p->state;
-        state = p->state ? __ffs(p->state) + 1 : 0;
+        if (state)
+                state = __ffs(state) + 1;
        printk(KERN_INFO "%-15.15s %c", p->comm,
                state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
 #if BITS_PER_LONG == 32
@@ -4766,7 +4727,7 @@ static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
-        if (p->sched_class && p->sched_class->set_cpus_allowed)
+        if (p->sched_class->set_cpus_allowed)
                p->sched_class->set_cpus_allowed(p, new_mask);
        cpumask_copy(&p->cpus_allowed, new_mask);
@@ -5434,9 +5395,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                                  struct cpumask *groupmask)
 {
        struct sched_group *group = sd->groups;
-        char str[256];
-        cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
        cpumask_clear(groupmask);
        printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
@@ -5449,7 +5408,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                return -1;
        }
-        printk(KERN_CONT "span %s level %s\n", str, sd->name);
+        printk(KERN_CONT "span %*pbl level %s\n",
+               cpumask_pr_args(sched_domain_span(sd)), sd->name);
        if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
                printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -5494,9 +5454,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                cpumask_or(groupmask, groupmask, sched_group_cpus(group));
-                cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
+                printk(KERN_CONT " %*pbl",
+                       cpumask_pr_args(sched_group_cpus(group)));
-                printk(KERN_CONT " %s", str);
                if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
                        printk(KERN_CONT " (cpu_capacity = %d)",
                                group->sgc->capacity);
@@ -7276,6 +7235,11 @@ void __init sched_init(void)
        enter_lazy_tlb(&init_mm, current);
        /*
+         * During early bootup we pretend to be a normal task:
+         */
+        current->sched_class = &fair_sched_class;
+        /*
         * Make us the idle thread. Technically, schedule() should not be
         * called from this thread, however somewhere below it might be,
         * but because we are the idle thread, we just pick up running again
@@ -7285,11 +7249,6 @@ void __init sched_init(void)
        calc_load_update = jiffies + LOAD_FREQ;
-        /*
-         * During early bootup we pretend to be a normal task:
-         */
-        current->sched_class = &fair_sched_class;
 #ifdef CONFIG_SMP
        zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
        /* May be allocated at isolcpus cmdline parse time */
@@ -7350,6 +7309,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
                        in_atomic(), irqs_disabled(),
                        current->pid, current->comm);
+        if (task_stack_end_corrupted(current))
+                printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
        debug_show_held_locks(current);
        if (irqs_disabled())
                print_irqtrace_events(current);
@@ -7613,6 +7575,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
 {
        struct task_struct *g, *p;
+        /*
+         * Autogroups do not have RT tasks; see autogroup_create().
+         */
+        if (task_group_is_autogroup(tg))
+                return 0;
        for_each_process_thread(g, p) {
                if (rt_task(p) && task_group(p) == tg)
                        return 1;
@@ -7705,6 +7673,17 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
 {
        int i, err = 0;
+        /*
+         * Disallowing the root group RT runtime is BAD, it would disallow the
+         * kernel creating (and or operating) RT threads.
+         */
+        if (tg == &root_task_group && rt_runtime == 0)
+                return -EINVAL;
+        /* No period doesn't make any sense. */
+        if (rt_period == 0)
+                return -EINVAL;
        mutex_lock(&rt_constraints_mutex);
        read_lock(&tasklist_lock);
        err = __rt_schedulable(tg, rt_period, rt_runtime);
@@ -7761,9 +7740,6 @@ static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
        rt_period = (u64)rt_period_us * NSEC_PER_USEC;
        rt_runtime = tg->rt_bandwidth.rt_runtime;
-        if (rt_period == 0)
-                return -EINVAL;
        return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 539ca3ce071b..c6acb07466bb 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -107,7 +107,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
        int best_cpu = -1;
        const struct sched_dl_entity *dl_se = &p->dl;
-        if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) {
+        if (later_mask &&
+            cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
                best_cpu = cpumask_any(later_mask);
                goto out;
        } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
@@ -186,6 +187,26 @@ out:
 }
 /*
+ * cpudl_set_freecpu - Set the cpudl.free_cpus
+ * @cp: the cpudl max-heap context
+ * @cpu: rd attached cpu
+ */
+void cpudl_set_freecpu(struct cpudl *cp, int cpu)
+{
+        cpumask_set_cpu(cpu, cp->free_cpus);
+}
+/*
+ * cpudl_clear_freecpu - Clear the cpudl.free_cpus
+ * @cp: the cpudl max-heap context
+ * @cpu: rd attached cpu
+ */
+void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
+{
+        cpumask_clear_cpu(cpu, cp->free_cpus);
+}
+/*
 * cpudl_init - initialize the cpudl structure
 * @cp: the cpudl max-heap context
 */
@@ -203,7 +224,7 @@ int cpudl_init(struct cpudl *cp)
        if (!cp->elements)
                return -ENOMEM;
-        if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
+        if (!zalloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
                kfree(cp->elements);
                return -ENOMEM;
        }
@@ -211,8 +232,6 @@ int cpudl_init(struct cpudl *cp)
        for_each_possible_cpu(i)
                cp->elements[i].idx = IDX_INVALID;
-        cpumask_setall(cp->free_cpus);
        return 0;
 }
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 020039bd1326..1a0a6ef2fbe1 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -24,6 +24,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
               struct cpumask *later_mask);
 void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
 int cpudl_init(struct cpudl *cp);
+void cpudl_set_freecpu(struct cpudl *cp, int cpu);
+void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
 void cpudl_cleanup(struct cpudl *cp);
 #endif /* CONFIG_SMP */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 726470d47f87..3fa8fa6d9403 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -350,6 +350,11 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
                dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
                dl_se->runtime = pi_se->dl_runtime;
        }
+        if (dl_se->dl_yielded)
+                dl_se->dl_yielded = 0;
+        if (dl_se->dl_throttled)
+                dl_se->dl_throttled = 0;
 }
 /*
@@ -506,16 +511,10 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
                                                     struct sched_dl_entity,
                                                     dl_timer);
        struct task_struct *p = dl_task_of(dl_se);
+        unsigned long flags;
        struct rq *rq;
-again:
-        rq = task_rq(p);
-        raw_spin_lock(&rq->lock);
-        if (rq != task_rq(p)) {
+        rq = task_rq_lock(current, &flags);
-                /* Task was moved, retrying. */
-                raw_spin_unlock(&rq->lock);
-                goto again;
-        }
        /*
         * We need to take care of several possible races here:
@@ -536,25 +535,41 @@ again:
        sched_clock_tick();
        update_rq_clock(rq);
-        dl_se->dl_throttled = 0;
-        dl_se->dl_yielded = 0;
+        /*
-        if (task_on_rq_queued(p)) {
+         * If the throttle happened during sched-out; like:
-                enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
+         *
-                if (dl_task(rq->curr))
+         *   schedule()
-                        check_preempt_curr_dl(rq, p, 0);
+         *     deactivate_task()
-                else
+         *       dequeue_task_dl()
-                        resched_curr(rq);
+         *         update_curr_dl()
+         *           start_dl_timer()
+         *         __dequeue_task_dl()
+         *     prev->on_rq = 0;
+         *
+         * We can be both throttled and !queued. Replenish the counter
+         * but do not enqueue -- wait for our wakeup to do that.
+         */
+        if (!task_on_rq_queued(p)) {
+                replenish_dl_entity(dl_se, dl_se);
+                goto unlock;
+        }
+        enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
+        if (dl_task(rq->curr))
+                check_preempt_curr_dl(rq, p, 0);
+        else
+                resched_curr(rq);
 #ifdef CONFIG_SMP
-                /*
+        /*
-                 * Queueing this task back might have overloaded rq,
+         * Queueing this task back might have overloaded rq,
-                 * check if we need to kick someone away.
+         * check if we need to kick someone away.
-                 */
+         */
-                if (has_pushable_dl_tasks(rq))
+        if (has_pushable_dl_tasks(rq))
-                        push_dl_task(rq);
+                push_dl_task(rq);
 #endif
-        }
 unlock:
-        raw_spin_unlock(&rq->lock);
+        task_rq_unlock(rq, current, &flags);
        return HRTIMER_NORESTART;
 }
@@ -613,10 +628,9 @@ static void update_curr_dl(struct rq *rq)
        dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
        if (dl_runtime_exceeded(rq, dl_se)) {
+                dl_se->dl_throttled = 1;
                __dequeue_task_dl(rq, curr, 0);
-                if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted)))
+                if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted)))
-                        dl_se->dl_throttled = 1;
-                else
                        enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
                if (!is_leftmost(curr, &rq->dl))
@@ -853,7 +867,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
         * its rq, the bandwidth timer callback (which clearly has not
         * run yet) will take care of this.
         */
-        if (p->dl.dl_throttled)
+        if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH))
                return;
        enqueue_dl_entity(&p->dl, pi_se, flags);
@@ -898,6 +912,7 @@ static void yield_task_dl(struct rq *rq)
                rq->curr->dl.dl_yielded = 1;
                p->dl.runtime = 0;
        }
+        update_rq_clock(rq);
        update_curr_dl(rq);
 }
@@ -1073,7 +1088,13 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
 {
        update_curr_dl(rq);
-        if (hrtick_enabled(rq) && queued && p->dl.runtime > 0)
+        /*
+         * Even when we have runtime, update_curr_dl() might have resulted in us
+         * not being the leftmost task anymore. In that case NEED_RESCHED will
+         * be set and schedule() will start a new hrtick for the next task.
+         */
+        if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 &&
+            is_leftmost(p, &rq->dl))
                start_hrtick_dl(rq, p);
 }
@@ -1166,9 +1187,6 @@ static int find_later_rq(struct task_struct *task)
         * We have to consider system topology and task affinity
         * first, then we can look for a suitable cpu.
         */
-        cpumask_copy(later_mask, task_rq(task)->rd->span);
-        cpumask_and(later_mask, later_mask, cpu_active_mask);
-        cpumask_and(later_mask, later_mask, &task->cpus_allowed);
        best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
                        task, later_mask);
        if (best_cpu == -1)
@@ -1563,6 +1581,7 @@ static void rq_online_dl(struct rq *rq)
        if (rq->dl.overloaded)
                dl_set_overload(rq);
+        cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
        if (rq->dl.dl_nr_running > 0)
                cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
 }
@@ -1574,6 +1593,7 @@ static void rq_offline_dl(struct rq *rq)
                dl_clear_overload(rq);
        cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+        cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
 }
 void init_sched_dl_class(void)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 92cc52001e74..8baaf858d25c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -305,6 +305,7 @@ do {									\
        PN(next_balance);
        SEQ_printf(m, "  .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
        PN(clock);
+        PN(clock_task);
        P(cpu_load[0]);
        P(cpu_load[1]);
        P(cpu_load[2]);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fe331fc391f5..7ce18f3c097a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -676,7 +676,6 @@ void init_task_runnable_average(struct task_struct *p)
 {
        u32 slice;
-        p->se.avg.decay_count = 0;
        slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
        p->se.avg.runnable_avg_sum = slice;
        p->se.avg.runnable_avg_period = slice;
@@ -2574,11 +2573,11 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
        u64 decays = atomic64_read(&cfs_rq->decay_counter);
        decays -= se->avg.decay_count;
+        se->avg.decay_count = 0;
        if (!decays)
                return 0;
        se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
-        se->avg.decay_count = 0;
        return decays;
 }
@@ -5157,7 +5156,7 @@ static void yield_task_fair(struct rq *rq)
                 * so we don't do microscopic update in schedule()
                 * and double the fastpath cost.
                 */
-                 rq->skip_clock_update = 1;
+                rq_clock_skip_update(rq, true);
        }
        set_skip_buddy(se);
@@ -5949,8 +5948,8 @@ static unsigned long scale_rt_capacity(int cpu)
         */
        age_stamp = ACCESS_ONCE(rq->age_stamp);
        avg = ACCESS_ONCE(rq->rt_avg);
+        delta = __rq_clock_broken(rq) - age_stamp;
-        delta = rq_clock(rq) - age_stamp;
        if (unlikely(delta < 0))
                delta = 0;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c47fce75e666..94b2d7b88a27 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -7,6 +7,7 @@
 #include <linux/tick.h>
 #include <linux/mm.h>
 #include <linux/stackprotector.h>
+#include <linux/suspend.h>
 #include <asm/tlb.h>
@@ -47,7 +48,8 @@ static inline int cpu_idle_poll(void)
        rcu_idle_enter();
        trace_cpu_idle_rcuidle(0, smp_processor_id());
        local_irq_enable();
-        while (!tif_need_resched())
+        while (!tif_need_resched() &&
+                (cpu_idle_force_poll || tick_check_broadcast_expired()))
                cpu_relax();
        trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
        rcu_idle_exit();
@@ -104,6 +106,21 @@ static void cpuidle_idle_call(void)
        rcu_idle_enter();
        /*
+         * Suspend-to-idle ("freeze") is a system state in which all user space
+         * has been frozen, all I/O devices have been suspended and the only
+         * activity happens here and in iterrupts (if any).  In that case bypass
+         * the cpuidle governor and go stratight for the deepest idle state
+         * available.  Possibly also suspend the local tick and the entire
+         * timekeeping to prevent timer interrupts from kicking us out of idle
+         * until a proper wakeup interrupt happens.
+         */
+        if (idle_should_freeze()) {
+                cpuidle_enter_freeze();
+                local_irq_enable();
+                goto exit_idle;
+        }
+        /*
         * Ask the cpuidle framework to choose a convenient idle state.
         * Fall back to the default arch idle method on errors.
         */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index ee15f5a0d1c1..f4d4b077eba0 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -831,11 +831,14 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
                                enqueue = 1;
                                /*
-                                 * Force a clock update if the CPU was idle,
+                                 * When we're idle and a woken (rt) task is
-                                 * lest wakeup -> unthrottle time accumulate.
+                                 * throttled check_preempt_curr() will set
+                                 * skip_update and the time between the wakeup
+                                 * and this unthrottle will get accounted as
+                                 * 'runtime'.
                                 */
                                if (rt_rq->rt_nr_running && rq->curr == rq->idle)
-                                        rq->skip_clock_update = -1;
+                                        rq_clock_skip_update(rq, false);
                        }
                        if (rt_rq->rt_time || rt_rq->rt_nr_running)
                                idle = 0;
@@ -1337,7 +1340,12 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
             curr->prio <= p->prio)) {
                int target = find_lowest_rq(p);
-                if (target != -1)
+                /*
+                 * Don't bother moving it if the destination CPU is
+                 * not running a lower priority task.
+                 */
+                if (target != -1 &&
+                    p->prio < cpu_rq(target)->rt.highest_prio.curr)
                        cpu = target;
        }
        rcu_read_unlock();
@@ -1614,6 +1622,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
                lowest_rq = cpu_rq(cpu);
+                if (lowest_rq->rt.highest_prio.curr <= task->prio) {
+                        /*
+                         * Target rq has tasks of equal or higher priority,
+                         * retrying does not release any lock and is unlikely
+                         * to yield a different result.
+                         */
+                        lowest_rq = NULL;
+                        break;
+                }
                /* if the prio of this runqueue changed, try again */
                if (double_lock_balance(rq, lowest_rq)) {
                        /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9a2a45c970e7..dc0f435a2779 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -558,8 +558,6 @@ struct rq {
 #ifdef CONFIG_NO_HZ_FULL
        unsigned long last_sched_tick;
 #endif
-        int skip_clock_update;
        /* capture load from *all* tasks on this cpu: */
        struct load_weight load;
        unsigned long nr_load_updates;
@@ -588,6 +586,7 @@ struct rq {
        unsigned long next_balance;
        struct mm_struct *prev_mm;
+        unsigned int clock_skip_update;
        u64 clock;
        u64 clock_task;
@@ -687,16 +686,35 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
 #define raw_rq()                raw_cpu_ptr(&runqueues)
+static inline u64 __rq_clock_broken(struct rq *rq)
+{
+        return ACCESS_ONCE(rq->clock);
+}
 static inline u64 rq_clock(struct rq *rq)
 {
+        lockdep_assert_held(&rq->lock);
        return rq->clock;
 }
 static inline u64 rq_clock_task(struct rq *rq)
 {
+        lockdep_assert_held(&rq->lock);
        return rq->clock_task;
 }
+#define RQCF_REQ_SKIP   0x01
+#define RQCF_ACT_SKIP   0x02
+static inline void rq_clock_skip_update(struct rq *rq, bool skip)
+{
+        lockdep_assert_held(&rq->lock);
+        if (skip)
+                rq->clock_skip_update |= RQCF_REQ_SKIP;
+        else
+                rq->clock_skip_update &= ~RQCF_REQ_SKIP;
+}
 #ifdef CONFIG_NUMA
 enum numa_topology_type {
        NUMA_DIRECT,
@@ -1362,6 +1380,82 @@ static inline void sched_avg_update(struct rq *rq) { }
 extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
+/*
+ * __task_rq_lock - lock the rq @p resides on.
+ */
+static inline struct rq *__task_rq_lock(struct task_struct *p)
+        __acquires(rq->lock)
+{
+        struct rq *rq;
+        lockdep_assert_held(&p->pi_lock);
+        for (;;) {
+                rq = task_rq(p);
+                raw_spin_lock(&rq->lock);
+                if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+                        return rq;
+                raw_spin_unlock(&rq->lock);
+                while (unlikely(task_on_rq_migrating(p)))
+                        cpu_relax();
+        }
+}
+/*
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
+ */
+static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+        __acquires(p->pi_lock)
+        __acquires(rq->lock)
+{
+        struct rq *rq;
+        for (;;) {
+                raw_spin_lock_irqsave(&p->pi_lock, *flags);
+                rq = task_rq(p);
+                raw_spin_lock(&rq->lock);
+                /*
+                 *      move_queued_task()              task_rq_lock()
+                 *
+                 *      ACQUIRE (rq->lock)
+                 *      [S] ->on_rq = MIGRATING         [L] rq = task_rq()
+                 *      WMB (__set_task_cpu())          ACQUIRE (rq->lock);
+                 *      [S] ->cpu = new_cpu             [L] task_rq()
+                 *                                      [L] ->on_rq
+                 *      RELEASE (rq->lock)
+                 *
+                 * If we observe the old cpu in task_rq_lock, the acquire of
+                 * the old rq->lock will fully serialize against the stores.
+                 *
+                 * If we observe the new cpu in task_rq_lock, the acquire will
+                 * pair with the WMB to ensure we must then also see migrating.
+                 */
+                if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+                        return rq;
+                raw_spin_unlock(&rq->lock);
+                raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+                while (unlikely(task_on_rq_migrating(p)))
+                        cpu_relax();
+        }
+}
+static inline void __task_rq_unlock(struct rq *rq)
+        __releases(rq->lock)
+{
+        raw_spin_unlock(&rq->lock);
+}
+static inline void
+task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
+        __releases(rq->lock)
+        __releases(p->pi_lock)
+{
+        raw_spin_unlock(&rq->lock);
+        raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+}
 #ifdef CONFIG_SMP
 #ifdef CONFIG_PREEMPT
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index a476bea17fbc..87e2c9f0c33e 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -15,11 +15,6 @@
 static int show_schedstat(struct seq_file *seq, void *v)
 {
        int cpu;
-        int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
-        char *mask_str = kmalloc(mask_len, GFP_KERNEL);
-        if (mask_str == NULL)
-                return -ENOMEM;
        if (v == (void *)1) {
                seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
@@ -50,9 +45,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
                for_each_domain(cpu, sd) {
                        enum cpu_idle_type itype;
-                        cpumask_scnprintf(mask_str, mask_len,
+                        seq_printf(seq, "domain%d %*pb", dcount++,
-                                          sched_domain_span(sd));
+                                   cpumask_pr_args(sched_domain_span(sd)));
-                        seq_printf(seq, "domain%d %s", dcount++, mask_str);
                        for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
                                        itype++) {
                                seq_printf(seq, " %u %u %u %u %u %u %u %u",
@@ -76,7 +70,6 @@ static int show_schedstat(struct seq_file *seq, void *v)
                rcu_read_unlock();
 #endif
        }
-        kfree(mask_str);
        return 0;
 }
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 4ef9687ac115..4f44028943e6 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -629,7 +629,9 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
        switch (action) {
        case SECCOMP_RET_ERRNO:
-                /* Set the low-order 16-bits as a errno. */
+                /* Set low-order bits as an errno, capped at MAX_ERRNO. */
+                if (data > MAX_ERRNO)
+                        data = MAX_ERRNO;
                syscall_set_return_value(current, task_pt_regs(current),
                                         -data, 0);
                goto skip;
diff --git a/kernel/signal.c b/kernel/signal.c
index 16a305295256..a390499943e4 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2501,7 +2501,7 @@ EXPORT_SYMBOL(unblock_all_signals);
 */
 SYSCALL_DEFINE0(restart_syscall)
 {
-        struct restart_block *restart = &current_thread_info()->restart_block;
+        struct restart_block *restart = &current->restart_block;
        return restart->fn(restart);
 }
@@ -3550,7 +3550,7 @@ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)
 SYSCALL_DEFINE0(pause)
 {
        while (!signal_pending(current)) {
-                current->state = TASK_INTERRUPTIBLE;
+                __set_current_state(TASK_INTERRUPTIBLE);
                schedule();
        }
        return -ERESTARTNOHAND;
@@ -3563,7 +3563,7 @@ int sigsuspend(sigset_t *set)
        current->saved_sigmask = current->blocked;
        set_current_blocked(set);
-        current->state = TASK_INTERRUPTIBLE;
+        __set_current_state(TASK_INTERRUPTIBLE);
        schedule();
        set_restore_sigmask();
        return -ERESTARTNOHAND;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 501baa9ac1be..479e4436f787 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -114,8 +114,12 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
                trace_softirqs_off(ip);
        raw_local_irq_restore(flags);
-        if (preempt_count() == cnt)
+        if (preempt_count() == cnt) {
+#ifdef CONFIG_DEBUG_PREEMPT
+                current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1);
+#endif
                trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+        }
 }
 EXPORT_SYMBOL(__local_bh_disable_ip);
 #endif /* CONFIG_TRACE_IRQFLAGS */
@@ -656,9 +660,8 @@ static void run_ksoftirqd(unsigned int cpu)
                 * in the task stack here.
                 */
                __do_softirq();
-                rcu_note_context_switch();
                local_irq_enable();
-                cond_resched();
+                cond_resched_rcu_qs();
                return;
        }
        local_irq_enable();
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 137c7f69b264..88ea2d6e0031 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1248,7 +1248,6 @@ static struct ctl_table vm_table[] = {
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
                .proc_handler   = hugetlb_sysctl_handler,
-                .extra1         = &zero,
        },
 #ifdef CONFIG_NUMA
        {
@@ -1257,7 +1256,6 @@ static struct ctl_table vm_table[] = {
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
                .proc_handler   = &hugetlb_mempolicy_sysctl_handler,
-                .extra1         = &zero,
        },
 #endif
         {
@@ -1280,7 +1278,6 @@ static struct ctl_table vm_table[] = {
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
                .proc_handler   = hugetlb_overcommit_handler,
-                .extra1         = &zero,
        },
 #endif
        {
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 670fff88a961..21f82c29c914 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -111,13 +111,8 @@ static int send_reply(struct sk_buff *skb, struct genl_info *info)
 {
        struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
        void *reply = genlmsg_data(genlhdr);
-        int rc;
-        rc = genlmsg_end(skb, reply);
+        genlmsg_end(skb, reply);
-        if (rc < 0) {
-                nlmsg_free(skb);
-                return rc;
-        }
        return genlmsg_reply(skb, info);
 }
@@ -134,11 +129,7 @@ static void send_cpu_listeners(struct sk_buff *skb,
        void *reply = genlmsg_data(genlhdr);
        int rc, delcount = 0;
-        rc = genlmsg_end(skb, reply);
+        genlmsg_end(skb, reply);
-        if (rc < 0) {
-                nlmsg_free(skb);
-                return;
-        }
        rc = 0;
        down_read(&listeners->sem);
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index f622cf28628a..c09c07817d7a 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,6 +1,6 @@
 obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o
 obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
-obj-y += timeconv.o posix-clock.o alarmtimer.o
+obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD)         += clockevents.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS)               += tick-common.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index a7077d3ae52f..1b001ed1edb9 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -788,7 +788,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
                        goto out;
        }
-        restart = &current_thread_info()->restart_block;
+        restart = &current->restart_block;
        restart->fn = alarm_timer_nsleep_restart;
        restart->nanosleep.clockid = type;
        restart->nanosleep.expires = exp.tv64;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index b79f39bda7e1..4892352f0e49 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -34,82 +34,6 @@
 #include "tick-internal.h"
 #include "timekeeping_internal.h"
-void timecounter_init(struct timecounter *tc,
-                      const struct cyclecounter *cc,
-                      u64 start_tstamp)
-{
-        tc->cc = cc;
-        tc->cycle_last = cc->read(cc);
-        tc->nsec = start_tstamp;
-}
-EXPORT_SYMBOL_GPL(timecounter_init);
-/**
- * timecounter_read_delta - get nanoseconds since last call of this function
- * @tc:         Pointer to time counter
- *
- * When the underlying cycle counter runs over, this will be handled
- * correctly as long as it does not run over more than once between
- * calls.
- *
- * The first call to this function for a new time counter initializes
- * the time tracking and returns an undefined result.
- */
-static u64 timecounter_read_delta(struct timecounter *tc)
-{
-        cycle_t cycle_now, cycle_delta;
-        u64 ns_offset;
-        /* read cycle counter: */
-        cycle_now = tc->cc->read(tc->cc);
-        /* calculate the delta since the last timecounter_read_delta(): */
-        cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
-        /* convert to nanoseconds: */
-        ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta);
-        /* update time stamp of timecounter_read_delta() call: */
-        tc->cycle_last = cycle_now;
-        return ns_offset;
-}
-u64 timecounter_read(struct timecounter *tc)
-{
-        u64 nsec;
-        /* increment time by nanoseconds since last call */
-        nsec = timecounter_read_delta(tc);
-        nsec += tc->nsec;
-        tc->nsec = nsec;
-        return nsec;
-}
-EXPORT_SYMBOL_GPL(timecounter_read);
-u64 timecounter_cyc2time(struct timecounter *tc,
-                         cycle_t cycle_tstamp)
-{
-        u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
-        u64 nsec;
-        /*
-         * Instead of always treating cycle_tstamp as more recent
-         * than tc->cycle_last, detect when it is too far in the
-         * future and treat it as old time stamp instead.
-         */
-        if (cycle_delta > tc->cc->mask / 2) {
-                cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
-                nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta);
-        } else {
-                nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec;
-        }
-        return nsec;
-}
-EXPORT_SYMBOL_GPL(timecounter_cyc2time);
 /**
 * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
 * @mult:       pointer to mult variable
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index d8c724cda37b..bee0c1f78091 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -266,7 +266,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
 /*
 * Divide a ktime value by a nanosecond value
 */
-u64 ktime_divns(const ktime_t kt, s64 div)
+u64 __ktime_divns(const ktime_t kt, s64 div)
 {
        u64 dclc;
        int sft = 0;
@@ -282,7 +282,7 @@ u64 ktime_divns(const ktime_t kt, s64 div)
        return dclc;
 }
-EXPORT_SYMBOL_GPL(ktime_divns);
+EXPORT_SYMBOL_GPL(__ktime_divns);
 #endif /* BITS_PER_LONG >= 64 */
 /*
@@ -440,6 +440,37 @@ static inline void debug_deactivate(struct hrtimer *timer)
        trace_hrtimer_cancel(timer);
 }
+#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
+static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
+{
+        struct hrtimer_clock_base *base = cpu_base->clock_base;
+        ktime_t expires, expires_next = { .tv64 = KTIME_MAX };
+        int i;
+        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
+                struct timerqueue_node *next;
+                struct hrtimer *timer;
+                next = timerqueue_getnext(&base->active);
+                if (!next)
+                        continue;
+                timer = container_of(next, struct hrtimer, node);
+                expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
+                if (expires.tv64 < expires_next.tv64)
+                        expires_next = expires;
+        }
+        /*
+         * clock_was_set() might have changed base->offset of any of
+         * the clock bases so the result might be negative. Fix it up
+         * to prevent a false positive in clockevents_program_event().
+         */
+        if (expires_next.tv64 < 0)
+                expires_next.tv64 = 0;
+        return expires_next;
+}
+#endif
 /* High resolution timer related functions */
 #ifdef CONFIG_HIGH_RES_TIMERS
@@ -488,32 +519,7 @@ static inline int hrtimer_hres_active(void)
 static void
 hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
 {
-        int i;
+        ktime_t expires_next = __hrtimer_get_next_event(cpu_base);
-        struct hrtimer_clock_base *base = cpu_base->clock_base;
-        ktime_t expires, expires_next;
-        expires_next.tv64 = KTIME_MAX;
-        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
-                struct hrtimer *timer;
-                struct timerqueue_node *next;
-                next = timerqueue_getnext(&base->active);
-                if (!next)
-                        continue;
-                timer = container_of(next, struct hrtimer, node);
-                expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
-                /*
-                 * clock_was_set() has changed base->offset so the
-                 * result might be negative. Fix it up to prevent a
-                 * false positive in clockevents_program_event()
-                 */
-                if (expires.tv64 < 0)
-                        expires.tv64 = 0;
-                if (expires.tv64 < expires_next.tv64)
-                        expires_next = expires;
-        }
        if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
                return;
@@ -587,6 +593,15 @@ static int hrtimer_reprogram(struct hrtimer *timer,
                return 0;
        /*
+         * When the target cpu of the timer is currently executing
+         * hrtimer_interrupt(), then we do not touch the clock event
+         * device. hrtimer_interrupt() will reevaluate all clock bases
+         * before reprogramming the device.
+         */
+        if (cpu_base->in_hrtirq)
+                return 0;
+        /*
         * If a hang was detected in the last timer interrupt then we
         * do not schedule a timer which is earlier than the expiry
         * which we enforced in the hang detection. We want the system
@@ -1104,29 +1119,14 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
 ktime_t hrtimer_get_next_event(void)
 {
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
-        struct hrtimer_clock_base *base = cpu_base->clock_base;
+        ktime_t mindelta = { .tv64 = KTIME_MAX };
-        ktime_t delta, mindelta = { .tv64 = KTIME_MAX };
        unsigned long flags;
-        int i;
        raw_spin_lock_irqsave(&cpu_base->lock, flags);
-        if (!hrtimer_hres_active()) {
+        if (!hrtimer_hres_active())
-                for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
+                mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base),
-                        struct hrtimer *timer;
+                                     ktime_get());
-                        struct timerqueue_node *next;
-                        next = timerqueue_getnext(&base->active);
-                        if (!next)
-                                continue;
-                        timer = container_of(next, struct hrtimer, node);
-                        delta.tv64 = hrtimer_get_expires_tv64(timer);
-                        delta = ktime_sub(delta, base->get_time());
-                        if (delta.tv64 < mindelta.tv64)
-                                mindelta.tv64 = delta.tv64;
-                }
-        }
        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
@@ -1253,7 +1253,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
        raw_spin_lock(&cpu_base->lock);
        entry_time = now = hrtimer_update_base(cpu_base);
 retry:
-        expires_next.tv64 = KTIME_MAX;
+        cpu_base->in_hrtirq = 1;
        /*
         * We set expires_next to KTIME_MAX here with cpu_base->lock
         * held to prevent that a timer is enqueued in our queue via
@@ -1291,28 +1291,20 @@ retry:
                         * are right-of a not yet expired timer, because that
                         * timer will have to trigger a wakeup anyway.
                         */
+                        if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
-                        if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {
-                                ktime_t expires;
-                                expires = ktime_sub(hrtimer_get_expires(timer),
-                                                    base->offset);
-                                if (expires.tv64 < 0)
-                                        expires.tv64 = KTIME_MAX;
-                                if (expires.tv64 < expires_next.tv64)
-                                        expires_next = expires;
                                break;
-                        }
                        __run_hrtimer(timer, &basenow);
                }
        }
+        /* Reevaluate the clock bases for the next expiry */
+        expires_next = __hrtimer_get_next_event(cpu_base);
        /*
         * Store the new expiry value so the migration code can verify
         * against it.
         */
        cpu_base->expires_next = expires_next;
+        cpu_base->in_hrtirq = 0;
        raw_spin_unlock(&cpu_base->lock);
        /* Reprogramming necessary ? */
@@ -1591,7 +1583,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
                        goto out;
        }
-        restart = &current_thread_info()->restart_block;
+        restart = &current->restart_block;
        restart->fn = hrtimer_nanosleep_restart;
        restart->nanosleep.clockid = t.timer.base->clockid;
        restart->nanosleep.rmtp = rmtp;
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 28bf91c60a0b..0f60b08a4f07 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -488,13 +488,13 @@ static void sync_cmos_clock(struct work_struct *work)
        getnstimeofday64(&now);
        if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
-                struct timespec adjust = timespec64_to_timespec(now);
+                struct timespec64 adjust = now;
                fail = -ENODEV;
                if (persistent_clock_is_local)
                        adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
 #ifdef CONFIG_GENERIC_CMOS_UPDATE
-                fail = update_persistent_clock(adjust);
+                fail = update_persistent_clock(timespec64_to_timespec(adjust));
 #endif
 #ifdef CONFIG_RTC_SYSTOHC
                if (fail == -ENODEV)
@@ -633,10 +633,14 @@ int ntp_validate_timex(struct timex *txc)
        if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME)))
                return -EPERM;
-        if (txc->modes & ADJ_FREQUENCY) {
+        /*
-                if (LONG_MIN / PPM_SCALE > txc->freq)
+         * Check for potential multiplication overflows that can
+         * only happen on 64-bit systems:
+         */
+        if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) {
+                if (LLONG_MIN / PPM_SCALE > txc->freq)
                        return -EINVAL;
-                if (LONG_MAX / PPM_SCALE < txc->freq)
+                if (LLONG_MAX / PPM_SCALE < txc->freq)
                        return -EINVAL;
        }
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a16b67859e2a..0075da74abf0 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1334,8 +1334,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
 static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
                            struct timespec *rqtp, struct timespec __user *rmtp)
 {
-        struct restart_block *restart_block =
+        struct restart_block *restart_block = &current->restart_block;
-                &current_thread_info()->restart_block;
        struct itimerspec it;
        int error;
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 7efeedf53ebd..f7c515595b42 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -394,6 +394,56 @@ void tick_resume(void)
        }
 }
+static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
+static unsigned int tick_freeze_depth;
+/**
+ * tick_freeze - Suspend the local tick and (possibly) timekeeping.
+ *
+ * Check if this is the last online CPU executing the function and if so,
+ * suspend timekeeping.  Otherwise suspend the local tick.
+ *
+ * Call with interrupts disabled.  Must be balanced with %tick_unfreeze().
+ * Interrupts must not be enabled before the subsequent %tick_unfreeze().
+ */
+void tick_freeze(void)
+{
+        raw_spin_lock(&tick_freeze_lock);
+        tick_freeze_depth++;
+        if (tick_freeze_depth == num_online_cpus()) {
+                timekeeping_suspend();
+        } else {
+                tick_suspend();
+                tick_suspend_broadcast();
+        }
+        raw_spin_unlock(&tick_freeze_lock);
+}
+/**
+ * tick_unfreeze - Resume the local tick and (possibly) timekeeping.
+ *
+ * Check if this is the first CPU executing the function and if so, resume
+ * timekeeping.  Otherwise resume the local tick.
+ *
+ * Call with interrupts disabled.  Must be balanced with %tick_freeze().
+ * Interrupts must not be enabled after the preceding %tick_freeze().
+ */
+void tick_unfreeze(void)
+{
+        raw_spin_lock(&tick_freeze_lock);
+        if (tick_freeze_depth == num_online_cpus())
+                timekeeping_resume();
+        else
+                tick_resume();
+        tick_freeze_depth--;
+        raw_spin_unlock(&tick_freeze_lock);
+}
 /**
 * tick_init - initialize the tick control
 */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 1363d58f07e9..a4c4edac4528 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -326,13 +326,6 @@ static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
        return NOTIFY_OK;
 }
-/*
- * Worst case string length in chunks of CPU range seems 2 steps
- * separations: 0,2,4,6,...
- * This is NR_CPUS + sizeof('\0')
- */
-static char __initdata nohz_full_buf[NR_CPUS + 1];
 static int tick_nohz_init_all(void)
 {
        int err = -1;
@@ -393,8 +386,8 @@ void __init tick_nohz_init(void)
                context_tracking_cpu_set(cpu);
        cpu_notifier(tick_nohz_cpu_down_callback, 0);
-        cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask);
+        pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
-        pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf);
+                cpumask_pr_args(tick_nohz_full_mask));
 }
 #endif
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
new file mode 100644
index 000000000000..4687b3104bae
--- /dev/null
+++ b/kernel/time/timecounter.c
@@ -0,0 +1,112 @@
+/*
+ * linux/kernel/time/timecounter.c
+ *
+ * based on code that migrated away from
+ * linux/kernel/time/clocksource.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include <linux/export.h>
+#include <linux/timecounter.h>
+void timecounter_init(struct timecounter *tc,
+                      const struct cyclecounter *cc,
+                      u64 start_tstamp)
+{
+        tc->cc = cc;
+        tc->cycle_last = cc->read(cc);
+        tc->nsec = start_tstamp;
+        tc->mask = (1ULL << cc->shift) - 1;
+        tc->frac = 0;
+}
+EXPORT_SYMBOL_GPL(timecounter_init);
+/**
+ * timecounter_read_delta - get nanoseconds since last call of this function
+ * @tc:         Pointer to time counter
+ *
+ * When the underlying cycle counter runs over, this will be handled
+ * correctly as long as it does not run over more than once between
+ * calls.
+ *
+ * The first call to this function for a new time counter initializes
+ * the time tracking and returns an undefined result.
+ */
+static u64 timecounter_read_delta(struct timecounter *tc)
+{
+        cycle_t cycle_now, cycle_delta;
+        u64 ns_offset;
+        /* read cycle counter: */
+        cycle_now = tc->cc->read(tc->cc);
+        /* calculate the delta since the last timecounter_read_delta(): */
+        cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
+        /* convert to nanoseconds: */
+        ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta,
+                                        tc->mask, &tc->frac);
+        /* update time stamp of timecounter_read_delta() call: */
+        tc->cycle_last = cycle_now;
+        return ns_offset;
+}
+u64 timecounter_read(struct timecounter *tc)
+{
+        u64 nsec;
+        /* increment time by nanoseconds since last call */
+        nsec = timecounter_read_delta(tc);
+        nsec += tc->nsec;
+        tc->nsec = nsec;
+        return nsec;
+}
+EXPORT_SYMBOL_GPL(timecounter_read);
+/*
+ * This is like cyclecounter_cyc2ns(), but it is used for computing a
+ * time previous to the time stored in the cycle counter.
+ */
+static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc,
+                               cycle_t cycles, u64 mask, u64 frac)
+{
+        u64 ns = (u64) cycles;
+        ns = ((ns * cc->mult) - frac) >> cc->shift;
+        return ns;
+}
+u64 timecounter_cyc2time(struct timecounter *tc,
+                         cycle_t cycle_tstamp)
+{
+        u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
+        u64 nsec = tc->nsec, frac = tc->frac;
+        /*
+         * Instead of always treating cycle_tstamp as more recent
+         * than tc->cycle_last, detect when it is too far in the
+         * future and treat it as old time stamp instead.
+         */
+        if (delta > tc->cc->mask / 2) {
+                delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
+                nsec -= cc_cyc2ns_backwards(tc->cc, delta, tc->mask, frac);
+        } else {
+                nsec += cyclecounter_cyc2ns(tc->cc, delta, tc->mask, &frac);
+        }
+        return nsec;
+}
+EXPORT_SYMBOL_GPL(timecounter_cyc2time);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6a931852082f..91db94136c10 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -230,9 +230,7 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
 /**
 * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
- * @tk:         The timekeeper from which we take the update
+ * @tkr: Timekeeping readout base from which we take the update
- * @tkf:        The fast timekeeper to update
- * @tbase:      The time base for the fast timekeeper (mono/raw)
 *
 * We want to use this from any context including NMI and tracing /
 * instrumenting the timekeeping code itself.
@@ -244,11 +242,11 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
 * smp_wmb();   <- Ensure that the last base[1] update is visible
 * tkf->seq++;
 * smp_wmb();   <- Ensure that the seqcount update is visible
- * update(tkf->base[0], tk);
+ * update(tkf->base[0], tkr);
 * smp_wmb();   <- Ensure that the base[0] update is visible
 * tkf->seq++;
 * smp_wmb();   <- Ensure that the seqcount update is visible
- * update(tkf->base[1], tk);
+ * update(tkf->base[1], tkr);
 *
 * The reader side does:
 *
@@ -269,7 +267,7 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
 * slightly wrong timestamp (a few nanoseconds). See
 * @ktime_get_mono_fast_ns.
 */
-static void update_fast_timekeeper(struct timekeeper *tk)
+static void update_fast_timekeeper(struct tk_read_base *tkr)
 {
        struct tk_read_base *base = tk_fast_mono.base;
@@ -277,7 +275,7 @@ static void update_fast_timekeeper(struct timekeeper *tk)
        raw_write_seqcount_latch(&tk_fast_mono.seq);
        /* Update base[0] */
-        memcpy(base, &tk->tkr, sizeof(*base));
+        memcpy(base, tkr, sizeof(*base));
        /* Force readers back to base[0] */
        raw_write_seqcount_latch(&tk_fast_mono.seq);
@@ -334,6 +332,35 @@ u64 notrace ktime_get_mono_fast_ns(void)
 }
 EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
+/* Suspend-time cycles value for halted fast timekeeper. */
+static cycle_t cycles_at_suspend;
+static cycle_t dummy_clock_read(struct clocksource *cs)
+{
+        return cycles_at_suspend;
+}
+/**
+ * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
+ * @tk: Timekeeper to snapshot.
+ *
+ * It generally is unsafe to access the clocksource after timekeeping has been
+ * suspended, so take a snapshot of the readout base of @tk and use it as the
+ * fast timekeeper's readout base while suspended.  It will return the same
+ * number of cycles every time until timekeeping is resumed at which time the
+ * proper readout base for the fast timekeeper will be restored automatically.
+ */
+static void halt_fast_timekeeper(struct timekeeper *tk)
+{
+        static struct tk_read_base tkr_dummy;
+        struct tk_read_base *tkr = &tk->tkr;
+        memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
+        cycles_at_suspend = tkr->read(tkr->clock);
+        tkr_dummy.read = dummy_clock_read;
+        update_fast_timekeeper(&tkr_dummy);
+}
 #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
 static inline void update_vsyscall(struct timekeeper *tk)
@@ -462,7 +489,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
                memcpy(&shadow_timekeeper, &tk_core.timekeeper,
                       sizeof(tk_core.timekeeper));
-        update_fast_timekeeper(tk);
+        update_fast_timekeeper(&tk->tkr);
 }
 /**
@@ -1170,7 +1197,7 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta)
 * xtime/wall_to_monotonic/jiffies/etc are
 * still managed by arch specific suspend/resume code.
 */
-static void timekeeping_resume(void)
+void timekeeping_resume(void)
 {
        struct timekeeper *tk = &tk_core.timekeeper;
        struct clocksource *clock = tk->tkr.clock;
@@ -1251,7 +1278,7 @@ static void timekeeping_resume(void)
        hrtimers_resume();
 }
-static int timekeeping_suspend(void)
+int timekeeping_suspend(void)
 {
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned long flags;
@@ -1296,6 +1323,7 @@ static int timekeeping_suspend(void)
        }
        timekeeping_update(tk, TK_MIRROR);
+        halt_fast_timekeeper(tk);
        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -1659,24 +1687,24 @@ out:
 }
 /**
- * getboottime - Return the real time of system boot.
+ * getboottime64 - Return the real time of system boot.
- * @ts:         pointer to the timespec to be set
+ * @ts:         pointer to the timespec64 to be set
 *
- * Returns the wall-time of boot in a timespec.
+ * Returns the wall-time of boot in a timespec64.
 *
 * This is based on the wall_to_monotonic offset and the total suspend
 * time. Calls to settimeofday will affect the value returned (which
 * basically means that however wrong your real time clock is at boot time,
 * you get the right time here).
 */
-void getboottime(struct timespec *ts)
+void getboottime64(struct timespec64 *ts)
 {
        struct timekeeper *tk = &tk_core.timekeeper;
        ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot);
-        *ts = ktime_to_timespec(t);
+        *ts = ktime_to_timespec64(t);
 }
-EXPORT_SYMBOL_GPL(getboottime);
+EXPORT_SYMBOL_GPL(getboottime64);
 unsigned long get_seconds(void)
 {
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index adc1fc98bde3..1d91416055d5 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -16,5 +16,7 @@ extern int timekeeping_inject_offset(struct timespec *ts);
 extern s32 timekeeping_get_tai_offset(void);
 extern void timekeeping_set_tai_offset(s32 tai_offset);
 extern void timekeeping_clocktai(struct timespec *ts);
+extern int timekeeping_suspend(void);
+extern void timekeeping_resume(void);
 #endif
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 979ccde26720..98f26588255e 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -3,11 +3,11 @@
 ifdef CONFIG_FUNCTION_TRACER
 ORIG_CFLAGS := $(KBUILD_CFLAGS)
-KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
+KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS))
 ifdef CONFIG_FTRACE_SELFTEST
 # selftest needs instrumentation
-CFLAGS_trace_selftest_dynamic.o = -pg
+CFLAGS_trace_selftest_dynamic.o = $(CC_FLAGS_FTRACE)
 obj-y += trace_selftest_dynamic.o
 endif
 endif
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 224e768bdc73..45e5cb143d17 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5456,7 +5456,7 @@ static __init int ftrace_init_debugfs(void)
        struct dentry *d_tracer;
        d_tracer = tracing_init_dentry();
-        if (!d_tracer)
+        if (IS_ERR(d_tracer))
                return 0;
        ftrace_init_dyn_debugfs(d_tracer);
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index 1c71382b283d..eb4220a132ec 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,5 +13,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/power.h>
+EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume);
 EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7a4104cb95cb..5040d44fe5a3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -9,7 +9,6 @@
 #include <linux/trace_seq.h>
 #include <linux/spinlock.h>
 #include <linux/irq_work.h>
-#include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/hardirq.h>
 #include <linux/kthread.h>      /* for self test */
@@ -23,7 +22,6 @@
 #include <linux/hash.h>
 #include <linux/list.h>
 #include <linux/cpu.h>
-#include <linux/fs.h>
 #include <asm/local.h>
@@ -447,7 +445,10 @@ int ring_buffer_print_page_header(struct trace_seq *s)
 struct rb_irq_work {
        struct irq_work                 work;
        wait_queue_head_t               waiters;
+        wait_queue_head_t               full_waiters;
        bool                            waiters_pending;
+        bool                            full_waiters_pending;
+        bool                            wakeup_full;
 };
 /*
@@ -529,6 +530,10 @@ static void rb_wake_up_waiters(struct irq_work *work)
        struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
        wake_up_all(&rbwork->waiters);
+        if (rbwork->wakeup_full) {
+                rbwork->wakeup_full = false;
+                wake_up_all(&rbwork->full_waiters);
+        }
 }
 /**
@@ -553,9 +558,11 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
         * data in any cpu buffer, or a specific buffer, put the
         * caller on the appropriate wait queue.
         */
-        if (cpu == RING_BUFFER_ALL_CPUS)
+        if (cpu == RING_BUFFER_ALL_CPUS) {
                work = &buffer->irq_work;
-        else {
+                /* Full only makes sense on per cpu reads */
+                full = false;
+        } else {
                if (!cpumask_test_cpu(cpu, buffer->cpumask))
                        return -ENODEV;
                cpu_buffer = buffer->buffers[cpu];
@@ -564,7 +571,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
        while (true) {
-                prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
+                if (full)
+                        prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE);
+                else
+                        prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
                /*
                 * The events can happen in critical sections where
@@ -586,7 +596,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
                 * that is necessary is that the wake up happens after
                 * a task has been queued. It's OK for spurious wake ups.
                 */
-                work->waiters_pending = true;
+                if (full)
+                        work->full_waiters_pending = true;
+                else
+                        work->waiters_pending = true;
                if (signal_pending(current)) {
                        ret = -EINTR;
@@ -615,7 +628,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
                schedule();
        }
-        finish_wait(&work->waiters, &wait);
+        if (full)
+                finish_wait(&work->full_waiters, &wait);
+        else
+                finish_wait(&work->waiters, &wait);
        return ret;
 }
@@ -1230,6 +1246,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
        init_completion(&cpu_buffer->update_done);
        init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
        init_waitqueue_head(&cpu_buffer->irq_work.waiters);
+        init_waitqueue_head(&cpu_buffer->irq_work.full_waiters);
        bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
                            GFP_KERNEL, cpu_to_node(cpu));
@@ -2801,6 +2818,8 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
 static __always_inline void
 rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
 {
+        bool pagebusy;
        if (buffer->irq_work.waiters_pending) {
                buffer->irq_work.waiters_pending = false;
                /* irq_work_queue() supplies it's own memory barriers */
@@ -2812,6 +2831,15 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
                /* irq_work_queue() supplies it's own memory barriers */
                irq_work_queue(&cpu_buffer->irq_work.work);
        }
+        pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
+        if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) {
+                cpu_buffer->irq_work.wakeup_full = true;
+                cpu_buffer->irq_work.full_waiters_pending = false;
+                /* irq_work_queue() supplies it's own memory barriers */
+                irq_work_queue(&cpu_buffer->irq_work.work);
+        }
 }
 /**
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 3f9e328c30b5..13d945c0d03f 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -7,7 +7,7 @@
 #include <linux/completion.h>
 #include <linux/kthread.h>
 #include <linux/module.h>
-#include <linux/time.h>
+#include <linux/ktime.h>
 #include <asm/local.h>
 struct rb_page {
@@ -17,7 +17,7 @@ struct rb_page {
 };
 /* run time and sleep time in seconds */
-#define RUN_TIME        10
+#define RUN_TIME        10ULL
 #define SLEEP_TIME      10
 /* number of events for writer to wake up the reader */
@@ -212,8 +212,7 @@ static void ring_buffer_consumer(void)
 static void ring_buffer_producer(void)
 {
-        struct timeval start_tv;
+        ktime_t start_time, end_time, timeout;
-        struct timeval end_tv;
        unsigned long long time;
        unsigned long long entries;
        unsigned long long overruns;
@@ -227,7 +226,8 @@ static void ring_buffer_producer(void)
         * make the system stall)
         */
        trace_printk("Starting ring buffer hammer\n");
-        do_gettimeofday(&start_tv);
+        start_time = ktime_get();
+        timeout = ktime_add_ns(start_time, RUN_TIME * NSEC_PER_SEC);
        do {
                struct ring_buffer_event *event;
                int *entry;
@@ -244,7 +244,7 @@ static void ring_buffer_producer(void)
                                ring_buffer_unlock_commit(buffer, event);
                        }
                }
-                do_gettimeofday(&end_tv);
+                end_time = ktime_get();
                cnt++;
                if (consumer && !(cnt % wakeup_interval))
@@ -264,7 +264,7 @@ static void ring_buffer_producer(void)
                        cond_resched();
 #endif
-        } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test);
+        } while (ktime_before(end_time, timeout) && !kill_test);
        trace_printk("End ring buffer hammer\n");
        if (consumer) {
@@ -280,9 +280,7 @@ static void ring_buffer_producer(void)
                wait_for_completion(&read_done);
        }
-        time = end_tv.tv_sec - start_tv.tv_sec;
+        time = ktime_us_delta(end_time, start_time);
-        time *= USEC_PER_SEC;
-        time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec);
        entries = ring_buffer_entries(buffer);
        overruns = ring_buffer_overruns(buffer);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4a9079b9f082..62c6506d663f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2036,7 +2036,8 @@ void trace_printk_init_buffers(void)
        /* trace_printk() is for debug use only. Don't use it in production. */
-        pr_warning("\n**********************************************************\n");
+        pr_warning("\n");
+        pr_warning("**********************************************************\n");
        pr_warning("**   NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE   **\n");
        pr_warning("**                                                      **\n");
        pr_warning("** trace_printk() being used. Allocating extra memory.  **\n");
@@ -3352,12 +3353,12 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf,
        mutex_lock(&tracing_cpumask_update_lock);
-        len = cpumask_scnprintf(mask_str, count, tr->tracing_cpumask);
+        len = snprintf(mask_str, count, "%*pb\n",
-        if (count - len < 2) {
+                       cpumask_pr_args(tr->tracing_cpumask));
+        if (len >= count) {
                count = -EINVAL;
                goto out_err;
        }
-        len += sprintf(mask_str + len, "\n");
        count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1);
 out_err:
@@ -4140,6 +4141,12 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
                goto out;
        }
+        /* If trace pipe files are being read, we can't change the tracer */
+        if (tr->current_trace->ref) {
+                ret = -EBUSY;
+                goto out;
+        }
        trace_branch_disable();
        tr->current_trace->enabled--;
@@ -4326,17 +4333,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
        }
        trace_seq_init(&iter->seq);
+        iter->trace = tr->current_trace;
-        /*
-         * We make a copy of the current tracer to avoid concurrent
-         * changes on it while we are reading.
-         */
-        iter->trace = kmalloc(sizeof(*iter->trace), GFP_KERNEL);
-        if (!iter->trace) {
-                ret = -ENOMEM;
-                goto fail;
-        }
-        *iter->trace = *tr->current_trace;
        if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
                ret = -ENOMEM;
@@ -4363,6 +4360,8 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
                iter->trace->pipe_open(iter);
        nonseekable_open(inode, filp);
+        tr->current_trace->ref++;
 out:
        mutex_unlock(&trace_types_lock);
        return ret;
@@ -4382,6 +4381,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
        mutex_lock(&trace_types_lock);
+        tr->current_trace->ref--;
        if (iter->trace->pipe_close)
                iter->trace->pipe_close(iter);
@@ -4389,7 +4390,6 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
        free_cpumask_var(iter->started);
        mutex_destroy(&iter->mutex);
-        kfree(iter->trace);
        kfree(iter);
        trace_array_put(tr);
@@ -4422,7 +4422,7 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
        return trace_poll(iter, filp, poll_table);
 }
-/* Must be called with trace_types_lock mutex held. */
+/* Must be called with iter->mutex held. */
 static int tracing_wait_pipe(struct file *filp)
 {
        struct trace_iterator *iter = filp->private_data;
@@ -4467,7 +4467,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
                  size_t cnt, loff_t *ppos)
 {
        struct trace_iterator *iter = filp->private_data;
-        struct trace_array *tr = iter->tr;
        ssize_t sret;
        /* return any leftover data */
@@ -4477,12 +4476,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
        trace_seq_init(&iter->seq);
-        /* copy the tracer to avoid using a global lock all around */
-        mutex_lock(&trace_types_lock);
-        if (unlikely(iter->trace->name != tr->current_trace->name))
-                *iter->trace = *tr->current_trace;
-        mutex_unlock(&trace_types_lock);
        /*
         * Avoid more than one consumer on a single file descriptor
         * This is just a matter of traces coherency, the ring buffer itself
@@ -4642,7 +4635,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
                .ops            = &tracing_pipe_buf_ops,
                .spd_release    = tracing_spd_release_pipe,
        };
-        struct trace_array *tr = iter->tr;
        ssize_t ret;
        size_t rem;
        unsigned int i;
@@ -4650,12 +4642,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
        if (splice_grow_spd(pipe, &spd))
                return -ENOMEM;
-        /* copy the tracer to avoid using a global lock all around */
-        mutex_lock(&trace_types_lock);
-        if (unlikely(iter->trace->name != tr->current_trace->name))
-                *iter->trace = *tr->current_trace;
-        mutex_unlock(&trace_types_lock);
        mutex_lock(&iter->mutex);
        if (iter->trace->splice_read) {
@@ -4942,7 +4928,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
        *fpos += written;
 out_unlock:
-        for (i = 0; i < nr_pages; i++){
+        for (i = nr_pages - 1; i >= 0; i--) {
                kunmap_atomic(map_page[i]);
                put_page(pages[i]);
        }
@@ -5331,6 +5317,8 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
        filp->private_data = info;
+        tr->current_trace->ref++;
        mutex_unlock(&trace_types_lock);
        ret = nonseekable_open(inode, filp);
@@ -5361,21 +5349,16 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
        if (!count)
                return 0;
-        mutex_lock(&trace_types_lock);
 #ifdef CONFIG_TRACER_MAX_TRACE
-        if (iter->snapshot && iter->tr->current_trace->use_max_tr) {
+        if (iter->snapshot && iter->tr->current_trace->use_max_tr)
-                size = -EBUSY;
+                return -EBUSY;
-                goto out_unlock;
-        }
 #endif
        if (!info->spare)
                info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
                                                          iter->cpu_file);
-        size = -ENOMEM;
        if (!info->spare)
-                goto out_unlock;
+                return -ENOMEM;
        /* Do we have previous read data to read? */
        if (info->read < PAGE_SIZE)
@@ -5391,21 +5374,16 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
        if (ret < 0) {
                if (trace_empty(iter)) {
-                        if ((filp->f_flags & O_NONBLOCK)) {
+                        if ((filp->f_flags & O_NONBLOCK))
-                                size = -EAGAIN;
+                                return -EAGAIN;
-                                goto out_unlock;
-                        }
-                        mutex_unlock(&trace_types_lock);
                        ret = wait_on_pipe(iter, false);
-                        mutex_lock(&trace_types_lock);
+                        if (ret)
-                        if (ret) {
+                                return ret;
-                                size = ret;
-                                goto out_unlock;
-                        }
                        goto again;
                }
-                size = 0;
+                return 0;
-                goto out_unlock;
        }
        info->read = 0;
@@ -5415,18 +5393,14 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
                size = count;
        ret = copy_to_user(ubuf, info->spare + info->read, size);
-        if (ret == size) {
+        if (ret == size)
-                size = -EFAULT;
+                return -EFAULT;
-                goto out_unlock;
-        }
        size -= ret;
        *ppos += size;
        info->read += size;
- out_unlock:
-        mutex_unlock(&trace_types_lock);
        return size;
 }
@@ -5437,6 +5411,8 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
        mutex_lock(&trace_types_lock);
+        iter->tr->current_trace->ref--;
        __trace_array_put(iter->tr);
        if (info->spare)
@@ -5522,30 +5498,20 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
        int entries, size, i;
        ssize_t ret = 0;
-        mutex_lock(&trace_types_lock);
 #ifdef CONFIG_TRACER_MAX_TRACE
-        if (iter->snapshot && iter->tr->current_trace->use_max_tr) {
+        if (iter->snapshot && iter->tr->current_trace->use_max_tr)
-                ret = -EBUSY;
+                return -EBUSY;
-                goto out;
-        }
 #endif
-        if (splice_grow_spd(pipe, &spd)) {
+        if (splice_grow_spd(pipe, &spd))
-                ret = -ENOMEM;
+                return -ENOMEM;
-                goto out;
-        }
-        if (*ppos & (PAGE_SIZE - 1)) {
+        if (*ppos & (PAGE_SIZE - 1))
-                ret = -EINVAL;
+                return -EINVAL;
-                goto out;
-        }
        if (len & (PAGE_SIZE - 1)) {
-                if (len < PAGE_SIZE) {
+                if (len < PAGE_SIZE)
-                        ret = -EINVAL;
+                        return -EINVAL;
-                        goto out;
-                }
                len &= PAGE_MASK;
        }
@@ -5606,25 +5572,20 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
        /* did we read anything? */
        if (!spd.nr_pages) {
                if (ret)
-                        goto out;
+                        return ret;
+                if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
+                        return -EAGAIN;
-                if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) {
-                        ret = -EAGAIN;
-                        goto out;
-                }
-                mutex_unlock(&trace_types_lock);
                ret = wait_on_pipe(iter, true);
-                mutex_lock(&trace_types_lock);
                if (ret)
-                        goto out;
+                        return ret;
                goto again;
        }
        ret = splice_to_pipe(pipe, &spd);
        splice_shrink_spd(&spd);
-out:
-        mutex_unlock(&trace_types_lock);
        return ret;
 }
@@ -5854,28 +5815,11 @@ static __init int register_snapshot_cmd(void)
 static inline __init int register_snapshot_cmd(void) { return 0; }
 #endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */
-struct dentry *tracing_init_dentry_tr(struct trace_array *tr)
+static struct dentry *tracing_get_dentry(struct trace_array *tr)
 {
-        if (tr->dir)
-                return tr->dir;
-        if (!debugfs_initialized())
-                return NULL;
-        if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
-                tr->dir = debugfs_create_dir("tracing", NULL);
-        if (!tr->dir)
-                pr_warn_once("Could not create debugfs directory 'tracing'\n");
        return tr->dir;
 }
-struct dentry *tracing_init_dentry(void)
-{
-        return tracing_init_dentry_tr(&global_trace);
-}
 static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
 {
        struct dentry *d_tracer;
@@ -5883,8 +5827,8 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
        if (tr->percpu_dir)
                return tr->percpu_dir;
-        d_tracer = tracing_init_dentry_tr(tr);
+        d_tracer = tracing_get_dentry(tr);
-        if (!d_tracer)
+        if (IS_ERR(d_tracer))
                return NULL;
        tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer);
@@ -6086,8 +6030,8 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr)
        if (tr->options)
                return tr->options;
-        d_tracer = tracing_init_dentry_tr(tr);
+        d_tracer = tracing_get_dentry(tr);
-        if (!d_tracer)
+        if (IS_ERR(d_tracer))
                return NULL;
        tr->options = debugfs_create_dir("options", d_tracer);
@@ -6416,7 +6360,7 @@ static int instance_delete(const char *name)
                goto out_unlock;
        ret = -EBUSY;
-        if (tr->ref)
+        if (tr->ref || (tr->current_trace && tr->current_trace->ref))
                goto out_unlock;
        list_del(&tr->list);
@@ -6571,6 +6515,33 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
 }
+/**
+ * tracing_init_dentry - initialize top level trace array
+ *
+ * This is called when creating files or directories in the tracing
+ * directory. It is called via fs_initcall() by any of the boot up code
+ * and expects to return the dentry of the top level tracing directory.
+ */
+struct dentry *tracing_init_dentry(void)
+{
+        struct trace_array *tr = &global_trace;
+        if (tr->dir)
+                return tr->dir;
+        if (WARN_ON(!debugfs_initialized()))
+                return ERR_PTR(-ENODEV);
+        tr->dir = debugfs_create_dir("tracing", NULL);
+        if (!tr->dir) {
+                pr_warn_once("Could not create debugfs directory 'tracing'\n");
+                return ERR_PTR(-ENOMEM);
+        }
+        return tr->dir;
+}
 static __init int tracer_init_debugfs(void)
 {
        struct dentry *d_tracer;
@@ -6578,7 +6549,7 @@ static __init int tracer_init_debugfs(void)
        trace_access_lock_init();
        d_tracer = tracing_init_dentry();
-        if (!d_tracer)
+        if (IS_ERR(d_tracer))
                return 0;
        init_tracer_debugfs(&global_trace, d_tracer);
@@ -6811,7 +6782,6 @@ __init static int tracer_alloc_buffers(void)
        int ring_buf_size;
        int ret = -ENOMEM;
        if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
                goto out;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8de48bac1ce2..dd8205a35760 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -388,6 +388,7 @@ struct tracer {
        struct tracer           *next;
        struct tracer_flags     *flags;
        int                     enabled;
+        int                     ref;
        bool                    print_max;
        bool                    allow_instances;
 #ifdef CONFIG_TRACER_MAX_TRACE
@@ -541,7 +542,6 @@ struct dentry *trace_create_file(const char *name,
                                 void *data,
                                 const struct file_operations *fops);
-struct dentry *tracing_init_dentry_tr(struct trace_array *tr);
 struct dentry *tracing_init_dentry(void);
 struct ring_buffer_event;
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 7d6e2afde669..57cbf1efdd44 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -7,7 +7,6 @@
 #include <linux/seq_file.h>
 #include <linux/spinlock.h>
 #include <linux/irqflags.h>
-#include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/ftrace.h>
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 4b9c114ee9de..6fa484de2ba1 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -261,7 +261,7 @@ void perf_trace_del(struct perf_event *p_event, int flags)
 }
 void *perf_trace_buf_prepare(int size, unsigned short type,
-                             struct pt_regs *regs, int *rctxp)
+                             struct pt_regs **regs, int *rctxp)
 {
        struct trace_entry *entry;
        unsigned long flags;
@@ -280,6 +280,8 @@ void *perf_trace_buf_prepare(int size, unsigned short type,
        if (*rctxp < 0)
                return NULL;
+        if (regs)
+                *regs = this_cpu_ptr(&__perf_regs[*rctxp]);
        raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
        /* zero the dead bytes from align to not leak stack to user */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index b03a0ea77b99..db54dda10ccc 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2531,7 +2531,7 @@ static __init int event_trace_init(void)
                return -ENODEV;
        d_tracer = tracing_init_dentry();
-        if (!d_tracer)
+        if (IS_ERR(d_tracer))
                return 0;
        entry = debugfs_create_file("available_events", 0444, d_tracer,
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d4ddde28a81a..12e2b99be862 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -6,12 +6,10 @@
 #include <linux/stringify.h>
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
-#include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/fs.h>
 #include "trace_output.h"
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index ba476009e5de..2d25ad1526bb 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -1437,7 +1437,7 @@ static __init int init_graph_debugfs(void)
        struct dentry *d_tracer;
        d_tracer = tracing_init_dentry();
-        if (!d_tracer)
+        if (IS_ERR(d_tracer))
                return 0;
        trace_create_file("max_graph_depth", 0644, d_tracer,
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 9bb104f748d0..8523ea345f2b 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -10,11 +10,9 @@
 *  Copyright (C) 2004 Nadia Yvette Chambers
 */
 #include <linux/kallsyms.h>
-#include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/ftrace.h>
-#include <linux/fs.h>
 #include "trace.h"
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5edb518be345..d73f565b4e06 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1148,7 +1148,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
        size = ALIGN(__size + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);
-        entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
+        entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
        if (!entry)
                return;
@@ -1179,7 +1179,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
        size = ALIGN(__size + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);
-        entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
+        entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
        if (!entry)
                return;
@@ -1320,7 +1320,7 @@ static __init int init_kprobe_trace(void)
                return -EINVAL;
        d_tracer = tracing_init_dentry();
-        if (!d_tracer)
+        if (IS_ERR(d_tracer))
                return 0;
        entry = debugfs_create_file("kprobe_events", 0644, d_tracer,
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index fcf0a9e48916..8bb2071474dd 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -6,8 +6,6 @@
 */
 #include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/debugfs.h>
 #include <linux/ftrace.h>
 #include "trace.h"
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index b77b9a697619..692bf7184c8c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -177,6 +177,50 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
 }
 EXPORT_SYMBOL(ftrace_print_hex_seq);
+const char *
+ftrace_print_array_seq(struct trace_seq *p, const void *buf, int buf_len,
+                       size_t el_size)
+{
+        const char *ret = trace_seq_buffer_ptr(p);
+        const char *prefix = "";
+        void *ptr = (void *)buf;
+        trace_seq_putc(p, '{');
+        while (ptr < buf + buf_len) {
+                switch (el_size) {
+                case 1:
+                        trace_seq_printf(p, "%s0x%x", prefix,
+                                         *(u8 *)ptr);
+                        break;
+                case 2:
+                        trace_seq_printf(p, "%s0x%x", prefix,
+                                         *(u16 *)ptr);
+                        break;
+                case 4:
+                        trace_seq_printf(p, "%s0x%x", prefix,
+                                         *(u32 *)ptr);
+                        break;
+                case 8:
+                        trace_seq_printf(p, "%s0x%llx", prefix,
+                                         *(u64 *)ptr);
+                        break;
+                default:
+                        trace_seq_printf(p, "BAD SIZE:%zu 0x%x", el_size,
+                                         *(u8 *)ptr);
+                        el_size = 1;
+                }
+                prefix = ",";
+                ptr += el_size;
+        }
+        trace_seq_putc(p, '}');
+        trace_seq_putc(p, 0);
+        return ret;
+}
+EXPORT_SYMBOL(ftrace_print_array_seq);
 int ftrace_raw_output_prep(struct trace_iterator *iter,
                           struct trace_event *trace_event)
 {
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index c4e70b6bd7fa..36c1455b7567 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -5,7 +5,6 @@
 *
 */
 #include <linux/seq_file.h>
-#include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/kernel.h>
 #include <linux/ftrace.h>
@@ -15,7 +14,6 @@
 #include <linux/ctype.h>
 #include <linux/list.h>
 #include <linux/slab.h>
-#include <linux/fs.h>
 #include "trace.h"
@@ -349,7 +347,7 @@ static __init int init_trace_printk_function_export(void)
        struct dentry *d_tracer;
        d_tracer = tracing_init_dentry();
-        if (!d_tracer)
+        if (IS_ERR(d_tracer))
                return 0;
        trace_create_file("printk_formats", 0444, d_tracer,
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 2e293beb186e..419ca37e72c9 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -5,8 +5,6 @@
 *
 */
 #include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/debugfs.h>
 #include <linux/kallsyms.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 8fb84b362816..d6e1003724e9 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -10,8 +10,6 @@
 *  Copyright (C) 2004 Nadia Yvette Chambers
 */
 #include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/debugfs.h>
 #include <linux/kallsyms.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index f8b45d8792f9..e694c9f9efa4 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -120,7 +120,7 @@ void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
        __trace_seq_init(s);
-        seq_buf_bitmask(&s->seq, maskp, nmaskbits);
+        seq_buf_printf(&s->seq, "%*pb", nmaskbits, maskp);
        if (unlikely(seq_buf_has_overflowed(&s->seq))) {
                s->seq.len = save_len;
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 16eddb308c33..c3e4fcfddd45 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -7,12 +7,10 @@
 #include <linux/seq_file.h>
 #include <linux/spinlock.h>
 #include <linux/uaccess.h>
-#include <linux/debugfs.h>
 #include <linux/ftrace.h>
 #include <linux/module.h>
 #include <linux/sysctl.h>
 #include <linux/init.h>
-#include <linux/fs.h>
 #include <asm/setup.h>
@@ -462,7 +460,7 @@ static __init int stack_trace_init(void)
        struct dentry *d_tracer;
        d_tracer = tracing_init_dentry();
-        if (!d_tracer)
+        if (IS_ERR(d_tracer))
                return 0;
        trace_create_file("stack_max_size", 0644, d_tracer,
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 7af67360b330..75e19e86c954 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -276,7 +276,7 @@ static int tracing_stat_init(void)
        struct dentry *d_tracing;
        d_tracing = tracing_init_dentry();
-        if (!d_tracing)
+        if (IS_ERR(d_tracing))
                return 0;
        stat_dir = debugfs_create_dir("trace_stat", d_tracing);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index c6ee36fcbf90..f97f6e3a676c 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -574,7 +574,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
        size -= sizeof(u32);
        rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
-                                sys_data->enter_event->event.type, regs, &rctx);
+                                sys_data->enter_event->event.type, NULL, &rctx);
        if (!rec)
                return;
@@ -647,7 +647,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
        size -= sizeof(u32);
        rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
-                                sys_data->exit_event->event.type, regs, &rctx);
+                                sys_data->exit_event->event.type, NULL, &rctx);
        if (!rec)
                return;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 8520acc34b18..7dc1c8abecd6 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1111,7 +1111,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
        if (hlist_empty(head))
                goto out;
-        entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
+        entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
        if (!entry)
                goto out;
@@ -1321,7 +1321,7 @@ static __init int init_uprobe_trace(void)
        struct dentry *d_tracer;
        d_tracer = tracing_init_dentry();
-        if (!d_tracer)
+        if (IS_ERR(d_tracer))
                return 0;
        trace_create_file("uprobe_events", 0644, d_tracer,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 70bf11815f84..3174bf8e3538 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -154,7 +154,7 @@ static int get_softlockup_thresh(void)
 */
 static unsigned long get_timestamp(void)
 {
-        return local_clock() >> 30LL;  /* 2^30 ~= 10^9 */
+        return running_clock() >> 30LL;  /* 2^30 ~= 10^9 */
 }
 static void set_sample_period(void)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index beeeac9e0e3e..f28849394791 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3083,10 +3083,9 @@ static ssize_t wq_cpumask_show(struct device *dev,
        int written;
        mutex_lock(&wq->mutex);
-        written = cpumask_scnprintf(buf, PAGE_SIZE, wq->unbound_attrs->cpumask);
+        written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
+                            cpumask_pr_args(wq->unbound_attrs->cpumask));
        mutex_unlock(&wq->mutex);
-        written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
        return written;
 }