77 files changed, 3062 insertions, 1503 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index bc010ee272b6..f2a8b6246ce9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -18,11 +18,13 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_irq_work.o = -pg
 endif
+# cond_syscall is currently not LTO compatible
+CFLAGS_sys_ni.o = $(DISABLE_LTO)
 obj-y += sched/
 obj-y += locking/
 obj-y += power/
 obj-y += printk/
-obj-y += cpu/
 obj-y += irq/
 obj-y += rcu/
@@ -93,6 +95,7 @@ obj-$(CONFIG_PADATA) += padata.o
 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_JUMP_LABEL) += jump_label.o
 obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
+obj-$(CONFIG_TORTURE_TEST) += torture.o
 $(obj)/configs.o: $(obj)/config_data.h
diff --git a/kernel/audit.c b/kernel/audit.c
index 34c5a2310fbf..95a20f3f52f1 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -182,7 +182,7 @@ struct audit_buffer {
 struct audit_reply {
        __u32 portid;
-        pid_t pid;
+        struct net *net;        
        struct sk_buff *skb;
 };
@@ -500,7 +500,7 @@ int audit_send_list(void *_dest)
 {
        struct audit_netlink_list *dest = _dest;
        struct sk_buff *skb;
-        struct net *net = get_net_ns_by_pid(dest->pid);
+        struct net *net = dest->net;
        struct audit_net *aunet = net_generic(net, audit_net_id);
        /* wait for parent to finish and send an ACK */
@@ -510,6 +510,7 @@ int audit_send_list(void *_dest)
        while ((skb = __skb_dequeue(&dest->q)) != NULL)
                netlink_unicast(aunet->nlsk, skb, dest->portid, 0);
+        put_net(net);
        kfree(dest);
        return 0;
@@ -543,7 +544,7 @@ out_kfree_skb:
 static int audit_send_reply_thread(void *arg)
 {
        struct audit_reply *reply = (struct audit_reply *)arg;
-        struct net *net = get_net_ns_by_pid(reply->pid);
+        struct net *net = reply->net;
        struct audit_net *aunet = net_generic(net, audit_net_id);
        mutex_lock(&audit_cmd_mutex);
@@ -552,12 +553,13 @@ static int audit_send_reply_thread(void *arg)
        /* Ignore failure. It'll only happen if the sender goes away,
           because our timeout is set to infinite. */
        netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0);
+        put_net(net);
        kfree(reply);
        return 0;
 }
 /**
 * audit_send_reply - send an audit reply message via netlink
- * @portid: netlink port to which to send reply
+ * @request_skb: skb of request we are replying to (used to target the reply)
 * @seq: sequence number
 * @type: audit message type
 * @done: done (last) flag
@@ -568,9 +570,11 @@ static int audit_send_reply_thread(void *arg)
 * Allocates an skb, builds the netlink message, and sends it to the port id.
 * No failure notifications.
 */
-static void audit_send_reply(__u32 portid, int seq, int type, int done,
+static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done,
                             int multi, const void *payload, int size)
 {
+        u32 portid = NETLINK_CB(request_skb).portid;
+        struct net *net = sock_net(NETLINK_CB(request_skb).sk);
        struct sk_buff *skb;
        struct task_struct *tsk;
        struct audit_reply *reply = kmalloc(sizeof(struct audit_reply),
@@ -583,8 +587,8 @@ static void audit_send_reply(__u32 portid, int seq, int type, int done,
        if (!skb)
                goto out;
+        reply->net = get_net(net);
        reply->portid = portid;
-        reply->pid = task_pid_vnr(current);
        reply->skb = skb;
        tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
@@ -604,9 +608,19 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
        int err = 0;
        /* Only support the initial namespaces for now. */
+        /*
+         * We return ECONNREFUSED because it tricks userspace into thinking
+         * that audit was not configured into the kernel.  Lots of users
+         * configure their PAM stack (because that's what the distro does)
+         * to reject login if unable to send messages to audit.  If we return
+         * ECONNREFUSED the PAM stack thinks the kernel does not have audit
+         * configured in and will let login proceed.  If we return EPERM
+         * userspace will reject all logins.  This should be removed when we
+         * support non init namespaces!!
+         */
        if ((current_user_ns() != &init_user_ns) ||
            (task_active_pid_ns(current) != &init_pid_ns))
-                return -EPERM;
+                return -ECONNREFUSED;
        switch (msg_type) {
        case AUDIT_LIST:
@@ -673,8 +687,7 @@ static int audit_get_feature(struct sk_buff *skb)
        seq = nlmsg_hdr(skb)->nlmsg_seq;
-        audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0,
+        audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &af, sizeof(af));
-                         &af, sizeof(af));
        return 0;
 }
@@ -794,8 +807,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                s.backlog               = skb_queue_len(&audit_skb_queue);
                s.version               = AUDIT_VERSION_LATEST;
                s.backlog_wait_time     = audit_backlog_wait_time;
-                audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0,
+                audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
-                                 &s, sizeof(s));
                break;
        }
        case AUDIT_SET: {
@@ -905,7 +917,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                                           seq, data, nlmsg_len(nlh));
                break;
        case AUDIT_LIST_RULES:
-                err = audit_list_rules_send(NETLINK_CB(skb).portid, seq);
+                err = audit_list_rules_send(skb, seq);
                break;
        case AUDIT_TRIM:
                audit_trim_trees();
@@ -970,8 +982,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                        memcpy(sig_data->ctx, ctx, len);
                        security_release_secctx(ctx, len);
                }
-                audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_SIGNAL_INFO,
+                audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
-                                0, 0, sig_data, sizeof(*sig_data) + len);
+                                 sig_data, sizeof(*sig_data) + len);
                kfree(sig_data);
                break;
        case AUDIT_TTY_GET: {
@@ -983,8 +995,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                s.log_passwd = tsk->signal->audit_tty_log_passwd;
                spin_unlock(&tsk->sighand->siglock);
-                audit_send_reply(NETLINK_CB(skb).portid, seq,
+                audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
-                                 AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
                break;
        }
        case AUDIT_TTY_SET: {
diff --git a/kernel/audit.h b/kernel/audit.h
index 57cc64d67718..8df132214606 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -247,7 +247,7 @@ extern void		    audit_panic(const char *message);
 struct audit_netlink_list {
        __u32 portid;
-        pid_t pid;
+        struct net *net;
        struct sk_buff_head q;
 };
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 67ccf0e7cca9..135944a7b28a 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -916,7 +916,7 @@ static int audit_tree_handle_event(struct fsnotify_group *group,
                                   struct fsnotify_mark *inode_mark,
                                   struct fsnotify_mark *vfsmount_mark,
                                   u32 mask, void *data, int data_type,
-                                   const unsigned char *file_name)
+                                   const unsigned char *file_name, u32 cookie)
 {
        return 0;
 }
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 2596fac5dcb4..70b4554d2fbe 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -471,7 +471,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
                                    struct fsnotify_mark *inode_mark,
                                    struct fsnotify_mark *vfsmount_mark,
                                    u32 mask, void *data, int data_type,
-                                    const unsigned char *dname)
+                                    const unsigned char *dname, u32 cookie)
 {
        struct inode *inode;
        struct audit_parent *parent;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 14a78cca384e..92062fd6cc8c 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -29,6 +29,8 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/security.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
 #include "audit.h"
 /*
@@ -1065,11 +1067,13 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data,
 /**
 * audit_list_rules_send - list the audit rules
- * @portid: target portid for netlink audit messages
+ * @request_skb: skb of request we are replying to (used to target the reply)
 * @seq: netlink audit message sequence (serial) number
 */
-int audit_list_rules_send(__u32 portid, int seq)
+int audit_list_rules_send(struct sk_buff *request_skb, int seq)
 {
+        u32 portid = NETLINK_CB(request_skb).portid;
+        struct net *net = sock_net(NETLINK_CB(request_skb).sk);
        struct task_struct *tsk;
        struct audit_netlink_list *dest;
        int err = 0;
@@ -1083,8 +1087,8 @@ int audit_list_rules_send(__u32 portid, int seq)
        dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
        if (!dest)
                return -ENOMEM;
+        dest->net = get_net(net);
        dest->portid = portid;
-        dest->pid = task_pid_vnr(current);
        skb_queue_head_init(&dest->q);
        mutex_lock(&audit_filter_mutex);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e2f46ba37f72..0c753ddd223b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -886,7 +886,9 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
                 * per-subsystem and moved to css->id so that lookups are
                 * successful until the target css is released.
                 */
+                mutex_lock(&cgroup_mutex);
                idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
+                mutex_unlock(&cgroup_mutex);
                cgrp->id = -1;
                call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
@@ -1566,10 +1568,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                mutex_lock(&cgroup_mutex);
                mutex_lock(&cgroup_root_mutex);
-                root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp,
+                ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
-                                           0, 1, GFP_KERNEL);
+                if (ret < 0)
-                if (root_cgrp->id < 0)
                        goto unlock_drop;
+                root_cgrp->id = ret;
                /* Check for name clashes with existing mounts */
                ret = -EBUSY;
@@ -2763,10 +2765,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
         */
        update_before = cgroup_serial_nr_next;
-        mutex_unlock(&cgroup_mutex);
        /* add/rm files for all cgroups created before */
-        rcu_read_lock();
        css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
                struct cgroup *cgrp = css->cgroup;
@@ -2775,23 +2774,19 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
                inode = cgrp->dentry->d_inode;
                dget(cgrp->dentry);
-                rcu_read_unlock();
                dput(prev);
                prev = cgrp->dentry;
+                mutex_unlock(&cgroup_mutex);
                mutex_lock(&inode->i_mutex);
                mutex_lock(&cgroup_mutex);
                if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
                        ret = cgroup_addrm_files(cgrp, cfts, is_add);
-                mutex_unlock(&cgroup_mutex);
                mutex_unlock(&inode->i_mutex);
-                rcu_read_lock();
                if (ret)
                        break;
        }
-        rcu_read_unlock();
+        mutex_unlock(&cgroup_mutex);
        dput(prev);
        deactivate_super(sb);
        return ret;
@@ -2910,9 +2905,14 @@ static void cgroup_enable_task_cg_lists(void)
                 * We should check if the process is exiting, otherwise
                 * it will race with cgroup_exit() in that the list
                 * entry won't be deleted though the process has exited.
+                 * Do it while holding siglock so that we don't end up
+                 * racing against cgroup_exit().
                 */
+                spin_lock_irq(&p->sighand->siglock);
                if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
                        list_add(&p->cg_list, &task_css_set(p)->tasks);
+                spin_unlock_irq(&p->sighand->siglock);
                task_unlock(p);
        } while_each_thread(g, p);
        read_unlock(&tasklist_lock);
@@ -4112,17 +4112,17 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
        err = percpu_ref_init(&css->refcnt, css_release);
        if (err)
-                goto err_free;
+                goto err_free_css;
        init_css(css, ss, cgrp);
        err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id);
        if (err)
-                goto err_free;
+                goto err_free_percpu_ref;
        err = online_css(css);
        if (err)
-                goto err_free;
+                goto err_clear_dir;
        dget(cgrp->dentry);
        css_get(css->parent);
@@ -4138,8 +4138,11 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
        return 0;
-err_free:
+err_clear_dir:
+        cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id);
+err_free_percpu_ref:
        percpu_ref_cancel_init(&css->refcnt);
+err_free_css:
        ss->css_free(css);
        return err;
 }
@@ -4158,7 +4161,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        struct cgroup *cgrp;
        struct cgroup_name *name;
        struct cgroupfs_root *root = parent->root;
-        int ssid, err = 0;
+        int ssid, err;
        struct cgroup_subsys *ss;
        struct super_block *sb = root->sb;
@@ -4168,19 +4171,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                return -ENOMEM;
        name = cgroup_alloc_name(dentry);
-        if (!name)
+        if (!name) {
+                err = -ENOMEM;
                goto err_free_cgrp;
+        }
        rcu_assign_pointer(cgrp->name, name);
        /*
-         * Temporarily set the pointer to NULL, so idr_find() won't return
-         * a half-baked cgroup.
-         */
-        cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
-        if (cgrp->id < 0)
-                goto err_free_name;
-        /*
         * Only live parents can have children.  Note that the liveliness
         * check isn't strictly necessary because cgroup_mkdir() and
         * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
@@ -4189,7 +4186,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
         */
        if (!cgroup_lock_live_group(parent)) {
                err = -ENODEV;
-                goto err_free_id;
+                goto err_free_name;
+        }
+        /*
+         * Temporarily set the pointer to NULL, so idr_find() won't return
+         * a half-baked cgroup.
+         */
+        cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
+        if (cgrp->id < 0) {
+                err = -ENOMEM;
+                goto err_unlock;
        }
        /* Grab a reference on the superblock so the hierarchy doesn't
@@ -4221,7 +4228,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
         */
        err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
        if (err < 0)
-                goto err_unlock;
+                goto err_free_id;
        lockdep_assert_held(&dentry->d_inode->i_mutex);
        cgrp->serial_nr = cgroup_serial_nr_next++;
@@ -4257,12 +4264,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        return 0;
-err_unlock:
-        mutex_unlock(&cgroup_mutex);
-        /* Release the reference count that we took on the superblock */
-        deactivate_super(sb);
 err_free_id:
        idr_remove(&root->cgroup_idr, cgrp->id);
+        /* Release the reference count that we took on the superblock */
+        deactivate_super(sb);
+err_unlock:
+        mutex_unlock(&cgroup_mutex);
 err_free_name:
        kfree(rcu_dereference_raw(cgrp->name));
 err_free_cgrp:
diff --git a/kernel/compat.c b/kernel/compat.c
index 0a09e481b70b..488ff8c4cf48 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -110,8 +110,8 @@ static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc)
        return 0;
 }
-asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
+COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv,
-                struct timezone __user *tz)
+                       struct timezone __user *, tz)
 {
        if (tv) {
                struct timeval ktv;
@@ -127,8 +127,8 @@ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
        return 0;
 }
-asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
+COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv,
-                struct timezone __user *tz)
+                       struct timezone __user *, tz)
 {
        struct timespec kts;
        struct timezone ktz;
@@ -236,8 +236,8 @@ static long compat_nanosleep_restart(struct restart_block *restart)
        return ret;
 }
-asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
+COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
-                                     struct compat_timespec __user *rmtp)
+                       struct compat_timespec __user *, rmtp)
 {
        struct timespec tu, rmt;
        mm_segment_t oldfs;
@@ -328,7 +328,7 @@ static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
        return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
 }
-asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
+COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf)
 {
        if (tbuf) {
                struct tms tms;
@@ -354,7 +354,7 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
 * types that can be passed to put_user()/get_user().
 */
-asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
+COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set)
 {
        old_sigset_t s;
        long ret;
@@ -424,8 +424,8 @@ COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how,
 #endif
-asmlinkage long compat_sys_setrlimit(unsigned int resource,
+COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource,
-                struct compat_rlimit __user *rlim)
+                       struct compat_rlimit __user *, rlim)
 {
        struct rlimit r;
@@ -443,8 +443,8 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource,
 #ifdef COMPAT_RLIM_OLD_INFINITY
-asmlinkage long compat_sys_old_getrlimit(unsigned int resource,
+COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
-                struct compat_rlimit __user *rlim)
+                       struct compat_rlimit __user *, rlim)
 {
        struct rlimit r;
        int ret;
@@ -470,8 +470,8 @@ asmlinkage long compat_sys_old_getrlimit(unsigned int resource,
 #endif
-asmlinkage long compat_sys_getrlimit(unsigned int resource,
+COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource,
-                struct compat_rlimit __user *rlim)
+                       struct compat_rlimit __user *, rlim)
 {
        struct rlimit r;
        int ret;
@@ -596,9 +596,9 @@ static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr,
        return compat_get_bitmap(k, user_mask_ptr, len * 8);
 }
-asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid,
+COMPAT_SYSCALL_DEFINE3(sched_setaffinity, compat_pid_t, pid,
-                                             unsigned int len,
+                       unsigned int, len,
-                                             compat_ulong_t __user *user_mask_ptr)
+                       compat_ulong_t __user *, user_mask_ptr)
 {
        cpumask_var_t new_mask;
        int retval;
@@ -616,8 +616,8 @@ out:
        return retval;
 }
-asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
+COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t,  pid, unsigned int, len,
-                                             compat_ulong_t __user *user_mask_ptr)
+                       compat_ulong_t __user *, user_mask_ptr)
 {
        int ret;
        cpumask_var_t mask;
@@ -662,9 +662,9 @@ int put_compat_itimerspec(struct compat_itimerspec __user *dst,
        return 0;
 }
-long compat_sys_timer_create(clockid_t which_clock,
+COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock,
-                        struct compat_sigevent __user *timer_event_spec,
+                       struct compat_sigevent __user *, timer_event_spec,
-                        timer_t __user *created_timer_id)
+                       timer_t __user *, created_timer_id)
 {
        struct sigevent __user *event = NULL;
@@ -680,9 +680,9 @@ long compat_sys_timer_create(clockid_t which_clock,
        return sys_timer_create(which_clock, event, created_timer_id);
 }
-long compat_sys_timer_settime(timer_t timer_id, int flags,
+COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
-                          struct compat_itimerspec __user *new,
+                       struct compat_itimerspec __user *, new,
-                          struct compat_itimerspec __user *old)
+                       struct compat_itimerspec __user *, old)
 {
        long err;
        mm_segment_t oldfs;
@@ -703,8 +703,8 @@ long compat_sys_timer_settime(timer_t timer_id, int flags,
        return err;
 }
-long compat_sys_timer_gettime(timer_t timer_id,
+COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
-                struct compat_itimerspec __user *setting)
+                       struct compat_itimerspec __user *, setting)
 {
        long err;
        mm_segment_t oldfs;
@@ -720,8 +720,8 @@ long compat_sys_timer_gettime(timer_t timer_id,
        return err;
 }
-long compat_sys_clock_settime(clockid_t which_clock,
+COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
-                struct compat_timespec __user *tp)
+                       struct compat_timespec __user *, tp)
 {
        long err;
        mm_segment_t oldfs;
@@ -737,8 +737,8 @@ long compat_sys_clock_settime(clockid_t which_clock,
        return err;
 }
-long compat_sys_clock_gettime(clockid_t which_clock,
+COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
-                struct compat_timespec __user *tp)
+                       struct compat_timespec __user *, tp)
 {
        long err;
        mm_segment_t oldfs;
@@ -754,8 +754,8 @@ long compat_sys_clock_gettime(clockid_t which_clock,
        return err;
 }
-long compat_sys_clock_adjtime(clockid_t which_clock,
+COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
-                struct compat_timex __user *utp)
+                       struct compat_timex __user *, utp)
 {
        struct timex txc;
        mm_segment_t oldfs;
@@ -777,8 +777,8 @@ long compat_sys_clock_adjtime(clockid_t which_clock,
        return ret;
 }
-long compat_sys_clock_getres(clockid_t which_clock,
+COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
-                struct compat_timespec __user *tp)
+                       struct compat_timespec __user *, tp)
 {
        long err;
        mm_segment_t oldfs;
@@ -818,9 +818,9 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart)
        return err;
 }
-long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
+COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
-                            struct compat_timespec __user *rqtp,
+                       struct compat_timespec __user *, rqtp,
-                            struct compat_timespec __user *rmtp)
+                       struct compat_timespec __user *, rmtp)
 {
        long err;
        mm_segment_t oldfs;
@@ -1010,7 +1010,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
 /* compat_time_t is a 32 bit "long" and needs to get converted. */
-asmlinkage long compat_sys_time(compat_time_t __user * tloc)
+COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc)
 {
        compat_time_t i;
        struct timeval tv;
@@ -1026,7 +1026,7 @@ asmlinkage long compat_sys_time(compat_time_t __user * tloc)
        return i;
 }
-asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
+COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr)
 {
        struct timespec tv;
        int err;
@@ -1046,7 +1046,7 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
 #endif /* __ARCH_WANT_COMPAT_SYS_TIME */
-asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
+COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp)
 {
        struct timex txc;
        int err, ret;
@@ -1065,11 +1065,11 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
 }
 #ifdef CONFIG_NUMA
-asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
+COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
-                compat_uptr_t __user *pages32,
+                       compat_uptr_t __user *, pages32,
-                const int __user *nodes,
+                       const int __user *, nodes,
-                int __user *status,
+                       int __user *, status,
-                int flags)
+                       int, flags)
 {
        const void __user * __user *pages;
        int i;
@@ -1085,10 +1085,10 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
        return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
 }
-asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
+COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
-                        compat_ulong_t maxnode,
+                       compat_ulong_t, maxnode,
-                        const compat_ulong_t __user *old_nodes,
+                       const compat_ulong_t __user *, old_nodes,
-                        const compat_ulong_t __user *new_nodes)
+                       const compat_ulong_t __user *, new_nodes)
 {
        unsigned long __user *old = NULL;
        unsigned long __user *new = NULL;
diff --git a/kernel/cpu/Makefile b/kernel/cpu/Makefile
deleted file mode 100644
index 59ab052ef7a0..000000000000
--- a/kernel/cpu/Makefile
+++ /dev/null
@@ -1 +0,0 @@
-obj-y   = idle.o
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4410ac6a55f1..e6b1b66afe52 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -974,12 +974,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 *    Temporarilly set tasks mems_allowed to target nodes of migration,
 *    so that the migration code can allocate pages on these nodes.
 *
- *    Call holding cpuset_mutex, so current's cpuset won't change
- *    during this call, as manage_mutex holds off any cpuset_attach()
- *    calls.  Therefore we don't need to take task_lock around the
- *    call to guarantee_online_mems(), as we know no one is changing
- *    our task's cpuset.
- *
 *    While the mm_struct we are migrating is typically from some
 *    other task, the task_struct mems_allowed that we are hacking
 *    is for our current task, which must allocate new pages for that
@@ -996,8 +990,10 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
        do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
+        rcu_read_lock();
        mems_cs = effective_nodemask_cpuset(task_cs(tsk));
        guarantee_online_mems(mems_cs, &tsk->mems_allowed);
+        rcu_read_unlock();
 }
 /*
@@ -2486,9 +2482,9 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
        task_lock(current);
        cs = nearest_hardwall_ancestor(task_cs(current));
+        allowed = node_isset(node, cs->mems_allowed);
        task_unlock(current);
-        allowed = node_isset(node, cs->mems_allowed);
        mutex_unlock(&callback_mutex);
        return allowed;
 }
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 334b3980ffc1..99982a70ddad 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -1035,7 +1035,7 @@ int dbg_io_get_char(void)
 * otherwise as a quick means to stop program execution and "break" into
 * the debugger.
 */
-void kgdb_breakpoint(void)
+noinline void kgdb_breakpoint(void)
 {
        atomic_inc(&kgdb_setting_breakpoint);
        wmb(); /* Sync point before breakpoint */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 56003c6edfd3..661951ab8ae7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -231,11 +231,29 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 #define NR_ACCUMULATED_SAMPLES 128
 static DEFINE_PER_CPU(u64, running_sample_length);
-void perf_sample_event_took(u64 sample_len_ns)
+static void perf_duration_warn(struct irq_work *w)
 {
+        u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
        u64 avg_local_sample_len;
        u64 local_samples_len;
+        local_samples_len = __get_cpu_var(running_sample_length);
+        avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
+        printk_ratelimited(KERN_WARNING
+                        "perf interrupt took too long (%lld > %lld), lowering "
+                        "kernel.perf_event_max_sample_rate to %d\n",
+                        avg_local_sample_len, allowed_ns >> 1,
+                        sysctl_perf_event_sample_rate);
+}
+static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
+void perf_sample_event_took(u64 sample_len_ns)
+{
        u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
+        u64 avg_local_sample_len;
+        u64 local_samples_len;
        if (allowed_ns == 0)
                return;
@@ -263,13 +281,14 @@ void perf_sample_event_took(u64 sample_len_ns)
        sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
-        printk_ratelimited(KERN_WARNING
-                        "perf samples too long (%lld > %lld), lowering "
-                        "kernel.perf_event_max_sample_rate to %d\n",
-                        avg_local_sample_len, allowed_ns,
-                        sysctl_perf_event_sample_rate);
        update_perf_cpu_limits();
+        if (!irq_work_queue(&perf_duration_work)) {
+                early_printk("perf interrupt took too long (%lld > %lld), lowering "
+                             "kernel.perf_event_max_sample_rate to %d\n",
+                             avg_local_sample_len, allowed_ns >> 1,
+                             sysctl_perf_event_sample_rate);
+        }
 }
 static atomic64_t perf_event_id;
@@ -1714,7 +1733,7 @@ group_sched_in(struct perf_event *group_event,
               struct perf_event_context *ctx)
 {
        struct perf_event *event, *partial_group = NULL;
-        struct pmu *pmu = group_event->pmu;
+        struct pmu *pmu = ctx->pmu;
        u64 now = ctx->time;
        bool simulate = false;
@@ -2563,8 +2582,6 @@ static void perf_branch_stack_sched_in(struct task_struct *prev,
                if (cpuctx->ctx.nr_branch_stack > 0
                    && pmu->flush_branch_stack) {
-                        pmu = cpuctx->ctx.pmu;
                        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
                        perf_pmu_disable(pmu);
@@ -6294,7 +6311,7 @@ static int perf_event_idx_default(struct perf_event *event)
 * Ensures all contexts with the same task_ctx_nr have the same
 * pmu_cpu_context too.
 */
-static void *find_pmu_context(int ctxn)
+static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
 {
        struct pmu *pmu;
@@ -7856,14 +7873,14 @@ static void perf_pmu_rotate_stop(struct pmu *pmu)
 static void __perf_event_exit_context(void *__info)
 {
        struct perf_event_context *ctx = __info;
-        struct perf_event *event, *tmp;
+        struct perf_event *event;
        perf_pmu_rotate_stop(ctx->pmu);
-        list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
+        rcu_read_lock();
-                __perf_remove_from_context(event);
+        list_for_each_entry_rcu(event, &ctx->event_list, event_entry)
-        list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
                __perf_remove_from_context(event);
+        rcu_read_unlock();
 }
 static void perf_event_exit_cpu_context(int cpu)
@@ -7887,11 +7904,11 @@ static void perf_event_exit_cpu(int cpu)
 {
        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
+        perf_event_exit_cpu_context(cpu);
        mutex_lock(&swhash->hlist_mutex);
        swevent_hlist_release(swhash);
        mutex_unlock(&swhash->hlist_mutex);
-        perf_event_exit_cpu_context(cpu);
 }
 #else
 static inline void perf_event_exit_cpu(int cpu) { }
diff --git a/kernel/extable.c b/kernel/extable.c
index 763faf037ec1..d8a6446adbcb 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -36,7 +36,7 @@ extern struct exception_table_entry __start___ex_table[];
 extern struct exception_table_entry __stop___ex_table[];
 /* Cleared by build time tools if the table is already sorted. */
-u32 __initdata main_extable_sort_needed = 1;
+u32 __initdata __visible main_extable_sort_needed = 1;
 /* Sort the kernel's built-in exception table */
 void __init sort_main_extable(void)
diff --git a/kernel/fork.c b/kernel/fork.c
index a17621c6cd42..332688e5e7b4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -237,6 +237,7 @@ void __put_task_struct(struct task_struct *tsk)
        WARN_ON(atomic_read(&tsk->usage));
        WARN_ON(tsk == current);
+        task_numa_free(tsk);
        security_task_free(tsk);
        exit_creds(tsk);
        delayacct_tsk_free(tsk);
diff --git a/kernel/futex.c b/kernel/futex.c
index 44a1261cb9ff..67dacaf93e56 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -157,7 +157,9 @@
 * enqueue.
 */
+#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
 int __read_mostly futex_cmpxchg_enabled;
+#endif
 /*
 * Futex flags used to encode options to functions and preserve them across
@@ -234,6 +236,7 @@ static const struct futex_q futex_q_init = {
 * waiting on a futex.
 */
 struct futex_hash_bucket {
+        atomic_t waiters;
        spinlock_t lock;
        struct plist_head chain;
 } ____cacheline_aligned_in_smp;
@@ -253,22 +256,37 @@ static inline void futex_get_mm(union futex_key *key)
        smp_mb__after_atomic_inc();
 }
-static inline bool hb_waiters_pending(struct futex_hash_bucket *hb)
+/*
+ * Reflects a new waiter being added to the waitqueue.
+ */
+static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
 {
 #ifdef CONFIG_SMP
+        atomic_inc(&hb->waiters);
        /*
-         * Tasks trying to enter the critical region are most likely
+         * Full barrier (A), see the ordering comment above.
-         * potential waiters that will be added to the plist. Ensure
-         * that wakers won't miss to-be-slept tasks in the window between
-         * the wait call and the actual plist_add.
         */
-        if (spin_is_locked(&hb->lock))
+        smp_mb__after_atomic_inc();
-                return true;
+#endif
-        smp_rmb(); /* Make sure we check the lock state first */
+}
+/*
+ * Reflects a waiter being removed from the waitqueue by wakeup
+ * paths.
+ */
+static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+        atomic_dec(&hb->waiters);
+#endif
+}
-        return !plist_head_empty(&hb->chain);
+static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+        return atomic_read(&hb->waiters);
 #else
-        return true;
+        return 1;
 #endif
 }
@@ -954,6 +972,7 @@ static void __unqueue_futex(struct futex_q *q)
        hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
        plist_del(&q->list, &hb->chain);
+        hb_waiters_dec(hb);
 }
 /*
@@ -1257,7 +1276,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
         */
        if (likely(&hb1->chain != &hb2->chain)) {
                plist_del(&q->list, &hb1->chain);
+                hb_waiters_dec(hb1);
                plist_add(&q->list, &hb2->chain);
+                hb_waiters_inc(hb2);
                q->lock_ptr = &hb2->lock;
        }
        get_futex_key_refs(key2);
@@ -1600,6 +1621,17 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
        struct futex_hash_bucket *hb;
        hb = hash_futex(&q->key);
+        /*
+         * Increment the counter before taking the lock so that
+         * a potential waker won't miss a to-be-slept task that is
+         * waiting for the spinlock. This is safe as all queue_lock()
+         * users end up calling queue_me(). Similarly, for housekeeping,
+         * decrement the counter at queue_unlock() when some error has
+         * occurred and we don't end up adding the task to the list.
+         */
+        hb_waiters_inc(hb);
        q->lock_ptr = &hb->lock;
        spin_lock(&hb->lock); /* implies MB (A) */
@@ -1611,6 +1643,7 @@ queue_unlock(struct futex_hash_bucket *hb)
        __releases(&hb->lock)
 {
        spin_unlock(&hb->lock);
+        hb_waiters_dec(hb);
 }
 /**
@@ -2342,6 +2375,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
                 * Unqueue the futex_q and determine which it was.
                 */
                plist_del(&q->list, &hb->chain);
+                hb_waiters_dec(hb);
                /* Handle spurious wakeups gracefully */
                ret = -EWOULDBLOCK;
@@ -2843,9 +2877,28 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
        return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
 }
-static int __init futex_init(void)
+static void __init futex_detect_cmpxchg(void)
 {
+#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
        u32 curval;
+        /*
+         * This will fail and we want it. Some arch implementations do
+         * runtime detection of the futex_atomic_cmpxchg_inatomic()
+         * functionality. We want to know that before we call in any
+         * of the complex code paths. Also we want to prevent
+         * registration of robust lists in that case. NULL is
+         * guaranteed to fault and we get -EFAULT on functional
+         * implementation, the non-functional ones will return
+         * -ENOSYS.
+         */
+        if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
+                futex_cmpxchg_enabled = 1;
+#endif
+}
+static int __init futex_init(void)
+{
        unsigned int futex_shift;
        unsigned long i;
@@ -2861,20 +2914,11 @@ static int __init futex_init(void)
                                               &futex_shift, NULL,
                                               futex_hashsize, futex_hashsize);
        futex_hashsize = 1UL << futex_shift;
-        /*
-         * This will fail and we want it. Some arch implementations do
+        futex_detect_cmpxchg();
-         * runtime detection of the futex_atomic_cmpxchg_inatomic()
-         * functionality. We want to know that before we call in any
-         * of the complex code paths. Also we want to prevent
-         * registration of robust lists in that case. NULL is
-         * guaranteed to fault and we get -EFAULT on functional
-         * implementation, the non-functional ones will return
-         * -ENOSYS.
-         */
-        if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
-                futex_cmpxchg_enabled = 1;
        for (i = 0; i < futex_hashsize; i++) {
+                atomic_set(&futex_queues[i].waiters, 0);
                plist_head_init(&futex_queues[i].chain);
                spin_lock_init(&futex_queues[i].lock);
        }
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index cf68bb36fe58..f14033700c25 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -10,6 +10,7 @@
 #include <linux/mutex.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
+#include <linux/of_irq.h>
 #include <linux/topology.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 481a13c43b17..d3bf660cb57f 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -802,8 +802,7 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc,
 static void wake_threads_waitq(struct irq_desc *desc)
 {
-        if (atomic_dec_and_test(&desc->threads_active) &&
+        if (atomic_dec_and_test(&desc->threads_active))
-            waitqueue_active(&desc->wait_for_threads))
                wake_up(&desc->wait_for_threads);
 }
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 55fcce6065cf..a82170e2fa78 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -61,11 +61,11 @@ void __weak arch_irq_work_raise(void)
 *
 * Can be re-enqueued while the callback is still in progress.
 */
-void irq_work_queue(struct irq_work *work)
+bool irq_work_queue(struct irq_work *work)
 {
        /* Only queue if not already pending */
        if (!irq_work_claim(work))
-                return;
+                return false;
        /* Queue the entry and raise the IPI if needed. */
        preempt_disable();
@@ -83,6 +83,8 @@ void irq_work_queue(struct irq_work *work)
        }
        preempt_enable();
+        return true;
 }
 EXPORT_SYMBOL_GPL(irq_work_queue);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 60bafbed06ab..45601cf41bee 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1039,10 +1039,10 @@ void __weak crash_unmap_reserved_pages(void)
 {}
 #ifdef CONFIG_COMPAT
-asmlinkage long compat_sys_kexec_load(unsigned long entry,
+COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
-                                unsigned long nr_segments,
+                       compat_ulong_t, nr_segments,
-                                struct compat_kexec_segment __user *segments,
+                       struct compat_kexec_segment __user *, segments,
-                                unsigned long flags)
+                       compat_ulong_t, flags)
 {
        struct compat_kexec_segment in;
        struct kexec_segment out, __user *ksegments;
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index d945a949760f..e660964086e2 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -19,6 +19,8 @@
 #include <linux/sched.h>
 #include <linux/capability.h>
+#include <linux/rcupdate.h>     /* rcu_expedited */
 #define KERNEL_ATTR_RO(_name) \
 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index baab8e5e7f66..306a76b51e0f 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,5 +1,5 @@
-obj-y += mutex.o semaphore.o rwsem.o lglock.o
+obj-y += mutex.o semaphore.o rwsem.o lglock.o mcs_spinlock.o
 ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_lockdep.o = -pg
@@ -23,3 +23,4 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
 obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
 obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
+obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index eb8a54783fa0..b0e9467922e1 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1936,12 +1936,12 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
        for (;;) {
                int distance = curr->lockdep_depth - depth + 1;
-                hlock = curr->held_locks + depth-1;
+                hlock = curr->held_locks + depth - 1;
                /*
                 * Only non-recursive-read entries get new dependencies
                 * added:
                 */
-                if (hlock->read != 2) {
+                if (hlock->read != 2 && hlock->check) {
                        if (!check_prev_add(curr, hlock, next,
                                                distance, trylock_loop))
                                return 0;
@@ -2098,7 +2098,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
         * (If lookup_chain_cache() returns with 1 it acquires
         * graph_lock for us)
         */
-        if (!hlock->trylock && (hlock->check == 2) &&
+        if (!hlock->trylock && hlock->check &&
            lookup_chain_cache(curr, hlock, chain_key)) {
                /*
                 * Check whether last held lock:
@@ -2517,7 +2517,7 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
                BUG_ON(usage_bit >= LOCK_USAGE_STATES);
-                if (hlock_class(hlock)->key == __lockdep_no_validate__.subkeys)
+                if (!hlock->check)
                        continue;
                if (!mark_lock(curr, hlock, usage_bit))
@@ -2557,7 +2557,7 @@ static void __trace_hardirqs_on_caller(unsigned long ip)
        debug_atomic_inc(hardirqs_on_events);
 }
-void trace_hardirqs_on_caller(unsigned long ip)
+__visible void trace_hardirqs_on_caller(unsigned long ip)
 {
        time_hardirqs_on(CALLER_ADDR0, ip);
@@ -2610,7 +2610,7 @@ EXPORT_SYMBOL(trace_hardirqs_on);
 /*
 * Hardirqs were disabled:
 */
-void trace_hardirqs_off_caller(unsigned long ip)
+__visible void trace_hardirqs_off_caller(unsigned long ip)
 {
        struct task_struct *curr = current;
@@ -3055,9 +3055,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        int class_idx;
        u64 chain_key;
-        if (!prove_locking)
-                check = 1;
        if (unlikely(!debug_locks))
                return 0;
@@ -3069,8 +3066,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return 0;
-        if (lock->key == &__lockdep_no_validate__)
+        if (!prove_locking || lock->key == &__lockdep_no_validate__)
-                check = 1;
+                check = 0;
        if (subclass < NR_LOCKDEP_CACHING_CLASSES)
                class = lock->class_cache[subclass];
@@ -3138,7 +3135,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        hlock->holdtime_stamp = lockstat_clock();
 #endif
-        if (check == 2 && !mark_irqflags(curr, hlock))
+        if (check && !mark_irqflags(curr, hlock))
                return 0;
        /* mark it as used: */
@@ -4191,7 +4188,7 @@ void debug_show_held_locks(struct task_struct *task)
 }
 EXPORT_SYMBOL_GPL(debug_show_held_locks);
-void lockdep_sys_exit(void)
+asmlinkage void lockdep_sys_exit(void)
 {
        struct task_struct *curr = current;
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
new file mode 100644
index 000000000000..f26b1a18e34e
--- /dev/null
+++ b/kernel/locking/locktorture.c
@@ -0,0 +1,452 @@
+/*
+ * Module-based torture test facility for locking
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2014
+ *
+ * Author: Paul E. McKenney <paulmck@us.ibm.com>
+ *      Based on kernel/rcu/torture.c.
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/freezer.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/trace_clock.h>
+#include <asm/byteorder.h>
+#include <linux/torture.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
+torture_param(int, nwriters_stress, -1,
+             "Number of write-locking stress-test threads");
+torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
+torture_param(int, onoff_interval, 0,
+             "Time between CPU hotplugs (s), 0=disable");
+torture_param(int, shuffle_interval, 3,
+             "Number of jiffies between shuffles, 0=disable");
+torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable.");
+torture_param(int, stat_interval, 60,
+             "Number of seconds between stats printk()s");
+torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
+torture_param(bool, verbose, true,
+             "Enable verbose debugging printk()s");
+static char *torture_type = "spin_lock";
+module_param(torture_type, charp, 0444);
+MODULE_PARM_DESC(torture_type,
+                 "Type of lock to torture (spin_lock, spin_lock_irq, ...)");
+static atomic_t n_lock_torture_errors;
+static struct task_struct *stats_task;
+static struct task_struct **writer_tasks;
+static int nrealwriters_stress;
+static bool lock_is_write_held;
+struct lock_writer_stress_stats {
+        long n_write_lock_fail;
+        long n_write_lock_acquired;
+};
+static struct lock_writer_stress_stats *lwsa;
+#if defined(MODULE) || defined(CONFIG_LOCK_TORTURE_TEST_RUNNABLE)
+#define LOCKTORTURE_RUNNABLE_INIT 1
+#else
+#define LOCKTORTURE_RUNNABLE_INIT 0
+#endif
+int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT;
+module_param(locktorture_runnable, int, 0444);
+MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at boot");
+/* Forward reference. */
+static void lock_torture_cleanup(void);
+/*
+ * Operations vector for selecting different types of tests.
+ */
+struct lock_torture_ops {
+        void (*init)(void);
+        int (*writelock)(void);
+        void (*write_delay)(struct torture_random_state *trsp);
+        void (*writeunlock)(void);
+        unsigned long flags;
+        const char *name;
+};
+static struct lock_torture_ops *cur_ops;
+/*
+ * Definitions for lock torture testing.
+ */
+static int torture_lock_busted_write_lock(void)
+{
+        return 0;  /* BUGGY, do not use in real life!!! */
+}
+static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
+{
+        const unsigned long longdelay_us = 100;
+        /* We want a long delay occasionally to force massive contention.  */
+        if (!(torture_random(trsp) %
+              (nrealwriters_stress * 2000 * longdelay_us)))
+                mdelay(longdelay_us);
+#ifdef CONFIG_PREEMPT
+        if (!(torture_random(trsp) % (nrealwriters_stress * 20000)))
+                preempt_schedule();  /* Allow test to be preempted. */
+#endif
+}
+static void torture_lock_busted_write_unlock(void)
+{
+          /* BUGGY, do not use in real life!!! */
+}
+static struct lock_torture_ops lock_busted_ops = {
+        .writelock      = torture_lock_busted_write_lock,
+        .write_delay    = torture_lock_busted_write_delay,
+        .writeunlock    = torture_lock_busted_write_unlock,
+        .name           = "lock_busted"
+};
+static DEFINE_SPINLOCK(torture_spinlock);
+static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock)
+{
+        spin_lock(&torture_spinlock);
+        return 0;
+}
+static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
+{
+        const unsigned long shortdelay_us = 2;
+        const unsigned long longdelay_us = 100;
+        /* We want a short delay mostly to emulate likely code, and
+         * we want a long delay occasionally to force massive contention.
+         */
+        if (!(torture_random(trsp) %
+              (nrealwriters_stress * 2000 * longdelay_us)))
+                mdelay(longdelay_us);
+        if (!(torture_random(trsp) %
+              (nrealwriters_stress * 2 * shortdelay_us)))
+                udelay(shortdelay_us);
+#ifdef CONFIG_PREEMPT
+        if (!(torture_random(trsp) % (nrealwriters_stress * 20000)))
+                preempt_schedule();  /* Allow test to be preempted. */
+#endif
+}
+static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock)
+{
+        spin_unlock(&torture_spinlock);
+}
+static struct lock_torture_ops spin_lock_ops = {
+        .writelock      = torture_spin_lock_write_lock,
+        .write_delay    = torture_spin_lock_write_delay,
+        .writeunlock    = torture_spin_lock_write_unlock,
+        .name           = "spin_lock"
+};
+static int torture_spin_lock_write_lock_irq(void)
+__acquires(torture_spinlock_irq)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&torture_spinlock, flags);
+        cur_ops->flags = flags;
+        return 0;
+}
+static void torture_lock_spin_write_unlock_irq(void)
+__releases(torture_spinlock)
+{
+        spin_unlock_irqrestore(&torture_spinlock, cur_ops->flags);
+}
+static struct lock_torture_ops spin_lock_irq_ops = {
+        .writelock      = torture_spin_lock_write_lock_irq,
+        .write_delay    = torture_spin_lock_write_delay,
+        .writeunlock    = torture_lock_spin_write_unlock_irq,
+        .name           = "spin_lock_irq"
+};
+/*
+ * Lock torture writer kthread.  Repeatedly acquires and releases
+ * the lock, checking for duplicate acquisitions.
+ */
+static int lock_torture_writer(void *arg)
+{
+        struct lock_writer_stress_stats *lwsp = arg;
+        static DEFINE_TORTURE_RANDOM(rand);
+        VERBOSE_TOROUT_STRING("lock_torture_writer task started");
+        set_user_nice(current, 19);
+        do {
+                schedule_timeout_uninterruptible(1);
+                cur_ops->writelock();
+                if (WARN_ON_ONCE(lock_is_write_held))
+                        lwsp->n_write_lock_fail++;
+                lock_is_write_held = 1;
+                lwsp->n_write_lock_acquired++;
+                cur_ops->write_delay(&rand);
+                lock_is_write_held = 0;
+                cur_ops->writeunlock();
+                stutter_wait("lock_torture_writer");
+        } while (!torture_must_stop());
+        torture_kthread_stopping("lock_torture_writer");
+        return 0;
+}
+/*
+ * Create an lock-torture-statistics message in the specified buffer.
+ */
+static void lock_torture_printk(char *page)
+{
+        bool fail = 0;
+        int i;
+        long max = 0;
+        long min = lwsa[0].n_write_lock_acquired;
+        long long sum = 0;
+        for (i = 0; i < nrealwriters_stress; i++) {
+                if (lwsa[i].n_write_lock_fail)
+                        fail = true;
+                sum += lwsa[i].n_write_lock_acquired;
+                if (max < lwsa[i].n_write_lock_fail)
+                        max = lwsa[i].n_write_lock_fail;
+                if (min > lwsa[i].n_write_lock_fail)
+                        min = lwsa[i].n_write_lock_fail;
+        }
+        page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG);
+        page += sprintf(page,
+                        "Writes:  Total: %lld  Max/Min: %ld/%ld %s  Fail: %d %s\n",
+                        sum, max, min, max / 2 > min ? "???" : "",
+                        fail, fail ? "!!!" : "");
+        if (fail)
+                atomic_inc(&n_lock_torture_errors);
+}
+/*
+ * Print torture statistics.  Caller must ensure that there is only one
+ * call to this function at a given time!!!  This is normally accomplished
+ * by relying on the module system to only have one copy of the module
+ * loaded, and then by giving the lock_torture_stats kthread full control
+ * (or the init/cleanup functions when lock_torture_stats thread is not
+ * running).
+ */
+static void lock_torture_stats_print(void)
+{
+        int size = nrealwriters_stress * 200 + 8192;
+        char *buf;
+        buf = kmalloc(size, GFP_KERNEL);
+        if (!buf) {
+                pr_err("lock_torture_stats_print: Out of memory, need: %d",
+                       size);
+                return;
+        }
+        lock_torture_printk(buf);
+        pr_alert("%s", buf);
+        kfree(buf);
+}
+/*
+ * Periodically prints torture statistics, if periodic statistics printing
+ * was specified via the stat_interval module parameter.
+ *
+ * No need to worry about fullstop here, since this one doesn't reference
+ * volatile state or register callbacks.
+ */
+static int lock_torture_stats(void *arg)
+{
+        VERBOSE_TOROUT_STRING("lock_torture_stats task started");
+        do {
+                schedule_timeout_interruptible(stat_interval * HZ);
+                lock_torture_stats_print();
+                torture_shutdown_absorb("lock_torture_stats");
+        } while (!torture_must_stop());
+        torture_kthread_stopping("lock_torture_stats");
+        return 0;
+}
+static inline void
+lock_torture_print_module_parms(struct lock_torture_ops *cur_ops,
+                                const char *tag)
+{
+        pr_alert("%s" TORTURE_FLAG
+                 "--- %s: nwriters_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
+                 torture_type, tag, nrealwriters_stress, stat_interval, verbose,
+                 shuffle_interval, stutter, shutdown_secs,
+                 onoff_interval, onoff_holdoff);
+}
+static void lock_torture_cleanup(void)
+{
+        int i;
+        if (torture_cleanup())
+                return;
+        if (writer_tasks) {
+                for (i = 0; i < nrealwriters_stress; i++)
+                        torture_stop_kthread(lock_torture_writer,
+                                             writer_tasks[i]);
+                kfree(writer_tasks);
+                writer_tasks = NULL;
+        }
+        torture_stop_kthread(lock_torture_stats, stats_task);
+        lock_torture_stats_print();  /* -After- the stats thread is stopped! */
+        if (atomic_read(&n_lock_torture_errors))
+                lock_torture_print_module_parms(cur_ops,
+                                                "End of test: FAILURE");
+        else if (torture_onoff_failures())
+                lock_torture_print_module_parms(cur_ops,
+                                                "End of test: LOCK_HOTPLUG");
+        else
+                lock_torture_print_module_parms(cur_ops,
+                                                "End of test: SUCCESS");
+}
+static int __init lock_torture_init(void)
+{
+        int i;
+        int firsterr = 0;
+        static struct lock_torture_ops *torture_ops[] = {
+                &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops,
+        };
+        torture_init_begin(torture_type, verbose, &locktorture_runnable);
+        /* Process args and tell the world that the torturer is on the job. */
+        for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
+                cur_ops = torture_ops[i];
+                if (strcmp(torture_type, cur_ops->name) == 0)
+                        break;
+        }
+        if (i == ARRAY_SIZE(torture_ops)) {
+                pr_alert("lock-torture: invalid torture type: \"%s\"\n",
+                         torture_type);
+                pr_alert("lock-torture types:");
+                for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
+                        pr_alert(" %s", torture_ops[i]->name);
+                pr_alert("\n");
+                torture_init_end();
+                return -EINVAL;
+        }
+        if (cur_ops->init)
+                cur_ops->init(); /* no "goto unwind" prior to this point!!! */
+        if (nwriters_stress >= 0)
+                nrealwriters_stress = nwriters_stress;
+        else
+                nrealwriters_stress = 2 * num_online_cpus();
+        lock_torture_print_module_parms(cur_ops, "Start of test");
+        /* Initialize the statistics so that each run gets its own numbers. */
+        lock_is_write_held = 0;
+        lwsa = kmalloc(sizeof(*lwsa) * nrealwriters_stress, GFP_KERNEL);
+        if (lwsa == NULL) {
+                VERBOSE_TOROUT_STRING("lwsa: Out of memory");
+                firsterr = -ENOMEM;
+                goto unwind;
+        }
+        for (i = 0; i < nrealwriters_stress; i++) {
+                lwsa[i].n_write_lock_fail = 0;
+                lwsa[i].n_write_lock_acquired = 0;
+        }
+        /* Start up the kthreads. */
+        if (onoff_interval > 0) {
+                firsterr = torture_onoff_init(onoff_holdoff * HZ,
+                                              onoff_interval * HZ);
+                if (firsterr)
+                        goto unwind;
+        }
+        if (shuffle_interval > 0) {
+                firsterr = torture_shuffle_init(shuffle_interval);
+                if (firsterr)
+                        goto unwind;
+        }
+        if (shutdown_secs > 0) {
+                firsterr = torture_shutdown_init(shutdown_secs,
+                                                 lock_torture_cleanup);
+                if (firsterr)
+                        goto unwind;
+        }
+        if (stutter > 0) {
+                firsterr = torture_stutter_init(stutter);
+                if (firsterr)
+                        goto unwind;
+        }
+        writer_tasks = kzalloc(nrealwriters_stress * sizeof(writer_tasks[0]),
+                               GFP_KERNEL);
+        if (writer_tasks == NULL) {
+                VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory");
+                firsterr = -ENOMEM;
+                goto unwind;
+        }
+        for (i = 0; i < nrealwriters_stress; i++) {
+                firsterr = torture_create_kthread(lock_torture_writer, &lwsa[i],
+                                                  writer_tasks[i]);
+                if (firsterr)
+                        goto unwind;
+        }
+        if (stat_interval > 0) {
+                firsterr = torture_create_kthread(lock_torture_stats, NULL,
+                                                  stats_task);
+                if (firsterr)
+                        goto unwind;
+        }
+        torture_init_end();
+        return 0;
+unwind:
+        torture_init_end();
+        lock_torture_cleanup();
+        return firsterr;
+}
+module_init(lock_torture_init);
+module_exit(lock_torture_cleanup);
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
new file mode 100644
index 000000000000..838dc9e00669
--- /dev/null
+++ b/kernel/locking/mcs_spinlock.c
@@ -0,0 +1,178 @@
+#include <linux/percpu.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include "mcs_spinlock.h"
+#ifdef CONFIG_SMP
+/*
+ * An MCS like lock especially tailored for optimistic spinning for sleeping
+ * lock implementations (mutex, rwsem, etc).
+ *
+ * Using a single mcs node per CPU is safe because sleeping locks should not be
+ * called from interrupt context and we have preemption disabled while
+ * spinning.
+ */
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node);
+/*
+ * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
+ * Can return NULL in case we were the last queued and we updated @lock instead.
+ */
+static inline struct optimistic_spin_queue *
+osq_wait_next(struct optimistic_spin_queue **lock,
+              struct optimistic_spin_queue *node,
+              struct optimistic_spin_queue *prev)
+{
+        struct optimistic_spin_queue *next = NULL;
+        for (;;) {
+                if (*lock == node && cmpxchg(lock, node, prev) == node) {
+                        /*
+                         * We were the last queued, we moved @lock back. @prev
+                         * will now observe @lock and will complete its
+                         * unlock()/unqueue().
+                         */
+                        break;
+                }
+                /*
+                 * We must xchg() the @node->next value, because if we were to
+                 * leave it in, a concurrent unlock()/unqueue() from
+                 * @node->next might complete Step-A and think its @prev is
+                 * still valid.
+                 *
+                 * If the concurrent unlock()/unqueue() wins the race, we'll
+                 * wait for either @lock to point to us, through its Step-B, or
+                 * wait for a new @node->next from its Step-C.
+                 */
+                if (node->next) {
+                        next = xchg(&node->next, NULL);
+                        if (next)
+                                break;
+                }
+                arch_mutex_cpu_relax();
+        }
+        return next;
+}
+bool osq_lock(struct optimistic_spin_queue **lock)
+{
+        struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
+        struct optimistic_spin_queue *prev, *next;
+        node->locked = 0;
+        node->next = NULL;
+        node->prev = prev = xchg(lock, node);
+        if (likely(prev == NULL))
+                return true;
+        ACCESS_ONCE(prev->next) = node;
+        /*
+         * Normally @prev is untouchable after the above store; because at that
+         * moment unlock can proceed and wipe the node element from stack.
+         *
+         * However, since our nodes are static per-cpu storage, we're
+         * guaranteed their existence -- this allows us to apply
+         * cmpxchg in an attempt to undo our queueing.
+         */
+        while (!smp_load_acquire(&node->locked)) {
+                /*
+                 * If we need to reschedule bail... so we can block.
+                 */
+                if (need_resched())
+                        goto unqueue;
+                arch_mutex_cpu_relax();
+        }
+        return true;
+unqueue:
+        /*
+         * Step - A  -- stabilize @prev
+         *
+         * Undo our @prev->next assignment; this will make @prev's
+         * unlock()/unqueue() wait for a next pointer since @lock points to us
+         * (or later).
+         */
+        for (;;) {
+                if (prev->next == node &&
+                    cmpxchg(&prev->next, node, NULL) == node)
+                        break;
+                /*
+                 * We can only fail the cmpxchg() racing against an unlock(),
+                 * in which case we should observe @node->locked becomming
+                 * true.
+                 */
+                if (smp_load_acquire(&node->locked))
+                        return true;
+                arch_mutex_cpu_relax();
+                /*
+                 * Or we race against a concurrent unqueue()'s step-B, in which
+                 * case its step-C will write us a new @node->prev pointer.
+                 */
+                prev = ACCESS_ONCE(node->prev);
+        }
+        /*
+         * Step - B -- stabilize @next
+         *
+         * Similar to unlock(), wait for @node->next or move @lock from @node
+         * back to @prev.
+         */
+        next = osq_wait_next(lock, node, prev);
+        if (!next)
+                return false;
+        /*
+         * Step - C -- unlink
+         *
+         * @prev is stable because its still waiting for a new @prev->next
+         * pointer, @next is stable because our @node->next pointer is NULL and
+         * it will wait in Step-A.
+         */
+        ACCESS_ONCE(next->prev) = prev;
+        ACCESS_ONCE(prev->next) = next;
+        return false;
+}
+void osq_unlock(struct optimistic_spin_queue **lock)
+{
+        struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
+        struct optimistic_spin_queue *next;
+        /*
+         * Fast path for the uncontended case.
+         */
+        if (likely(cmpxchg(lock, node, NULL) == node))
+                return;
+        /*
+         * Second most likely case.
+         */
+        next = xchg(&node->next, NULL);
+        if (next) {
+                ACCESS_ONCE(next->locked) = 1;
+                return;
+        }
+        next = osq_wait_next(lock, node, NULL);
+        if (next)
+                ACCESS_ONCE(next->locked) = 1;
+}
+#endif
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
new file mode 100644
index 000000000000..a2dbac4aca6b
--- /dev/null
+++ b/kernel/locking/mcs_spinlock.h
@@ -0,0 +1,129 @@
+/*
+ * MCS lock defines
+ *
+ * This file contains the main data structure and API definitions of MCS lock.
+ *
+ * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock
+ * with the desirable properties of being fair, and with each cpu trying
+ * to acquire the lock spinning on a local variable.
+ * It avoids expensive cache bouncings that common test-and-set spin-lock
+ * implementations incur.
+ */
+#ifndef __LINUX_MCS_SPINLOCK_H
+#define __LINUX_MCS_SPINLOCK_H
+#include <asm/mcs_spinlock.h>
+struct mcs_spinlock {
+        struct mcs_spinlock *next;
+        int locked; /* 1 if lock acquired */
+};
+#ifndef arch_mcs_spin_lock_contended
+/*
+ * Using smp_load_acquire() provides a memory barrier that ensures
+ * subsequent operations happen after the lock is acquired.
+ */
+#define arch_mcs_spin_lock_contended(l)                                 \
+do {                                                                    \
+        while (!(smp_load_acquire(l)))                                  \
+                arch_mutex_cpu_relax();                                 \
+} while (0)
+#endif
+#ifndef arch_mcs_spin_unlock_contended
+/*
+ * smp_store_release() provides a memory barrier to ensure all
+ * operations in the critical section has been completed before
+ * unlocking.
+ */
+#define arch_mcs_spin_unlock_contended(l)                               \
+        smp_store_release((l), 1)
+#endif
+/*
+ * Note: the smp_load_acquire/smp_store_release pair is not
+ * sufficient to form a full memory barrier across
+ * cpus for many architectures (except x86) for mcs_unlock and mcs_lock.
+ * For applications that need a full barrier across multiple cpus
+ * with mcs_unlock and mcs_lock pair, smp_mb__after_unlock_lock() should be
+ * used after mcs_lock.
+ */
+/*
+ * In order to acquire the lock, the caller should declare a local node and
+ * pass a reference of the node to this function in addition to the lock.
+ * If the lock has already been acquired, then this will proceed to spin
+ * on this node->locked until the previous lock holder sets the node->locked
+ * in mcs_spin_unlock().
+ *
+ * We don't inline mcs_spin_lock() so that perf can correctly account for the
+ * time spent in this lock function.
+ */
+static inline
+void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+{
+        struct mcs_spinlock *prev;
+        /* Init node */
+        node->locked = 0;
+        node->next   = NULL;
+        prev = xchg(lock, node);
+        if (likely(prev == NULL)) {
+                /*
+                 * Lock acquired, don't need to set node->locked to 1. Threads
+                 * only spin on its own node->locked value for lock acquisition.
+                 * However, since this thread can immediately acquire the lock
+                 * and does not proceed to spin on its own node->locked, this
+                 * value won't be used. If a debug mode is needed to
+                 * audit lock status, then set node->locked value here.
+                 */
+                return;
+        }
+        ACCESS_ONCE(prev->next) = node;
+        /* Wait until the lock holder passes the lock down. */
+        arch_mcs_spin_lock_contended(&node->locked);
+}
+/*
+ * Releases the lock. The caller should pass in the corresponding node that
+ * was used to acquire the lock.
+ */
+static inline
+void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+{
+        struct mcs_spinlock *next = ACCESS_ONCE(node->next);
+        if (likely(!next)) {
+                /*
+                 * Release the lock by setting it to NULL
+                 */
+                if (likely(cmpxchg(lock, node, NULL) == node))
+                        return;
+                /* Wait until the next pointer is set */
+                while (!(next = ACCESS_ONCE(node->next)))
+                        arch_mutex_cpu_relax();
+        }
+        /* Pass lock to next waiter. */
+        arch_mcs_spin_unlock_contended(&next->locked);
+}
+/*
+ * Cancellable version of the MCS lock above.
+ *
+ * Intended for adaptive spinning of sleeping locks:
+ * mutex_lock()/rwsem_down_{read,write}() etc.
+ */
+struct optimistic_spin_queue {
+        struct optimistic_spin_queue *next, *prev;
+        int locked; /* 1 if lock acquired */
+};
+extern bool osq_lock(struct optimistic_spin_queue **lock);
+extern void osq_unlock(struct optimistic_spin_queue **lock);
+#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index faf6f5b53e77..e1191c996c59 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -83,6 +83,12 @@ void debug_mutex_unlock(struct mutex *lock)
        DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
        mutex_clear_owner(lock);
+        /*
+         * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug
+         * mutexes so that we can do it here after we've verified state.
+         */
+        atomic_set(&lock->count, 1);
 }
 void debug_mutex_init(struct mutex *lock, const char *name,
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 4dd6e4c219de..bc73d33c6760 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -25,6 +25,7 @@
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 #include <linux/debug_locks.h>
+#include "mcs_spinlock.h"
 /*
 * In the DEBUG case we are using the "NULL fastpath" for mutexes,
@@ -33,6 +34,13 @@
 #ifdef CONFIG_DEBUG_MUTEXES
 # include "mutex-debug.h"
 # include <asm-generic/mutex-null.h>
+/*
+ * Must be 0 for the debug case so we do not do the unlock outside of the
+ * wait_lock region. debug_mutex_unlock() will do the actual unlock in this
+ * case.
+ */
+# undef __mutex_slowpath_needs_to_unlock
+# define  __mutex_slowpath_needs_to_unlock()    0
 #else
 # include "mutex.h"
 # include <asm/mutex.h>
@@ -52,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
        INIT_LIST_HEAD(&lock->wait_list);
        mutex_clear_owner(lock);
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-        lock->spin_mlock = NULL;
+        lock->osq = NULL;
 #endif
        debug_mutex_init(lock, name, key);
@@ -67,8 +75,7 @@ EXPORT_SYMBOL(__mutex_init);
 * We also put the fastpath first in the kernel image, to make sure the
 * branch is predicted by the CPU as default-untaken.
 */
-static __used noinline void __sched
+__visible void __sched __mutex_lock_slowpath(atomic_t *lock_count);
-__mutex_lock_slowpath(atomic_t *lock_count);
 /**
 * mutex_lock - acquire the mutex
@@ -111,54 +118,7 @@ EXPORT_SYMBOL(mutex_lock);
 * more or less simultaneously, the spinners need to acquire a MCS lock
 * first before spinning on the owner field.
 *
- * We don't inline mspin_lock() so that perf can correctly account for the
- * time spent in this lock function.
 */
-struct mspin_node {
-        struct mspin_node *next ;
-        int               locked;       /* 1 if lock acquired */
-};
-#define MLOCK(mutex)    ((struct mspin_node **)&((mutex)->spin_mlock))
-static noinline
-void mspin_lock(struct mspin_node **lock, struct mspin_node *node)
-{
-        struct mspin_node *prev;
-        /* Init node */
-        node->locked = 0;
-        node->next   = NULL;
-        prev = xchg(lock, node);
-        if (likely(prev == NULL)) {
-                /* Lock acquired */
-                node->locked = 1;
-                return;
-        }
-        ACCESS_ONCE(prev->next) = node;
-        smp_wmb();
-        /* Wait until the lock holder passes the lock down */
-        while (!ACCESS_ONCE(node->locked))
-                arch_mutex_cpu_relax();
-}
-static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node)
-{
-        struct mspin_node *next = ACCESS_ONCE(node->next);
-        if (likely(!next)) {
-                /*
-                 * Release the lock by setting it to NULL
-                 */
-                if (cmpxchg(lock, node, NULL) == node)
-                        return;
-                /* Wait until the next pointer is set */
-                while (!(next = ACCESS_ONCE(node->next)))
-                        arch_mutex_cpu_relax();
-        }
-        ACCESS_ONCE(next->locked) = 1;
-        smp_wmb();
-}
 /*
 * Mutex spinning code migrated from kernel/sched/core.c
@@ -212,6 +172,9 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
        struct task_struct *owner;
        int retval = 1;
+        if (need_resched())
+                return 0;
        rcu_read_lock();
        owner = ACCESS_ONCE(lock->owner);
        if (owner)
@@ -225,7 +188,8 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
 }
 #endif
-static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
+__visible __used noinline
+void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
 /**
 * mutex_unlock - release the mutex
@@ -446,9 +410,11 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
        if (!mutex_can_spin_on_owner(lock))
                goto slowpath;
+        if (!osq_lock(&lock->osq))
+                goto slowpath;
        for (;;) {
                struct task_struct *owner;
-                struct mspin_node  node;
                if (use_ww_ctx && ww_ctx->acquired > 0) {
                        struct ww_mutex *ww;
@@ -463,19 +429,16 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                         * performed the optimistic spinning cannot be done.
                         */
                        if (ACCESS_ONCE(ww->ctx))
-                                goto slowpath;
+                                break;
                }
                /*
                 * If there's an owner, wait for it to either
                 * release the lock or go to sleep.
                 */
-                mspin_lock(MLOCK(lock), &node);
                owner = ACCESS_ONCE(lock->owner);
-                if (owner && !mutex_spin_on_owner(lock, owner)) {
+                if (owner && !mutex_spin_on_owner(lock, owner))
-                        mspin_unlock(MLOCK(lock), &node);
+                        break;
-                        goto slowpath;
-                }
                if ((atomic_read(&lock->count) == 1) &&
                    (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
@@ -488,11 +451,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                        }
                        mutex_set_owner(lock);
-                        mspin_unlock(MLOCK(lock), &node);
+                        osq_unlock(&lock->osq);
                        preempt_enable();
                        return 0;
                }
-                mspin_unlock(MLOCK(lock), &node);
                /*
                 * When there's no owner, we might have preempted between the
@@ -501,7 +463,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                 * the owner complete.
                 */
                if (!owner && (need_resched() || rt_task(task)))
-                        goto slowpath;
+                        break;
                /*
                 * The cpu_relax() call is a compiler barrier which forces
@@ -511,7 +473,15 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                 */
                arch_mutex_cpu_relax();
        }
+        osq_unlock(&lock->osq);
 slowpath:
+        /*
+         * If we fell out of the spin path because of need_resched(),
+         * reschedule now, before we try-lock the mutex. This avoids getting
+         * scheduled out right after we obtained the mutex.
+         */
+        if (need_resched())
+                schedule_preempt_disabled();
 #endif
        spin_lock_mutex(&lock->wait_lock, flags);
@@ -717,10 +687,6 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
        struct mutex *lock = container_of(lock_count, struct mutex, count);
        unsigned long flags;
-        spin_lock_mutex(&lock->wait_lock, flags);
-        mutex_release(&lock->dep_map, nested, _RET_IP_);
-        debug_mutex_unlock(lock);
        /*
         * some architectures leave the lock unlocked in the fastpath failure
         * case, others need to leave it locked. In the later case we have to
@@ -729,6 +695,10 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
        if (__mutex_slowpath_needs_to_unlock())
                atomic_set(&lock->count, 1);
+        spin_lock_mutex(&lock->wait_lock, flags);
+        mutex_release(&lock->dep_map, nested, _RET_IP_);
+        debug_mutex_unlock(lock);
        if (!list_empty(&lock->wait_list)) {
                /* get the first entry from the wait-list: */
                struct mutex_waiter *waiter =
@@ -746,7 +716,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
 /*
 * Release the lock, slowpath:
 */
-static __used noinline void
+__visible void
 __mutex_unlock_slowpath(atomic_t *lock_count)
 {
        __mutex_unlock_common_slowpath(lock_count, 1);
@@ -803,7 +773,7 @@ int __sched mutex_lock_killable(struct mutex *lock)
 }
 EXPORT_SYMBOL(mutex_lock_killable);
-static __used noinline void __sched
+__visible void __sched
 __mutex_lock_slowpath(atomic_t *lock_count)
 {
        struct mutex *lock = container_of(lock_count, struct mutex, count);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 2e960a2bab81..aa4dff04b594 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -213,6 +213,18 @@ struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
 }
 /*
+ * Called by sched_setscheduler() to check whether the priority change
+ * is overruled by a possible priority boosting.
+ */
+int rt_mutex_check_prio(struct task_struct *task, int newprio)
+{
+        if (!task_has_pi_waiters(task))
+                return 0;
+        return task_top_pi_waiter(task)->task->prio <= newprio;
+}
+/*
 * Adjust the priority of a task, after its pi_waiters got modified.
 *
 * This can be both boosting and unboosting. task->pi_lock must be held.
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 19c5fa95e0b4..1d66e08e897d 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -143,6 +143,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
 /*
 * wait for the read lock to be granted
 */
+__visible
 struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 {
        long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
@@ -190,6 +191,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 /*
 * wait until we successfully acquire the write lock
 */
+__visible
 struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 {
        long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
@@ -252,6 +254,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 * handle waking up a waiter on the semaphore
 * - up_read/up_write has decremented the active part of count if we come here
 */
+__visible
 struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
 {
        unsigned long flags;
@@ -272,6 +275,7 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
 * - caller incremented waiting part of count and discovered it still negative
 * - just wake up any readers at the front of the queue
 */
+__visible
 struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
 {
        unsigned long flags;
diff --git a/kernel/module.c b/kernel/module.c
index d24fcf29cb64..8dc7f5e80dd8 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1015,7 +1015,7 @@ static size_t module_flags_taint(struct module *mod, char *buf)
                buf[l++] = 'C';
        /*
         * TAINT_FORCED_RMMOD: could be added.
-         * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
+         * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
         * apply to modules.
         */
        return l;
@@ -1948,6 +1948,10 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
                switch (sym[i].st_shndx) {
                case SHN_COMMON:
+                        /* Ignore common symbols */
+                        if (!strncmp(name, "__gnu_lto", 9))
+                                break;
                        /* We compiled with -fno-common.  These are not
                           supposed to happen.  */
                        pr_debug("Common symbol: %s\n", name);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 2d5cc4ccff7f..db4c8b08a50c 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -309,7 +309,7 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
         * racy then it does not matter what the result of the test
         * is, we re-check the list after having taken the lock anyway:
         */
-        if (rcu_dereference_raw(nh->head)) {
+        if (rcu_access_pointer(nh->head)) {
                down_read(&nh->rwsem);
                ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
                                        nr_calls);
diff --git a/kernel/panic.c b/kernel/panic.c
index 6d6300375090..cca8a913ae7c 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -199,7 +199,7 @@ struct tnt {
 static const struct tnt tnts[] = {
        { TAINT_PROPRIETARY_MODULE,     'P', 'G' },
        { TAINT_FORCED_MODULE,          'F', ' ' },
-        { TAINT_UNSAFE_SMP,             'S', ' ' },
+        { TAINT_CPU_OUT_OF_SPEC,        'S', ' ' },
        { TAINT_FORCED_RMMOD,           'R', ' ' },
        { TAINT_MACHINE_CHECK,          'M', ' ' },
        { TAINT_BAD_PAGE,               'B', ' ' },
@@ -459,7 +459,7 @@ EXPORT_SYMBOL(warn_slowpath_null);
 * Called when gcc's -fstack-protector feature is used, and
 * gcc detects corruption of the on-stack canary value
 */
-void __stack_chk_fail(void)
+__visible void __stack_chk_fail(void)
 {
        panic("stack-protector: Kernel stack is corrupted in: %p\n",
                __builtin_return_address(0));
diff --git a/kernel/power/console.c b/kernel/power/console.c
index eacb8bd8cab4..aba9c545a0e3 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -9,6 +9,7 @@
 #include <linux/kbd_kern.h>
 #include <linux/vt.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include "power.h"
 #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index b1d255f04135..4dae9cbe9259 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1076,7 +1076,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
                next_seq = log_next_seq;
                len = 0;
-                prev = 0;
                while (len >= 0 && seq < next_seq) {
                        struct printk_log *msg = log_from_idx(idx);
                        int textlen;
@@ -2788,7 +2787,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
        next_idx = idx;
        l = 0;
-        prev = 0;
        while (seq < dumper->next_seq) {
                struct printk_log *msg = log_from_idx(idx);
diff --git a/kernel/profile.c b/kernel/profile.c
index 6631e1ef55ab..ebdd9c1a86b4 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -549,14 +549,14 @@ static int create_hash_tables(void)
                struct page *page;
                page = alloc_pages_exact_node(node,
-                                GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+                                GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
                                0);
                if (!page)
                        goto out_cleanup;
                per_cpu(cpu_profile_hits, cpu)[1]
                                = (struct profile_hit *)page_address(page);
                page = alloc_pages_exact_node(node,
-                                GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+                                GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
                                0);
                if (!page)
                        goto out_cleanup;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1f4bcb3cc21c..adf98622cb32 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1180,8 +1180,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
        return ret;
 }
-asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
+COMPAT_SYSCALL_DEFINE4(ptrace, compat_long_t, request, compat_long_t, pid,
-                                  compat_long_t addr, compat_long_t data)
+                       compat_long_t, addr, compat_long_t, data)
 {
        struct task_struct *child;
        long ret;
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 01e9ec37a3e3..807ccfbf69b3 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,5 +1,5 @@
 obj-y += update.o srcu.o
-obj-$(CONFIG_RCU_TORTURE_TEST) += torture.o
+obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_TREE_RCU) += tree.o
 obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o
 obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 79c3877e9c5b..bfda2726ca45 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -12,8 +12,8 @@
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
+ * along with this program; if not, you can access it online at
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * http://www.gnu.org/licenses/gpl-2.0.html.
 *
 * Copyright IBM Corporation, 2011
 *
@@ -23,6 +23,7 @@
 #ifndef __LINUX_RCU_H
 #define __LINUX_RCU_H
+#include <trace/events/rcu.h>
 #ifdef CONFIG_RCU_TRACE
 #define RCU_TRACE(stmt) stmt
 #else /* #ifdef CONFIG_RCU_TRACE */
@@ -116,8 +117,6 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
        }
 }
-extern int rcu_expedited;
 #ifdef CONFIG_RCU_STALL_COMMON
 extern int rcu_cpu_stall_suppress;
diff --git a/kernel/rcu/torture.c b/kernel/rcu/rcutorture.c
index 732f8ae3086a..bd30bc61bc05 100644
--- a/kernel/rcu/torture.c
+++ b/kernel/rcu/rcutorture.c
@@ -12,8 +12,8 @@
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
+ * along with this program; if not, you can access it online at
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * http://www.gnu.org/licenses/gpl-2.0.html.
 *
 * Copyright (C) IBM Corporation, 2005, 2006
 *
@@ -48,110 +48,58 @@
 #include <linux/slab.h>
 #include <linux/trace_clock.h>
 #include <asm/byteorder.h>
+#include <linux/torture.h>
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
-MODULE_ALIAS("rcutorture");
-#ifdef MODULE_PARAM_PREFIX
+torture_param(int, fqs_duration, 0,
-#undef MODULE_PARAM_PREFIX
+              "Duration of fqs bursts (us), 0 to disable");
-#endif
+torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");
-#define MODULE_PARAM_PREFIX "rcutorture."
+torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)");
+torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
-static int fqs_duration;
+torture_param(bool, gp_normal, false,
-module_param(fqs_duration, int, 0444);
+             "Use normal (non-expedited) GP wait primitives");
-MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable");
+torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
-static int fqs_holdoff;
+torture_param(int, n_barrier_cbs, 0,
-module_param(fqs_holdoff, int, 0444);
+             "# of callbacks/kthreads for barrier testing");
-MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
+torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads");
-static int fqs_stutter = 3;
+torture_param(int, nreaders, -1, "Number of RCU reader threads");
-module_param(fqs_stutter, int, 0444);
+torture_param(int, object_debug, 0,
-MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
+             "Enable debug-object double call_rcu() testing");
-static bool gp_exp;
+torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
-module_param(gp_exp, bool, 0444);
+torture_param(int, onoff_interval, 0,
-MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives");
+             "Time between CPU hotplugs (s), 0=disable");
-static bool gp_normal;
+torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles");
-module_param(gp_normal, bool, 0444);
+torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
-MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives");
+torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
-static int irqreader = 1;
+torture_param(int, stall_cpu_holdoff, 10,
-module_param(irqreader, int, 0444);
+             "Time to wait before starting stall (s).");
-MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
+torture_param(int, stat_interval, 60,
-static int n_barrier_cbs;
+             "Number of seconds between stats printk()s");
-module_param(n_barrier_cbs, int, 0444);
+torture_param(int, stutter, 5, "Number of seconds to run/halt test");
-MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing");
+torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
-static int nfakewriters = 4;
+torture_param(int, test_boost_duration, 4,
-module_param(nfakewriters, int, 0444);
+             "Duration of each boost test, seconds.");
-MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
+torture_param(int, test_boost_interval, 7,
-static int nreaders = -1;
+             "Interval between boost tests, seconds.");
-module_param(nreaders, int, 0444);
+torture_param(bool, test_no_idle_hz, true,
-MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
+             "Test support for tickless idle CPUs");
-static int object_debug;
+torture_param(bool, verbose, true,
-module_param(object_debug, int, 0444);
+             "Enable verbose debugging printk()s");
-MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing");
-static int onoff_holdoff;
-module_param(onoff_holdoff, int, 0444);
-MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)");
-static int onoff_interval;
-module_param(onoff_interval, int, 0444);
-MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
-static int shuffle_interval = 3;
-module_param(shuffle_interval, int, 0444);
-MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
-static int shutdown_secs;
-module_param(shutdown_secs, int, 0444);
-MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable.");
-static int stall_cpu;
-module_param(stall_cpu, int, 0444);
-MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable.");
-static int stall_cpu_holdoff = 10;
-module_param(stall_cpu_holdoff, int, 0444);
-MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s).");
-static int stat_interval = 60;
-module_param(stat_interval, int, 0644);
-MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
-static int stutter = 5;
-module_param(stutter, int, 0444);
-MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
-static int test_boost = 1;
-module_param(test_boost, int, 0444);
-MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
-static int test_boost_duration = 4;
-module_param(test_boost_duration, int, 0444);
-MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
-static int test_boost_interval = 7;
-module_param(test_boost_interval, int, 0444);
-MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
-static bool test_no_idle_hz = true;
-module_param(test_no_idle_hz, bool, 0444);
-MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
 static char *torture_type = "rcu";
 module_param(torture_type, charp, 0444);
 MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)");
-static bool verbose;
-module_param(verbose, bool, 0444);
-MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
-#define TORTURE_FLAG "-torture:"
-#define PRINTK_STRING(s) \
-        do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0)
-#define VERBOSE_PRINTK_STRING(s) \
-        do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0)
-#define VERBOSE_PRINTK_ERRSTRING(s) \
-        do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
 static int nrealreaders;
 static struct task_struct *writer_task;
 static struct task_struct **fakewriter_tasks;
 static struct task_struct **reader_tasks;
 static struct task_struct *stats_task;
-static struct task_struct *shuffler_task;
-static struct task_struct *stutter_task;
 static struct task_struct *fqs_task;
 static struct task_struct *boost_tasks[NR_CPUS];
-static struct task_struct *shutdown_task;
-#ifdef CONFIG_HOTPLUG_CPU
-static struct task_struct *onoff_task;
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static struct task_struct *stall_task;
 static struct task_struct **barrier_cbs_tasks;
 static struct task_struct *barrier_task;
@@ -170,10 +118,10 @@ static struct rcu_torture __rcu *rcu_torture_current;
 static unsigned long rcu_torture_current_version;
 static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
 static DEFINE_SPINLOCK(rcu_torture_lock);
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
-        { 0 };
+                      rcu_torture_count) = { 0 };
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) =
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
-        { 0 };
+                      rcu_torture_batch) = { 0 };
 static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
 static atomic_t n_rcu_torture_alloc;
 static atomic_t n_rcu_torture_alloc_fail;
@@ -186,22 +134,9 @@ static long n_rcu_torture_boost_rterror;
 static long n_rcu_torture_boost_failure;
 static long n_rcu_torture_boosts;
 static long n_rcu_torture_timers;
-static long n_offline_attempts;
-static long n_offline_successes;
-static unsigned long sum_offline;
-static int min_offline = -1;
-static int max_offline;
-static long n_online_attempts;
-static long n_online_successes;
-static unsigned long sum_online;
-static int min_online = -1;
-static int max_online;
 static long n_barrier_attempts;
 static long n_barrier_successes;
 static struct list_head rcu_torture_removed;
-static cpumask_var_t shuffle_tmp_mask;
-static int stutter_pause_test;
 #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
 #define RCUTORTURE_RUNNABLE_INIT 1
@@ -232,7 +167,6 @@ static u64 notrace rcu_trace_clock_local(void)
 }
 #endif /* #else #ifdef CONFIG_RCU_TRACE */
-static unsigned long shutdown_time;     /* jiffies to system shutdown. */
 static unsigned long boost_starttime;   /* jiffies of next boost test start. */
 DEFINE_MUTEX(boost_mutex);              /* protect setting boost_starttime */
                                        /*  and boost task create/destroy. */
@@ -242,51 +176,6 @@ static atomic_t barrier_cbs_invoked;	/* Barrier callbacks invoked. */
 static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
 static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
-/* Mediate rmmod and system shutdown.  Concurrent rmmod & shutdown illegal! */
-#define FULLSTOP_DONTSTOP 0     /* Normal operation. */
-#define FULLSTOP_SHUTDOWN 1     /* System shutdown with rcutorture running. */
-#define FULLSTOP_RMMOD    2     /* Normal rmmod of rcutorture. */
-static int fullstop = FULLSTOP_RMMOD;
-/*
- * Protect fullstop transitions and spawning of kthreads.
- */
-static DEFINE_MUTEX(fullstop_mutex);
-/* Forward reference. */
-static void rcu_torture_cleanup(void);
-/*
- * Detect and respond to a system shutdown.
- */
-static int
-rcutorture_shutdown_notify(struct notifier_block *unused1,
-                           unsigned long unused2, void *unused3)
-{
-        mutex_lock(&fullstop_mutex);
-        if (fullstop == FULLSTOP_DONTSTOP)
-                fullstop = FULLSTOP_SHUTDOWN;
-        else
-                pr_warn(/* but going down anyway, so... */
-                       "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
-        mutex_unlock(&fullstop_mutex);
-        return NOTIFY_DONE;
-}
-/*
- * Absorb kthreads into a kernel function that won't return, so that
- * they won't ever access module text or data again.
- */
-static void rcutorture_shutdown_absorb(const char *title)
-{
-        if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
-                pr_notice(
-                       "rcutorture thread %s parking due to system shutdown\n",
-                       title);
-                schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
-        }
-}
 /*
 * Allocate an element from the rcu_tortures pool.
 */
@@ -320,44 +209,6 @@ rcu_torture_free(struct rcu_torture *p)
        spin_unlock_bh(&rcu_torture_lock);
 }
-struct rcu_random_state {
-        unsigned long rrs_state;
-        long rrs_count;
-};
-#define RCU_RANDOM_MULT 39916801  /* prime */
-#define RCU_RANDOM_ADD  479001701 /* prime */
-#define RCU_RANDOM_REFRESH 10000
-#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 }
-/*
- * Crude but fast random-number generator.  Uses a linear congruential
- * generator, with occasional help from cpu_clock().
- */
-static unsigned long
-rcu_random(struct rcu_random_state *rrsp)
-{
-        if (--rrsp->rrs_count < 0) {
-                rrsp->rrs_state += (unsigned long)local_clock();
-                rrsp->rrs_count = RCU_RANDOM_REFRESH;
-        }
-        rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
-        return swahw32(rrsp->rrs_state);
-}
-static void
-rcu_stutter_wait(const char *title)
-{
-        while (stutter_pause_test || !rcutorture_runnable) {
-                if (rcutorture_runnable)
-                        schedule_timeout_interruptible(1);
-                else
-                        schedule_timeout_interruptible(round_jiffies_relative(HZ));
-                rcutorture_shutdown_absorb(title);
-        }
-}
 /*
 * Operations vector for selecting different types of tests.
 */
@@ -365,7 +216,7 @@ rcu_stutter_wait(const char *title)
 struct rcu_torture_ops {
        void (*init)(void);
        int (*readlock)(void);
-        void (*read_delay)(struct rcu_random_state *rrsp);
+        void (*read_delay)(struct torture_random_state *rrsp);
        void (*readunlock)(int idx);
        int (*completed)(void);
        void (*deferred_free)(struct rcu_torture *p);
@@ -392,7 +243,7 @@ static int rcu_torture_read_lock(void) __acquires(RCU)
        return 0;
 }
-static void rcu_read_delay(struct rcu_random_state *rrsp)
+static void rcu_read_delay(struct torture_random_state *rrsp)
 {
        const unsigned long shortdelay_us = 200;
        const unsigned long longdelay_ms = 50;
@@ -401,12 +252,13 @@ static void rcu_read_delay(struct rcu_random_state *rrsp)
         * period, and we want a long delay occasionally to trigger
         * force_quiescent_state. */
-        if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms)))
+        if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms)))
                mdelay(longdelay_ms);
-        if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
+        if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
                udelay(shortdelay_us);
 #ifdef CONFIG_PREEMPT
-        if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000)))
+        if (!preempt_count() &&
+            !(torture_random(rrsp) % (nrealreaders * 20000)))
                preempt_schedule();  /* No QS if preempt_disable() in effect */
 #endif
 }
@@ -427,7 +279,7 @@ rcu_torture_cb(struct rcu_head *p)
        int i;
        struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
-        if (fullstop != FULLSTOP_DONTSTOP) {
+        if (torture_must_stop_irq()) {
                /* Test is ending, just drop callbacks on the floor. */
                /* The next initialization will pick up the pieces. */
                return;
@@ -520,6 +372,48 @@ static struct rcu_torture_ops rcu_bh_ops = {
 };
 /*
+ * Don't even think about trying any of these in real life!!!
+ * The names includes "busted", and they really means it!
+ * The only purpose of these functions is to provide a buggy RCU
+ * implementation to make sure that rcutorture correctly emits
+ * buggy-RCU error messages.
+ */
+static void rcu_busted_torture_deferred_free(struct rcu_torture *p)
+{
+        /* This is a deliberate bug for testing purposes only! */
+        rcu_torture_cb(&p->rtort_rcu);
+}
+static void synchronize_rcu_busted(void)
+{
+        /* This is a deliberate bug for testing purposes only! */
+}
+static void
+call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+        /* This is a deliberate bug for testing purposes only! */
+        func(head);
+}
+static struct rcu_torture_ops rcu_busted_ops = {
+        .init           = rcu_sync_torture_init,
+        .readlock       = rcu_torture_read_lock,
+        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
+        .readunlock     = rcu_torture_read_unlock,
+        .completed      = rcu_no_completed,
+        .deferred_free  = rcu_busted_torture_deferred_free,
+        .sync           = synchronize_rcu_busted,
+        .exp_sync       = synchronize_rcu_busted,
+        .call           = call_rcu_busted,
+        .cb_barrier     = NULL,
+        .fqs            = NULL,
+        .stats          = NULL,
+        .irq_capable    = 1,
+        .name           = "rcu_busted"
+};
+/*
 * Definitions for srcu torture testing.
 */
@@ -530,7 +424,7 @@ static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
        return srcu_read_lock(&srcu_ctl);
 }
-static void srcu_read_delay(struct rcu_random_state *rrsp)
+static void srcu_read_delay(struct torture_random_state *rrsp)
 {
        long delay;
        const long uspertick = 1000000 / HZ;
@@ -538,7 +432,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp)
        /* We want there to be long-running readers, but not all the time. */
-        delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick);
+        delay = torture_random(rrsp) %
+                (nrealreaders * 2 * longdelay * uspertick);
        if (!delay)
                schedule_timeout_interruptible(longdelay);
        else
@@ -677,12 +572,12 @@ static int rcu_torture_boost(void *arg)
        struct rcu_boost_inflight rbi = { .inflight = 0 };
        struct sched_param sp;
-        VERBOSE_PRINTK_STRING("rcu_torture_boost started");
+        VERBOSE_TOROUT_STRING("rcu_torture_boost started");
        /* Set real-time priority. */
        sp.sched_priority = 1;
        if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
-                VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!");
+                VERBOSE_TOROUT_STRING("rcu_torture_boost RT prio failed!");
                n_rcu_torture_boost_rterror++;
        }
@@ -693,9 +588,8 @@ static int rcu_torture_boost(void *arg)
                oldstarttime = boost_starttime;
                while (ULONG_CMP_LT(jiffies, oldstarttime)) {
                        schedule_timeout_interruptible(oldstarttime - jiffies);
-                        rcu_stutter_wait("rcu_torture_boost");
+                        stutter_wait("rcu_torture_boost");
-                        if (kthread_should_stop() ||
+                        if (torture_must_stop())
-                            fullstop != FULLSTOP_DONTSTOP)
                                goto checkwait;
                }
@@ -710,15 +604,14 @@ static int rcu_torture_boost(void *arg)
                                call_rcu(&rbi.rcu, rcu_torture_boost_cb);
                                if (jiffies - call_rcu_time >
                                         test_boost_duration * HZ - HZ / 2) {
-                                        VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed");
+                                        VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed");
                                        n_rcu_torture_boost_failure++;
                                }
                                call_rcu_time = jiffies;
                        }
                        cond_resched();
-                        rcu_stutter_wait("rcu_torture_boost");
+                        stutter_wait("rcu_torture_boost");
-                        if (kthread_should_stop() ||
+                        if (torture_must_stop())
-                            fullstop != FULLSTOP_DONTSTOP)
                                goto checkwait;
                }
@@ -742,16 +635,17 @@ static int rcu_torture_boost(void *arg)
                }
                /* Go do the stutter. */
-checkwait:      rcu_stutter_wait("rcu_torture_boost");
+checkwait:      stutter_wait("rcu_torture_boost");
-        } while (!kthread_should_stop() && fullstop  == FULLSTOP_DONTSTOP);
+        } while (!torture_must_stop());
        /* Clean up and exit. */
-        VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
+        while (!kthread_should_stop() || rbi.inflight) {
-        rcutorture_shutdown_absorb("rcu_torture_boost");
+                torture_shutdown_absorb("rcu_torture_boost");
-        while (!kthread_should_stop() || rbi.inflight)
                schedule_timeout_uninterruptible(1);
+        }
        smp_mb(); /* order accesses to ->inflight before stack-frame death. */
        destroy_rcu_head_on_stack(&rbi.rcu);
+        torture_kthread_stopping("rcu_torture_boost");
        return 0;
 }
@@ -766,7 +660,7 @@ rcu_torture_fqs(void *arg)
        unsigned long fqs_resume_time;
        int fqs_burst_remaining;
-        VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
+        VERBOSE_TOROUT_STRING("rcu_torture_fqs task started");
        do {
                fqs_resume_time = jiffies + fqs_stutter * HZ;
                while (ULONG_CMP_LT(jiffies, fqs_resume_time) &&
@@ -780,12 +674,9 @@ rcu_torture_fqs(void *arg)
                        udelay(fqs_holdoff);
                        fqs_burst_remaining -= fqs_holdoff;
                }
-                rcu_stutter_wait("rcu_torture_fqs");
+                stutter_wait("rcu_torture_fqs");
-        } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+        } while (!torture_must_stop());
-        VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping");
+        torture_kthread_stopping("rcu_torture_fqs");
-        rcutorture_shutdown_absorb("rcu_torture_fqs");
-        while (!kthread_should_stop())
-                schedule_timeout_uninterruptible(1);
        return 0;
 }
@@ -802,10 +693,10 @@ rcu_torture_writer(void *arg)
        struct rcu_torture *rp;
        struct rcu_torture *rp1;
        struct rcu_torture *old_rp;
-        static DEFINE_RCU_RANDOM(rand);
+        static DEFINE_TORTURE_RANDOM(rand);
-        VERBOSE_PRINTK_STRING("rcu_torture_writer task started");
+        VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
-        set_user_nice(current, 19);
+        set_user_nice(current, MAX_NICE);
        do {
                schedule_timeout_uninterruptible(1);
@@ -813,7 +704,7 @@ rcu_torture_writer(void *arg)
                if (rp == NULL)
                        continue;
                rp->rtort_pipe_count = 0;
-                udelay(rcu_random(&rand) & 0x3ff);
+                udelay(torture_random(&rand) & 0x3ff);
                old_rp = rcu_dereference_check(rcu_torture_current,
                                               current == writer_task);
                rp->rtort_mbtest = 1;
@@ -826,7 +717,7 @@ rcu_torture_writer(void *arg)
                        atomic_inc(&rcu_torture_wcount[i]);
                        old_rp->rtort_pipe_count++;
                        if (gp_normal == gp_exp)
-                                exp = !!(rcu_random(&rand) & 0x80);
+                                exp = !!(torture_random(&rand) & 0x80);
                        else
                                exp = gp_exp;
                        if (!exp) {
@@ -852,12 +743,9 @@ rcu_torture_writer(void *arg)
                        }
                }
                rcutorture_record_progress(++rcu_torture_current_version);
-                rcu_stutter_wait("rcu_torture_writer");
+                stutter_wait("rcu_torture_writer");
-        } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+        } while (!torture_must_stop());
-        VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
+        torture_kthread_stopping("rcu_torture_writer");
-        rcutorture_shutdown_absorb("rcu_torture_writer");
-        while (!kthread_should_stop())
-                schedule_timeout_uninterruptible(1);
        return 0;
 }
@@ -868,19 +756,19 @@ rcu_torture_writer(void *arg)
 static int
 rcu_torture_fakewriter(void *arg)
 {
-        DEFINE_RCU_RANDOM(rand);
+        DEFINE_TORTURE_RANDOM(rand);
-        VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started");
+        VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started");
-        set_user_nice(current, 19);
+        set_user_nice(current, MAX_NICE);
        do {
-                schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
+                schedule_timeout_uninterruptible(1 + torture_random(&rand)%10);
-                udelay(rcu_random(&rand) & 0x3ff);
+                udelay(torture_random(&rand) & 0x3ff);
                if (cur_ops->cb_barrier != NULL &&
-                    rcu_random(&rand) % (nfakewriters * 8) == 0) {
+                    torture_random(&rand) % (nfakewriters * 8) == 0) {
                        cur_ops->cb_barrier();
                } else if (gp_normal == gp_exp) {
-                        if (rcu_random(&rand) & 0x80)
+                        if (torture_random(&rand) & 0x80)
                                cur_ops->sync();
                        else
                                cur_ops->exp_sync();
@@ -889,13 +777,10 @@ rcu_torture_fakewriter(void *arg)
                } else {
                        cur_ops->exp_sync();
                }
-                rcu_stutter_wait("rcu_torture_fakewriter");
+                stutter_wait("rcu_torture_fakewriter");
-        } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+        } while (!torture_must_stop());
-        VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
+        torture_kthread_stopping("rcu_torture_fakewriter");
-        rcutorture_shutdown_absorb("rcu_torture_fakewriter");
-        while (!kthread_should_stop())
-                schedule_timeout_uninterruptible(1);
        return 0;
 }
@@ -921,7 +806,7 @@ static void rcu_torture_timer(unsigned long unused)
        int idx;
        int completed;
        int completed_end;
-        static DEFINE_RCU_RANDOM(rand);
+        static DEFINE_TORTURE_RANDOM(rand);
        static DEFINE_SPINLOCK(rand_lock);
        struct rcu_torture *p;
        int pipe_count;
@@ -980,14 +865,14 @@ rcu_torture_reader(void *arg)
        int completed;
        int completed_end;
        int idx;
-        DEFINE_RCU_RANDOM(rand);
+        DEFINE_TORTURE_RANDOM(rand);
        struct rcu_torture *p;
        int pipe_count;
        struct timer_list t;
        unsigned long long ts;
-        VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
+        VERBOSE_TOROUT_STRING("rcu_torture_reader task started");
-        set_user_nice(current, 19);
+        set_user_nice(current, MAX_NICE);
        if (irqreader && cur_ops->irq_capable)
                setup_timer_on_stack(&t, rcu_torture_timer, 0);
@@ -1034,14 +919,11 @@ rcu_torture_reader(void *arg)
                preempt_enable();
                cur_ops->readunlock(idx);
                schedule();
-                rcu_stutter_wait("rcu_torture_reader");
+                stutter_wait("rcu_torture_reader");
-        } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+        } while (!torture_must_stop());
-        VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
-        rcutorture_shutdown_absorb("rcu_torture_reader");
        if (irqreader && cur_ops->irq_capable)
                del_timer_sync(&t);
-        while (!kthread_should_stop())
+        torture_kthread_stopping("rcu_torture_reader");
-                schedule_timeout_uninterruptible(1);
        return 0;
 }
@@ -1083,13 +965,7 @@ rcu_torture_printk(char *page)
                       n_rcu_torture_boost_failure,
                       n_rcu_torture_boosts,
                       n_rcu_torture_timers);
-        page += sprintf(page,
+        page = torture_onoff_stats(page);
-                       "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
-                       n_online_successes, n_online_attempts,
-                       n_offline_successes, n_offline_attempts,
-                       min_online, max_online,
-                       min_offline, max_offline,
-                       sum_online, sum_offline, HZ);
        page += sprintf(page, "barrier: %ld/%ld:%ld",
                       n_barrier_successes,
                       n_barrier_attempts,
@@ -1150,123 +1026,17 @@ rcu_torture_stats_print(void)
 /*
 * Periodically prints torture statistics, if periodic statistics printing
 * was specified via the stat_interval module parameter.
- *
- * No need to worry about fullstop here, since this one doesn't reference
- * volatile state or register callbacks.
 */
 static int
 rcu_torture_stats(void *arg)
 {
-        VERBOSE_PRINTK_STRING("rcu_torture_stats task started");
+        VERBOSE_TOROUT_STRING("rcu_torture_stats task started");
        do {
                schedule_timeout_interruptible(stat_interval * HZ);
                rcu_torture_stats_print();
-                rcutorture_shutdown_absorb("rcu_torture_stats");
+                torture_shutdown_absorb("rcu_torture_stats");
-        } while (!kthread_should_stop());
+        } while (!torture_must_stop());
-        VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
+        torture_kthread_stopping("rcu_torture_stats");
-        return 0;
-}
-static int rcu_idle_cpu;        /* Force all torture tasks off this CPU */
-/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case
- * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs.
- */
-static void rcu_torture_shuffle_tasks(void)
-{
-        int i;
-        cpumask_setall(shuffle_tmp_mask);
-        get_online_cpus();
-        /* No point in shuffling if there is only one online CPU (ex: UP) */
-        if (num_online_cpus() == 1) {
-                put_online_cpus();
-                return;
-        }
-        if (rcu_idle_cpu != -1)
-                cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask);
-        set_cpus_allowed_ptr(current, shuffle_tmp_mask);
-        if (reader_tasks) {
-                for (i = 0; i < nrealreaders; i++)
-                        if (reader_tasks[i])
-                                set_cpus_allowed_ptr(reader_tasks[i],
-                                                     shuffle_tmp_mask);
-        }
-        if (fakewriter_tasks) {
-                for (i = 0; i < nfakewriters; i++)
-                        if (fakewriter_tasks[i])
-                                set_cpus_allowed_ptr(fakewriter_tasks[i],
-                                                     shuffle_tmp_mask);
-        }
-        if (writer_task)
-                set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask);
-        if (stats_task)
-                set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask);
-        if (stutter_task)
-                set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask);
-        if (fqs_task)
-                set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask);
-        if (shutdown_task)
-                set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask);
-#ifdef CONFIG_HOTPLUG_CPU
-        if (onoff_task)
-                set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask);
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-        if (stall_task)
-                set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask);
-        if (barrier_cbs_tasks)
-                for (i = 0; i < n_barrier_cbs; i++)
-                        if (barrier_cbs_tasks[i])
-                                set_cpus_allowed_ptr(barrier_cbs_tasks[i],
-                                                     shuffle_tmp_mask);
-        if (barrier_task)
-                set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask);
-        if (rcu_idle_cpu == -1)
-                rcu_idle_cpu = num_online_cpus() - 1;
-        else
-                rcu_idle_cpu--;
-        put_online_cpus();
-}
-/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
- * system to become idle at a time and cut off its timer ticks. This is meant
- * to test the support for such tickless idle CPU in RCU.
- */
-static int
-rcu_torture_shuffle(void *arg)
-{
-        VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started");
-        do {
-                schedule_timeout_interruptible(shuffle_interval * HZ);
-                rcu_torture_shuffle_tasks();
-                rcutorture_shutdown_absorb("rcu_torture_shuffle");
-        } while (!kthread_should_stop());
-        VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping");
-        return 0;
-}
-/* Cause the rcutorture test to "stutter", starting and stopping all
- * threads periodically.
- */
-static int
-rcu_torture_stutter(void *arg)
-{
-        VERBOSE_PRINTK_STRING("rcu_torture_stutter task started");
-        do {
-                schedule_timeout_interruptible(stutter * HZ);
-                stutter_pause_test = 1;
-                if (!kthread_should_stop())
-                        schedule_timeout_interruptible(stutter * HZ);
-                stutter_pause_test = 0;
-                rcutorture_shutdown_absorb("rcu_torture_stutter");
-        } while (!kthread_should_stop());
-        VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping");
        return 0;
 }
@@ -1293,10 +1063,6 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
                 onoff_interval, onoff_holdoff);
 }
-static struct notifier_block rcutorture_shutdown_nb = {
-        .notifier_call = rcutorture_shutdown_notify,
-};
 static void rcutorture_booster_cleanup(int cpu)
 {
        struct task_struct *t;
@@ -1304,14 +1070,12 @@ static void rcutorture_booster_cleanup(int cpu)
        if (boost_tasks[cpu] == NULL)
                return;
        mutex_lock(&boost_mutex);
-        VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
        t = boost_tasks[cpu];
        boost_tasks[cpu] = NULL;
        mutex_unlock(&boost_mutex);
        /* This must be outside of the mutex, otherwise deadlock! */
-        kthread_stop(t);
+        torture_stop_kthread(rcu_torture_boost, t);
-        boost_tasks[cpu] = NULL;
 }
 static int rcutorture_booster_init(int cpu)
@@ -1323,13 +1087,13 @@ static int rcutorture_booster_init(int cpu)
        /* Don't allow time recalculation while creating a new task. */
        mutex_lock(&boost_mutex);
-        VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
+        VERBOSE_TOROUT_STRING("Creating rcu_torture_boost task");
        boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL,
                                                  cpu_to_node(cpu),
                                                  "rcu_torture_boost");
        if (IS_ERR(boost_tasks[cpu])) {
                retval = PTR_ERR(boost_tasks[cpu]);
-                VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
+                VERBOSE_TOROUT_STRING("rcu_torture_boost task create failed");
                n_rcu_torture_boost_ktrerror++;
                boost_tasks[cpu] = NULL;
                mutex_unlock(&boost_mutex);
@@ -1342,175 +1106,6 @@ static int rcutorture_booster_init(int cpu)
 }
 /*
- * Cause the rcutorture test to shutdown the system after the test has
- * run for the time specified by the shutdown_secs module parameter.
- */
-static int
-rcu_torture_shutdown(void *arg)
-{
-        long delta;
-        unsigned long jiffies_snap;
-        VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started");
-        jiffies_snap = ACCESS_ONCE(jiffies);
-        while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
-               !kthread_should_stop()) {
-                delta = shutdown_time - jiffies_snap;
-                if (verbose)
-                        pr_alert("%s" TORTURE_FLAG
-                                 "rcu_torture_shutdown task: %lu jiffies remaining\n",
-                                 torture_type, delta);
-                schedule_timeout_interruptible(delta);
-                jiffies_snap = ACCESS_ONCE(jiffies);
-        }
-        if (kthread_should_stop()) {
-                VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping");
-                return 0;
-        }
-        /* OK, shut down the system. */
-        VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system");
-        shutdown_task = NULL;   /* Avoid self-kill deadlock. */
-        rcu_torture_cleanup();  /* Get the success/failure message. */
-        kernel_power_off();     /* Shut down the system. */
-        return 0;
-}
-#ifdef CONFIG_HOTPLUG_CPU
-/*
- * Execute random CPU-hotplug operations at the interval specified
- * by the onoff_interval.
- */
-static int
-rcu_torture_onoff(void *arg)
-{
-        int cpu;
-        unsigned long delta;
-        int maxcpu = -1;
-        DEFINE_RCU_RANDOM(rand);
-        int ret;
-        unsigned long starttime;
-        VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
-        for_each_online_cpu(cpu)
-                maxcpu = cpu;
-        WARN_ON(maxcpu < 0);
-        if (onoff_holdoff > 0) {
-                VERBOSE_PRINTK_STRING("rcu_torture_onoff begin holdoff");
-                schedule_timeout_interruptible(onoff_holdoff * HZ);
-                VERBOSE_PRINTK_STRING("rcu_torture_onoff end holdoff");
-        }
-        while (!kthread_should_stop()) {
-                cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1);
-                if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
-                        if (verbose)
-                                pr_alert("%s" TORTURE_FLAG
-                                         "rcu_torture_onoff task: offlining %d\n",
-                                         torture_type, cpu);
-                        starttime = jiffies;
-                        n_offline_attempts++;
-                        ret = cpu_down(cpu);
-                        if (ret) {
-                                if (verbose)
-                                        pr_alert("%s" TORTURE_FLAG
-                                                 "rcu_torture_onoff task: offline %d failed: errno %d\n",
-                                                 torture_type, cpu, ret);
-                        } else {
-                                if (verbose)
-                                        pr_alert("%s" TORTURE_FLAG
-                                                 "rcu_torture_onoff task: offlined %d\n",
-                                                 torture_type, cpu);
-                                n_offline_successes++;
-                                delta = jiffies - starttime;
-                                sum_offline += delta;
-                                if (min_offline < 0) {
-                                        min_offline = delta;
-                                        max_offline = delta;
-                                }
-                                if (min_offline > delta)
-                                        min_offline = delta;
-                                if (max_offline < delta)
-                                        max_offline = delta;
-                        }
-                } else if (cpu_is_hotpluggable(cpu)) {
-                        if (verbose)
-                                pr_alert("%s" TORTURE_FLAG
-                                         "rcu_torture_onoff task: onlining %d\n",
-                                         torture_type, cpu);
-                        starttime = jiffies;
-                        n_online_attempts++;
-                        ret = cpu_up(cpu);
-                        if (ret) {
-                                if (verbose)
-                                        pr_alert("%s" TORTURE_FLAG
-                                                 "rcu_torture_onoff task: online %d failed: errno %d\n",
-                                                 torture_type, cpu, ret);
-                        } else {
-                                if (verbose)
-                                        pr_alert("%s" TORTURE_FLAG
-                                                 "rcu_torture_onoff task: onlined %d\n",
-                                                 torture_type, cpu);
-                                n_online_successes++;
-                                delta = jiffies - starttime;
-                                sum_online += delta;
-                                if (min_online < 0) {
-                                        min_online = delta;
-                                        max_online = delta;
-                                }
-                                if (min_online > delta)
-                                        min_online = delta;
-                                if (max_online < delta)
-                                        max_online = delta;
-                        }
-                }
-                schedule_timeout_interruptible(onoff_interval * HZ);
-        }
-        VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping");
-        return 0;
-}
-static int
-rcu_torture_onoff_init(void)
-{
-        int ret;
-        if (onoff_interval <= 0)
-                return 0;
-        onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff");
-        if (IS_ERR(onoff_task)) {
-                ret = PTR_ERR(onoff_task);
-                onoff_task = NULL;
-                return ret;
-        }
-        return 0;
-}
-static void rcu_torture_onoff_cleanup(void)
-{
-        if (onoff_task == NULL)
-                return;
-        VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
-        kthread_stop(onoff_task);
-        onoff_task = NULL;
-}
-#else /* #ifdef CONFIG_HOTPLUG_CPU */
-static int
-rcu_torture_onoff_init(void)
-{
-        return 0;
-}
-static void rcu_torture_onoff_cleanup(void)
-{
-}
-#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
-/*
 * CPU-stall kthread.  It waits as specified by stall_cpu_holdoff, then
 * induces a CPU stall for the time specified by stall_cpu.
 */
@@ -1518,11 +1113,11 @@ static int rcu_torture_stall(void *args)
 {
        unsigned long stop_at;
-        VERBOSE_PRINTK_STRING("rcu_torture_stall task started");
+        VERBOSE_TOROUT_STRING("rcu_torture_stall task started");
        if (stall_cpu_holdoff > 0) {
-                VERBOSE_PRINTK_STRING("rcu_torture_stall begin holdoff");
+                VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff");
                schedule_timeout_interruptible(stall_cpu_holdoff * HZ);
-                VERBOSE_PRINTK_STRING("rcu_torture_stall end holdoff");
+                VERBOSE_TOROUT_STRING("rcu_torture_stall end holdoff");
        }
        if (!kthread_should_stop()) {
                stop_at = get_seconds() + stall_cpu;
@@ -1536,7 +1131,7 @@ static int rcu_torture_stall(void *args)
                rcu_read_unlock();
                pr_alert("rcu_torture_stall end.\n");
        }
-        rcutorture_shutdown_absorb("rcu_torture_stall");
+        torture_shutdown_absorb("rcu_torture_stall");
        while (!kthread_should_stop())
                schedule_timeout_interruptible(10 * HZ);
        return 0;
@@ -1545,27 +1140,9 @@ static int rcu_torture_stall(void *args)
 /* Spawn CPU-stall kthread, if stall_cpu specified. */
 static int __init rcu_torture_stall_init(void)
 {
-        int ret;
        if (stall_cpu <= 0)
                return 0;
-        stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall");
+        return torture_create_kthread(rcu_torture_stall, NULL, stall_task);
-        if (IS_ERR(stall_task)) {
-                ret = PTR_ERR(stall_task);
-                stall_task = NULL;
-                return ret;
-        }
-        return 0;
-}
-/* Clean up after the CPU-stall kthread, if one was spawned. */
-static void rcu_torture_stall_cleanup(void)
-{
-        if (stall_task == NULL)
-                return;
-        VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task.");
-        kthread_stop(stall_task);
-        stall_task = NULL;
 }
 /* Callback function for RCU barrier testing. */
@@ -1583,28 +1160,24 @@ static int rcu_torture_barrier_cbs(void *arg)
        struct rcu_head rcu;
        init_rcu_head_on_stack(&rcu);
-        VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started");
+        VERBOSE_TOROUT_STRING("rcu_torture_barrier_cbs task started");
-        set_user_nice(current, 19);
+        set_user_nice(current, MAX_NICE);
        do {
                wait_event(barrier_cbs_wq[myid],
                           (newphase =
                            ACCESS_ONCE(barrier_phase)) != lastphase ||
-                           kthread_should_stop() ||
+                           torture_must_stop());
-                           fullstop != FULLSTOP_DONTSTOP);
                lastphase = newphase;
                smp_mb(); /* ensure barrier_phase load before ->call(). */
-                if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
+                if (torture_must_stop())
                        break;
                cur_ops->call(&rcu, rcu_torture_barrier_cbf);
                if (atomic_dec_and_test(&barrier_cbs_count))
                        wake_up(&barrier_wq);
-        } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+        } while (!torture_must_stop());
-        VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping");
-        rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
-        while (!kthread_should_stop())
-                schedule_timeout_interruptible(1);
        cur_ops->cb_barrier();
        destroy_rcu_head_on_stack(&rcu);
+        torture_kthread_stopping("rcu_torture_barrier_cbs");
        return 0;
 }
@@ -1613,7 +1186,7 @@ static int rcu_torture_barrier(void *arg)
 {
        int i;
-        VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting");
+        VERBOSE_TOROUT_STRING("rcu_torture_barrier task starting");
        do {
                atomic_set(&barrier_cbs_invoked, 0);
                atomic_set(&barrier_cbs_count, n_barrier_cbs);
@@ -1623,9 +1196,8 @@ static int rcu_torture_barrier(void *arg)
                        wake_up(&barrier_cbs_wq[i]);
                wait_event(barrier_wq,
                           atomic_read(&barrier_cbs_count) == 0 ||
-                           kthread_should_stop() ||
+                           torture_must_stop());
-                           fullstop != FULLSTOP_DONTSTOP);
+                if (torture_must_stop())
-                if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
                        break;
                n_barrier_attempts++;
                cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */
@@ -1635,11 +1207,8 @@ static int rcu_torture_barrier(void *arg)
                }
                n_barrier_successes++;
                schedule_timeout_interruptible(HZ / 10);
-        } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+        } while (!torture_must_stop());
-        VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping");
+        torture_kthread_stopping("rcu_torture_barrier");
-        rcutorture_shutdown_absorb("rcu_torture_barrier");
-        while (!kthread_should_stop())
-                schedule_timeout_interruptible(1);
        return 0;
 }
@@ -1672,24 +1241,13 @@ static int rcu_torture_barrier_init(void)
                return -ENOMEM;
        for (i = 0; i < n_barrier_cbs; i++) {
                init_waitqueue_head(&barrier_cbs_wq[i]);
-                barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs,
+                ret = torture_create_kthread(rcu_torture_barrier_cbs,
-                                                   (void *)(long)i,
+                                             (void *)(long)i,
-                                                   "rcu_torture_barrier_cbs");
+                                             barrier_cbs_tasks[i]);
-                if (IS_ERR(barrier_cbs_tasks[i])) {
+                if (ret)
-                        ret = PTR_ERR(barrier_cbs_tasks[i]);
-                        VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs");
-                        barrier_cbs_tasks[i] = NULL;
                        return ret;
-                }
        }
-        barrier_task = kthread_run(rcu_torture_barrier, NULL,
+        return torture_create_kthread(rcu_torture_barrier, NULL, barrier_task);
-                                   "rcu_torture_barrier");
-        if (IS_ERR(barrier_task)) {
-                ret = PTR_ERR(barrier_task);
-                VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier");
-                barrier_task = NULL;
-        }
-        return 0;
 }
 /* Clean up after RCU barrier testing. */
@@ -1697,19 +1255,11 @@ static void rcu_torture_barrier_cleanup(void)
 {
        int i;
-        if (barrier_task != NULL) {
+        torture_stop_kthread(rcu_torture_barrier, barrier_task);
-                VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task");
-                kthread_stop(barrier_task);
-                barrier_task = NULL;
-        }
        if (barrier_cbs_tasks != NULL) {
-                for (i = 0; i < n_barrier_cbs; i++) {
+                for (i = 0; i < n_barrier_cbs; i++)
-                        if (barrier_cbs_tasks[i] != NULL) {
+                        torture_stop_kthread(rcu_torture_barrier_cbs,
-                                VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task");
+                                             barrier_cbs_tasks[i]);
-                                kthread_stop(barrier_cbs_tasks[i]);
-                                barrier_cbs_tasks[i] = NULL;
-                        }
-                }
                kfree(barrier_cbs_tasks);
                barrier_cbs_tasks = NULL;
        }
@@ -1747,90 +1297,42 @@ rcu_torture_cleanup(void)
 {
        int i;
-        mutex_lock(&fullstop_mutex);
        rcutorture_record_test_transition();
-        if (fullstop == FULLSTOP_SHUTDOWN) {
+        if (torture_cleanup()) {
-                pr_warn(/* but going down anyway, so... */
-                       "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
-                mutex_unlock(&fullstop_mutex);
-                schedule_timeout_uninterruptible(10);
                if (cur_ops->cb_barrier != NULL)
                        cur_ops->cb_barrier();
                return;
        }
-        fullstop = FULLSTOP_RMMOD;
-        mutex_unlock(&fullstop_mutex);
-        unregister_reboot_notifier(&rcutorture_shutdown_nb);
-        rcu_torture_barrier_cleanup();
-        rcu_torture_stall_cleanup();
-        if (stutter_task) {
-                VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
-                kthread_stop(stutter_task);
-        }
-        stutter_task = NULL;
-        if (shuffler_task) {
-                VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
-                kthread_stop(shuffler_task);
-                free_cpumask_var(shuffle_tmp_mask);
-        }
-        shuffler_task = NULL;
-        if (writer_task) {
+        rcu_torture_barrier_cleanup();
-                VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
+        torture_stop_kthread(rcu_torture_stall, stall_task);
-                kthread_stop(writer_task);
+        torture_stop_kthread(rcu_torture_writer, writer_task);
-        }
-        writer_task = NULL;
        if (reader_tasks) {
-                for (i = 0; i < nrealreaders; i++) {
+                for (i = 0; i < nrealreaders; i++)
-                        if (reader_tasks[i]) {
+                        torture_stop_kthread(rcu_torture_reader,
-                                VERBOSE_PRINTK_STRING(
+                                             reader_tasks[i]);
-                                        "Stopping rcu_torture_reader task");
-                                kthread_stop(reader_tasks[i]);
-                        }
-                        reader_tasks[i] = NULL;
-                }
                kfree(reader_tasks);
-                reader_tasks = NULL;
        }
        rcu_torture_current = NULL;
        if (fakewriter_tasks) {
                for (i = 0; i < nfakewriters; i++) {
-                        if (fakewriter_tasks[i]) {
+                        torture_stop_kthread(rcu_torture_fakewriter,
-                                VERBOSE_PRINTK_STRING(
+                                             fakewriter_tasks[i]);
-                                        "Stopping rcu_torture_fakewriter task");
-                                kthread_stop(fakewriter_tasks[i]);
-                        }
-                        fakewriter_tasks[i] = NULL;
                }
                kfree(fakewriter_tasks);
                fakewriter_tasks = NULL;
        }
-        if (stats_task) {
+        torture_stop_kthread(rcu_torture_stats, stats_task);
-                VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
+        torture_stop_kthread(rcu_torture_fqs, fqs_task);
-                kthread_stop(stats_task);
-        }
-        stats_task = NULL;
-        if (fqs_task) {
-                VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task");
-                kthread_stop(fqs_task);
-        }
-        fqs_task = NULL;
        if ((test_boost == 1 && cur_ops->can_boost) ||
            test_boost == 2) {
                unregister_cpu_notifier(&rcutorture_cpu_nb);
                for_each_possible_cpu(i)
                        rcutorture_booster_cleanup(i);
        }
-        if (shutdown_task != NULL) {
-                VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
-                kthread_stop(shutdown_task);
-        }
-        shutdown_task = NULL;
-        rcu_torture_onoff_cleanup();
        /* Wait for all RCU callbacks to fire.  */
@@ -1841,8 +1343,7 @@ rcu_torture_cleanup(void)
        if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
                rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
-        else if (n_online_successes != n_online_attempts ||
+        else if (torture_onoff_failures())
-                 n_offline_successes != n_offline_attempts)
                rcu_torture_print_module_parms(cur_ops,
                                               "End of test: RCU_HOTPLUG");
        else
@@ -1911,12 +1412,11 @@ rcu_torture_init(void)
        int i;
        int cpu;
        int firsterr = 0;
-        int retval;
        static struct rcu_torture_ops *torture_ops[] = {
-                &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops,
+                &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops,
        };
-        mutex_lock(&fullstop_mutex);
+        torture_init_begin(torture_type, verbose, &rcutorture_runnable);
        /* Process args and tell the world that the torturer is on the job. */
        for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
@@ -1931,7 +1431,7 @@ rcu_torture_init(void)
                for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
                        pr_alert(" %s", torture_ops[i]->name);
                pr_alert("\n");
-                mutex_unlock(&fullstop_mutex);
+                torture_init_end();
                return -EINVAL;
        }
        if (cur_ops->fqs == NULL && fqs_duration != 0) {
@@ -1946,7 +1446,6 @@ rcu_torture_init(void)
        else
                nrealreaders = 2 * num_online_cpus();
        rcu_torture_print_module_parms(cur_ops, "Start of test");
-        fullstop = FULLSTOP_DONTSTOP;
        /* Set up the freelist. */
@@ -1982,108 +1481,61 @@ rcu_torture_init(void)
        /* Start up the kthreads. */
-        VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task");
+        firsterr = torture_create_kthread(rcu_torture_writer, NULL,
-        writer_task = kthread_create(rcu_torture_writer, NULL,
+                                          writer_task);
-                                     "rcu_torture_writer");
+        if (firsterr)
-        if (IS_ERR(writer_task)) {
-                firsterr = PTR_ERR(writer_task);
-                VERBOSE_PRINTK_ERRSTRING("Failed to create writer");
-                writer_task = NULL;
                goto unwind;
-        }
-        wake_up_process(writer_task);
        fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
                                   GFP_KERNEL);
        if (fakewriter_tasks == NULL) {
-                VERBOSE_PRINTK_ERRSTRING("out of memory");
+                VERBOSE_TOROUT_ERRSTRING("out of memory");
                firsterr = -ENOMEM;
                goto unwind;
        }
        for (i = 0; i < nfakewriters; i++) {
-                VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task");
+                firsterr = torture_create_kthread(rcu_torture_fakewriter,
-                fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL,
+                                                  NULL, fakewriter_tasks[i]);
-                                                  "rcu_torture_fakewriter");
+                if (firsterr)
-                if (IS_ERR(fakewriter_tasks[i])) {
-                        firsterr = PTR_ERR(fakewriter_tasks[i]);
-                        VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter");
-                        fakewriter_tasks[i] = NULL;
                        goto unwind;
-                }
        }
        reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]),
                               GFP_KERNEL);
        if (reader_tasks == NULL) {
-                VERBOSE_PRINTK_ERRSTRING("out of memory");
+                VERBOSE_TOROUT_ERRSTRING("out of memory");
                firsterr = -ENOMEM;
                goto unwind;
        }
        for (i = 0; i < nrealreaders; i++) {
-                VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task");
+                firsterr = torture_create_kthread(rcu_torture_reader, NULL,
-                reader_tasks[i] = kthread_run(rcu_torture_reader, NULL,
+                                                  reader_tasks[i]);
-                                              "rcu_torture_reader");
+                if (firsterr)
-                if (IS_ERR(reader_tasks[i])) {
-                        firsterr = PTR_ERR(reader_tasks[i]);
-                        VERBOSE_PRINTK_ERRSTRING("Failed to create reader");
-                        reader_tasks[i] = NULL;
                        goto unwind;
-                }
        }
        if (stat_interval > 0) {
-                VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task");
+                firsterr = torture_create_kthread(rcu_torture_stats, NULL,
-                stats_task = kthread_run(rcu_torture_stats, NULL,
+                                                  stats_task);
-                                        "rcu_torture_stats");
+                if (firsterr)
-                if (IS_ERR(stats_task)) {
-                        firsterr = PTR_ERR(stats_task);
-                        VERBOSE_PRINTK_ERRSTRING("Failed to create stats");
-                        stats_task = NULL;
                        goto unwind;
-                }
        }
        if (test_no_idle_hz) {
-                rcu_idle_cpu = num_online_cpus() - 1;
+                firsterr = torture_shuffle_init(shuffle_interval * HZ);
+                if (firsterr)
-                if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
-                        firsterr = -ENOMEM;
-                        VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask");
-                        goto unwind;
-                }
-                /* Create the shuffler thread */
-                shuffler_task = kthread_run(rcu_torture_shuffle, NULL,
-                                          "rcu_torture_shuffle");
-                if (IS_ERR(shuffler_task)) {
-                        free_cpumask_var(shuffle_tmp_mask);
-                        firsterr = PTR_ERR(shuffler_task);
-                        VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler");
-                        shuffler_task = NULL;
                        goto unwind;
-                }
        }
        if (stutter < 0)
                stutter = 0;
        if (stutter) {
-                /* Create the stutter thread */
+                firsterr = torture_stutter_init(stutter * HZ);
-                stutter_task = kthread_run(rcu_torture_stutter, NULL,
+                if (firsterr)
-                                          "rcu_torture_stutter");
-                if (IS_ERR(stutter_task)) {
-                        firsterr = PTR_ERR(stutter_task);
-                        VERBOSE_PRINTK_ERRSTRING("Failed to create stutter");
-                        stutter_task = NULL;
                        goto unwind;
-                }
        }
        if (fqs_duration < 0)
                fqs_duration = 0;
        if (fqs_duration) {
-                /* Create the stutter thread */
+                /* Create the fqs thread */
-                fqs_task = kthread_run(rcu_torture_fqs, NULL,
+                torture_create_kthread(rcu_torture_fqs, NULL, fqs_task);
-                                       "rcu_torture_fqs");
+                if (firsterr)
-                if (IS_ERR(fqs_task)) {
-                        firsterr = PTR_ERR(fqs_task);
-                        VERBOSE_PRINTK_ERRSTRING("Failed to create fqs");
-                        fqs_task = NULL;
                        goto unwind;
-                }
        }
        if (test_boost_interval < 1)
                test_boost_interval = 1;
@@ -2097,49 +1549,31 @@ rcu_torture_init(void)
                for_each_possible_cpu(i) {
                        if (cpu_is_offline(i))
                                continue;  /* Heuristic: CPU can go offline. */
-                        retval = rcutorture_booster_init(i);
+                        firsterr = rcutorture_booster_init(i);
-                        if (retval < 0) {
+                        if (firsterr)
-                                firsterr = retval;
                                goto unwind;
-                        }
                }
        }
-        if (shutdown_secs > 0) {
+        firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
-                shutdown_time = jiffies + shutdown_secs * HZ;
+        if (firsterr)
-                shutdown_task = kthread_create(rcu_torture_shutdown, NULL,
-                                               "rcu_torture_shutdown");
-                if (IS_ERR(shutdown_task)) {
-                        firsterr = PTR_ERR(shutdown_task);
-                        VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown");
-                        shutdown_task = NULL;
-                        goto unwind;
-                }
-                wake_up_process(shutdown_task);
-        }
-        i = rcu_torture_onoff_init();
-        if (i != 0) {
-                firsterr = i;
                goto unwind;
-        }
+        firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ);
-        register_reboot_notifier(&rcutorture_shutdown_nb);
+        if (firsterr)
-        i = rcu_torture_stall_init();
-        if (i != 0) {
-                firsterr = i;
                goto unwind;
-        }
+        firsterr = rcu_torture_stall_init();
-        retval = rcu_torture_barrier_init();
+        if (firsterr)
-        if (retval != 0) {
+                goto unwind;
-                firsterr = retval;
+        firsterr = rcu_torture_barrier_init();
+        if (firsterr)
                goto unwind;
-        }
        if (object_debug)
                rcu_test_debug_objects();
        rcutorture_record_test_transition();
-        mutex_unlock(&fullstop_mutex);
+        torture_init_end();
        return 0;
 unwind:
-        mutex_unlock(&fullstop_mutex);
+        torture_init_end();
        rcu_torture_cleanup();
        return firsterr;
 }
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 3318d8284384..c639556f3fa0 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -12,8 +12,8 @@
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
+ * along with this program; if not, you can access it online at
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * http://www.gnu.org/licenses/gpl-2.0.html.
 *
 * Copyright (C) IBM Corporation, 2006
 * Copyright (C) Fujitsu, 2012
@@ -36,8 +36,6 @@
 #include <linux/delay.h>
 #include <linux/srcu.h>
-#include <trace/events/rcu.h>
 #include "rcu.h"
 /*
@@ -398,7 +396,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
        rcu_batch_queue(&sp->batch_queue, head);
        if (!sp->running) {
                sp->running = true;
-                schedule_delayed_work(&sp->work, 0);
+                queue_delayed_work(system_power_efficient_wq, &sp->work, 0);
        }
        spin_unlock_irqrestore(&sp->queue_lock, flags);
 }
@@ -674,7 +672,8 @@ static void srcu_reschedule(struct srcu_struct *sp)
        }
        if (pending)
-                schedule_delayed_work(&sp->work, SRCU_INTERVAL);
+                queue_delayed_work(system_power_efficient_wq,
+                                   &sp->work, SRCU_INTERVAL);
 }
 /*
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 1254f312d024..d9efcc13008c 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -12,8 +12,8 @@
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
+ * along with this program; if not, you can access it online at
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * http://www.gnu.org/licenses/gpl-2.0.html.
 *
 * Copyright IBM Corporation, 2008
 *
@@ -37,10 +37,6 @@
 #include <linux/prefetch.h>
 #include <linux/ftrace_event.h>
-#ifdef CONFIG_RCU_TRACE
-#include <trace/events/rcu.h>
-#endif /* #else #ifdef CONFIG_RCU_TRACE */
 #include "rcu.h"
 /* Forward declarations for tiny_plugin.h. */
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 280d06cae352..431528520562 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -14,8 +14,8 @@
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
+ * along with this program; if not, you can access it online at
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * http://www.gnu.org/licenses/gpl-2.0.html.
 *
 * Copyright (c) 2010 Linaro
 *
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b3d116cd072d..0c47e300210a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -12,8 +12,8 @@
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
+ * along with this program; if not, you can access it online at
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * http://www.gnu.org/licenses/gpl-2.0.html.
 *
 * Copyright IBM Corporation, 2008
 *
@@ -58,8 +58,6 @@
 #include <linux/suspend.h>
 #include "tree.h"
-#include <trace/events/rcu.h>
 #include "rcu.h"
 MODULE_ALIAS("rcutree");
@@ -837,7 +835,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
         * to the next.  Only do this for the primary flavor of RCU.
         */
        if (rdp->rsp == rcu_state &&
-            ULONG_CMP_GE(ACCESS_ONCE(jiffies), rdp->rsp->jiffies_resched)) {
+            ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
                rdp->rsp->jiffies_resched += 5;
                resched_cpu(rdp->cpu);
        }
@@ -847,7 +845,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
 static void record_gp_stall_check_time(struct rcu_state *rsp)
 {
-        unsigned long j = ACCESS_ONCE(jiffies);
+        unsigned long j = jiffies;
        unsigned long j1;
        rsp->gp_start = j;
@@ -1005,7 +1003,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
        if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp))
                return;
-        j = ACCESS_ONCE(jiffies);
+        j = jiffies;
        /*
         * Lots of memory barriers to reject false positives.
@@ -1423,13 +1421,14 @@ static int rcu_gp_init(struct rcu_state *rsp)
        /* Advance to a new grace period and initialize state. */
        record_gp_stall_check_time(rsp);
-        smp_wmb(); /* Record GP times before starting GP. */
+        /* Record GP times before starting GP, hence smp_store_release(). */
-        rsp->gpnum++;
+        smp_store_release(&rsp->gpnum, rsp->gpnum + 1);
        trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
        raw_spin_unlock_irq(&rnp->lock);
        /* Exclude any concurrent CPU-hotplug operations. */
        mutex_lock(&rsp->onoff_mutex);
+        smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */
        /*
         * Set the quiescent-state-needed bits in all the rcu_node
@@ -1557,10 +1556,11 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
        }
        rnp = rcu_get_root(rsp);
        raw_spin_lock_irq(&rnp->lock);
-        smp_mb__after_unlock_lock();
+        smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */
        rcu_nocb_gp_set(rnp, nocb);
-        rsp->completed = rsp->gpnum; /* Declare grace period done. */
+        /* Declare grace period done. */
+        ACCESS_ONCE(rsp->completed) = rsp->gpnum;
        trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
        rsp->fqs_state = RCU_GP_IDLE;
        rdp = this_cpu_ptr(rsp->rda);
@@ -2304,7 +2304,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
                if (rnp_old != NULL)
                        raw_spin_unlock(&rnp_old->fqslock);
                if (ret) {
-                        rsp->n_force_qs_lh++;
+                        ACCESS_ONCE(rsp->n_force_qs_lh)++;
                        return;
                }
                rnp_old = rnp;
@@ -2316,7 +2316,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
        smp_mb__after_unlock_lock();
        raw_spin_unlock(&rnp_old->fqslock);
        if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
-                rsp->n_force_qs_lh++;
+                ACCESS_ONCE(rsp->n_force_qs_lh)++;
                raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
                return;  /* Someone beat us to it. */
        }
@@ -2639,6 +2639,58 @@ void synchronize_rcu_bh(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
+/**
+ * get_state_synchronize_rcu - Snapshot current RCU state
+ *
+ * Returns a cookie that is used by a later call to cond_synchronize_rcu()
+ * to determine whether or not a full grace period has elapsed in the
+ * meantime.
+ */
+unsigned long get_state_synchronize_rcu(void)
+{
+        /*
+         * Any prior manipulation of RCU-protected data must happen
+         * before the load from ->gpnum.
+         */
+        smp_mb();  /* ^^^ */
+        /*
+         * Make sure this load happens before the purportedly
+         * time-consuming work between get_state_synchronize_rcu()
+         * and cond_synchronize_rcu().
+         */
+        return smp_load_acquire(&rcu_state->gpnum);
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
+/**
+ * cond_synchronize_rcu - Conditionally wait for an RCU grace period
+ *
+ * @oldstate: return value from earlier call to get_state_synchronize_rcu()
+ *
+ * If a full RCU grace period has elapsed since the earlier call to
+ * get_state_synchronize_rcu(), just return.  Otherwise, invoke
+ * synchronize_rcu() to wait for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account.  But
+ * counter wrap is harmless.  If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for one additional grace period should be just fine.
+ */
+void cond_synchronize_rcu(unsigned long oldstate)
+{
+        unsigned long newstate;
+        /*
+         * Ensure that this load happens before any RCU-destructive
+         * actions the caller might carry out after we return.
+         */
+        newstate = smp_load_acquire(&rcu_state->completed);
+        if (ULONG_CMP_GE(oldstate, newstate))
+                synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
 static int synchronize_sched_expedited_cpu_stop(void *data)
 {
        /*
@@ -2880,7 +2932,7 @@ static int rcu_pending(int cpu)
 * non-NULL, store an indication of whether all callbacks are lazy.
 * (If there are no callbacks, all of them are deemed to be lazy.)
 */
-static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
+static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
 {
        bool al = true;
        bool hc = false;
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 8c19873f1ac9..75dc3c39a02a 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -13,8 +13,8 @@
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
+ * along with this program; if not, you can access it online at
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * http://www.gnu.org/licenses/gpl-2.0.html.
 *
 * Copyright IBM Corporation, 2008
 *
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 6e2ef4b2b920..962d1d589929 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -14,8 +14,8 @@
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
+ * along with this program; if not, you can access it online at
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * http://www.gnu.org/licenses/gpl-2.0.html.
 *
 * Copyright Red Hat, 2009
 * Copyright IBM Corporation, 2009
@@ -1586,11 +1586,13 @@ static void rcu_prepare_kthreads(int cpu)
 * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
 * any flavor of RCU.
 */
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
 int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
 {
        *delta_jiffies = ULONG_MAX;
        return rcu_cpu_has_callbacks(cpu, NULL);
 }
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 /*
 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
@@ -1656,7 +1658,7 @@ extern int tick_nohz_active;
 * only if it has been awhile since the last time we did so.  Afterwards,
 * if there are any callbacks ready for immediate invocation, return true.
 */
-static bool rcu_try_advance_all_cbs(void)
+static bool __maybe_unused rcu_try_advance_all_cbs(void)
 {
        bool cbs_ready = false;
        struct rcu_data *rdp;
@@ -1696,6 +1698,7 @@ static bool rcu_try_advance_all_cbs(void)
 *
 * The caller must have disabled interrupts.
 */
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
 int rcu_needs_cpu(int cpu, unsigned long *dj)
 {
        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
@@ -1726,6 +1729,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
        }
        return 0;
 }
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 /*
 * Prepare a CPU for idle from an RCU perspective.  The first major task
@@ -1739,6 +1743,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
 */
 static void rcu_prepare_for_idle(int cpu)
 {
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
        struct rcu_data *rdp;
        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
        struct rcu_node *rnp;
@@ -1790,6 +1795,7 @@ static void rcu_prepare_for_idle(int cpu)
                rcu_accelerate_cbs(rsp, rnp, rdp);
                raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
        }
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 }
 /*
@@ -1799,11 +1805,12 @@ static void rcu_prepare_for_idle(int cpu)
 */
 static void rcu_cleanup_after_idle(int cpu)
 {
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
        if (rcu_is_nocb_cpu(cpu))
                return;
        if (rcu_try_advance_all_cbs())
                invoke_rcu_core();
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 }
 /*
@@ -2101,6 +2108,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
        init_waitqueue_head(&rnp->nocb_gp_wq[1]);
 }
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
 /* Is the specified CPU a no-CPUs CPU? */
 bool rcu_is_nocb_cpu(int cpu)
 {
@@ -2108,6 +2116,7 @@ bool rcu_is_nocb_cpu(int cpu)
                return cpumask_test_cpu(cpu, rcu_nocb_mask);
        return false;
 }
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 /*
 * Enqueue the specified string of rcu_head structures onto the specified
@@ -2893,7 +2902,7 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
 * CPU unless the grace period has extended for too long.
 *
 * This code relies on the fact that all NO_HZ_FULL CPUs are also
- * CONFIG_RCU_NOCB_CPUs.
+ * CONFIG_RCU_NOCB_CPU CPUs.
 */
 static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
 {
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 4def475336d4..5cdc62e1beeb 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -12,8 +12,8 @@
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
+ * along with this program; if not, you can access it online at
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * http://www.gnu.org/licenses/gpl-2.0.html.
 *
 * Copyright IBM Corporation, 2008
 *
@@ -273,7 +273,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
        seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
                   rsp->n_force_qs, rsp->n_force_qs_ngp,
                   rsp->n_force_qs - rsp->n_force_qs_ngp,
-                   rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen);
+                   ACCESS_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen);
        for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
                if (rnp->level != level) {
                        seq_puts(m, "\n");
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index c54609faf233..4c0a9b0af469 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -12,8 +12,8 @@
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
+ * along with this program; if not, you can access it online at
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * http://www.gnu.org/licenses/gpl-2.0.html.
 *
 * Copyright IBM Corporation, 2001
 *
@@ -49,7 +49,6 @@
 #include <linux/module.h>
 #define CREATE_TRACE_POINTS
-#include <trace/events/rcu.h>
 #include "rcu.h"
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 9a95c8c2af2a..ab32b7b0db5c 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -13,7 +13,7 @@ endif
 obj-y += core.o proc.o clock.o cputime.o
 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-y += wait.o completion.o
+obj-y += wait.o completion.o idle.o
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 4a073539c58e..e73efba98301 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -203,7 +203,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
        struct autogroup *ag;
        int err;
-        if (nice < -20 || nice > 19)
+        if (nice < MIN_NICE || nice > MAX_NICE)
                return -EINVAL;
        err = security_task_setnice(current, nice);
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 43c2bcc35761..b30a2924ef14 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -301,14 +301,14 @@ u64 sched_clock_cpu(int cpu)
        if (unlikely(!sched_clock_running))
                return 0ull;
-        preempt_disable();
+        preempt_disable_notrace();
        scd = cpu_sdc(cpu);
        if (cpu != smp_processor_id())
                clock = sched_clock_remote(scd);
        else
                clock = sched_clock_local(scd);
-        preempt_enable();
+        preempt_enable_notrace();
        return clock;
 }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b46131ef6aab..a47902c687ae 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1745,8 +1745,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
        p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
        p->numa_scan_period = sysctl_numa_balancing_scan_delay;
        p->numa_work.next = &p->numa_work;
-        p->numa_faults = NULL;
+        p->numa_faults_memory = NULL;
-        p->numa_faults_buffer = NULL;
+        p->numa_faults_buffer_memory = NULL;
+        p->last_task_numa_placement = 0;
+        p->last_sum_exec_runtime = 0;
        INIT_LIST_HEAD(&p->numa_entry);
        p->numa_group = NULL;
@@ -1952,7 +1954,7 @@ static int dl_overflow(struct task_struct *p, int policy,
 {
        struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
-        u64 period = attr->sched_period;
+        u64 period = attr->sched_period ?: attr->sched_deadline;
        u64 runtime = attr->sched_runtime;
        u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
        int cpus, err = -1;
@@ -2149,8 +2151,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
        if (mm)
                mmdrop(mm);
        if (unlikely(prev_state == TASK_DEAD)) {
-                task_numa_free(prev);
                if (prev->sched_class->task_dead)
                        prev->sched_class->task_dead(prev);
@@ -2167,13 +2167,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 #ifdef CONFIG_SMP
-/* assumes rq->lock is held */
-static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
-{
-        if (prev->sched_class->pre_schedule)
-                prev->sched_class->pre_schedule(rq, prev);
-}
 /* rq->lock is NOT held, but preemption is disabled */
 static inline void post_schedule(struct rq *rq)
 {
@@ -2191,10 +2184,6 @@ static inline void post_schedule(struct rq *rq)
 #else
-static inline void pre_schedule(struct rq *rq, struct task_struct *p)
-{
-}
 static inline void post_schedule(struct rq *rq)
 {
 }
@@ -2510,8 +2499,13 @@ void __kprobes preempt_count_add(int val)
        DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
                                PREEMPT_MASK - 10);
 #endif
-        if (preempt_count() == val)
+        if (preempt_count() == val) {
-                trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+                unsigned long ip = get_parent_ip(CALLER_ADDR1);
+#ifdef CONFIG_DEBUG_PREEMPT
+                current->preempt_disable_ip = ip;
+#endif
+                trace_preempt_off(CALLER_ADDR0, ip);
+        }
 }
 EXPORT_SYMBOL(preempt_count_add);
@@ -2554,6 +2548,13 @@ static noinline void __schedule_bug(struct task_struct *prev)
        print_modules();
        if (irqs_disabled())
                print_irqtrace_events(prev);
+#ifdef CONFIG_DEBUG_PREEMPT
+        if (in_atomic_preempt_off()) {
+                pr_err("Preemption disabled at:");
+                print_ip_sym(current->preempt_disable_ip);
+                pr_cont("\n");
+        }
+#endif
        dump_stack();
        add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 }
@@ -2577,36 +2578,34 @@ static inline void schedule_debug(struct task_struct *prev)
        schedstat_inc(this_rq(), sched_count);
 }
-static void put_prev_task(struct rq *rq, struct task_struct *prev)
-{
-        if (prev->on_rq || rq->skip_clock_update < 0)
-                update_rq_clock(rq);
-        prev->sched_class->put_prev_task(rq, prev);
-}
 /*
 * Pick up the highest-prio task:
 */
 static inline struct task_struct *
-pick_next_task(struct rq *rq)
+pick_next_task(struct rq *rq, struct task_struct *prev)
 {
-        const struct sched_class *class;
+        const struct sched_class *class = &fair_sched_class;
        struct task_struct *p;
        /*
         * Optimization: we know that if all tasks are in
         * the fair class we can call that function directly:
         */
-        if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
+        if (likely(prev->sched_class == class &&
-                p = fair_sched_class.pick_next_task(rq);
+                   rq->nr_running == rq->cfs.h_nr_running)) {
-                if (likely(p))
+                p = fair_sched_class.pick_next_task(rq, prev);
+                if (likely(p && p != RETRY_TASK))
                        return p;
        }
+again:
        for_each_class(class) {
-                p = class->pick_next_task(rq);
+                p = class->pick_next_task(rq, prev);
-                if (p)
+                if (p) {
+                        if (unlikely(p == RETRY_TASK))
+                                goto again;
                        return p;
+                }
        }
        BUG(); /* the idle class will always have a runnable task */
@@ -2700,13 +2699,10 @@ need_resched:
                switch_count = &prev->nvcsw;
        }
-        pre_schedule(rq, prev);
+        if (prev->on_rq || rq->skip_clock_update < 0)
+                update_rq_clock(rq);
-        if (unlikely(!rq->nr_running))
-                idle_balance(cpu, rq);
-        put_prev_task(rq, prev);
+        next = pick_next_task(rq, prev);
-        next = pick_next_task(rq);
        clear_tsk_need_resched(prev);
        clear_preempt_need_resched();
        rq->skip_clock_update = 0;
@@ -2908,7 +2904,8 @@ EXPORT_SYMBOL(sleep_on_timeout);
 * This function changes the 'effective' priority of a task. It does
 * not touch ->normal_prio like __setscheduler().
 *
- * Used by the rt_mutex code to implement priority inheritance logic.
+ * Used by the rt_mutex code to implement priority inheritance
+ * logic. Call site only calls if the priority of the task changed.
 */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
@@ -2998,7 +2995,7 @@ void set_user_nice(struct task_struct *p, long nice)
        unsigned long flags;
        struct rq *rq;
-        if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
+        if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
                return;
        /*
         * We have to be careful, if called from sys_setpriority(),
@@ -3076,11 +3073,11 @@ SYSCALL_DEFINE1(nice, int, increment)
        if (increment > 40)
                increment = 40;
-        nice = TASK_NICE(current) + increment;
+        nice = task_nice(current) + increment;
-        if (nice < -20)
+        if (nice < MIN_NICE)
-                nice = -20;
+                nice = MIN_NICE;
-        if (nice > 19)
+        if (nice > MAX_NICE)
-                nice = 19;
+                nice = MAX_NICE;
        if (increment < 0 && !can_nice(current, nice))
                return -EPERM;
@@ -3109,18 +3106,6 @@ int task_prio(const struct task_struct *p)
 }
 /**
- * task_nice - return the nice value of a given task.
- * @p: the task in question.
- *
- * Return: The nice value [ -20 ... 0 ... 19 ].
- */
-int task_nice(const struct task_struct *p)
-{
-        return TASK_NICE(p);
-}
-EXPORT_SYMBOL(task_nice);
-/**
 * idle_cpu - is a given cpu idle currently?
 * @cpu: the processor in question.
 *
@@ -3189,9 +3174,8 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
        dl_se->dl_new = 1;
 }
-/* Actually do priority change: must hold pi & rq lock. */
+static void __setscheduler_params(struct task_struct *p,
-static void __setscheduler(struct rq *rq, struct task_struct *p,
+                const struct sched_attr *attr)
-                           const struct sched_attr *attr)
 {
        int policy = attr->sched_policy;
@@ -3211,9 +3195,21 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
         * getparam()/getattr() don't report silly values for !rt tasks.
         */
        p->rt_priority = attr->sched_priority;
        p->normal_prio = normal_prio(p);
-        p->prio = rt_mutex_getprio(p);
+        set_load_weight(p);
+}
+/* Actually do priority change: must hold pi & rq lock. */
+static void __setscheduler(struct rq *rq, struct task_struct *p,
+                           const struct sched_attr *attr)
+{
+        __setscheduler_params(p, attr);
+        /*
+         * If we get here, there was no pi waiters boosting the
+         * task. It is safe to use the normal prio.
+         */
+        p->prio = normal_prio(p);
        if (dl_prio(p->prio))
                p->sched_class = &dl_sched_class;
@@ -3221,8 +3217,6 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
                p->sched_class = &rt_sched_class;
        else
                p->sched_class = &fair_sched_class;
-        set_load_weight(p);
 }
 static void
@@ -3275,6 +3269,8 @@ static int __sched_setscheduler(struct task_struct *p,
                                const struct sched_attr *attr,
                                bool user)
 {
+        int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
+                      MAX_RT_PRIO - 1 - attr->sched_priority;
        int retval, oldprio, oldpolicy = -1, on_rq, running;
        int policy = attr->sched_policy;
        unsigned long flags;
@@ -3319,7 +3315,7 @@ recheck:
         */
        if (user && !capable(CAP_SYS_NICE)) {
                if (fair_policy(policy)) {
-                        if (attr->sched_nice < TASK_NICE(p) &&
+                        if (attr->sched_nice < task_nice(p) &&
                            !can_nice(p, attr->sched_nice))
                                return -EPERM;
                }
@@ -3338,12 +3334,21 @@ recheck:
                                return -EPERM;
                }
+                 /*
+                  * Can't set/change SCHED_DEADLINE policy at all for now
+                  * (safest behavior); in the future we would like to allow
+                  * unprivileged DL tasks to increase their relative deadline
+                  * or reduce their runtime (both ways reducing utilization)
+                  */
+                if (dl_policy(policy))
+                        return -EPERM;
                /*
                 * Treat SCHED_IDLE as nice 20. Only allow a switch to
                 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
                 */
                if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
-                        if (!can_nice(p, TASK_NICE(p)))
+                        if (!can_nice(p, task_nice(p)))
                                return -EPERM;
                }
@@ -3380,16 +3385,18 @@ recheck:
        }
        /*
-         * If not changing anything there's no need to proceed further:
+         * If not changing anything there's no need to proceed further,
+         * but store a possible modification of reset_on_fork.
         */
        if (unlikely(policy == p->policy)) {
-                if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p))
+                if (fair_policy(policy) && attr->sched_nice != task_nice(p))
                        goto change;
                if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
                        goto change;
                if (dl_policy(policy))
                        goto change;
+                p->sched_reset_on_fork = reset_on_fork;
                task_rq_unlock(rq, p, &flags);
                return 0;
        }
@@ -3443,6 +3450,24 @@ change:
                return -EBUSY;
        }
+        p->sched_reset_on_fork = reset_on_fork;
+        oldprio = p->prio;
+        /*
+         * Special case for priority boosted tasks.
+         *
+         * If the new priority is lower or equal (user space view)
+         * than the current (boosted) priority, we just store the new
+         * normal parameters and do not touch the scheduler class and
+         * the runqueue. This will be done when the task deboost
+         * itself.
+         */
+        if (rt_mutex_check_prio(p, newprio)) {
+                __setscheduler_params(p, attr);
+                task_rq_unlock(rq, p, &flags);
+                return 0;
+        }
        on_rq = p->on_rq;
        running = task_current(rq, p);
        if (on_rq)
@@ -3450,16 +3475,18 @@ change:
        if (running)
                p->sched_class->put_prev_task(rq, p);
-        p->sched_reset_on_fork = reset_on_fork;
-        oldprio = p->prio;
        prev_class = p->sched_class;
        __setscheduler(rq, p, attr);
        if (running)
                p->sched_class->set_curr_task(rq);
-        if (on_rq)
+        if (on_rq) {
-                enqueue_task(rq, p, 0);
+                /*
+                 * We enqueue to tail when the priority of a task is
+                 * increased (user space view).
+                 */
+                enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+        }
        check_class_changed(rq, p, prev_class, oldprio);
        task_rq_unlock(rq, p, &flags);
@@ -3615,7 +3642,7 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
         * XXX: do we want to be lenient like existing syscalls; or do we want
         * to be strict and return an error on out-of-bounds values?
         */
-        attr->sched_nice = clamp(attr->sched_nice, -20, 19);
+        attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
 out:
        return ret;
@@ -3661,13 +3688,14 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
 * @pid: the pid in question.
 * @uattr: structure containing the extended parameters.
 */
-SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr)
+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
+                               unsigned int, flags)
 {
        struct sched_attr attr;
        struct task_struct *p;
        int retval;
-        if (!uattr || pid < 0)
+        if (!uattr || pid < 0 || flags)
                return -EINVAL;
        if (sched_copy_attr(uattr, &attr))
@@ -3786,7 +3814,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
                attr->size = usize;
        }
-        ret = copy_to_user(uattr, attr, usize);
+        ret = copy_to_user(uattr, attr, attr->size);
        if (ret)
                return -EFAULT;
@@ -3804,8 +3832,8 @@ err_size:
 * @uattr: structure containing the extended parameters.
 * @size: sizeof(attr) for fwd/bwd comp.
 */
-SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
-                unsigned int, size)
+                unsigned int, size, unsigned int, flags)
 {
        struct sched_attr attr = {
                .size = sizeof(struct sched_attr),
@@ -3814,7 +3842,7 @@ SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
        int retval;
        if (!uattr || pid < 0 || size > PAGE_SIZE ||
-            size < SCHED_ATTR_SIZE_VER0)
+            size < SCHED_ATTR_SIZE_VER0 || flags)
                return -EINVAL;
        rcu_read_lock();
@@ -3835,7 +3863,7 @@ SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
        else if (task_has_rt_policy(p))
                attr.sched_priority = p->rt_priority;
        else
-                attr.sched_nice = TASK_NICE(p);
+                attr.sched_nice = task_nice(p);
        rcu_read_unlock();
@@ -4473,6 +4501,7 @@ void init_idle(struct task_struct *idle, int cpu)
        rcu_read_unlock();
        rq->curr = rq->idle = idle;
+        idle->on_rq = 1;
 #if defined(CONFIG_SMP)
        idle->on_cpu = 1;
 #endif
@@ -4692,8 +4721,10 @@ void idle_task_exit(void)
        BUG_ON(cpu_online(smp_processor_id()));
-        if (mm != &init_mm)
+        if (mm != &init_mm) {
                switch_mm(mm, &init_mm, current);
+                finish_arch_post_lock_switch();
+        }
        mmdrop(mm);
 }
@@ -4711,6 +4742,22 @@ static void calc_load_migrate(struct rq *rq)
                atomic_long_add(delta, &calc_load_tasks);
 }
+static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
+{
+}
+static const struct sched_class fake_sched_class = {
+        .put_prev_task = put_prev_task_fake,
+};
+static struct task_struct fake_task = {
+        /*
+         * Avoid pull_{rt,dl}_task()
+         */
+        .prio = MAX_PRIO + 1,
+        .sched_class = &fake_sched_class,
+};
 /*
 * Migrate all tasks from the rq, sleeping tasks will be migrated by
 * try_to_wake_up()->select_task_rq().
@@ -4751,7 +4798,7 @@ static void migrate_tasks(unsigned int dead_cpu)
                if (rq->nr_running == 1)
                        break;
-                next = pick_next_task(rq);
+                next = pick_next_task(rq, &fake_task);
                BUG_ON(!next);
                next->sched_class->put_prev_task(rq, next);
@@ -4841,7 +4888,7 @@ set_table_entry(struct ctl_table *entry,
 static struct ctl_table *
 sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
-        struct ctl_table *table = sd_alloc_ctl_entry(13);
+        struct ctl_table *table = sd_alloc_ctl_entry(14);
        if (table == NULL)
                return NULL;
@@ -4869,9 +4916,12 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
                sizeof(int), 0644, proc_dointvec_minmax, false);
        set_table_entry(&table[10], "flags", &sd->flags,
                sizeof(int), 0644, proc_dointvec_minmax, false);
-        set_table_entry(&table[11], "name", sd->name,
+        set_table_entry(&table[11], "max_newidle_lb_cost",
+                &sd->max_newidle_lb_cost,
+                sizeof(long), 0644, proc_doulongvec_minmax, false);
+        set_table_entry(&table[12], "name", sd->name,
                CORENAME_MAX_SIZE, 0444, proc_dostring, false);
-        /* &table[12] is terminator */
+        /* &table[13] is terminator */
        return table;
 }
@@ -6848,7 +6898,6 @@ void __init sched_init(void)
                rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
 #ifdef CONFIG_RT_GROUP_SCHED
-                INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
                init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
 #endif
@@ -6937,7 +6986,8 @@ void __might_sleep(const char *file, int line, int preempt_offset)
        static unsigned long prev_jiffy;        /* ratelimiting */
        rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
-        if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
+        if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
+             !is_idle_task(current)) ||
            system_state != SYSTEM_RUNNING || oops_in_progress)
                return;
        if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
@@ -6955,6 +7005,13 @@ void __might_sleep(const char *file, int line, int preempt_offset)
        debug_show_held_locks(current);
        if (irqs_disabled())
                print_irqtrace_events(current);
+#ifdef CONFIG_DEBUG_PREEMPT
+        if (!preempt_count_equals(preempt_offset)) {
+                pr_err("Preemption disabled at:");
+                print_ip_sym(current->preempt_disable_ip);
+                pr_cont("\n");
+        }
+#endif
        dump_stack();
 }
 EXPORT_SYMBOL(__might_sleep);
@@ -7008,7 +7065,7 @@ void normalize_rt_tasks(void)
                         * Renice negative nice level userspace
                         * tasks back to 0:
                         */
-                        if (TASK_NICE(p) < 0 && p->mm)
+                        if (task_nice(p) < 0 && p->mm)
                                set_user_nice(p, 0);
                        continue;
                }
@@ -7422,6 +7479,7 @@ static int sched_dl_global_constraints(void)
        u64 period = global_rt_period();
        u64 new_bw = to_ratio(period, runtime);
        int cpu, ret = 0;
+        unsigned long flags;
        /*
         * Here we want to check the bandwidth not being set to some
@@ -7435,10 +7493,10 @@ static int sched_dl_global_constraints(void)
        for_each_possible_cpu(cpu) {
                struct dl_bw *dl_b = dl_bw_of(cpu);
-                raw_spin_lock(&dl_b->lock);
+                raw_spin_lock_irqsave(&dl_b->lock, flags);
                if (new_bw < dl_b->total_bw)
                        ret = -EBUSY;
-                raw_spin_unlock(&dl_b->lock);
+                raw_spin_unlock_irqrestore(&dl_b->lock, flags);
                if (ret)
                        break;
@@ -7451,6 +7509,7 @@ static void sched_dl_do_global(void)
 {
        u64 new_bw = -1;
        int cpu;
+        unsigned long flags;
        def_dl_bandwidth.dl_period = global_rt_period();
        def_dl_bandwidth.dl_runtime = global_rt_runtime();
@@ -7464,9 +7523,9 @@ static void sched_dl_do_global(void)
        for_each_possible_cpu(cpu) {
                struct dl_bw *dl_b = dl_bw_of(cpu);
-                raw_spin_lock(&dl_b->lock);
+                raw_spin_lock_irqsave(&dl_b->lock, flags);
                dl_b->bw = new_bw;
-                raw_spin_unlock(&dl_b->lock);
+                raw_spin_unlock_irqrestore(&dl_b->lock, flags);
        }
 }
@@ -7475,7 +7534,8 @@ static int sched_rt_global_validate(void)
        if (sysctl_sched_rt_period <= 0)
                return -EINVAL;
-        if (sysctl_sched_rt_runtime > sysctl_sched_rt_period)
+        if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
+                (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
                return -EINVAL;
        return 0;
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 045fc74e3f09..5b9bb42b2d47 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -70,7 +70,7 @@ static void cpudl_heapify(struct cpudl *cp, int idx)
 static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
 {
-        WARN_ON(idx > num_present_cpus() || idx == IDX_INVALID);
+        WARN_ON(idx == IDX_INVALID || !cpu_present(idx));
        if (dl_time_before(new_dl, cp->elements[idx].dl)) {
                cp->elements[idx].dl = new_dl;
@@ -117,7 +117,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
        }
 out:
-        WARN_ON(best_cpu > num_present_cpus() && best_cpu != -1);
+        WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
        return best_cpu;
 }
@@ -137,7 +137,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
        int old_idx, new_cpu;
        unsigned long flags;
-        WARN_ON(cpu > num_present_cpus());
+        WARN_ON(!cpu_present(cpu));
        raw_spin_lock_irqsave(&cp->lock, flags);
        old_idx = cp->cpu_to_idx[cpu];
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 99947919e30b..58624a65f124 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -142,7 +142,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
        p->utimescaled += cputime_scaled;
        account_group_user_time(p, cputime);
-        index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
+        index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
        /* Add user time to cpustat. */
        task_group_account_field(p, index, (__force u64) cputime);
@@ -169,7 +169,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
        p->gtime += cputime;
        /* Add guest time to cpustat. */
-        if (TASK_NICE(p) > 0) {
+        if (task_nice(p) > 0) {
                cpustat[CPUTIME_NICE] += (__force u64) cputime;
                cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
        } else {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0dd5e0971a07..27ef40925525 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -121,7 +121,7 @@ static inline void dl_clear_overload(struct rq *rq)
 static void update_dl_migration(struct dl_rq *dl_rq)
 {
-        if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_total > 1) {
+        if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) {
                if (!dl_rq->overloaded) {
                        dl_set_overload(rq_of_dl_rq(dl_rq));
                        dl_rq->overloaded = 1;
@@ -135,9 +135,7 @@ static void update_dl_migration(struct dl_rq *dl_rq)
 static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
        struct task_struct *p = dl_task_of(dl_se);
-        dl_rq = &rq_of_dl_rq(dl_rq)->dl;
-        dl_rq->dl_nr_total++;
        if (p->nr_cpus_allowed > 1)
                dl_rq->dl_nr_migratory++;
@@ -147,9 +145,7 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
        struct task_struct *p = dl_task_of(dl_se);
-        dl_rq = &rq_of_dl_rq(dl_rq)->dl;
-        dl_rq->dl_nr_total--;
        if (p->nr_cpus_allowed > 1)
                dl_rq->dl_nr_migratory--;
@@ -214,6 +210,16 @@ static inline int has_pushable_dl_tasks(struct rq *rq)
 static int push_dl_task(struct rq *rq);
+static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
+{
+        return dl_task(prev);
+}
+static inline void set_post_schedule(struct rq *rq)
+{
+        rq->post_schedule = has_pushable_dl_tasks(rq);
+}
 #else
 static inline
@@ -236,6 +242,19 @@ void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
 }
+static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
+{
+        return false;
+}
+static inline int pull_dl_task(struct rq *rq)
+{
+        return 0;
+}
+static inline void set_post_schedule(struct rq *rq)
+{
+}
 #endif /* CONFIG_SMP */
 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
@@ -566,6 +585,8 @@ int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
        return 1;
 }
+extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
 /*
 * Update the current task's runtime statistics (provided it is still
 * a -deadline task and has not been removed from the dl_rq).
@@ -588,8 +609,8 @@ static void update_curr_dl(struct rq *rq)
         * approach need further study.
         */
        delta_exec = rq_clock_task(rq) - curr->se.exec_start;
-        if (unlikely((s64)delta_exec < 0))
+        if (unlikely((s64)delta_exec <= 0))
-                delta_exec = 0;
+                return;
        schedstat_set(curr->se.statistics.exec_max,
                      max(curr->se.statistics.exec_max, delta_exec));
@@ -629,11 +650,13 @@ static void update_curr_dl(struct rq *rq)
                struct rt_rq *rt_rq = &rq->rt;
                raw_spin_lock(&rt_rq->rt_runtime_lock);
-                rt_rq->rt_time += delta_exec;
                /*
                 * We'll let actual RT tasks worry about the overflow here, we
-                 * have our own CBS to keep us inline -- see above.
+                 * have our own CBS to keep us inline; only account when RT
+                 * bandwidth is relevant.
                 */
+                if (sched_rt_bandwidth_account(rt_rq))
+                        rt_rq->rt_time += delta_exec;
                raw_spin_unlock(&rt_rq->rt_runtime_lock);
        }
 }
@@ -717,6 +740,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
        WARN_ON(!dl_prio(prio));
        dl_rq->dl_nr_running++;
+        inc_nr_running(rq_of_dl_rq(dl_rq));
        inc_dl_deadline(dl_rq, deadline);
        inc_dl_migration(dl_se, dl_rq);
@@ -730,6 +754,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
        WARN_ON(!dl_prio(prio));
        WARN_ON(!dl_rq->dl_nr_running);
        dl_rq->dl_nr_running--;
+        dec_nr_running(rq_of_dl_rq(dl_rq));
        dec_dl_deadline(dl_rq, dl_se->deadline);
        dec_dl_migration(dl_se, dl_rq);
@@ -836,8 +861,6 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
        if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
                enqueue_pushable_dl_task(rq, p);
-        inc_nr_running(rq);
 }
 static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
@@ -850,8 +873,6 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 {
        update_curr_dl(rq);
        __dequeue_task_dl(rq, p, flags);
-        dec_nr_running(rq);
 }
 /*
@@ -944,6 +965,8 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
        resched_task(rq->curr);
 }
+static int pull_dl_task(struct rq *this_rq);
 #endif /* CONFIG_SMP */
 /*
@@ -990,7 +1013,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
        return rb_entry(left, struct sched_dl_entity, rb_node);
 }
-struct task_struct *pick_next_task_dl(struct rq *rq)
+struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
 {
        struct sched_dl_entity *dl_se;
        struct task_struct *p;
@@ -998,9 +1021,20 @@ struct task_struct *pick_next_task_dl(struct rq *rq)
        dl_rq = &rq->dl;
+        if (need_pull_dl_task(rq, prev))
+                pull_dl_task(rq);
+        /*
+         * When prev is DL, we may throttle it in put_prev_task().
+         * So, we update time before we check for dl_nr_running.
+         */
+        if (prev->sched_class == &dl_sched_class)
+                update_curr_dl(rq);
        if (unlikely(!dl_rq->dl_nr_running))
                return NULL;
+        put_prev_task(rq, prev);
        dl_se = pick_next_dl_entity(rq, dl_rq);
        BUG_ON(!dl_se);
@@ -1015,9 +1049,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq)
                start_hrtick_dl(rq, p);
 #endif
-#ifdef CONFIG_SMP
+        set_post_schedule(rq);
-        rq->post_schedule = has_pushable_dl_tasks(rq);
-#endif /* CONFIG_SMP */
        return p;
 }
@@ -1426,13 +1458,6 @@ skip:
        return ret;
 }
-static void pre_schedule_dl(struct rq *rq, struct task_struct *prev)
-{
-        /* Try to pull other tasks here */
-        if (dl_task(prev))
-                pull_dl_task(rq);
-}
 static void post_schedule_dl(struct rq *rq)
 {
        push_dl_tasks(rq);
@@ -1560,7 +1585,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
        if (unlikely(p->dl.dl_throttled))
                return;
-        if (p->on_rq || rq->curr != p) {
+        if (p->on_rq && rq->curr != p) {
 #ifdef CONFIG_SMP
                if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
                        /* Only reschedule if pushing failed */
@@ -1625,7 +1650,6 @@ const struct sched_class dl_sched_class = {
        .set_cpus_allowed       = set_cpus_allowed_dl,
        .rq_online              = rq_online_dl,
        .rq_offline             = rq_offline_dl,
-        .pre_schedule           = pre_schedule_dl,
        .post_schedule          = post_schedule_dl,
        .task_woken             = task_woken_dl,
 #endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index dd52e7ffb10e..f3344c31632a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -321,6 +321,7 @@ do {									\
        P(sched_goidle);
 #ifdef CONFIG_SMP
        P64(avg_idle);
+        P64(max_idle_balance_cost);
 #endif
        P(ttwu_count);
@@ -533,15 +534,15 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
                        unsigned long nr_faults = -1;
                        int cpu_current, home_node;
-                        if (p->numa_faults)
+                        if (p->numa_faults_memory)
-                                nr_faults = p->numa_faults[2*node + i];
+                                nr_faults = p->numa_faults_memory[2*node + i];
                        cpu_current = !i ? (task_node(p) == node) :
                                (pol && node_isset(node, pol->v.nodes));
                        home_node = (p->numa_preferred_nid == node);
-                        SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n",
+                        SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n",
                                i, node, cpu_current, home_node, nr_faults);
                }
        }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 966cc2bfcb77..7e9bd0b1fa9e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -322,13 +322,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
        list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 /* Do the two (enqueued) entities belong to the same group ? */
-static inline int
+static inline struct cfs_rq *
 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 {
        if (se->cfs_rq == pse->cfs_rq)
-                return 1;
+                return se->cfs_rq;
-        return 0;
+        return NULL;
 }
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -336,17 +336,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
        return se->parent;
 }
-/* return depth at which a sched entity is present in the hierarchy */
-static inline int depth_se(struct sched_entity *se)
-{
-        int depth = 0;
-        for_each_sched_entity(se)
-                depth++;
-        return depth;
-}
 static void
 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 {
@@ -360,8 +349,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
         */
        /* First walk up until both entities are at same depth */
-        se_depth = depth_se(*se);
+        se_depth = (*se)->depth;
-        pse_depth = depth_se(*pse);
+        pse_depth = (*pse)->depth;
        while (se_depth > pse_depth) {
                se_depth--;
@@ -426,12 +415,6 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
                for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
-static inline int
-is_same_group(struct sched_entity *se, struct sched_entity *pse)
-{
-        return 1;
-}
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 {
        return NULL;
@@ -819,14 +802,6 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
 unsigned int sysctl_numa_balancing_scan_delay = 1000;
-/*
- * After skipping a page migration on a shared page, skip N more numa page
- * migrations unconditionally. This reduces the number of NUMA migrations
- * in shared memory workloads, and has the effect of pulling tasks towards
- * where their memory lives, over pulling the memory towards the task.
- */
-unsigned int sysctl_numa_balancing_migrate_deferred = 16;
 static unsigned int task_nr_scan_windows(struct task_struct *p)
 {
        unsigned long rss = 0;
@@ -893,10 +868,26 @@ struct numa_group {
        struct list_head task_list;
        struct rcu_head rcu;
+        nodemask_t active_nodes;
        unsigned long total_faults;
+        /*
+         * Faults_cpu is used to decide whether memory should move
+         * towards the CPU. As a consequence, these stats are weighted
+         * more by CPU use than by memory faults.
+         */
+        unsigned long *faults_cpu;
        unsigned long faults[0];
 };
+/* Shared or private faults. */
+#define NR_NUMA_HINT_FAULT_TYPES 2
+/* Memory and CPU locality */
+#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
+/* Averaged statistics, and temporary buffers. */
+#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
 pid_t task_numa_group_id(struct task_struct *p)
 {
        return p->numa_group ? p->numa_group->gid : 0;
@@ -904,16 +895,16 @@ pid_t task_numa_group_id(struct task_struct *p)
 static inline int task_faults_idx(int nid, int priv)
 {
-        return 2 * nid + priv;
+        return NR_NUMA_HINT_FAULT_TYPES * nid + priv;
 }
 static inline unsigned long task_faults(struct task_struct *p, int nid)
 {
-        if (!p->numa_faults)
+        if (!p->numa_faults_memory)
                return 0;
-        return p->numa_faults[task_faults_idx(nid, 0)] +
+        return p->numa_faults_memory[task_faults_idx(nid, 0)] +
-                p->numa_faults[task_faults_idx(nid, 1)];
+                p->numa_faults_memory[task_faults_idx(nid, 1)];
 }
 static inline unsigned long group_faults(struct task_struct *p, int nid)
@@ -925,6 +916,12 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
                p->numa_group->faults[task_faults_idx(nid, 1)];
 }
+static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
+{
+        return group->faults_cpu[task_faults_idx(nid, 0)] +
+                group->faults_cpu[task_faults_idx(nid, 1)];
+}
 /*
 * These return the fraction of accesses done by a particular task, or
 * task group, on a particular numa node.  The group weight is given a
@@ -935,7 +932,7 @@ static inline unsigned long task_weight(struct task_struct *p, int nid)
 {
        unsigned long total_faults;
-        if (!p->numa_faults)
+        if (!p->numa_faults_memory)
                return 0;
        total_faults = p->total_numa_faults;
@@ -954,6 +951,69 @@ static inline unsigned long group_weight(struct task_struct *p, int nid)
        return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
 }
+bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
+                                int src_nid, int dst_cpu)
+{
+        struct numa_group *ng = p->numa_group;
+        int dst_nid = cpu_to_node(dst_cpu);
+        int last_cpupid, this_cpupid;
+        this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
+        /*
+         * Multi-stage node selection is used in conjunction with a periodic
+         * migration fault to build a temporal task<->page relation. By using
+         * a two-stage filter we remove short/unlikely relations.
+         *
+         * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
+         * a task's usage of a particular page (n_p) per total usage of this
+         * page (n_t) (in a given time-span) to a probability.
+         *
+         * Our periodic faults will sample this probability and getting the
+         * same result twice in a row, given these samples are fully
+         * independent, is then given by P(n)^2, provided our sample period
+         * is sufficiently short compared to the usage pattern.
+         *
+         * This quadric squishes small probabilities, making it less likely we
+         * act on an unlikely task<->page relation.
+         */
+        last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+        if (!cpupid_pid_unset(last_cpupid) &&
+                                cpupid_to_nid(last_cpupid) != dst_nid)
+                return false;
+        /* Always allow migrate on private faults */
+        if (cpupid_match_pid(p, last_cpupid))
+                return true;
+        /* A shared fault, but p->numa_group has not been set up yet. */
+        if (!ng)
+                return true;
+        /*
+         * Do not migrate if the destination is not a node that
+         * is actively used by this numa group.
+         */
+        if (!node_isset(dst_nid, ng->active_nodes))
+                return false;
+        /*
+         * Source is a node that is not actively used by this
+         * numa group, while the destination is. Migrate.
+         */
+        if (!node_isset(src_nid, ng->active_nodes))
+                return true;
+        /*
+         * Both source and destination are nodes in active
+         * use by this numa group. Maximize memory bandwidth
+         * by migrating from more heavily used groups, to less
+         * heavily used ones, spreading the load around.
+         * Use a 1/4 hysteresis to avoid spurious page movement.
+         */
+        return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
+}
 static unsigned long weighted_cpuload(const int cpu);
 static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
@@ -1267,7 +1327,7 @@ static int task_numa_migrate(struct task_struct *p)
 static void numa_migrate_preferred(struct task_struct *p)
 {
        /* This task has no NUMA fault statistics yet */
-        if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
+        if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
                return;
        /* Periodically retry migrating the task to the preferred node */
@@ -1282,6 +1342,38 @@ static void numa_migrate_preferred(struct task_struct *p)
 }
 /*
+ * Find the nodes on which the workload is actively running. We do this by
+ * tracking the nodes from which NUMA hinting faults are triggered. This can
+ * be different from the set of nodes where the workload's memory is currently
+ * located.
+ *
+ * The bitmask is used to make smarter decisions on when to do NUMA page
+ * migrations, To prevent flip-flopping, and excessive page migrations, nodes
+ * are added when they cause over 6/16 of the maximum number of faults, but
+ * only removed when they drop below 3/16.
+ */
+static void update_numa_active_node_mask(struct numa_group *numa_group)
+{
+        unsigned long faults, max_faults = 0;
+        int nid;
+        for_each_online_node(nid) {
+                faults = group_faults_cpu(numa_group, nid);
+                if (faults > max_faults)
+                        max_faults = faults;
+        }
+        for_each_online_node(nid) {
+                faults = group_faults_cpu(numa_group, nid);
+                if (!node_isset(nid, numa_group->active_nodes)) {
+                        if (faults > max_faults * 6 / 16)
+                                node_set(nid, numa_group->active_nodes);
+                } else if (faults < max_faults * 3 / 16)
+                        node_clear(nid, numa_group->active_nodes);
+        }
+}
+/*
 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
 * increments. The more local the fault statistics are, the higher the scan
 * period will be for the next scan window. If local/remote ratio is below
@@ -1355,11 +1447,41 @@ static void update_task_scan_period(struct task_struct *p,
        memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
 }
+/*
+ * Get the fraction of time the task has been running since the last
+ * NUMA placement cycle. The scheduler keeps similar statistics, but
+ * decays those on a 32ms period, which is orders of magnitude off
+ * from the dozens-of-seconds NUMA balancing period. Use the scheduler
+ * stats only if the task is so new there are no NUMA statistics yet.
+ */
+static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
+{
+        u64 runtime, delta, now;
+        /* Use the start of this time slice to avoid calculations. */
+        now = p->se.exec_start;
+        runtime = p->se.sum_exec_runtime;
+        if (p->last_task_numa_placement) {
+                delta = runtime - p->last_sum_exec_runtime;
+                *period = now - p->last_task_numa_placement;
+        } else {
+                delta = p->se.avg.runnable_avg_sum;
+                *period = p->se.avg.runnable_avg_period;
+        }
+        p->last_sum_exec_runtime = runtime;
+        p->last_task_numa_placement = now;
+        return delta;
+}
 static void task_numa_placement(struct task_struct *p)
 {
        int seq, nid, max_nid = -1, max_group_nid = -1;
        unsigned long max_faults = 0, max_group_faults = 0;
        unsigned long fault_types[2] = { 0, 0 };
+        unsigned long total_faults;
+        u64 runtime, period;
        spinlock_t *group_lock = NULL;
        seq = ACCESS_ONCE(p->mm->numa_scan_seq);
@@ -1368,6 +1490,10 @@ static void task_numa_placement(struct task_struct *p)
        p->numa_scan_seq = seq;
        p->numa_scan_period_max = task_scan_max(p);
+        total_faults = p->numa_faults_locality[0] +
+                       p->numa_faults_locality[1];
+        runtime = numa_get_avg_runtime(p, &period);
        /* If the task is part of a group prevent parallel updates to group stats */
        if (p->numa_group) {
                group_lock = &p->numa_group->lock;
@@ -1379,24 +1505,37 @@ static void task_numa_placement(struct task_struct *p)
                unsigned long faults = 0, group_faults = 0;
                int priv, i;
-                for (priv = 0; priv < 2; priv++) {
+                for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
-                        long diff;
+                        long diff, f_diff, f_weight;
                        i = task_faults_idx(nid, priv);
-                        diff = -p->numa_faults[i];
                        /* Decay existing window, copy faults since last scan */
-                        p->numa_faults[i] >>= 1;
+                        diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;
-                        p->numa_faults[i] += p->numa_faults_buffer[i];
+                        fault_types[priv] += p->numa_faults_buffer_memory[i];
-                        fault_types[priv] += p->numa_faults_buffer[i];
+                        p->numa_faults_buffer_memory[i] = 0;
-                        p->numa_faults_buffer[i] = 0;
-                        faults += p->numa_faults[i];
+                        /*
-                        diff += p->numa_faults[i];
+                         * Normalize the faults_from, so all tasks in a group
+                         * count according to CPU use, instead of by the raw
+                         * number of faults. Tasks with little runtime have
+                         * little over-all impact on throughput, and thus their
+                         * faults are less important.
+                         */
+                        f_weight = div64_u64(runtime << 16, period + 1);
+                        f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
+                                   (total_faults + 1);
+                        f_diff = f_weight - p->numa_faults_cpu[i] / 2;
+                        p->numa_faults_buffer_cpu[i] = 0;
+                        p->numa_faults_memory[i] += diff;
+                        p->numa_faults_cpu[i] += f_diff;
+                        faults += p->numa_faults_memory[i];
                        p->total_numa_faults += diff;
                        if (p->numa_group) {
                                /* safe because we can only change our own group */
                                p->numa_group->faults[i] += diff;
+                                p->numa_group->faults_cpu[i] += f_diff;
                                p->numa_group->total_faults += diff;
                                group_faults += p->numa_group->faults[i];
                        }
@@ -1416,6 +1555,7 @@ static void task_numa_placement(struct task_struct *p)
        update_task_scan_period(p, fault_types[0], fault_types[1]);
        if (p->numa_group) {
+                update_numa_active_node_mask(p->numa_group);
                /*
                 * If the preferred task and group nids are different,
                 * iterate over the nodes again to find the best place.
@@ -1465,7 +1605,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
        if (unlikely(!p->numa_group)) {
                unsigned int size = sizeof(struct numa_group) +
-                                    2*nr_node_ids*sizeof(unsigned long);
+                                    4*nr_node_ids*sizeof(unsigned long);
                grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
                if (!grp)
@@ -1475,9 +1615,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
                spin_lock_init(&grp->lock);
                INIT_LIST_HEAD(&grp->task_list);
                grp->gid = p->pid;
+                /* Second half of the array tracks nids where faults happen */
+                grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
+                                                nr_node_ids;
+                node_set(task_node(current), grp->active_nodes);
-                for (i = 0; i < 2*nr_node_ids; i++)
+                for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
-                        grp->faults[i] = p->numa_faults[i];
+                        grp->faults[i] = p->numa_faults_memory[i];
                grp->total_faults = p->total_numa_faults;
@@ -1534,9 +1679,9 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
        double_lock(&my_grp->lock, &grp->lock);
-        for (i = 0; i < 2*nr_node_ids; i++) {
+        for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
-                my_grp->faults[i] -= p->numa_faults[i];
+                my_grp->faults[i] -= p->numa_faults_memory[i];
-                grp->faults[i] += p->numa_faults[i];
+                grp->faults[i] += p->numa_faults_memory[i];
        }
        my_grp->total_faults -= p->total_numa_faults;
        grp->total_faults += p->total_numa_faults;
@@ -1562,12 +1707,12 @@ void task_numa_free(struct task_struct *p)
 {
        struct numa_group *grp = p->numa_group;
        int i;
-        void *numa_faults = p->numa_faults;
+        void *numa_faults = p->numa_faults_memory;
        if (grp) {
                spin_lock(&grp->lock);
-                for (i = 0; i < 2*nr_node_ids; i++)
+                for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
-                        grp->faults[i] -= p->numa_faults[i];
+                        grp->faults[i] -= p->numa_faults_memory[i];
                grp->total_faults -= p->total_numa_faults;
                list_del(&p->numa_entry);
@@ -1577,18 +1722,21 @@ void task_numa_free(struct task_struct *p)
                put_numa_group(grp);
        }
-        p->numa_faults = NULL;
+        p->numa_faults_memory = NULL;
-        p->numa_faults_buffer = NULL;
+        p->numa_faults_buffer_memory = NULL;
+        p->numa_faults_cpu= NULL;
+        p->numa_faults_buffer_cpu = NULL;
        kfree(numa_faults);
 }
 /*
 * Got a PROT_NONE fault for a page on @node.
 */
-void task_numa_fault(int last_cpupid, int node, int pages, int flags)
+void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 {
        struct task_struct *p = current;
        bool migrated = flags & TNF_MIGRATED;
+        int cpu_node = task_node(current);
        int priv;
        if (!numabalancing_enabled)
@@ -1603,16 +1751,24 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
                return;
        /* Allocate buffer to track faults on a per-node basis */
-        if (unlikely(!p->numa_faults)) {
+        if (unlikely(!p->numa_faults_memory)) {
-                int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
+                int size = sizeof(*p->numa_faults_memory) *
+                           NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
-                /* numa_faults and numa_faults_buffer share the allocation */
+                p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
-                p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
+                if (!p->numa_faults_memory)
-                if (!p->numa_faults)
                        return;
-                BUG_ON(p->numa_faults_buffer);
+                BUG_ON(p->numa_faults_buffer_memory);
-                p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
+                /*
+                 * The averaged statistics, shared & private, memory & cpu,
+                 * occupy the first half of the array. The second half of the
+                 * array is for current counters, which are averaged into the
+                 * first set by task_numa_placement.
+                 */
+                p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
+                p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
+                p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
                p->total_numa_faults = 0;
                memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
        }
@@ -1641,7 +1797,8 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
        if (migrated)
                p->numa_pages_migrated += pages;
-        p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
+        p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
+        p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
        p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
 }
@@ -1757,6 +1914,8 @@ void task_numa_work(struct callback_head *work)
                        start = end;
                        if (pages <= 0)
                                goto out;
+                        cond_resched();
                } while (end != vma->vm_end);
        }
@@ -2217,13 +2376,20 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)
                se->avg.load_avg_contrib >>= NICE_0_SHIFT;
        }
 }
-#else
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
+{
+        __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
+        __update_tg_runnable_avg(&rq->avg, &rq->cfs);
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
                                                 int force_update) {}
 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
                                                  struct cfs_rq *cfs_rq) {}
 static inline void __update_group_entity_contrib(struct sched_entity *se) {}
-#endif
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 static inline void __update_task_entity_contrib(struct sched_entity *se)
 {
@@ -2321,12 +2487,6 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
        __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
 }
-static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
-{
-        __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
-        __update_tg_runnable_avg(&rq->avg, &rq->cfs);
-}
 /* Add the load generated by se into cfs_rq's child load-average */
 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
                                                  struct sched_entity *se,
@@ -2414,7 +2574,10 @@ void idle_exit_fair(struct rq *this_rq)
        update_rq_runnable_avg(this_rq, 0);
 }
-#else
+static int idle_balance(struct rq *this_rq);
+#else /* CONFIG_SMP */
 static inline void update_entity_load_avg(struct sched_entity *se,
                                          int update_cfs_rq) {}
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
@@ -2426,7 +2589,13 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
                                           int sleep) {}
 static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
                                              int force_update) {}
-#endif
+static inline int idle_balance(struct rq *rq)
+{
+        return 0;
+}
+#endif /* CONFIG_SMP */
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -2576,10 +2745,10 @@ static void __clear_buddies_last(struct sched_entity *se)
 {
        for_each_sched_entity(se) {
                struct cfs_rq *cfs_rq = cfs_rq_of(se);
-                if (cfs_rq->last == se)
+                if (cfs_rq->last != se)
-                        cfs_rq->last = NULL;
-                else
                        break;
+                cfs_rq->last = NULL;
        }
 }
@@ -2587,10 +2756,10 @@ static void __clear_buddies_next(struct sched_entity *se)
 {
        for_each_sched_entity(se) {
                struct cfs_rq *cfs_rq = cfs_rq_of(se);
-                if (cfs_rq->next == se)
+                if (cfs_rq->next != se)
-                        cfs_rq->next = NULL;
-                else
                        break;
+                cfs_rq->next = NULL;
        }
 }
@@ -2598,10 +2767,10 @@ static void __clear_buddies_skip(struct sched_entity *se)
 {
        for_each_sched_entity(se) {
                struct cfs_rq *cfs_rq = cfs_rq_of(se);
-                if (cfs_rq->skip == se)
+                if (cfs_rq->skip != se)
-                        cfs_rq->skip = NULL;
-                else
                        break;
+                cfs_rq->skip = NULL;
        }
 }
@@ -2744,17 +2913,36 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
 * 3) pick the "last" process, for cache locality
 * 4) do not run the "skip" process, if something else is available
 */
-static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *
+pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
-        struct sched_entity *se = __pick_first_entity(cfs_rq);
+        struct sched_entity *left = __pick_first_entity(cfs_rq);
-        struct sched_entity *left = se;
+        struct sched_entity *se;
+        /*
+         * If curr is set we have to see if its left of the leftmost entity
+         * still in the tree, provided there was anything in the tree at all.
+         */
+        if (!left || (curr && entity_before(curr, left)))
+                left = curr;
+        se = left; /* ideally we run the leftmost entity */
        /*
         * Avoid running the skip buddy, if running something else can
         * be done without getting too unfair.
         */
        if (cfs_rq->skip == se) {
-                struct sched_entity *second = __pick_next_entity(se);
+                struct sched_entity *second;
+                if (se == curr) {
+                        second = __pick_first_entity(cfs_rq);
+                } else {
+                        second = __pick_next_entity(se);
+                        if (!second || (curr && entity_before(curr, second)))
+                                second = curr;
+                }
                if (second && wakeup_preempt_entity(second, left) < 1)
                        se = second;
        }
@@ -2776,7 +2964,7 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
        return se;
 }
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 {
@@ -3431,22 +3619,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
 }
 /* conditionally throttle active cfs_rq's from put_prev_entity() */
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
        if (!cfs_bandwidth_used())
-                return;
+                return false;
        if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
-                return;
+                return false;
        /*
         * it's possible for a throttled entity to be forced into a running
         * state (e.g. set_curr_task), in this case we're finished.
         */
        if (cfs_rq_throttled(cfs_rq))
-                return;
+                return true;
        throttle_cfs_rq(cfs_rq);
+        return true;
 }
 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
@@ -3556,7 +3745,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
 }
 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -4211,13 +4400,14 @@ done:
 }
 /*
- * sched_balance_self: balance the current task (running on cpu) in domains
+ * select_task_rq_fair: Select target runqueue for the waking task in domains
- * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
+ * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
- * SD_BALANCE_EXEC.
+ * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
 *
- * Balance, ie. select the least loaded group.
+ * Balances load by selecting the idlest cpu in the idlest group, or under
+ * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
 *
- * Returns the target CPU number, or the same CPU if no balancing is needed.
+ * Returns the target cpu number.
 *
 * preempt must be disabled.
 */
@@ -4492,26 +4682,124 @@ preempt:
                set_last_buddy(se);
 }
-static struct task_struct *pick_next_task_fair(struct rq *rq)
+static struct task_struct *
+pick_next_task_fair(struct rq *rq, struct task_struct *prev)
 {
-        struct task_struct *p;
        struct cfs_rq *cfs_rq = &rq->cfs;
        struct sched_entity *se;
+        struct task_struct *p;
+        int new_tasks;
+again:
+#ifdef CONFIG_FAIR_GROUP_SCHED
        if (!cfs_rq->nr_running)
-                return NULL;
+                goto idle;
+        if (prev->sched_class != &fair_sched_class)
+                goto simple;
+        /*
+         * Because of the set_next_buddy() in dequeue_task_fair() it is rather
+         * likely that a next task is from the same cgroup as the current.
+         *
+         * Therefore attempt to avoid putting and setting the entire cgroup
+         * hierarchy, only change the part that actually changes.
+         */
        do {
-                se = pick_next_entity(cfs_rq);
+                struct sched_entity *curr = cfs_rq->curr;
+                /*
+                 * Since we got here without doing put_prev_entity() we also
+                 * have to consider cfs_rq->curr. If it is still a runnable
+                 * entity, update_curr() will update its vruntime, otherwise
+                 * forget we've ever seen it.
+                 */
+                if (curr && curr->on_rq)
+                        update_curr(cfs_rq);
+                else
+                        curr = NULL;
+                /*
+                 * This call to check_cfs_rq_runtime() will do the throttle and
+                 * dequeue its entity in the parent(s). Therefore the 'simple'
+                 * nr_running test will indeed be correct.
+                 */
+                if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+                        goto simple;
+                se = pick_next_entity(cfs_rq, curr);
+                cfs_rq = group_cfs_rq(se);
+        } while (cfs_rq);
+        p = task_of(se);
+        /*
+         * Since we haven't yet done put_prev_entity and if the selected task
+         * is a different task than we started out with, try and touch the
+         * least amount of cfs_rqs.
+         */
+        if (prev != p) {
+                struct sched_entity *pse = &prev->se;
+                while (!(cfs_rq = is_same_group(se, pse))) {
+                        int se_depth = se->depth;
+                        int pse_depth = pse->depth;
+                        if (se_depth <= pse_depth) {
+                                put_prev_entity(cfs_rq_of(pse), pse);
+                                pse = parent_entity(pse);
+                        }
+                        if (se_depth >= pse_depth) {
+                                set_next_entity(cfs_rq_of(se), se);
+                                se = parent_entity(se);
+                        }
+                }
+                put_prev_entity(cfs_rq, pse);
+                set_next_entity(cfs_rq, se);
+        }
+        if (hrtick_enabled(rq))
+                hrtick_start_fair(rq, p);
+        return p;
+simple:
+        cfs_rq = &rq->cfs;
+#endif
+        if (!cfs_rq->nr_running)
+                goto idle;
+        put_prev_task(rq, prev);
+        do {
+                se = pick_next_entity(cfs_rq, NULL);
                set_next_entity(cfs_rq, se);
                cfs_rq = group_cfs_rq(se);
        } while (cfs_rq);
        p = task_of(se);
        if (hrtick_enabled(rq))
                hrtick_start_fair(rq, p);
        return p;
+idle:
+        new_tasks = idle_balance(rq);
+        /*
+         * Because idle_balance() releases (and re-acquires) rq->lock, it is
+         * possible for any higher priority task to appear. In that case we
+         * must re-start the pick_next_entity() loop.
+         */
+        if (new_tasks < 0)
+                return RETRY_TASK;
+        if (new_tasks > 0)
+                goto again;
+        return NULL;
 }
 /*
@@ -4749,7 +5037,7 @@ static void move_task(struct task_struct *p, struct lb_env *env)
 * Is this task likely cache-hot:
 */
 static int
-task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
+task_hot(struct task_struct *p, u64 now)
 {
        s64 delta;
@@ -4783,7 +5071,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
 {
        int src_nid, dst_nid;
-        if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
+        if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
            !(env->sd->flags & SD_NUMA)) {
                return false;
        }
@@ -4814,7 +5102,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
        if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
                return false;
-        if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
+        if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))
                return false;
        src_nid = cpu_to_node(env->src_cpu);
@@ -4910,7 +5198,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
         * 2) task is cache cold, or
         * 3) too many balance attempts have failed.
         */
-        tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
+        tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq));
        if (!tsk_cache_hot)
                tsk_cache_hot = migrate_degrades_locality(p, env);
@@ -5773,12 +6061,10 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
        pwr_now /= SCHED_POWER_SCALE;
        /* Amount of load we'd subtract */
-        tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
+        if (busiest->avg_load > scaled_busy_load_per_task) {
-                busiest->group_power;
-        if (busiest->avg_load > tmp) {
                pwr_move += busiest->group_power *
                            min(busiest->load_per_task,
-                                busiest->avg_load - tmp);
+                                busiest->avg_load - scaled_busy_load_per_task);
        }
        /* Amount of load we'd add */
@@ -6357,17 +6643,23 @@ out:
 * idle_balance is called by schedule() if this_cpu is about to become
 * idle. Attempts to pull tasks from other CPUs.
 */
-void idle_balance(int this_cpu, struct rq *this_rq)
+static int idle_balance(struct rq *this_rq)
 {
        struct sched_domain *sd;
        int pulled_task = 0;
        unsigned long next_balance = jiffies + HZ;
        u64 curr_cost = 0;
+        int this_cpu = this_rq->cpu;
+        idle_enter_fair(this_rq);
+        /*
+         * We must set idle_stamp _before_ calling idle_balance(), such that we
+         * measure the duration of idle_balance() as idle time.
+         */
        this_rq->idle_stamp = rq_clock(this_rq);
        if (this_rq->avg_idle < sysctl_sched_migration_cost)
-                return;
+                goto out;
        /*
         * Drop the rq->lock, but keep IRQ/preempt disabled.
@@ -6405,15 +6697,22 @@ void idle_balance(int this_cpu, struct rq *this_rq)
                interval = msecs_to_jiffies(sd->balance_interval);
                if (time_after(next_balance, sd->last_balance + interval))
                        next_balance = sd->last_balance + interval;
-                if (pulled_task) {
+                if (pulled_task)
-                        this_rq->idle_stamp = 0;
                        break;
-                }
        }
        rcu_read_unlock();
        raw_spin_lock(&this_rq->lock);
+        /*
+         * While browsing the domains, we released the rq lock.
+         * A task could have be enqueued in the meantime
+         */
+        if (this_rq->cfs.h_nr_running && !pulled_task) {
+                pulled_task = 1;
+                goto out;
+        }
        if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
                /*
                 * We are going idle. next_balance may be set based on
@@ -6424,6 +6723,20 @@ void idle_balance(int this_cpu, struct rq *this_rq)
        if (curr_cost > this_rq->max_idle_balance_cost)
                this_rq->max_idle_balance_cost = curr_cost;
+out:
+        /* Is there a task of a high priority class? */
+        if (this_rq->nr_running != this_rq->cfs.h_nr_running &&
+            (this_rq->dl.dl_nr_running ||
+             (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))
+                pulled_task = -1;
+        if (pulled_task) {
+                idle_exit_fair(this_rq);
+                this_rq->idle_stamp = 0;
+        }
+        return pulled_task;
 }
 /*
@@ -6494,6 +6807,11 @@ out_unlock:
        return 0;
 }
+static inline int on_null_domain(struct rq *rq)
+{
+        return unlikely(!rcu_dereference_sched(rq->sd));
+}
 #ifdef CONFIG_NO_HZ_COMMON
 /*
 * idle load balancing details
@@ -6548,8 +6866,13 @@ static void nohz_balancer_kick(void)
 static inline void nohz_balance_exit_idle(int cpu)
 {
        if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
-                cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+                /*
-                atomic_dec(&nohz.nr_cpus);
+                 * Completely isolated CPUs don't ever set, so we must test.
+                 */
+                if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
+                        cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+                        atomic_dec(&nohz.nr_cpus);
+                }
                clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
        }
 }
@@ -6603,6 +6926,12 @@ void nohz_balance_enter_idle(int cpu)
        if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
                return;
+        /*
+         * If we're a completely isolated CPU, we don't play.
+         */
+        if (on_null_domain(cpu_rq(cpu)))
+                return;
        cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
        atomic_inc(&nohz.nr_cpus);
        set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
@@ -6865,11 +7194,6 @@ static void run_rebalance_domains(struct softirq_action *h)
        nohz_idle_balance(this_rq, idle);
 }
-static inline int on_null_domain(struct rq *rq)
-{
-        return !rcu_dereference_sched(rq->sd);
-}
 /*
 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
 */
@@ -6999,15 +7323,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
        /*
-         * Ensure the task's vruntime is normalized, so that when its
+         * Ensure the task's vruntime is normalized, so that when it's
         * switched back to the fair class the enqueue_entity(.flags=0) will
         * do the right thing.
         *
-         * If it was on_rq, then the dequeue_entity(.flags=0) will already
+         * If it's on_rq, then the dequeue_entity(.flags=0) will already
-         * have normalized the vruntime, if it was !on_rq, then only when
+         * have normalized the vruntime, if it's !on_rq, then only when
         * the task is sleeping will it still have non-normalized vruntime.
         */
-        if (!se->on_rq && p->state != TASK_RUNNING) {
+        if (!p->on_rq && p->state != TASK_RUNNING) {
                /*
                 * Fix up our vruntime so that the current sleep doesn't
                 * cause 'unlimited' sleep bonus.
@@ -7034,7 +7358,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
 */
 static void switched_to_fair(struct rq *rq, struct task_struct *p)
 {
-        if (!p->se.on_rq)
+        struct sched_entity *se = &p->se;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        /*
+         * Since the real-depth could have been changed (only FAIR
+         * class maintain depth value), reset depth properly.
+         */
+        se->depth = se->parent ? se->parent->depth + 1 : 0;
+#endif
+        if (!se->on_rq)
                return;
        /*
@@ -7082,7 +7414,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void task_move_group_fair(struct task_struct *p, int on_rq)
 {
+        struct sched_entity *se = &p->se;
        struct cfs_rq *cfs_rq;
        /*
         * If the task was not on the rq at the time of this cgroup movement
         * it must have been asleep, sleeping tasks keep their ->vruntime
@@ -7108,23 +7442,24 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
         * To prevent boost or penalty in the new cfs_rq caused by delta
         * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
         */
-        if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
+        if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
                on_rq = 1;
        if (!on_rq)
-                p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+                se->vruntime -= cfs_rq_of(se)->min_vruntime;
        set_task_rq(p, task_cpu(p));
+        se->depth = se->parent ? se->parent->depth + 1 : 0;
        if (!on_rq) {
-                cfs_rq = cfs_rq_of(&p->se);
+                cfs_rq = cfs_rq_of(se);
-                p->se.vruntime += cfs_rq->min_vruntime;
+                se->vruntime += cfs_rq->min_vruntime;
 #ifdef CONFIG_SMP
                /*
                 * migrate_task_rq_fair() will have removed our previous
                 * contribution, but we must synchronize for ongoing future
                 * decay.
                 */
-                p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+                se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
-                cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
+                cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
 #endif
        }
 }
@@ -7220,10 +7555,13 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
        if (!se)
                return;
-        if (!parent)
+        if (!parent) {
                se->cfs_rq = &rq->cfs;
-        else
+                se->depth = 0;
+        } else {
                se->cfs_rq = parent->my_q;
+                se->depth = parent->depth + 1;
+        }
        se->my_q = cfs_rq;
        /* guarantee group entities always have weight */
diff --git a/kernel/cpu/idle.c b/kernel/sched/idle.c
index 277f494c2a9a..b7976a127178 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/sched/idle.c
@@ -3,6 +3,7 @@
 */
 #include <linux/sched.h>
 #include <linux/cpu.h>
+#include <linux/cpuidle.h>
 #include <linux/tick.h>
 #include <linux/mm.h>
 #include <linux/stackprotector.h>
@@ -95,8 +96,10 @@ static void cpu_idle_loop(void)
                                if (!current_clr_polling_and_test()) {
                                        stop_critical_timings();
                                        rcu_idle_enter();
-                                        arch_cpu_idle();
+                                        if (cpuidle_idle_call())
-                                        WARN_ON_ONCE(irqs_disabled());
+                                                arch_cpu_idle();
+                                        if (WARN_ON_ONCE(irqs_disabled()))
+                                                local_irq_enable();
                                        rcu_idle_exit();
                                        start_critical_timings();
                                } else {
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 516c3d9ceea1..879f2b75266a 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -13,18 +13,8 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
 {
        return task_cpu(p); /* IDLE tasks as never migrated */
 }
-static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
-{
-        idle_exit_fair(rq);
-        rq_last_tick_reset(rq);
-}
-static void post_schedule_idle(struct rq *rq)
-{
-        idle_enter_fair(rq);
-}
 #endif /* CONFIG_SMP */
 /*
 * Idle tasks are unconditionally rescheduled:
 */
@@ -33,13 +23,12 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
        resched_task(rq->idle);
 }
-static struct task_struct *pick_next_task_idle(struct rq *rq)
+static struct task_struct *
+pick_next_task_idle(struct rq *rq, struct task_struct *prev)
 {
+        put_prev_task(rq, prev);
        schedstat_inc(rq, sched_goidle);
-#ifdef CONFIG_SMP
-        /* Trigger the post schedule to do an idle_enter for CFS */
-        rq->post_schedule = 1;
-#endif
        return rq->idle;
 }
@@ -58,6 +47,8 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
 static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
 {
+        idle_exit_fair(rq);
+        rq_last_tick_reset(rq);
 }
 static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
@@ -101,8 +92,6 @@ const struct sched_class idle_sched_class = {
 #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_idle,
-        .pre_schedule           = pre_schedule_idle,
-        .post_schedule          = post_schedule_idle,
 #endif
        .set_curr_task          = set_curr_task_idle,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a2740b775b45..d8cdf1618551 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -229,6 +229,14 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 #ifdef CONFIG_SMP
+static int pull_rt_task(struct rq *this_rq);
+static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
+{
+        /* Try to pull RT tasks here if we lower this rq's prio */
+        return rq->rt.highest_prio.curr > prev->prio;
+}
 static inline int rt_overloaded(struct rq *rq)
 {
        return atomic_read(&rq->rd->rto_count);
@@ -315,6 +323,15 @@ static inline int has_pushable_tasks(struct rq *rq)
        return !plist_head_empty(&rq->rt.pushable_tasks);
 }
+static inline void set_post_schedule(struct rq *rq)
+{
+        /*
+         * We detect this state here so that we can avoid taking the RQ
+         * lock again later if there is no need to push
+         */
+        rq->post_schedule = has_pushable_tasks(rq);
+}
 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 {
        plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
@@ -359,6 +376,19 @@ void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
 }
+static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
+{
+        return false;
+}
+static inline int pull_rt_task(struct rq *this_rq)
+{
+        return 0;
+}
+static inline void set_post_schedule(struct rq *rq)
+{
+}
 #endif /* CONFIG_SMP */
 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
@@ -440,11 +470,6 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
                dequeue_rt_entity(rt_se);
 }
-static inline int rt_rq_throttled(struct rt_rq *rt_rq)
-{
-        return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
-}
 static int rt_se_boosted(struct sched_rt_entity *rt_se)
 {
        struct rt_rq *rt_rq = group_rt_rq(rt_se);
@@ -515,11 +540,6 @@ static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 {
 }
-static inline int rt_rq_throttled(struct rt_rq *rt_rq)
-{
-        return rt_rq->rt_throttled;
-}
 static inline const struct cpumask *sched_rt_period_mask(void)
 {
        return cpu_online_mask;
@@ -538,6 +558,14 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 #endif /* CONFIG_RT_GROUP_SCHED */
+bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
+{
+        struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+        return (hrtimer_active(&rt_b->rt_period_timer) ||
+                rt_rq->rt_time < rt_b->rt_runtime);
+}
 #ifdef CONFIG_SMP
 /*
 * We ran out of runtime, see if we can borrow some from our neighbours.
@@ -1310,15 +1338,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
 {
        struct sched_rt_entity *rt_se;
        struct task_struct *p;
-        struct rt_rq *rt_rq;
+        struct rt_rq *rt_rq  = &rq->rt;
-        rt_rq = &rq->rt;
-        if (!rt_rq->rt_nr_running)
-                return NULL;
-        if (rt_rq_throttled(rt_rq))
-                return NULL;
        do {
                rt_se = pick_next_rt_entity(rq, rt_rq);
@@ -1332,21 +1352,45 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
        return p;
 }
-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct task_struct *
+pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 {
-        struct task_struct *p = _pick_next_task_rt(rq);
+        struct task_struct *p;
+        struct rt_rq *rt_rq = &rq->rt;
+        if (need_pull_rt_task(rq, prev)) {
+                pull_rt_task(rq);
+                /*
+                 * pull_rt_task() can drop (and re-acquire) rq->lock; this
+                 * means a dl task can slip in, in which case we need to
+                 * re-start task selection.
+                 */
+                if (unlikely(rq->dl.dl_nr_running))
+                        return RETRY_TASK;
+        }
+        /*
+         * We may dequeue prev's rt_rq in put_prev_task().
+         * So, we update time before rt_nr_running check.
+         */
+        if (prev->sched_class == &rt_sched_class)
+                update_curr_rt(rq);
+        if (!rt_rq->rt_nr_running)
+                return NULL;
+        if (rt_rq_throttled(rt_rq))
+                return NULL;
+        put_prev_task(rq, prev);
+        p = _pick_next_task_rt(rq);
        /* The running task is never eligible for pushing */
        if (p)
                dequeue_pushable_task(rq, p);
-#ifdef CONFIG_SMP
+        set_post_schedule(rq);
-        /*
-         * We detect this state here so that we can avoid taking the RQ
-         * lock again later if there is no need to push
-         */
-        rq->post_schedule = has_pushable_tasks(rq);
-#endif
        return p;
 }
@@ -1716,13 +1760,6 @@ skip:
        return ret;
 }
-static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
-{
-        /* Try to pull RT tasks here if we lower this rq's prio */
-        if (rq->rt.highest_prio.curr > prev->prio)
-                pull_rt_task(rq);
-}
 static void post_schedule_rt(struct rq *rq)
 {
        push_rt_tasks(rq);
@@ -1825,7 +1862,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
                resched_task(rq->curr);
 }
-void init_sched_rt_class(void)
+void __init init_sched_rt_class(void)
 {
        unsigned int i;
@@ -1999,7 +2036,6 @@ const struct sched_class rt_sched_class = {
        .set_cpus_allowed       = set_cpus_allowed_rt,
        .rq_online              = rq_online_rt,
        .rq_offline             = rq_offline_rt,
-        .pre_schedule           = pre_schedule_rt,
        .post_schedule          = post_schedule_rt,
        .task_woken             = task_woken_rt,
        .switched_from          = switched_from_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c2119fd20f8b..f2de7a175620 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -24,24 +24,6 @@ extern long calc_load_fold_active(struct rq *this_rq);
 extern void update_cpu_load_active(struct rq *this_rq);
 /*
- * Convert user-nice values [ -20 ... 0 ... 19 ]
- * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
- * and back.
- */
-#define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
-#define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
-#define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
-/*
- * 'User priority' is the nice value converted to something we
- * can work with better when scaling various scheduler parameters,
- * it's a [ 0 ... 39 ] range.
- */
-#define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
-#define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
-#define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
-/*
 * Helpers for converting nanosecond timing to jiffy resolution
 */
 #define NS_TO_JIFFIES(TIME)     ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
@@ -441,6 +423,18 @@ struct rt_rq {
 #endif
 };
+#ifdef CONFIG_RT_GROUP_SCHED
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+        return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
+}
+#else
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+        return rt_rq->rt_throttled;
+}
+#endif
 /* Deadline class' related fields in a runqueue */
 struct dl_rq {
        /* runqueue is an rbtree, ordered by deadline */
@@ -462,7 +456,6 @@ struct dl_rq {
        } earliest_dl;
        unsigned long dl_nr_migratory;
-        unsigned long dl_nr_total;
        int overloaded;
        /*
@@ -559,11 +552,9 @@ struct rq {
 #ifdef CONFIG_FAIR_GROUP_SCHED
        /* list of leaf cfs_rq on this cpu: */
        struct list_head leaf_cfs_rq_list;
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-#ifdef CONFIG_RT_GROUP_SCHED
+        struct sched_avg avg;
-        struct list_head leaf_rt_rq_list;
+#endif /* CONFIG_FAIR_GROUP_SCHED */
-#endif
        /*
         * This is part of a global counter where only the total sum
@@ -652,8 +643,6 @@ struct rq {
 #ifdef CONFIG_SMP
        struct llist_head wake_list;
 #endif
-        struct sched_avg avg;
 };
 static inline int cpu_of(struct rq *rq)
@@ -1113,6 +1102,8 @@ static const u32 prio_to_wmult[40] = {
 #define DEQUEUE_SLEEP           1
+#define RETRY_TASK              ((void *)-1UL)
 struct sched_class {
        const struct sched_class *next;
@@ -1123,14 +1114,22 @@ struct sched_class {
        void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
-        struct task_struct * (*pick_next_task) (struct rq *rq);
+        /*
+         * It is the responsibility of the pick_next_task() method that will
+         * return the next task to call put_prev_task() on the @prev task or
+         * something equivalent.
+         *
+         * May return RETRY_TASK when it finds a higher prio class has runnable
+         * tasks.
+         */
+        struct task_struct * (*pick_next_task) (struct rq *rq,
+                                                struct task_struct *prev);
        void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 #ifdef CONFIG_SMP
        int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
        void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
-        void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
        void (*post_schedule) (struct rq *this_rq);
        void (*task_waking) (struct task_struct *task);
        void (*task_woken) (struct rq *this_rq, struct task_struct *task);
@@ -1160,6 +1159,11 @@ struct sched_class {
 #endif
 };
+static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
+{
+        prev->sched_class->put_prev_task(rq, prev);
+}
 #define sched_class_highest (&stop_sched_class)
 #define for_each_class(class) \
   for (class = sched_class_highest; class; class = class->next)
@@ -1176,16 +1180,14 @@ extern const struct sched_class idle_sched_class;
 extern void update_group_power(struct sched_domain *sd, int cpu);
 extern void trigger_load_balance(struct rq *rq);
-extern void idle_balance(int this_cpu, struct rq *this_rq);
 extern void idle_enter_fair(struct rq *this_rq);
 extern void idle_exit_fair(struct rq *this_rq);
-#else   /* CONFIG_SMP */
+#else
-static inline void idle_balance(int cpu, struct rq *rq)
+static inline void idle_enter_fair(struct rq *rq) { }
-{
+static inline void idle_exit_fair(struct rq *rq) { }
-}
 #endif
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index fdb6bb0b3356..d6ce65dde541 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,16 +23,19 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
        /* we're never preempted */
 }
-static struct task_struct *pick_next_task_stop(struct rq *rq)
+static struct task_struct *
+pick_next_task_stop(struct rq *rq, struct task_struct *prev)
 {
        struct task_struct *stop = rq->stop;
-        if (stop && stop->on_rq) {
+        if (!stop || !stop->on_rq)
-                stop->se.exec_start = rq_clock_task(rq);
+                return NULL;
-                return stop;
-        }
-        return NULL;
+        put_prev_task(rq, prev);
+        stop->se.exec_start = rq_clock_task(rq);
+        return stop;
 }
 static void
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 84571e09c907..01fbae5b97b7 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -293,7 +293,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
         */
        smp_call_function_single(min(cpu1, cpu2),
                                 &irq_cpu_stop_queue_work,
-                                 &call_args, 0);
+                                 &call_args, 1);
        lg_local_unlock(&stop_cpus_lock);
        preempt_enable();
diff --git a/kernel/sys.c b/kernel/sys.c
index c0a58be780a4..adaeab6f7a87 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -174,10 +174,10 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
        /* normalize: avoid signed division (rounding problems) */
        error = -ESRCH;
-        if (niceval < -20)
+        if (niceval < MIN_NICE)
-                niceval = -20;
+                niceval = MIN_NICE;
-        if (niceval > 19)
+        if (niceval > MAX_NICE)
-                niceval = 19;
+                niceval = MAX_NICE;
        rcu_read_lock();
        read_lock(&tasklist_lock);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 49e13e1f8fe6..7754ff16f334 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -386,13 +386,6 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = proc_dointvec,
        },
        {
-                .procname       = "numa_balancing_migrate_deferred",
-                .data           = &sysctl_numa_balancing_migrate_deferred,
-                .maxlen         = sizeof(unsigned int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
-        },
-        {
                .procname       = "numa_balancing",
                .data           = NULL, /* filled in by handler */
                .maxlen         = sizeof(unsigned int),
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 0abb36464281..4d23dc4d8139 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -116,20 +116,42 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
 void __init sched_clock_register(u64 (*read)(void), int bits,
                                 unsigned long rate)
 {
+        u64 res, wrap, new_mask, new_epoch, cyc, ns;
+        u32 new_mult, new_shift;
+        ktime_t new_wrap_kt;
        unsigned long r;
-        u64 res, wrap;
        char r_unit;
        if (cd.rate > rate)
                return;
        WARN_ON(!irqs_disabled());
-        read_sched_clock = read;
-        sched_clock_mask = CLOCKSOURCE_MASK(bits);
-        cd.rate = rate;
        /* calculate the mult/shift to convert counter ticks to ns. */
-        clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 3600);
+        clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600);
+        new_mask = CLOCKSOURCE_MASK(bits);
+        /* calculate how many ns until we wrap */
+        wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask);
+        new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
+        /* update epoch for new counter and update epoch_ns from old counter*/
+        new_epoch = read();
+        cyc = read_sched_clock();
+        ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
+                          cd.mult, cd.shift);
+        raw_write_seqcount_begin(&cd.seq);
+        read_sched_clock = read;
+        sched_clock_mask = new_mask;
+        cd.rate = rate;
+        cd.wrap_kt = new_wrap_kt;
+        cd.mult = new_mult;
+        cd.shift = new_shift;
+        cd.epoch_cyc = new_epoch;
+        cd.epoch_ns = ns;
+        raw_write_seqcount_end(&cd.seq);
        r = rate;
        if (r >= 4000000) {
@@ -141,22 +163,12 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
        } else
                r_unit = ' ';
-        /* calculate how many ns until we wrap */
-        wrap = clocks_calc_max_nsecs(cd.mult, cd.shift, 0, sched_clock_mask);
-        cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
        /* calculate the ns resolution of this counter */
-        res = cyc_to_ns(1ULL, cd.mult, cd.shift);
+        res = cyc_to_ns(1ULL, new_mult, new_shift);
        pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
                bits, r, r_unit, res, wrap);
-        update_sched_clock();
-        /*
-         * Ensure that sched_clock() starts off at 0ns
-         */
-        cd.epoch_ns = 0;
        /* Enable IRQ time accounting if we have a fast enough sched_clock */
        if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
                enable_sched_clock_irqtime();
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 0aa4ce81bc16..5b40279ecd71 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1435,7 +1435,8 @@ void update_wall_time(void)
 out:
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
        if (clock_set)
-                clock_was_set();
+                /* Have to call _delayed version, since in irq context*/
+                clock_was_set_delayed();
 }
 /**
diff --git a/kernel/timer.c b/kernel/timer.c
index accfd241b9e5..d78de047599b 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -52,7 +52,7 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/timer.h>
-u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
+__visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
 EXPORT_SYMBOL(jiffies_64);
diff --git a/kernel/torture.c b/kernel/torture.c
new file mode 100644
index 000000000000..acc9afc2f26e
--- /dev/null
+++ b/kernel/torture.c
@@ -0,0 +1,719 @@
+/*
+ * Common functions for in-kernel torture tests.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2014
+ *
+ * Author: Paul E. McKenney <paulmck@us.ibm.com>
+ *      Based on kernel/rcu/torture.c.
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/freezer.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/trace_clock.h>
+#include <asm/byteorder.h>
+#include <linux/torture.h>
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
+static char *torture_type;
+static bool verbose;
+/* Mediate rmmod and system shutdown.  Concurrent rmmod & shutdown illegal! */
+#define FULLSTOP_DONTSTOP 0     /* Normal operation. */
+#define FULLSTOP_SHUTDOWN 1     /* System shutdown with torture running. */
+#define FULLSTOP_RMMOD    2     /* Normal rmmod of torture. */
+static int fullstop = FULLSTOP_RMMOD;
+static DEFINE_MUTEX(fullstop_mutex);
+static int *torture_runnable;
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Variables for online-offline handling.  Only present if CPU hotplug
+ * is enabled, otherwise does nothing.
+ */
+static struct task_struct *onoff_task;
+static long onoff_holdoff;
+static long onoff_interval;
+static long n_offline_attempts;
+static long n_offline_successes;
+static unsigned long sum_offline;
+static int min_offline = -1;
+static int max_offline;
+static long n_online_attempts;
+static long n_online_successes;
+static unsigned long sum_online;
+static int min_online = -1;
+static int max_online;
+/*
+ * Execute random CPU-hotplug operations at the interval specified
+ * by the onoff_interval.
+ */
+static int
+torture_onoff(void *arg)
+{
+        int cpu;
+        unsigned long delta;
+        int maxcpu = -1;
+        DEFINE_TORTURE_RANDOM(rand);
+        int ret;
+        unsigned long starttime;
+        VERBOSE_TOROUT_STRING("torture_onoff task started");
+        for_each_online_cpu(cpu)
+                maxcpu = cpu;
+        WARN_ON(maxcpu < 0);
+        if (onoff_holdoff > 0) {
+                VERBOSE_TOROUT_STRING("torture_onoff begin holdoff");
+                schedule_timeout_interruptible(onoff_holdoff);
+                VERBOSE_TOROUT_STRING("torture_onoff end holdoff");
+        }
+        while (!torture_must_stop()) {
+                cpu = (torture_random(&rand) >> 4) % (maxcpu + 1);
+                if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
+                        if (verbose)
+                                pr_alert("%s" TORTURE_FLAG
+                                         "torture_onoff task: offlining %d\n",
+                                         torture_type, cpu);
+                        starttime = jiffies;
+                        n_offline_attempts++;
+                        ret = cpu_down(cpu);
+                        if (ret) {
+                                if (verbose)
+                                        pr_alert("%s" TORTURE_FLAG
+                                                 "torture_onoff task: offline %d failed: errno %d\n",
+                                                 torture_type, cpu, ret);
+                        } else {
+                                if (verbose)
+                                        pr_alert("%s" TORTURE_FLAG
+                                                 "torture_onoff task: offlined %d\n",
+                                                 torture_type, cpu);
+                                n_offline_successes++;
+                                delta = jiffies - starttime;
+                                sum_offline += delta;
+                                if (min_offline < 0) {
+                                        min_offline = delta;
+                                        max_offline = delta;
+                                }
+                                if (min_offline > delta)
+                                        min_offline = delta;
+                                if (max_offline < delta)
+                                        max_offline = delta;
+                        }
+                } else if (cpu_is_hotpluggable(cpu)) {
+                        if (verbose)
+                                pr_alert("%s" TORTURE_FLAG
+                                         "torture_onoff task: onlining %d\n",
+                                         torture_type, cpu);
+                        starttime = jiffies;
+                        n_online_attempts++;
+                        ret = cpu_up(cpu);
+                        if (ret) {
+                                if (verbose)
+                                        pr_alert("%s" TORTURE_FLAG
+                                                 "torture_onoff task: online %d failed: errno %d\n",
+                                                 torture_type, cpu, ret);
+                        } else {
+                                if (verbose)
+                                        pr_alert("%s" TORTURE_FLAG
+                                                 "torture_onoff task: onlined %d\n",
+                                                 torture_type, cpu);
+                                n_online_successes++;
+                                delta = jiffies - starttime;
+                                sum_online += delta;
+                                if (min_online < 0) {
+                                        min_online = delta;
+                                        max_online = delta;
+                                }
+                                if (min_online > delta)
+                                        min_online = delta;
+                                if (max_online < delta)
+                                        max_online = delta;
+                        }
+                }
+                schedule_timeout_interruptible(onoff_interval);
+        }
+        torture_kthread_stopping("torture_onoff");
+        return 0;
+}
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+/*
+ * Initiate online-offline handling.
+ */
+int torture_onoff_init(long ooholdoff, long oointerval)
+{
+        int ret = 0;
+#ifdef CONFIG_HOTPLUG_CPU
+        onoff_holdoff = ooholdoff;
+        onoff_interval = oointerval;
+        if (onoff_interval <= 0)
+                return 0;
+        ret = torture_create_kthread(torture_onoff, NULL, onoff_task);
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+        return ret;
+}
+EXPORT_SYMBOL_GPL(torture_onoff_init);
+/*
+ * Clean up after online/offline testing.
+ */
+static void torture_onoff_cleanup(void)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+        if (onoff_task == NULL)
+                return;
+        VERBOSE_TOROUT_STRING("Stopping torture_onoff task");
+        kthread_stop(onoff_task);
+        onoff_task = NULL;
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+}
+EXPORT_SYMBOL_GPL(torture_onoff_cleanup);
+/*
+ * Print online/offline testing statistics.
+ */
+char *torture_onoff_stats(char *page)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+        page += sprintf(page,
+                       "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
+                       n_online_successes, n_online_attempts,
+                       n_offline_successes, n_offline_attempts,
+                       min_online, max_online,
+                       min_offline, max_offline,
+                       sum_online, sum_offline, HZ);
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+        return page;
+}
+EXPORT_SYMBOL_GPL(torture_onoff_stats);
+/*
+ * Were all the online/offline operations successful?
+ */
+bool torture_onoff_failures(void)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+        return n_online_successes != n_online_attempts ||
+               n_offline_successes != n_offline_attempts;
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
+        return false;
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
+}
+EXPORT_SYMBOL_GPL(torture_onoff_failures);
+#define TORTURE_RANDOM_MULT     39916801  /* prime */
+#define TORTURE_RANDOM_ADD      479001701 /* prime */
+#define TORTURE_RANDOM_REFRESH  10000
+/*
+ * Crude but fast random-number generator.  Uses a linear congruential
+ * generator, with occasional help from cpu_clock().
+ */
+unsigned long
+torture_random(struct torture_random_state *trsp)
+{
+        if (--trsp->trs_count < 0) {
+                trsp->trs_state += (unsigned long)local_clock();
+                trsp->trs_count = TORTURE_RANDOM_REFRESH;
+        }
+        trsp->trs_state = trsp->trs_state * TORTURE_RANDOM_MULT +
+                TORTURE_RANDOM_ADD;
+        return swahw32(trsp->trs_state);
+}
+EXPORT_SYMBOL_GPL(torture_random);
+/*
+ * Variables for shuffling.  The idea is to ensure that each CPU stays
+ * idle for an extended period to test interactions with dyntick idle,
+ * as well as interactions with any per-CPU varibles.
+ */
+struct shuffle_task {
+        struct list_head st_l;
+        struct task_struct *st_t;
+};
+static long shuffle_interval;   /* In jiffies. */
+static struct task_struct *shuffler_task;
+static cpumask_var_t shuffle_tmp_mask;
+static int shuffle_idle_cpu;    /* Force all torture tasks off this CPU */
+static struct list_head shuffle_task_list = LIST_HEAD_INIT(shuffle_task_list);
+static DEFINE_MUTEX(shuffle_task_mutex);
+/*
+ * Register a task to be shuffled.  If there is no memory, just splat
+ * and don't bother registering.
+ */
+void torture_shuffle_task_register(struct task_struct *tp)
+{
+        struct shuffle_task *stp;
+        if (WARN_ON_ONCE(tp == NULL))
+                return;
+        stp = kmalloc(sizeof(*stp), GFP_KERNEL);
+        if (WARN_ON_ONCE(stp == NULL))
+                return;
+        stp->st_t = tp;
+        mutex_lock(&shuffle_task_mutex);
+        list_add(&stp->st_l, &shuffle_task_list);
+        mutex_unlock(&shuffle_task_mutex);
+}
+EXPORT_SYMBOL_GPL(torture_shuffle_task_register);
+/*
+ * Unregister all tasks, for example, at the end of the torture run.
+ */
+static void torture_shuffle_task_unregister_all(void)
+{
+        struct shuffle_task *stp;
+        struct shuffle_task *p;
+        mutex_lock(&shuffle_task_mutex);
+        list_for_each_entry_safe(stp, p, &shuffle_task_list, st_l) {
+                list_del(&stp->st_l);
+                kfree(stp);
+        }
+        mutex_unlock(&shuffle_task_mutex);
+}
+/* Shuffle tasks such that we allow shuffle_idle_cpu to become idle.
+ * A special case is when shuffle_idle_cpu = -1, in which case we allow
+ * the tasks to run on all CPUs.
+ */
+static void torture_shuffle_tasks(void)
+{
+        struct shuffle_task *stp;
+        cpumask_setall(shuffle_tmp_mask);
+        get_online_cpus();
+        /* No point in shuffling if there is only one online CPU (ex: UP) */
+        if (num_online_cpus() == 1) {
+                put_online_cpus();
+                return;
+        }
+        /* Advance to the next CPU.  Upon overflow, don't idle any CPUs. */
+        shuffle_idle_cpu = cpumask_next(shuffle_idle_cpu, shuffle_tmp_mask);
+        if (shuffle_idle_cpu >= nr_cpu_ids)
+                shuffle_idle_cpu = -1;
+        if (shuffle_idle_cpu != -1) {
+                cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask);
+                if (cpumask_empty(shuffle_tmp_mask)) {
+                        put_online_cpus();
+                        return;
+                }
+        }
+        mutex_lock(&shuffle_task_mutex);
+        list_for_each_entry(stp, &shuffle_task_list, st_l)
+                set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask);
+        mutex_unlock(&shuffle_task_mutex);
+        put_online_cpus();
+}
+/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
+ * system to become idle at a time and cut off its timer ticks. This is meant
+ * to test the support for such tickless idle CPU in RCU.
+ */
+static int torture_shuffle(void *arg)
+{
+        VERBOSE_TOROUT_STRING("torture_shuffle task started");
+        do {
+                schedule_timeout_interruptible(shuffle_interval);
+                torture_shuffle_tasks();
+                torture_shutdown_absorb("torture_shuffle");
+        } while (!torture_must_stop());
+        torture_kthread_stopping("torture_shuffle");
+        return 0;
+}
+/*
+ * Start the shuffler, with shuffint in jiffies.
+ */
+int torture_shuffle_init(long shuffint)
+{
+        shuffle_interval = shuffint;
+        shuffle_idle_cpu = -1;
+        if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
+                VERBOSE_TOROUT_ERRSTRING("Failed to alloc mask");
+                return -ENOMEM;
+        }
+        /* Create the shuffler thread */
+        return torture_create_kthread(torture_shuffle, NULL, shuffler_task);
+}
+EXPORT_SYMBOL_GPL(torture_shuffle_init);
+/*
+ * Stop the shuffling.
+ */
+static void torture_shuffle_cleanup(void)
+{
+        torture_shuffle_task_unregister_all();
+        if (shuffler_task) {
+                VERBOSE_TOROUT_STRING("Stopping torture_shuffle task");
+                kthread_stop(shuffler_task);
+                free_cpumask_var(shuffle_tmp_mask);
+        }
+        shuffler_task = NULL;
+}
+EXPORT_SYMBOL_GPL(torture_shuffle_cleanup);
+/*
+ * Variables for auto-shutdown.  This allows "lights out" torture runs
+ * to be fully scripted.
+ */
+static int shutdown_secs;               /* desired test duration in seconds. */
+static struct task_struct *shutdown_task;
+static unsigned long shutdown_time;     /* jiffies to system shutdown. */
+static void (*torture_shutdown_hook)(void);
+/*
+ * Absorb kthreads into a kernel function that won't return, so that
+ * they won't ever access module text or data again.
+ */
+void torture_shutdown_absorb(const char *title)
+{
+        while (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
+                pr_notice("torture thread %s parking due to system shutdown\n",
+                          title);
+                schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
+        }
+}
+EXPORT_SYMBOL_GPL(torture_shutdown_absorb);
+/*
+ * Cause the torture test to shutdown the system after the test has
+ * run for the time specified by the shutdown_secs parameter.
+ */
+static int torture_shutdown(void *arg)
+{
+        long delta;
+        unsigned long jiffies_snap;
+        VERBOSE_TOROUT_STRING("torture_shutdown task started");
+        jiffies_snap = jiffies;
+        while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
+               !torture_must_stop()) {
+                delta = shutdown_time - jiffies_snap;
+                if (verbose)
+                        pr_alert("%s" TORTURE_FLAG
+                                 "torture_shutdown task: %lu jiffies remaining\n",
+                                 torture_type, delta);
+                schedule_timeout_interruptible(delta);
+                jiffies_snap = jiffies;
+        }
+        if (torture_must_stop()) {
+                torture_kthread_stopping("torture_shutdown");
+                return 0;
+        }
+        /* OK, shut down the system. */
+        VERBOSE_TOROUT_STRING("torture_shutdown task shutting down system");
+        shutdown_task = NULL;   /* Avoid self-kill deadlock. */
+        if (torture_shutdown_hook)
+                torture_shutdown_hook();
+        else
+                VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping.");
+        kernel_power_off();     /* Shut down the system. */
+        return 0;
+}
+/*
+ * Start up the shutdown task.
+ */
+int torture_shutdown_init(int ssecs, void (*cleanup)(void))
+{
+        int ret = 0;
+        shutdown_secs = ssecs;
+        torture_shutdown_hook = cleanup;
+        if (shutdown_secs > 0) {
+                shutdown_time = jiffies + shutdown_secs * HZ;
+                ret = torture_create_kthread(torture_shutdown, NULL,
+                                             shutdown_task);
+        }
+        return ret;
+}
+EXPORT_SYMBOL_GPL(torture_shutdown_init);
+/*
+ * Detect and respond to a system shutdown.
+ */
+static int torture_shutdown_notify(struct notifier_block *unused1,
+                                   unsigned long unused2, void *unused3)
+{
+        mutex_lock(&fullstop_mutex);
+        if (ACCESS_ONCE(fullstop) == FULLSTOP_DONTSTOP) {
+                VERBOSE_TOROUT_STRING("Unscheduled system shutdown detected");
+                ACCESS_ONCE(fullstop) = FULLSTOP_SHUTDOWN;
+        } else {
+                pr_warn("Concurrent rmmod and shutdown illegal!\n");
+        }
+        mutex_unlock(&fullstop_mutex);
+        return NOTIFY_DONE;
+}
+static struct notifier_block torture_shutdown_nb = {
+        .notifier_call = torture_shutdown_notify,
+};
+/*
+ * Shut down the shutdown task.  Say what???  Heh!  This can happen if
+ * the torture module gets an rmmod before the shutdown time arrives.  ;-)
+ */
+static void torture_shutdown_cleanup(void)
+{
+        unregister_reboot_notifier(&torture_shutdown_nb);
+        if (shutdown_task != NULL) {
+                VERBOSE_TOROUT_STRING("Stopping torture_shutdown task");
+                kthread_stop(shutdown_task);
+        }
+        shutdown_task = NULL;
+}
+/*
+ * Variables for stuttering, which means to periodically pause and
+ * restart testing in order to catch bugs that appear when load is
+ * suddenly applied to or removed from the system.
+ */
+static struct task_struct *stutter_task;
+static int stutter_pause_test;
+static int stutter;
+/*
+ * Block until the stutter interval ends.  This must be called periodically
+ * by all running kthreads that need to be subject to stuttering.
+ */
+void stutter_wait(const char *title)
+{
+        while (ACCESS_ONCE(stutter_pause_test) ||
+               (torture_runnable && !ACCESS_ONCE(*torture_runnable))) {
+                if (stutter_pause_test)
+                        schedule_timeout_interruptible(1);
+                else
+                        schedule_timeout_interruptible(round_jiffies_relative(HZ));
+                torture_shutdown_absorb(title);
+        }
+}
+EXPORT_SYMBOL_GPL(stutter_wait);
+/*
+ * Cause the torture test to "stutter", starting and stopping all
+ * threads periodically.
+ */
+static int torture_stutter(void *arg)
+{
+        VERBOSE_TOROUT_STRING("torture_stutter task started");
+        do {
+                if (!torture_must_stop()) {
+                        schedule_timeout_interruptible(stutter);
+                        ACCESS_ONCE(stutter_pause_test) = 1;
+                }
+                if (!torture_must_stop())
+                        schedule_timeout_interruptible(stutter);
+                ACCESS_ONCE(stutter_pause_test) = 0;
+                torture_shutdown_absorb("torture_stutter");
+        } while (!torture_must_stop());
+        torture_kthread_stopping("torture_stutter");
+        return 0;
+}
+/*
+ * Initialize and kick off the torture_stutter kthread.
+ */
+int torture_stutter_init(int s)
+{
+        int ret;
+        stutter = s;
+        ret = torture_create_kthread(torture_stutter, NULL, stutter_task);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(torture_stutter_init);
+/*
+ * Cleanup after the torture_stutter kthread.
+ */
+static void torture_stutter_cleanup(void)
+{
+        if (!stutter_task)
+                return;
+        VERBOSE_TOROUT_STRING("Stopping torture_stutter task");
+        kthread_stop(stutter_task);
+        stutter_task = NULL;
+}
+/*
+ * Initialize torture module.  Please note that this is -not- invoked via
+ * the usual module_init() mechanism, but rather by an explicit call from
+ * the client torture module.  This call must be paired with a later
+ * torture_init_end().
+ *
+ * The runnable parameter points to a flag that controls whether or not
+ * the test is currently runnable.  If there is no such flag, pass in NULL.
+ */
+void __init torture_init_begin(char *ttype, bool v, int *runnable)
+{
+        mutex_lock(&fullstop_mutex);
+        torture_type = ttype;
+        verbose = v;
+        torture_runnable = runnable;
+        fullstop = FULLSTOP_DONTSTOP;
+}
+EXPORT_SYMBOL_GPL(torture_init_begin);
+/*
+ * Tell the torture module that initialization is complete.
+ */
+void __init torture_init_end(void)
+{
+        mutex_unlock(&fullstop_mutex);
+        register_reboot_notifier(&torture_shutdown_nb);
+}
+EXPORT_SYMBOL_GPL(torture_init_end);
+/*
+ * Clean up torture module.  Please note that this is -not- invoked via
+ * the usual module_exit() mechanism, but rather by an explicit call from
+ * the client torture module.  Returns true if a race with system shutdown
+ * is detected, otherwise, all kthreads started by functions in this file
+ * will be shut down.
+ *
+ * This must be called before the caller starts shutting down its own
+ * kthreads.
+ */
+bool torture_cleanup(void)
+{
+        mutex_lock(&fullstop_mutex);
+        if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
+                pr_warn("Concurrent rmmod and shutdown illegal!\n");
+                mutex_unlock(&fullstop_mutex);
+                schedule_timeout_uninterruptible(10);
+                return true;
+        }
+        ACCESS_ONCE(fullstop) = FULLSTOP_RMMOD;
+        mutex_unlock(&fullstop_mutex);
+        torture_shutdown_cleanup();
+        torture_shuffle_cleanup();
+        torture_stutter_cleanup();
+        torture_onoff_cleanup();
+        return false;
+}
+EXPORT_SYMBOL_GPL(torture_cleanup);
+/*
+ * Is it time for the current torture test to stop?
+ */
+bool torture_must_stop(void)
+{
+        return torture_must_stop_irq() || kthread_should_stop();
+}
+EXPORT_SYMBOL_GPL(torture_must_stop);
+/*
+ * Is it time for the current torture test to stop?  This is the irq-safe
+ * version, hence no check for kthread_should_stop().
+ */
+bool torture_must_stop_irq(void)
+{
+        return ACCESS_ONCE(fullstop) != FULLSTOP_DONTSTOP;
+}
+EXPORT_SYMBOL_GPL(torture_must_stop_irq);
+/*
+ * Each kthread must wait for kthread_should_stop() before returning from
+ * its top-level function, otherwise segfaults ensue.  This function
+ * prints a "stopping" message and waits for kthread_should_stop(), and
+ * should be called from all torture kthreads immediately prior to
+ * returning.
+ */
+void torture_kthread_stopping(char *title)
+{
+        if (verbose)
+                VERBOSE_TOROUT_STRING(title);
+        while (!kthread_should_stop()) {
+                torture_shutdown_absorb(title);
+                schedule_timeout_uninterruptible(1);
+        }
+}
+EXPORT_SYMBOL_GPL(torture_kthread_stopping);
+/*
+ * Create a generic torture kthread that is immediately runnable.  If you
+ * need the kthread to be stopped so that you can do something to it before
+ * it starts, you will need to open-code your own.
+ */
+int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m,
+                            char *f, struct task_struct **tp)
+{
+        int ret = 0;
+        VERBOSE_TOROUT_STRING(m);
+        *tp = kthread_run(fn, arg, s);
+        if (IS_ERR(*tp)) {
+                ret = PTR_ERR(*tp);
+                VERBOSE_TOROUT_ERRSTRING(f);
+                *tp = NULL;
+        }
+        torture_shuffle_task_register(*tp);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(_torture_create_kthread);
+/*
+ * Stop a generic kthread, emitting a message.
+ */
+void _torture_stop_kthread(char *m, struct task_struct **tp)
+{
+        if (*tp == NULL)
+                return;
+        VERBOSE_TOROUT_STRING(m);
+        kthread_stop(*tp);
+        *tp = NULL;
+}
+EXPORT_SYMBOL_GPL(_torture_stop_kthread);
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index a5457d577b98..0434ff1b808e 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -40,8 +40,8 @@ static int write_iteration = 50;
 module_param(write_iteration, uint, 0644);
 MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings");
-static int producer_nice = 19;
+static int producer_nice = MAX_NICE;
-static int consumer_nice = 19;
+static int consumer_nice = MAX_NICE;
 static int producer_fifo = -1;
 static int consumer_fifo = -1;
@@ -308,7 +308,7 @@ static void ring_buffer_producer(void)
        /* Let the user know that the test is running at low priority */
        if (producer_fifo < 0 && consumer_fifo < 0 &&
-            producer_nice == 19 && consumer_nice == 19)
+            producer_nice == MAX_NICE && consumer_nice == MAX_NICE)
                trace_printk("WARNING!!! This test is running at lowest priority.\n");
        trace_printk("Time:     %lld (usecs)\n", time);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 815c878f409b..24c1f2382557 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1600,15 +1600,31 @@ void trace_buffer_unlock_commit(struct ring_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
+static struct ring_buffer *temp_buffer;
 struct ring_buffer_event *
 trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
                          struct ftrace_event_file *ftrace_file,
                          int type, unsigned long len,
                          unsigned long flags, int pc)
 {
+        struct ring_buffer_event *entry;
        *current_rb = ftrace_file->tr->trace_buffer.buffer;
-        return trace_buffer_lock_reserve(*current_rb,
+        entry = trace_buffer_lock_reserve(*current_rb,
                                         type, len, flags, pc);
+        /*
+         * If tracing is off, but we have triggers enabled
+         * we still need to look at the event data. Use the temp_buffer
+         * to store the trace event for the tigger to use. It's recusive
+         * safe and will not be recorded anywhere.
+         */
+        if (!entry && ftrace_file->flags & FTRACE_EVENT_FL_TRIGGER_COND) {
+                *current_rb = temp_buffer;
+                entry = trace_buffer_lock_reserve(*current_rb,
+                                                  type, len, flags, pc);
+        }
+        return entry;
 }
 EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);
@@ -6494,11 +6510,16 @@ __init static int tracer_alloc_buffers(void)
        raw_spin_lock_init(&global_trace.start_lock);
+        /* Used for event triggers */
+        temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE);
+        if (!temp_buffer)
+                goto out_free_cpumask;
        /* TODO: make the number of buffers hot pluggable with CPUS */
        if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
                printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
                WARN_ON(1);
-                goto out_free_cpumask;
+                goto out_free_temp_buffer;
        }
        if (global_trace.buffer_disabled)
@@ -6540,6 +6561,8 @@ __init static int tracer_alloc_buffers(void)
        return 0;
+out_free_temp_buffer:
+        ring_buffer_free(temp_buffer);
 out_free_cpumask:
        free_percpu(global_trace.trace_buffer.data);
 #ifdef CONFIG_TRACER_MAX_TRACE
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index e854f420e033..c894614de14d 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -31,9 +31,25 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
        }
        /* The ftrace function trace is allowed only for root. */
-        if (ftrace_event_is_function(tp_event) &&
+        if (ftrace_event_is_function(tp_event)) {
-            perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
+                if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
-                return -EPERM;
+                        return -EPERM;
+                /*
+                 * We don't allow user space callchains for  function trace
+                 * event, due to issues with page faults while tracing page
+                 * fault handler and its overall trickiness nature.
+                 */
+                if (!p_event->attr.exclude_callchain_user)
+                        return -EINVAL;
+                /*
+                 * Same reason to disable user stack dump as for user space
+                 * callchains above.
+                 */
+                if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER)
+                        return -EINVAL;
+        }
        /* No tracing, just counting, so no obvious leak */
        if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e71ffd4eccb5..7b16d40bd64d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -27,12 +27,6 @@
 DEFINE_MUTEX(event_mutex);
-DEFINE_MUTEX(event_storage_mutex);
-EXPORT_SYMBOL_GPL(event_storage_mutex);
-char event_storage[EVENT_STORAGE_SIZE];
-EXPORT_SYMBOL_GPL(event_storage);
 LIST_HEAD(ftrace_events);
 static LIST_HEAD(ftrace_common_fields);
@@ -1777,6 +1771,16 @@ static void trace_module_add_events(struct module *mod)
 {
        struct ftrace_event_call **call, **start, **end;
+        if (!mod->num_trace_events)
+                return;
+        /* Don't add infrastructure for mods without tracepoints */
+        if (trace_module_has_bad_taint(mod)) {
+                pr_err("%s: module has bad taint, not creating trace events\n",
+                       mod->name);
+                return;
+        }
        start = mod->trace_events;
        end = mod->trace_events + mod->num_trace_events;
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 7c3e3e72e2b6..ee0a5098ac43 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -95,15 +95,12 @@ static void __always_unused ____ftrace_check_##name(void)		\
 #undef __array
 #define __array(type, item, len)                                        \
        do {                                                            \
+                char *type_str = #type"["__stringify(len)"]";           \
                BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);                 \
-                mutex_lock(&event_storage_mutex);                       \
+                ret = trace_define_field(event_call, type_str, #item,   \
-                snprintf(event_storage, sizeof(event_storage),          \
-                         "%s[%d]", #type, len);                         \
-                ret = trace_define_field(event_call, event_storage, #item, \
                                 offsetof(typeof(field), item),         \
                                 sizeof(field.item),                    \
                                 is_signed_type(type), filter_type);    \
-                mutex_unlock(&event_storage_mutex);                     \
                if (ret)                                                \
                        return ret;                                     \
        } while (0);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 2aefbee93a6d..887ef88b0bc7 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -498,14 +498,14 @@ void trace_hardirqs_off(void)
 }
 EXPORT_SYMBOL(trace_hardirqs_off);
-void trace_hardirqs_on_caller(unsigned long caller_addr)
+__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
 {
        if (!preempt_trace() && irq_trace())
                stop_critical_timing(CALLER_ADDR0, caller_addr);
 }
 EXPORT_SYMBOL(trace_hardirqs_on_caller);
-void trace_hardirqs_off_caller(unsigned long caller_addr)
+__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
 {
        if (!preempt_trace() && irq_trace())
                start_critical_timing(CALLER_ADDR0, caller_addr);
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 29f26540e9c9..031cc5655a51 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -631,6 +631,11 @@ void tracepoint_iter_reset(struct tracepoint_iter *iter)
 EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
 #ifdef CONFIG_MODULES
+bool trace_module_has_bad_taint(struct module *mod)
+{
+        return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP));
+}
 static int tracepoint_module_coming(struct module *mod)
 {
        struct tp_module *tp_mod, *iter;
@@ -641,7 +646,7 @@ static int tracepoint_module_coming(struct module *mod)
         * module headers (for forced load), to make sure we don't cause a crash.
         * Staging and out-of-tree GPL modules are fine.
         */
-        if (mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP)))
+        if (trace_module_has_bad_taint(mod))
                return 0;
        mutex_lock(&tracepoints_mutex);
        tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 240fb62cf394..dd06439b9c84 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -225,7 +225,7 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id)
 *
 *      When there is no mapping defined for the user-namespace uid
 *      pair INVALID_UID is returned.  Callers are expected to test
- *      for and handle handle INVALID_UID being returned.  INVALID_UID
+ *      for and handle INVALID_UID being returned.  INVALID_UID
 *      may be tested for using uid_valid().
 */
 kuid_t make_kuid(struct user_namespace *ns, uid_t uid)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 82ef9f3b7473..3fa5b8f3aae3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1851,6 +1851,12 @@ static void destroy_worker(struct worker *worker)
        if (worker->flags & WORKER_IDLE)
                pool->nr_idle--;
+        /*
+         * Once WORKER_DIE is set, the kworker may destroy itself at any
+         * point.  Pin to ensure the task stays until we're done with it.
+         */
+        get_task_struct(worker->task);
        list_del_init(&worker->entry);
        worker->flags |= WORKER_DIE;
@@ -1859,6 +1865,7 @@ static void destroy_worker(struct worker *worker)
        spin_unlock_irq(&pool->lock);
        kthread_stop(worker->task);
+        put_task_struct(worker->task);
        kfree(worker);
        spin_lock_irq(&pool->lock);
@@ -3218,7 +3225,7 @@ static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
                return -ENOMEM;
        if (sscanf(buf, "%d", &attrs->nice) == 1 &&
-            attrs->nice >= -20 && attrs->nice <= 19)
+            attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
                ret = apply_workqueue_attrs(wq, attrs);
        else
                ret = -EINVAL;