Merge Linus' tree to be be to apply submitted patches to newer code than

current trivial.git base
author: Jiri Kosina <jkosina@suse.cz> 2014-11-20 08:42:02 -0500
committer: Jiri Kosina <jkosina@suse.cz> 2014-11-20 08:42:02 -0500
commit: a02001086bbfb4da35d1228bebc2f1b442db455f (patch)
tree: 62ab47936cef06fd08657ca5b6cd1df98c19be57 /kernel
parent: eff264efeeb0898408e8c9df72d8a32621035bed (diff)
parent: fc14f9c1272f62c3e8d01300f52467c0d9af50f9 (diff)
125 files changed, 9587 insertions, 3061 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 0026cf531769..17ea6d4a9a24 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -86,7 +86,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_TRACEPOINTS) += trace/
 obj-$(CONFIG_IRQ_WORK) += irq_work.o
 obj-$(CONFIG_CPU_PM) += cpu_pm.o
-obj-$(CONFIG_NET) += bpf/
+obj-$(CONFIG_BPF) += bpf/
 obj-$(CONFIG_PERF_EVENTS) += events/
@@ -105,7 +105,7 @@ targets += config_data.gz
 $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
        $(call if_changed,gzip)
-      filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;")
+      filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/basic/bin2c; echo "MAGIC_END;")
 targets += config_data.h
 $(obj)/config_data.h: $(obj)/config_data.gz FORCE
        $(call filechk,ikconfiggz)
diff --git a/kernel/acct.c b/kernel/acct.c
index a1844f14c6d6..33738ef972f3 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -59,6 +59,7 @@
 #include <asm/div64.h>
 #include <linux/blkdev.h> /* sector_div */
 #include <linux/pid_namespace.h>
+#include <linux/fs_pin.h>
 /*
 * These constants control the amount of freespace that suspend and
@@ -75,172 +76,190 @@ int acct_parm[3] = {4, 2, 30};
 /*
 * External references and all of the globals.
 */
-static void do_acct_process(struct bsd_acct_struct *acct,
+static void do_acct_process(struct bsd_acct_struct *acct);
-                struct pid_namespace *ns, struct file *);
-/*
- * This structure is used so that all the data protected by lock
- * can be placed in the same cache line as the lock.  This primes
- * the cache line to have the data after getting the lock.
- */
 struct bsd_acct_struct {
+        struct fs_pin           pin;
+        struct mutex            lock;
        int                     active;
        unsigned long           needcheck;
        struct file             *file;
        struct pid_namespace    *ns;
-        struct list_head        list;
+        struct work_struct      work;
+        struct completion       done;
 };
-static DEFINE_SPINLOCK(acct_lock);
-static LIST_HEAD(acct_list);
 /*
 * Check the amount of free space and suspend/resume accordingly.
 */
-static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
+static int check_free_space(struct bsd_acct_struct *acct)
 {
        struct kstatfs sbuf;
-        int res;
-        int act;
+        if (time_is_before_jiffies(acct->needcheck))
-        u64 resume;
-        u64 suspend;
-        spin_lock(&acct_lock);
-        res = acct->active;
-        if (!file || time_is_before_jiffies(acct->needcheck))
                goto out;
-        spin_unlock(&acct_lock);
        /* May block */
-        if (vfs_statfs(&file->f_path, &sbuf))
+        if (vfs_statfs(&acct->file->f_path, &sbuf))
-                return res;
-        suspend = sbuf.f_blocks * SUSPEND;
-        resume = sbuf.f_blocks * RESUME;
-        do_div(suspend, 100);
-        do_div(resume, 100);
-        if (sbuf.f_bavail <= suspend)
-                act = -1;
-        else if (sbuf.f_bavail >= resume)
-                act = 1;
-        else
-                act = 0;
-        /*
-         * If some joker switched acct->file under us we'ld better be
-         * silent and _not_ touch anything.
-         */
-        spin_lock(&acct_lock);
-        if (file != acct->file) {
-                if (act)
-                        res = act > 0;
                goto out;
-        }
        if (acct->active) {
-                if (act < 0) {
+                u64 suspend = sbuf.f_blocks * SUSPEND;
+                do_div(suspend, 100);
+                if (sbuf.f_bavail <= suspend) {
                        acct->active = 0;
-                        printk(KERN_INFO "Process accounting paused\n");
+                        pr_info("Process accounting paused\n");
                }
        } else {
-                if (act > 0) {
+                u64 resume = sbuf.f_blocks * RESUME;
+                do_div(resume, 100);
+                if (sbuf.f_bavail >= resume) {
                        acct->active = 1;
-                        printk(KERN_INFO "Process accounting resumed\n");
+                        pr_info("Process accounting resumed\n");
                }
        }
        acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
-        res = acct->active;
 out:
-        spin_unlock(&acct_lock);
+        return acct->active;
+}
+static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
+{
+        struct bsd_acct_struct *res;
+again:
+        smp_rmb();
+        rcu_read_lock();
+        res = ACCESS_ONCE(ns->bacct);
+        if (!res) {
+                rcu_read_unlock();
+                return NULL;
+        }
+        if (!atomic_long_inc_not_zero(&res->pin.count)) {
+                rcu_read_unlock();
+                cpu_relax();
+                goto again;
+        }
+        rcu_read_unlock();
+        mutex_lock(&res->lock);
+        if (!res->ns) {
+                mutex_unlock(&res->lock);
+                pin_put(&res->pin);
+                goto again;
+        }
        return res;
 }
-/*
+static void close_work(struct work_struct *work)
- * Close the old accounting file (if currently open) and then replace
+{
- * it with file (if non-NULL).
+        struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
- *
+        struct file *file = acct->file;
- * NOTE: acct_lock MUST be held on entry and exit.
+        if (file->f_op->flush)
- */
+                file->f_op->flush(file, NULL);
-static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
+        __fput_sync(file);
-                struct pid_namespace *ns)
+        complete(&acct->done);
+}
+static void acct_kill(struct bsd_acct_struct *acct,
+                      struct bsd_acct_struct *new)
 {
-        struct file *old_acct = NULL;
+        if (acct) {
-        struct pid_namespace *old_ns = NULL;
+                struct pid_namespace *ns = acct->ns;
+                do_acct_process(acct);
-        if (acct->file) {
+                INIT_WORK(&acct->work, close_work);
-                old_acct = acct->file;
+                init_completion(&acct->done);
-                old_ns = acct->ns;
+                schedule_work(&acct->work);
-                acct->active = 0;
+                wait_for_completion(&acct->done);
-                acct->file = NULL;
+                pin_remove(&acct->pin);
+                ns->bacct = new;
                acct->ns = NULL;
-                list_del(&acct->list);
+                atomic_long_dec(&acct->pin.count);
+                mutex_unlock(&acct->lock);
+                pin_put(&acct->pin);
        }
-        if (file) {
+}
-                acct->file = file;
-                acct->ns = ns;
+static void acct_pin_kill(struct fs_pin *pin)
-                acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
+{
-                acct->active = 1;
+        struct bsd_acct_struct *acct;
-                list_add(&acct->list, &acct_list);
+        acct = container_of(pin, struct bsd_acct_struct, pin);
-        }
+        mutex_lock(&acct->lock);
-        if (old_acct) {
+        if (!acct->ns) {
-                mnt_unpin(old_acct->f_path.mnt);
+                mutex_unlock(&acct->lock);
-                spin_unlock(&acct_lock);
+                pin_put(pin);
-                do_acct_process(acct, old_ns, old_acct);
+                acct = NULL;
-                filp_close(old_acct, NULL);
-                spin_lock(&acct_lock);
        }
+        acct_kill(acct, NULL);
 }
 static int acct_on(struct filename *pathname)
 {
        struct file *file;
-        struct vfsmount *mnt;
+        struct vfsmount *mnt, *internal;
-        struct pid_namespace *ns;
+        struct pid_namespace *ns = task_active_pid_ns(current);
-        struct bsd_acct_struct *acct = NULL;
+        struct bsd_acct_struct *acct, *old;
+        int err;
+        acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
+        if (!acct)
+                return -ENOMEM;
        /* Difference from BSD - they don't do O_APPEND */
        file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
-        if (IS_ERR(file))
+        if (IS_ERR(file)) {
+                kfree(acct);
                return PTR_ERR(file);
+        }
        if (!S_ISREG(file_inode(file)->i_mode)) {
+                kfree(acct);
                filp_close(file, NULL);
                return -EACCES;
        }
        if (!file->f_op->write) {
+                kfree(acct);
                filp_close(file, NULL);
                return -EIO;
        }
+        internal = mnt_clone_internal(&file->f_path);
-        ns = task_active_pid_ns(current);
+        if (IS_ERR(internal)) {
-        if (ns->bacct == NULL) {
+                kfree(acct);
-                acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
+                filp_close(file, NULL);
-                if (acct == NULL) {
+                return PTR_ERR(internal);
-                        filp_close(file, NULL);
-                        return -ENOMEM;
-                }
        }
+        err = mnt_want_write(internal);
-        spin_lock(&acct_lock);
+        if (err) {
-        if (ns->bacct == NULL) {
+                mntput(internal);
-                ns->bacct = acct;
+                kfree(acct);
-                acct = NULL;
+                filp_close(file, NULL);
+                return err;
        }
        mnt = file->f_path.mnt;
-        mnt_pin(mnt);
+        file->f_path.mnt = internal;
-        acct_file_reopen(ns->bacct, file, ns);
-        spin_unlock(&acct_lock);
+        atomic_long_set(&acct->pin.count, 1);
+        acct->pin.kill = acct_pin_kill;
-        mntput(mnt); /* it's pinned, now give up active reference */
+        acct->file = file;
-        kfree(acct);
+        acct->needcheck = jiffies;
+        acct->ns = ns;
+        mutex_init(&acct->lock);
+        mutex_lock_nested(&acct->lock, 1);      /* nobody has seen it yet */
+        pin_insert(&acct->pin, mnt);
+        old = acct_get(ns);
+        if (old)
+                acct_kill(old, acct);
+        else
+                ns->bacct = acct;
+        mutex_unlock(&acct->lock);
+        mnt_drop_write(mnt);
+        mntput(mnt);
        return 0;
 }
+static DEFINE_MUTEX(acct_on_mutex);
 /**
 * sys_acct - enable/disable process accounting
 * @name: file name for accounting records or NULL to shutdown accounting
@@ -261,80 +280,23 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
        if (name) {
                struct filename *tmp = getname(name);
                if (IS_ERR(tmp))
                        return PTR_ERR(tmp);
+                mutex_lock(&acct_on_mutex);
                error = acct_on(tmp);
+                mutex_unlock(&acct_on_mutex);
                putname(tmp);
        } else {
-                struct bsd_acct_struct *acct;
+                acct_kill(acct_get(task_active_pid_ns(current)), NULL);
-                acct = task_active_pid_ns(current)->bacct;
-                if (acct == NULL)
-                        return 0;
-                spin_lock(&acct_lock);
-                acct_file_reopen(acct, NULL, NULL);
-                spin_unlock(&acct_lock);
        }
        return error;
 }
-/**
- * acct_auto_close - turn off a filesystem's accounting if it is on
- * @m: vfsmount being shut down
- *
- * If the accounting is turned on for a file in the subtree pointed to
- * to by m, turn accounting off.  Done when m is about to die.
- */
-void acct_auto_close_mnt(struct vfsmount *m)
-{
-        struct bsd_acct_struct *acct;
-        spin_lock(&acct_lock);
-restart:
-        list_for_each_entry(acct, &acct_list, list)
-                if (acct->file && acct->file->f_path.mnt == m) {
-                        acct_file_reopen(acct, NULL, NULL);
-                        goto restart;
-                }
-        spin_unlock(&acct_lock);
-}
-/**
- * acct_auto_close - turn off a filesystem's accounting if it is on
- * @sb: super block for the filesystem
- *
- * If the accounting is turned on for a file in the filesystem pointed
- * to by sb, turn accounting off.
- */
-void acct_auto_close(struct super_block *sb)
-{
-        struct bsd_acct_struct *acct;
-        spin_lock(&acct_lock);
-restart:
-        list_for_each_entry(acct, &acct_list, list)
-                if (acct->file && acct->file->f_path.dentry->d_sb == sb) {
-                        acct_file_reopen(acct, NULL, NULL);
-                        goto restart;
-                }
-        spin_unlock(&acct_lock);
-}
 void acct_exit_ns(struct pid_namespace *ns)
 {
-        struct bsd_acct_struct *acct = ns->bacct;
+        acct_kill(acct_get(ns), NULL);
-        if (acct == NULL)
-                return;
-        spin_lock(&acct_lock);
-        if (acct->file != NULL)
-                acct_file_reopen(acct, NULL, NULL);
-        spin_unlock(&acct_lock);
-        kfree(acct);
 }
 /*
@@ -376,7 +338,7 @@ static comp_t encode_comp_t(unsigned long value)
        return exp;
 }
-#if ACCT_VERSION==1 || ACCT_VERSION==2
+#if ACCT_VERSION == 1 || ACCT_VERSION == 2
 /*
 * encode an u64 into a comp2_t (24 bits)
 *
@@ -389,7 +351,7 @@ static comp_t encode_comp_t(unsigned long value)
 #define MANTSIZE2       20                      /* 20 bit mantissa. */
 #define EXPSIZE2        5                       /* 5 bit base 2 exponent. */
 #define MAXFRACT2       ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */
-#define MAXEXP2         ((1 <<EXPSIZE2) - 1)    /* Maximum exponent. */
+#define MAXEXP2         ((1 << EXPSIZE2) - 1)    /* Maximum exponent. */
 static comp2_t encode_comp2_t(u64 value)
 {
@@ -420,7 +382,7 @@ static comp2_t encode_comp2_t(u64 value)
 }
 #endif
-#if ACCT_VERSION==3
+#if ACCT_VERSION == 3
 /*
 * encode an u64 into a 32 bit IEEE float
 */
@@ -429,8 +391,9 @@ static u32 encode_float(u64 value)
        unsigned exp = 190;
        unsigned u;
-        if (value==0) return 0;
+        if (value == 0)
-        while ((s64)value > 0){
+                return 0;
+        while ((s64)value > 0) {
                value <<= 1;
                exp--;
        }
@@ -448,116 +411,116 @@ static u32 encode_float(u64 value)
 *  do_exit() or when switching to a different output file.
 */
-/*
+static void fill_ac(acct_t *ac)
- *  do_acct_process does all actual work. Caller holds the reference to file.
- */
-static void do_acct_process(struct bsd_acct_struct *acct,
-                struct pid_namespace *ns, struct file *file)
 {
        struct pacct_struct *pacct = &current->signal->pacct;
-        acct_t ac;
-        mm_segment_t fs;
-        unsigned long flim;
        u64 elapsed, run_time;
        struct tty_struct *tty;
-        const struct cred *orig_cred;
-        /* Perform file operations on behalf of whoever enabled accounting */
-        orig_cred = override_creds(file->f_cred);
-        /*
-         * First check to see if there is enough free_space to continue
-         * the process accounting system.
-         */
-        if (!check_free_space(acct, file))
-                goto out;
        /*
         * Fill the accounting struct with the needed info as recorded
         * by the different kernel functions.
         */
-        memset(&ac, 0, sizeof(acct_t));
+        memset(ac, 0, sizeof(acct_t));
-        ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
+        ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER;
-        strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
+        strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
        /* calculate run_time in nsec*/
        run_time = ktime_get_ns();
        run_time -= current->group_leader->start_time;
        /* convert nsec -> AHZ */
        elapsed = nsec_to_AHZ(run_time);
-#if ACCT_VERSION==3
+#if ACCT_VERSION == 3
-        ac.ac_etime = encode_float(elapsed);
+        ac->ac_etime = encode_float(elapsed);
 #else
-        ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
+        ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
-                               (unsigned long) elapsed : (unsigned long) -1l);
+                                (unsigned long) elapsed : (unsigned long) -1l);
 #endif
-#if ACCT_VERSION==1 || ACCT_VERSION==2
+#if ACCT_VERSION == 1 || ACCT_VERSION == 2
        {
                /* new enlarged etime field */
                comp2_t etime = encode_comp2_t(elapsed);
-                ac.ac_etime_hi = etime >> 16;
-                ac.ac_etime_lo = (u16) etime;
+                ac->ac_etime_hi = etime >> 16;
+                ac->ac_etime_lo = (u16) etime;
        }
 #endif
        do_div(elapsed, AHZ);
-        ac.ac_btime = get_seconds() - elapsed;
+        ac->ac_btime = get_seconds() - elapsed;
-        /* we really need to bite the bullet and change layout */
-        ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
-        ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
 #if ACCT_VERSION==2
-        ac.ac_ahz = AHZ;
+        ac->ac_ahz = AHZ;
-#endif
-#if ACCT_VERSION==1 || ACCT_VERSION==2
-        /* backward-compatible 16 bit fields */
-        ac.ac_uid16 = ac.ac_uid;
-        ac.ac_gid16 = ac.ac_gid;
-#endif
-#if ACCT_VERSION==3
-        ac.ac_pid = task_tgid_nr_ns(current, ns);
-        rcu_read_lock();
-        ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
-        rcu_read_unlock();
 #endif
        spin_lock_irq(&current->sighand->siglock);
        tty = current->signal->tty;     /* Safe as we hold the siglock */
-        ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
+        ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
-        ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
+        ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
-        ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
+        ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
-        ac.ac_flag = pacct->ac_flag;
+        ac->ac_flag = pacct->ac_flag;
-        ac.ac_mem = encode_comp_t(pacct->ac_mem);
+        ac->ac_mem = encode_comp_t(pacct->ac_mem);
-        ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
+        ac->ac_minflt = encode_comp_t(pacct->ac_minflt);
-        ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
+        ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
-        ac.ac_exitcode = pacct->ac_exitcode;
+        ac->ac_exitcode = pacct->ac_exitcode;
        spin_unlock_irq(&current->sighand->siglock);
-        ac.ac_io = encode_comp_t(0 /* current->io_usage */);    /* %% */
+}
-        ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
+/*
-        ac.ac_swaps = encode_comp_t(0);
+ *  do_acct_process does all actual work. Caller holds the reference to file.
+ */
+static void do_acct_process(struct bsd_acct_struct *acct)
+{
+        acct_t ac;
+        unsigned long flim;
+        const struct cred *orig_cred;
+        struct file *file = acct->file;
        /*
-         * Get freeze protection. If the fs is frozen, just skip the write
+         * Accounting records are not subject to resource limits.
-         * as we could deadlock the system otherwise.
         */
-        if (!file_start_write_trylock(file))
+        flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
-                goto out;
+        current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+        /* Perform file operations on behalf of whoever enabled accounting */
+        orig_cred = override_creds(file->f_cred);
        /*
-         * Kernel segment override to datasegment and write it
+         * First check to see if there is enough free_space to continue
-         * to the accounting file.
+         * the process accounting system.
         */
-        fs = get_fs();
+        if (!check_free_space(acct))
-        set_fs(KERNEL_DS);
+                goto out;
+        fill_ac(&ac);
+        /* we really need to bite the bullet and change layout */
+        ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
+        ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
+#if ACCT_VERSION == 1 || ACCT_VERSION == 2
+        /* backward-compatible 16 bit fields */
+        ac.ac_uid16 = ac.ac_uid;
+        ac.ac_gid16 = ac.ac_gid;
+#endif
+#if ACCT_VERSION == 3
+        {
+                struct pid_namespace *ns = acct->ns;
+                ac.ac_pid = task_tgid_nr_ns(current, ns);
+                rcu_read_lock();
+                ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent),
+                                             ns);
+                rcu_read_unlock();
+        }
+#endif
        /*
-         * Accounting records are not subject to resource limits.
+         * Get freeze protection. If the fs is frozen, just skip the write
+         * as we could deadlock the system otherwise.
         */
-        flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+        if (file_start_write_trylock(file)) {
-        current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+                /* it's been opened O_APPEND, so position is irrelevant */
-        file->f_op->write(file, (char *)&ac,
+                loff_t pos = 0;
-                               sizeof(acct_t), &file->f_pos);
+                __kernel_write(file, (char *)&ac, sizeof(acct_t), &pos);
-        current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
+                file_end_write(file);
-        set_fs(fs);
+        }
-        file_end_write(file);
 out:
+        current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
        revert_creds(orig_cred);
 }
@@ -574,6 +537,7 @@ void acct_collect(long exitcode, int group_dead)
        if (group_dead && current->mm) {
                struct vm_area_struct *vma;
                down_read(&current->mm->mmap_sem);
                vma = current->mm->mmap;
                while (vma) {
@@ -605,34 +569,20 @@ void acct_collect(long exitcode, int group_dead)
        spin_unlock_irq(&current->sighand->siglock);
 }
-static void acct_process_in_ns(struct pid_namespace *ns)
+static void slow_acct_process(struct pid_namespace *ns)
 {
-        struct file *file = NULL;
+        for ( ; ns; ns = ns->parent) {
-        struct bsd_acct_struct *acct;
+                struct bsd_acct_struct *acct = acct_get(ns);
+                if (acct) {
-        acct = ns->bacct;
+                        do_acct_process(acct);
-        /*
+                        mutex_unlock(&acct->lock);
-         * accelerate the common fastpath:
+                        pin_put(&acct->pin);
-         */
+                }
-        if (!acct || !acct->file)
-                return;
-        spin_lock(&acct_lock);
-        file = acct->file;
-        if (unlikely(!file)) {
-                spin_unlock(&acct_lock);
-                return;
        }
-        get_file(file);
-        spin_unlock(&acct_lock);
-        do_acct_process(acct, ns, file);
-        fput(file);
 }
 /**
- * acct_process - now just a wrapper around acct_process_in_ns,
+ * acct_process
- * which in turn is a wrapper around do_acct_process.
 *
 * handles process accounting for an exiting task
 */
@@ -645,6 +595,10 @@ void acct_process(void)
         * alive and holds its namespace, which in turn holds
         * its parent.
         */
-        for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent)
+        for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) {
-                acct_process_in_ns(ns);
+                if (ns->bacct)
+                        break;
+        }
+        if (unlikely(ns))
+                slow_acct_process(ns);
 }
diff --git a/kernel/async.c b/kernel/async.c
index 61f023ce0228..4c3773c0bf63 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -115,7 +115,7 @@ static void async_run_entry_fn(struct work_struct *work)
        /* 1) run (and print duration) */
        if (initcall_debug && system_state == SYSTEM_BOOTING) {
-                printk(KERN_DEBUG "calling  %lli_%pF @ %i\n",
+                pr_debug("calling  %lli_%pF @ %i\n",
                        (long long)entry->cookie,
                        entry->func, task_pid_nr(current));
                calltime = ktime_get();
@@ -124,7 +124,7 @@ static void async_run_entry_fn(struct work_struct *work)
        if (initcall_debug && system_state == SYSTEM_BOOTING) {
                rettime = ktime_get();
                delta = ktime_sub(rettime, calltime);
-                printk(KERN_DEBUG "initcall %lli_%pF returned 0 after %lld usecs\n",
+                pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n",
                        (long long)entry->cookie,
                        entry->func,
                        (long long)ktime_to_ns(delta) >> 10);
@@ -285,7 +285,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain
        ktime_t uninitialized_var(starttime), delta, endtime;
        if (initcall_debug && system_state == SYSTEM_BOOTING) {
-                printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
+                pr_debug("async_waiting @ %i\n", task_pid_nr(current));
                starttime = ktime_get();
        }
@@ -295,7 +295,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain
                endtime = ktime_get();
                delta = ktime_sub(endtime, starttime);
-                printk(KERN_DEBUG "async_continuing @ %i after %lli usec\n",
+                pr_debug("async_continuing @ %i after %lli usec\n",
                        task_pid_nr(current),
                        (long long)ktime_to_ns(delta) >> 10);
        }
diff --git a/kernel/audit.c b/kernel/audit.c
index ba2ff5a5c600..cebb11db4d34 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -126,7 +126,7 @@ static atomic_t    audit_lost = ATOMIC_INIT(0);
 /* The netlink socket. */
 static struct sock *audit_sock;
-int audit_net_id;
+static int audit_net_id;
 /* Hash for inode-based rules */
 struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -724,7 +724,7 @@ static int audit_get_feature(struct sk_buff *skb)
        seq = nlmsg_hdr(skb)->nlmsg_seq;
-        audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &af, sizeof(af));
+        audit_send_reply(skb, seq, AUDIT_GET_FEATURE, 0, 0, &af, sizeof(af));
        return 0;
 }
@@ -739,7 +739,7 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature
        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE);
        audit_log_task_info(ab, current);
-        audit_log_format(ab, "feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d",
+        audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d",
                         audit_feature_names[which], !!old_feature, !!new_feature,
                         !!old_lock, !!new_lock, res);
        audit_log_end(ab);
@@ -750,7 +750,7 @@ static int audit_set_feature(struct sk_buff *skb)
        struct audit_features *uaf;
        int i;
-        BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > sizeof(audit_feature_names)/sizeof(audit_feature_names[0]));
+        BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names));
        uaf = nlmsg_data(nlmsg_hdr(skb));
        /* if there is ever a version 2 we should handle that here */
@@ -1301,19 +1301,9 @@ err:
 */
 unsigned int audit_serial(void)
 {
-        static DEFINE_SPINLOCK(serial_lock);
+        static atomic_t serial = ATOMIC_INIT(0);
-        static unsigned int serial = 0;
-        unsigned long flags;
+        return atomic_add_return(1, &serial);
-        unsigned int ret;
-        spin_lock_irqsave(&serial_lock, flags);
-        do {
-                ret = ++serial;
-        } while (unlikely(!ret));
-        spin_unlock_irqrestore(&serial_lock, flags);
-        return ret;
 }
 static inline void audit_get_stamp(struct audit_context *ctx,
@@ -1681,7 +1671,7 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
        }
 }
-void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
+static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
 {
        kernel_cap_t *perm = &name->fcap.permitted;
        kernel_cap_t *inh = &name->fcap.inheritable;
@@ -1860,7 +1850,7 @@ EXPORT_SYMBOL(audit_log_task_context);
 void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
 {
        const struct cred *cred;
-        char name[sizeof(tsk->comm)];
+        char comm[sizeof(tsk->comm)];
        struct mm_struct *mm = tsk->mm;
        char *tty;
@@ -1894,9 +1884,8 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
                         from_kgid(&init_user_ns, cred->fsgid),
                         tty, audit_get_sessionid(tsk));
-        get_task_comm(name, tsk);
        audit_log_format(ab, " comm=");
-        audit_log_untrustedstring(ab, name);
+        audit_log_untrustedstring(ab, get_task_comm(comm, tsk));
        if (mm) {
                down_read(&mm->mmap_sem);
@@ -1959,6 +1948,7 @@ void audit_log_end(struct audit_buffer *ab)
        } else {
                struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
+                nlh->nlmsg_len = ab->skb->len;
                kauditd_send_multicast_skb(ab->skb);
                /*
@@ -1970,7 +1960,7 @@ void audit_log_end(struct audit_buffer *ab)
                 * protocol between the kaudit kernel subsystem and the auditd
                 * userspace code.
                 */
-                nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN;
+                nlh->nlmsg_len -= NLMSG_HDRLEN;
                if (audit_pid) {
                        skb_queue_tail(&audit_skb_queue, ab->skb);
diff --git a/kernel/audit.h b/kernel/audit.h
index 7bb65730c890..3cdffad5a1d9 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -222,7 +222,6 @@ extern void audit_copy_inode(struct audit_names *name,
                             const struct inode *inode);
 extern void audit_log_cap(struct audit_buffer *ab, char *prefix,
                          kernel_cap_t *cap);
-extern void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name);
 extern void audit_log_name(struct audit_context *context,
                           struct audit_names *n, struct path *path,
                           int record_num, int *call_panic);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 135944a7b28a..80f29e015570 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -154,6 +154,7 @@ static struct audit_chunk *alloc_chunk(int count)
                chunk->owners[i].index = i;
        }
        fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch);
+        chunk->mark.mask = FS_IN_IGNORED;
        return chunk;
 }
@@ -449,7 +450,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
        return 0;
 }
-static void audit_log_remove_rule(struct audit_krule *rule)
+static void audit_tree_log_remove_rule(struct audit_krule *rule)
 {
        struct audit_buffer *ab;
@@ -457,7 +458,7 @@ static void audit_log_remove_rule(struct audit_krule *rule)
        if (unlikely(!ab))
                return;
        audit_log_format(ab, "op=");
-        audit_log_string(ab, "remove rule");
+        audit_log_string(ab, "remove_rule");
        audit_log_format(ab, " dir=");
        audit_log_untrustedstring(ab, rule->tree->pathname);
        audit_log_key(ab, rule->filterkey);
@@ -476,7 +477,7 @@ static void kill_rules(struct audit_tree *tree)
                list_del_init(&rule->rlist);
                if (rule->tree) {
                        /* not a half-baked one */
-                        audit_log_remove_rule(rule);
+                        audit_tree_log_remove_rule(rule);
                        rule->tree = NULL;
                        list_del_rcu(&entry->list);
                        list_del(&entry->rule.list);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 70b4554d2fbe..ad9c1682f616 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -314,7 +314,7 @@ static void audit_update_watch(struct audit_parent *parent,
                                             &nentry->rule.list);
                        }
-                        audit_watch_log_rule_change(r, owatch, "updated rules");
+                        audit_watch_log_rule_change(r, owatch, "updated_rules");
                        call_rcu(&oentry->rcu, audit_free_rule_rcu);
                }
@@ -342,7 +342,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
        list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
                list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
                        e = container_of(r, struct audit_entry, rule);
-                        audit_watch_log_rule_change(r, w, "remove rule");
+                        audit_watch_log_rule_change(r, w, "remove_rule");
                        list_del(&r->rlist);
                        list_del(&r->list);
                        list_del_rcu(&e->list);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 8e9bc9c3dbb7..3598e13f2a65 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -71,6 +71,24 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
 DEFINE_MUTEX(audit_filter_mutex);
+static void audit_free_lsm_field(struct audit_field *f)
+{
+        switch (f->type) {
+        case AUDIT_SUBJ_USER:
+        case AUDIT_SUBJ_ROLE:
+        case AUDIT_SUBJ_TYPE:
+        case AUDIT_SUBJ_SEN:
+        case AUDIT_SUBJ_CLR:
+        case AUDIT_OBJ_USER:
+        case AUDIT_OBJ_ROLE:
+        case AUDIT_OBJ_TYPE:
+        case AUDIT_OBJ_LEV_LOW:
+        case AUDIT_OBJ_LEV_HIGH:
+                kfree(f->lsm_str);
+                security_audit_rule_free(f->lsm_rule);
+        }
+}
 static inline void audit_free_rule(struct audit_entry *e)
 {
        int i;
@@ -80,11 +98,8 @@ static inline void audit_free_rule(struct audit_entry *e)
        if (erule->watch)
                audit_put_watch(erule->watch);
        if (erule->fields)
-                for (i = 0; i < erule->field_count; i++) {
+                for (i = 0; i < erule->field_count; i++)
-                        struct audit_field *f = &erule->fields[i];
+                        audit_free_lsm_field(&erule->fields[i]);
-                        kfree(f->lsm_str);
-                        security_audit_rule_free(f->lsm_rule);
-                }
        kfree(erule->fields);
        kfree(erule->filterkey);
        kfree(e);
@@ -106,7 +121,7 @@ static inline struct audit_entry *audit_init_entry(u32 field_count)
        if (unlikely(!entry))
                return NULL;
-        fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL);
+        fields = kcalloc(field_count, sizeof(*fields), GFP_KERNEL);
        if (unlikely(!fields)) {
                kfree(entry);
                return NULL;
@@ -148,7 +163,7 @@ static inline int audit_to_inode(struct audit_krule *krule,
                                 struct audit_field *f)
 {
        if (krule->listnr != AUDIT_FILTER_EXIT ||
-            krule->watch || krule->inode_f || krule->tree ||
+            krule->inode_f || krule->watch || krule->tree ||
            (f->op != Audit_equal && f->op != Audit_not_equal))
                return -EINVAL;
@@ -160,7 +175,7 @@ static __u32 *classes[AUDIT_SYSCALL_CLASSES];
 int __init audit_register_class(int class, unsigned *list)
 {
-        __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL);
+        __u32 *p = kcalloc(AUDIT_BITMASK_SIZE, sizeof(__u32), GFP_KERNEL);
        if (!p)
                return -ENOMEM;
        while (*list != ~0U) {
@@ -422,10 +437,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
                f->type = data->fields[i];
                f->val = data->values[i];
-                f->uid = INVALID_UID;
-                f->gid = INVALID_GID;
-                f->lsm_str = NULL;
-                f->lsm_rule = NULL;
                /* Support legacy tests for a valid loginuid */
                if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) {
@@ -1053,30 +1064,27 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data,
        int err = 0;
        struct audit_entry *entry;
+        entry = audit_data_to_entry(data, datasz);
+        if (IS_ERR(entry))
+                return PTR_ERR(entry);
        switch (type) {
        case AUDIT_ADD_RULE:
-                entry = audit_data_to_entry(data, datasz);
-                if (IS_ERR(entry))
-                        return PTR_ERR(entry);
                err = audit_add_rule(entry);
-                audit_log_rule_change("add rule", &entry->rule, !err);
+                audit_log_rule_change("add_rule", &entry->rule, !err);
-                if (err)
-                        audit_free_rule(entry);
                break;
        case AUDIT_DEL_RULE:
-                entry = audit_data_to_entry(data, datasz);
-                if (IS_ERR(entry))
-                        return PTR_ERR(entry);
                err = audit_del_rule(entry);
-                audit_log_rule_change("remove rule", &entry->rule, !err);
+                audit_log_rule_change("remove_rule", &entry->rule, !err);
-                audit_free_rule(entry);
                break;
        default:
-                return -EINVAL;
+                err = -EINVAL;
+                WARN_ON(1);
        }
+        if (err || type == AUDIT_DEL_RULE)
+                audit_free_rule(entry);
        return err;
 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 7208c1df248d..e420a0c41b5f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -67,6 +67,7 @@
 #include <linux/binfmts.h>
 #include <linux/highmem.h>
 #include <linux/syscalls.h>
+#include <asm/syscall.h>
 #include <linux/capability.h>
 #include <linux/fs_struct.h>
 #include <linux/compat.h>
@@ -125,14 +126,6 @@ struct audit_tree_refs {
        struct audit_chunk *c[31];
 };
-static inline int open_arg(int flags, int mask)
-{
-        int n = ACC_MODE(flags);
-        if (flags & (O_TRUNC | O_CREAT))
-                n |= AUDIT_PERM_WRITE;
-        return n & mask;
-}
 static int audit_match_perm(struct audit_context *ctx, int mask)
 {
        unsigned n;
@@ -1505,7 +1498,6 @@ void __audit_free(struct task_struct *tsk)
 /**
 * audit_syscall_entry - fill in an audit record at syscall entry
- * @arch: architecture type
 * @major: major syscall type (function)
 * @a1: additional syscall register 1
 * @a2: additional syscall register 2
@@ -1520,9 +1512,8 @@ void __audit_free(struct task_struct *tsk)
 * will only be written if another part of the kernel requests that it
 * be written).
 */
-void __audit_syscall_entry(int arch, int major,
+void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
-                         unsigned long a1, unsigned long a2,
+                           unsigned long a3, unsigned long a4)
-                         unsigned long a3, unsigned long a4)
 {
        struct task_struct *tsk = current;
        struct audit_context *context = tsk->audit_context;
@@ -1536,7 +1527,7 @@ void __audit_syscall_entry(int arch, int major,
        if (!audit_enabled)
                return;
-        context->arch       = arch;
+        context->arch       = syscall_get_arch();
        context->major      = major;
        context->argv[0]    = a1;
        context->argv[1]    = a2;
@@ -2433,6 +2424,7 @@ static void audit_log_task(struct audit_buffer *ab)
        kgid_t gid;
        unsigned int sessionid;
        struct mm_struct *mm = current->mm;
+        char comm[sizeof(current->comm)];
        auid = audit_get_loginuid(current);
        sessionid = audit_get_sessionid(current);
@@ -2445,7 +2437,7 @@ static void audit_log_task(struct audit_buffer *ab)
                         sessionid);
        audit_log_task_context(ab);
        audit_log_format(ab, " pid=%d comm=", task_pid_nr(current));
-        audit_log_untrustedstring(ab, current->comm);
+        audit_log_untrustedstring(ab, get_task_comm(comm, current));
        if (mm) {
                down_read(&mm->mmap_sem);
                if (mm->exe_file)
@@ -2488,11 +2480,9 @@ void __audit_seccomp(unsigned long syscall, long signr, int code)
        if (unlikely(!ab))
                return;
        audit_log_task(ab);
-        audit_log_format(ab, " sig=%ld", signr);
+        audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x",
-        audit_log_format(ab, " syscall=%ld", syscall);
+                         signr, syscall_get_arch(), syscall, is_compat_task(),
-        audit_log_format(ab, " compat=%d", is_compat_task());
+                         KSTK_EIP(current), code);
-        audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current));
-        audit_log_format(ab, " code=0x%x", code);
        audit_log_end(ab);
 }
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 9fd4246b04b8..e1d1d1952bfa 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -9,7 +9,6 @@
 #include <linux/page-flags.h>
 #include <linux/mmzone.h>
 #include <linux/kbuild.h>
-#include <linux/page_cgroup.h>
 #include <linux/log2.h>
 #include <linux/spinlock_types.h>
@@ -18,7 +17,6 @@ void foo(void)
        /* The enum constants to put into include/generated/bounds.h */
        DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
        DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
-        DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
 #ifdef CONFIG_SMP
        DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
 #endif
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 6a71145e2769..0daf7f6ae7df 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1 +1,5 @@
 obj-y := core.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o
+ifdef CONFIG_TEST_BPF
+obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
+endif
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7f0dbcbb34af..d6594e457a25 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -20,9 +20,14 @@
 * Andi Kleen - Fix a few bad bugs and races.
 * Kris Katterjohn - Added many additional checks in bpf_check_classic()
 */
 #include <linux/filter.h>
 #include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <linux/random.h>
+#include <linux/moduleloader.h>
 #include <asm/unaligned.h>
+#include <linux/bpf.h>
 /* Registers */
 #define BPF_R0  regs[BPF_REG_0]
@@ -63,6 +68,105 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
        return NULL;
 }
+struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
+{
+        gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
+                          gfp_extra_flags;
+        struct bpf_prog_aux *aux;
+        struct bpf_prog *fp;
+        size = round_up(size, PAGE_SIZE);
+        fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
+        if (fp == NULL)
+                return NULL;
+        aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
+        if (aux == NULL) {
+                vfree(fp);
+                return NULL;
+        }
+        fp->pages = size / PAGE_SIZE;
+        fp->aux = aux;
+        return fp;
+}
+EXPORT_SYMBOL_GPL(bpf_prog_alloc);
+struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
+                                  gfp_t gfp_extra_flags)
+{
+        gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
+                          gfp_extra_flags;
+        struct bpf_prog *fp;
+        BUG_ON(fp_old == NULL);
+        size = round_up(size, PAGE_SIZE);
+        if (size <= fp_old->pages * PAGE_SIZE)
+                return fp_old;
+        fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
+        if (fp != NULL) {
+                memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
+                fp->pages = size / PAGE_SIZE;
+                /* We keep fp->aux from fp_old around in the new
+                 * reallocated structure.
+                 */
+                fp_old->aux = NULL;
+                __bpf_prog_free(fp_old);
+        }
+        return fp;
+}
+EXPORT_SYMBOL_GPL(bpf_prog_realloc);
+void __bpf_prog_free(struct bpf_prog *fp)
+{
+        kfree(fp->aux);
+        vfree(fp);
+}
+EXPORT_SYMBOL_GPL(__bpf_prog_free);
+#ifdef CONFIG_BPF_JIT
+struct bpf_binary_header *
+bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
+                     unsigned int alignment,
+                     bpf_jit_fill_hole_t bpf_fill_ill_insns)
+{
+        struct bpf_binary_header *hdr;
+        unsigned int size, hole, start;
+        /* Most of BPF filters are really small, but if some of them
+         * fill a page, allow at least 128 extra bytes to insert a
+         * random section of illegal instructions.
+         */
+        size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
+        hdr = module_alloc(size);
+        if (hdr == NULL)
+                return NULL;
+        /* Fill space with illegal/arch-dep instructions. */
+        bpf_fill_ill_insns(hdr, size);
+        hdr->pages = size / PAGE_SIZE;
+        hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
+                     PAGE_SIZE - sizeof(*hdr));
+        start = (prandom_u32() % hole) & ~(alignment - 1);
+        /* Leave a random number of instructions before BPF code. */
+        *image_ptr = &hdr->image[start];
+        return hdr;
+}
+void bpf_jit_binary_free(struct bpf_binary_header *hdr)
+{
+        module_free(NULL, hdr);
+}
+#endif /* CONFIG_BPF_JIT */
 /* Base function for offset calculation. Needs to go into .text section,
 * therefore keeping it non-static as well; will also be used by JITs
 * anyway later on, so do not let the compiler omit it.
@@ -180,6 +284,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
                [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W,
                [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H,
                [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
+                [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW,
        };
        void *ptr;
        int off;
@@ -239,6 +344,10 @@ select_insn:
        ALU64_MOV_K:
                DST = IMM;
                CONT;
+        LD_IMM_DW:
+                DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
+                insn++;
+                CONT;
        ALU64_ARSH_X:
                (*(s64 *) &DST) >>= SRC;
                CONT;
@@ -523,12 +632,35 @@ void bpf_prog_select_runtime(struct bpf_prog *fp)
        /* Probe if internal BPF can be JITed */
        bpf_int_jit_compile(fp);
+        /* Lock whole bpf_prog as read-only */
+        bpf_prog_lock_ro(fp);
 }
 EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
-/* free internal BPF program */
+static void bpf_prog_free_deferred(struct work_struct *work)
+{
+        struct bpf_prog_aux *aux;
+        aux = container_of(work, struct bpf_prog_aux, work);
+        bpf_jit_free(aux->prog);
+}
+/* Free internal BPF program */
 void bpf_prog_free(struct bpf_prog *fp)
 {
-        bpf_jit_free(fp);
+        struct bpf_prog_aux *aux = fp->aux;
+        INIT_WORK(&aux->work, bpf_prog_free_deferred);
+        aux->prog = fp;
+        schedule_work(&aux->work);
 }
 EXPORT_SYMBOL_GPL(bpf_prog_free);
+/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
+ * skb_copy_bits(), so provide a weak definition of it for NET-less config.
+ */
+int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
+                         int len)
+{
+        return -EFAULT;
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
new file mode 100644
index 000000000000..ba61c8c16032
--- /dev/null
+++ b/kernel/bpf/syscall.c
@@ -0,0 +1,606 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/license.h>
+#include <linux/filter.h>
+static LIST_HEAD(bpf_map_types);
+static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
+{
+        struct bpf_map_type_list *tl;
+        struct bpf_map *map;
+        list_for_each_entry(tl, &bpf_map_types, list_node) {
+                if (tl->type == attr->map_type) {
+                        map = tl->ops->map_alloc(attr);
+                        if (IS_ERR(map))
+                                return map;
+                        map->ops = tl->ops;
+                        map->map_type = attr->map_type;
+                        return map;
+                }
+        }
+        return ERR_PTR(-EINVAL);
+}
+/* boot time registration of different map implementations */
+void bpf_register_map_type(struct bpf_map_type_list *tl)
+{
+        list_add(&tl->list_node, &bpf_map_types);
+}
+/* called from workqueue */
+static void bpf_map_free_deferred(struct work_struct *work)
+{
+        struct bpf_map *map = container_of(work, struct bpf_map, work);
+        /* implementation dependent freeing */
+        map->ops->map_free(map);
+}
+/* decrement map refcnt and schedule it for freeing via workqueue
+ * (unrelying map implementation ops->map_free() might sleep)
+ */
+void bpf_map_put(struct bpf_map *map)
+{
+        if (atomic_dec_and_test(&map->refcnt)) {
+                INIT_WORK(&map->work, bpf_map_free_deferred);
+                schedule_work(&map->work);
+        }
+}
+static int bpf_map_release(struct inode *inode, struct file *filp)
+{
+        struct bpf_map *map = filp->private_data;
+        bpf_map_put(map);
+        return 0;
+}
+static const struct file_operations bpf_map_fops = {
+        .release = bpf_map_release,
+};
+/* helper macro to check that unused fields 'union bpf_attr' are zero */
+#define CHECK_ATTR(CMD) \
+        memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
+                   sizeof(attr->CMD##_LAST_FIELD), 0, \
+                   sizeof(*attr) - \
+                   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
+                   sizeof(attr->CMD##_LAST_FIELD)) != NULL
+#define BPF_MAP_CREATE_LAST_FIELD max_entries
+/* called via syscall */
+static int map_create(union bpf_attr *attr)
+{
+        struct bpf_map *map;
+        int err;
+        err = CHECK_ATTR(BPF_MAP_CREATE);
+        if (err)
+                return -EINVAL;
+        /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
+        map = find_and_alloc_map(attr);
+        if (IS_ERR(map))
+                return PTR_ERR(map);
+        atomic_set(&map->refcnt, 1);
+        err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
+        if (err < 0)
+                /* failed to allocate fd */
+                goto free_map;
+        return err;
+free_map:
+        map->ops->map_free(map);
+        return err;
+}
+/* if error is returned, fd is released.
+ * On success caller should complete fd access with matching fdput()
+ */
+struct bpf_map *bpf_map_get(struct fd f)
+{
+        struct bpf_map *map;
+        if (!f.file)
+                return ERR_PTR(-EBADF);
+        if (f.file->f_op != &bpf_map_fops) {
+                fdput(f);
+                return ERR_PTR(-EINVAL);
+        }
+        map = f.file->private_data;
+        return map;
+}
+/* helper to convert user pointers passed inside __aligned_u64 fields */
+static void __user *u64_to_ptr(__u64 val)
+{
+        return (void __user *) (unsigned long) val;
+}
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
+static int map_lookup_elem(union bpf_attr *attr)
+{
+        void __user *ukey = u64_to_ptr(attr->key);
+        void __user *uvalue = u64_to_ptr(attr->value);
+        int ufd = attr->map_fd;
+        struct fd f = fdget(ufd);
+        struct bpf_map *map;
+        void *key, *value;
+        int err;
+        if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
+                return -EINVAL;
+        map = bpf_map_get(f);
+        if (IS_ERR(map))
+                return PTR_ERR(map);
+        err = -ENOMEM;
+        key = kmalloc(map->key_size, GFP_USER);
+        if (!key)
+                goto err_put;
+        err = -EFAULT;
+        if (copy_from_user(key, ukey, map->key_size) != 0)
+                goto free_key;
+        err = -ESRCH;
+        rcu_read_lock();
+        value = map->ops->map_lookup_elem(map, key);
+        if (!value)
+                goto err_unlock;
+        err = -EFAULT;
+        if (copy_to_user(uvalue, value, map->value_size) != 0)
+                goto err_unlock;
+        err = 0;
+err_unlock:
+        rcu_read_unlock();
+free_key:
+        kfree(key);
+err_put:
+        fdput(f);
+        return err;
+}
+#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value
+static int map_update_elem(union bpf_attr *attr)
+{
+        void __user *ukey = u64_to_ptr(attr->key);
+        void __user *uvalue = u64_to_ptr(attr->value);
+        int ufd = attr->map_fd;
+        struct fd f = fdget(ufd);
+        struct bpf_map *map;
+        void *key, *value;
+        int err;
+        if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
+                return -EINVAL;
+        map = bpf_map_get(f);
+        if (IS_ERR(map))
+                return PTR_ERR(map);
+        err = -ENOMEM;
+        key = kmalloc(map->key_size, GFP_USER);
+        if (!key)
+                goto err_put;
+        err = -EFAULT;
+        if (copy_from_user(key, ukey, map->key_size) != 0)
+                goto free_key;
+        err = -ENOMEM;
+        value = kmalloc(map->value_size, GFP_USER);
+        if (!value)
+                goto free_key;
+        err = -EFAULT;
+        if (copy_from_user(value, uvalue, map->value_size) != 0)
+                goto free_value;
+        /* eBPF program that use maps are running under rcu_read_lock(),
+         * therefore all map accessors rely on this fact, so do the same here
+         */
+        rcu_read_lock();
+        err = map->ops->map_update_elem(map, key, value);
+        rcu_read_unlock();
+free_value:
+        kfree(value);
+free_key:
+        kfree(key);
+err_put:
+        fdput(f);
+        return err;
+}
+#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
+static int map_delete_elem(union bpf_attr *attr)
+{
+        void __user *ukey = u64_to_ptr(attr->key);
+        int ufd = attr->map_fd;
+        struct fd f = fdget(ufd);
+        struct bpf_map *map;
+        void *key;
+        int err;
+        if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
+                return -EINVAL;
+        map = bpf_map_get(f);
+        if (IS_ERR(map))
+                return PTR_ERR(map);
+        err = -ENOMEM;
+        key = kmalloc(map->key_size, GFP_USER);
+        if (!key)
+                goto err_put;
+        err = -EFAULT;
+        if (copy_from_user(key, ukey, map->key_size) != 0)
+                goto free_key;
+        rcu_read_lock();
+        err = map->ops->map_delete_elem(map, key);
+        rcu_read_unlock();
+free_key:
+        kfree(key);
+err_put:
+        fdput(f);
+        return err;
+}
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
+static int map_get_next_key(union bpf_attr *attr)
+{
+        void __user *ukey = u64_to_ptr(attr->key);
+        void __user *unext_key = u64_to_ptr(attr->next_key);
+        int ufd = attr->map_fd;
+        struct fd f = fdget(ufd);
+        struct bpf_map *map;
+        void *key, *next_key;
+        int err;
+        if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
+                return -EINVAL;
+        map = bpf_map_get(f);
+        if (IS_ERR(map))
+                return PTR_ERR(map);
+        err = -ENOMEM;
+        key = kmalloc(map->key_size, GFP_USER);
+        if (!key)
+                goto err_put;
+        err = -EFAULT;
+        if (copy_from_user(key, ukey, map->key_size) != 0)
+                goto free_key;
+        err = -ENOMEM;
+        next_key = kmalloc(map->key_size, GFP_USER);
+        if (!next_key)
+                goto free_key;
+        rcu_read_lock();
+        err = map->ops->map_get_next_key(map, key, next_key);
+        rcu_read_unlock();
+        if (err)
+                goto free_next_key;
+        err = -EFAULT;
+        if (copy_to_user(unext_key, next_key, map->key_size) != 0)
+                goto free_next_key;
+        err = 0;
+free_next_key:
+        kfree(next_key);
+free_key:
+        kfree(key);
+err_put:
+        fdput(f);
+        return err;
+}
+static LIST_HEAD(bpf_prog_types);
+static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
+{
+        struct bpf_prog_type_list *tl;
+        list_for_each_entry(tl, &bpf_prog_types, list_node) {
+                if (tl->type == type) {
+                        prog->aux->ops = tl->ops;
+                        prog->aux->prog_type = type;
+                        return 0;
+                }
+        }
+        return -EINVAL;
+}
+void bpf_register_prog_type(struct bpf_prog_type_list *tl)
+{
+        list_add(&tl->list_node, &bpf_prog_types);
+}
+/* fixup insn->imm field of bpf_call instructions:
+ * if (insn->imm == BPF_FUNC_map_lookup_elem)
+ *      insn->imm = bpf_map_lookup_elem - __bpf_call_base;
+ * else if (insn->imm == BPF_FUNC_map_update_elem)
+ *      insn->imm = bpf_map_update_elem - __bpf_call_base;
+ * else ...
+ *
+ * this function is called after eBPF program passed verification
+ */
+static void fixup_bpf_calls(struct bpf_prog *prog)
+{
+        const struct bpf_func_proto *fn;
+        int i;
+        for (i = 0; i < prog->len; i++) {
+                struct bpf_insn *insn = &prog->insnsi[i];
+                if (insn->code == (BPF_JMP | BPF_CALL)) {
+                        /* we reach here when program has bpf_call instructions
+                         * and it passed bpf_check(), means that
+                         * ops->get_func_proto must have been supplied, check it
+                         */
+                        BUG_ON(!prog->aux->ops->get_func_proto);
+                        fn = prog->aux->ops->get_func_proto(insn->imm);
+                        /* all functions that have prototype and verifier allowed
+                         * programs to call them, must be real in-kernel functions
+                         */
+                        BUG_ON(!fn->func);
+                        insn->imm = fn->func - __bpf_call_base;
+                }
+        }
+}
+/* drop refcnt on maps used by eBPF program and free auxilary data */
+static void free_used_maps(struct bpf_prog_aux *aux)
+{
+        int i;
+        for (i = 0; i < aux->used_map_cnt; i++)
+                bpf_map_put(aux->used_maps[i]);
+        kfree(aux->used_maps);
+}
+void bpf_prog_put(struct bpf_prog *prog)
+{
+        if (atomic_dec_and_test(&prog->aux->refcnt)) {
+                free_used_maps(prog->aux);
+                bpf_prog_free(prog);
+        }
+}
+static int bpf_prog_release(struct inode *inode, struct file *filp)
+{
+        struct bpf_prog *prog = filp->private_data;
+        bpf_prog_put(prog);
+        return 0;
+}
+static const struct file_operations bpf_prog_fops = {
+        .release = bpf_prog_release,
+};
+static struct bpf_prog *get_prog(struct fd f)
+{
+        struct bpf_prog *prog;
+        if (!f.file)
+                return ERR_PTR(-EBADF);
+        if (f.file->f_op != &bpf_prog_fops) {
+                fdput(f);
+                return ERR_PTR(-EINVAL);
+        }
+        prog = f.file->private_data;
+        return prog;
+}
+/* called by sockets/tracing/seccomp before attaching program to an event
+ * pairs with bpf_prog_put()
+ */
+struct bpf_prog *bpf_prog_get(u32 ufd)
+{
+        struct fd f = fdget(ufd);
+        struct bpf_prog *prog;
+        prog = get_prog(f);
+        if (IS_ERR(prog))
+                return prog;
+        atomic_inc(&prog->aux->refcnt);
+        fdput(f);
+        return prog;
+}
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_PROG_LOAD_LAST_FIELD log_buf
+static int bpf_prog_load(union bpf_attr *attr)
+{
+        enum bpf_prog_type type = attr->prog_type;
+        struct bpf_prog *prog;
+        int err;
+        char license[128];
+        bool is_gpl;
+        if (CHECK_ATTR(BPF_PROG_LOAD))
+                return -EINVAL;
+        /* copy eBPF program license from user space */
+        if (strncpy_from_user(license, u64_to_ptr(attr->license),
+                              sizeof(license) - 1) < 0)
+                return -EFAULT;
+        license[sizeof(license) - 1] = 0;
+        /* eBPF programs must be GPL compatible to use GPL-ed functions */
+        is_gpl = license_is_gpl_compatible(license);
+        if (attr->insn_cnt >= BPF_MAXINSNS)
+                return -EINVAL;
+        /* plain bpf_prog allocation */
+        prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
+        if (!prog)
+                return -ENOMEM;
+        prog->len = attr->insn_cnt;
+        err = -EFAULT;
+        if (copy_from_user(prog->insns, u64_to_ptr(attr->insns),
+                           prog->len * sizeof(struct bpf_insn)) != 0)
+                goto free_prog;
+        prog->orig_prog = NULL;
+        prog->jited = false;
+        atomic_set(&prog->aux->refcnt, 1);
+        prog->aux->is_gpl_compatible = is_gpl;
+        /* find program type: socket_filter vs tracing_filter */
+        err = find_prog_type(type, prog);
+        if (err < 0)
+                goto free_prog;
+        /* run eBPF verifier */
+        err = bpf_check(prog, attr);
+        if (err < 0)
+                goto free_used_maps;
+        /* fixup BPF_CALL->imm field */
+        fixup_bpf_calls(prog);
+        /* eBPF program is ready to be JITed */
+        bpf_prog_select_runtime(prog);
+        err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
+        if (err < 0)
+                /* failed to allocate fd */
+                goto free_used_maps;
+        return err;
+free_used_maps:
+        free_used_maps(prog->aux);
+free_prog:
+        bpf_prog_free(prog);
+        return err;
+}
+SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
+{
+        union bpf_attr attr = {};
+        int err;
+        /* the syscall is limited to root temporarily. This restriction will be
+         * lifted when security audit is clean. Note that eBPF+tracing must have
+         * this restriction, since it may pass kernel data to user space
+         */
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (!access_ok(VERIFY_READ, uattr, 1))
+                return -EFAULT;
+        if (size > PAGE_SIZE)   /* silly large */
+                return -E2BIG;
+        /* If we're handed a bigger struct than we know of,
+         * ensure all the unknown bits are 0 - i.e. new
+         * user-space does not rely on any kernel feature
+         * extensions we dont know about yet.
+         */
+        if (size > sizeof(attr)) {
+                unsigned char __user *addr;
+                unsigned char __user *end;
+                unsigned char val;
+                addr = (void __user *)uattr + sizeof(attr);
+                end  = (void __user *)uattr + size;
+                for (; addr < end; addr++) {
+                        err = get_user(val, addr);
+                        if (err)
+                                return err;
+                        if (val)
+                                return -E2BIG;
+                }
+                size = sizeof(attr);
+        }
+        /* copy attributes from user space, may be less than sizeof(bpf_attr) */
+        if (copy_from_user(&attr, uattr, size) != 0)
+                return -EFAULT;
+        switch (cmd) {
+        case BPF_MAP_CREATE:
+                err = map_create(&attr);
+                break;
+        case BPF_MAP_LOOKUP_ELEM:
+                err = map_lookup_elem(&attr);
+                break;
+        case BPF_MAP_UPDATE_ELEM:
+                err = map_update_elem(&attr);
+                break;
+        case BPF_MAP_DELETE_ELEM:
+                err = map_delete_elem(&attr);
+                break;
+        case BPF_MAP_GET_NEXT_KEY:
+                err = map_get_next_key(&attr);
+                break;
+        case BPF_PROG_LOAD:
+                err = bpf_prog_load(&attr);
+                break;
+        default:
+                err = -EINVAL;
+                break;
+        }
+        return err;
+}
diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c
new file mode 100644
index 000000000000..fcaddff4003e
--- /dev/null
+++ b/kernel/bpf/test_stub.c
@@ -0,0 +1,116 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/bpf.h>
+/* test stubs for BPF_MAP_TYPE_UNSPEC and for BPF_PROG_TYPE_UNSPEC
+ * to be used by user space verifier testsuite
+ */
+struct bpf_context {
+        u64 arg1;
+        u64 arg2;
+};
+static u64 test_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+        return 0;
+}
+static struct bpf_func_proto test_funcs[] = {
+        [BPF_FUNC_unspec] = {
+                .func = test_func,
+                .gpl_only = true,
+                .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+                .arg1_type = ARG_CONST_MAP_PTR,
+                .arg2_type = ARG_PTR_TO_MAP_KEY,
+        },
+};
+static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id)
+{
+        if (func_id < 0 || func_id >= ARRAY_SIZE(test_funcs))
+                return NULL;
+        return &test_funcs[func_id];
+}
+static const struct bpf_context_access {
+        int size;
+        enum bpf_access_type type;
+} test_ctx_access[] = {
+        [offsetof(struct bpf_context, arg1)] = {
+                FIELD_SIZEOF(struct bpf_context, arg1),
+                BPF_READ
+        },
+        [offsetof(struct bpf_context, arg2)] = {
+                FIELD_SIZEOF(struct bpf_context, arg2),
+                BPF_READ
+        },
+};
+static bool test_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+        const struct bpf_context_access *access;
+        if (off < 0 || off >= ARRAY_SIZE(test_ctx_access))
+                return false;
+        access = &test_ctx_access[off];
+        if (access->size == size && (access->type & type))
+                return true;
+        return false;
+}
+static struct bpf_verifier_ops test_ops = {
+        .get_func_proto = test_func_proto,
+        .is_valid_access = test_is_valid_access,
+};
+static struct bpf_prog_type_list tl_prog = {
+        .ops = &test_ops,
+        .type = BPF_PROG_TYPE_UNSPEC,
+};
+static struct bpf_map *test_map_alloc(union bpf_attr *attr)
+{
+        struct bpf_map *map;
+        map = kzalloc(sizeof(*map), GFP_USER);
+        if (!map)
+                return ERR_PTR(-ENOMEM);
+        map->key_size = attr->key_size;
+        map->value_size = attr->value_size;
+        map->max_entries = attr->max_entries;
+        return map;
+}
+static void test_map_free(struct bpf_map *map)
+{
+        kfree(map);
+}
+static struct bpf_map_ops test_map_ops = {
+        .map_alloc = test_map_alloc,
+        .map_free = test_map_free,
+};
+static struct bpf_map_type_list tl_map = {
+        .ops = &test_map_ops,
+        .type = BPF_MAP_TYPE_UNSPEC,
+};
+static int __init register_test_ops(void)
+{
+        bpf_register_map_type(&tl_map);
+        bpf_register_prog_type(&tl_prog);
+        return 0;
+}
+late_initcall(register_test_ops);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
new file mode 100644
index 000000000000..9f81818f2941
--- /dev/null
+++ b/kernel/bpf/verifier.c
@@ -0,0 +1,1924 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <net/netlink.h>
+#include <linux/file.h>
+#include <linux/vmalloc.h>
+/* bpf_check() is a static code analyzer that walks eBPF program
+ * instruction by instruction and updates register/stack state.
+ * All paths of conditional branches are analyzed until 'bpf_exit' insn.
+ *
+ * The first pass is depth-first-search to check that the program is a DAG.
+ * It rejects the following programs:
+ * - larger than BPF_MAXINSNS insns
+ * - if loop is present (detected via back-edge)
+ * - unreachable insns exist (shouldn't be a forest. program = one function)
+ * - out of bounds or malformed jumps
+ * The second pass is all possible path descent from the 1st insn.
+ * Since it's analyzing all pathes through the program, the length of the
+ * analysis is limited to 32k insn, which may be hit even if total number of
+ * insn is less then 4K, but there are too many branches that change stack/regs.
+ * Number of 'branches to be analyzed' is limited to 1k
+ *
+ * On entry to each instruction, each register has a type, and the instruction
+ * changes the types of the registers depending on instruction semantics.
+ * If instruction is BPF_MOV64_REG(BPF_REG_1, BPF_REG_5), then type of R5 is
+ * copied to R1.
+ *
+ * All registers are 64-bit.
+ * R0 - return register
+ * R1-R5 argument passing registers
+ * R6-R9 callee saved registers
+ * R10 - frame pointer read-only
+ *
+ * At the start of BPF program the register R1 contains a pointer to bpf_context
+ * and has type PTR_TO_CTX.
+ *
+ * Verifier tracks arithmetic operations on pointers in case:
+ *    BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ *    BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -20),
+ * 1st insn copies R10 (which has FRAME_PTR) type into R1
+ * and 2nd arithmetic instruction is pattern matched to recognize
+ * that it wants to construct a pointer to some element within stack.
+ * So after 2nd insn, the register R1 has type PTR_TO_STACK
+ * (and -20 constant is saved for further stack bounds checking).
+ * Meaning that this reg is a pointer to stack plus known immediate constant.
+ *
+ * Most of the time the registers have UNKNOWN_VALUE type, which
+ * means the register has some value, but it's not a valid pointer.
+ * (like pointer plus pointer becomes UNKNOWN_VALUE type)
+ *
+ * When verifier sees load or store instructions the type of base register
+ * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, FRAME_PTR. These are three pointer
+ * types recognized by check_mem_access() function.
+ *
+ * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value'
+ * and the range of [ptr, ptr + map's value_size) is accessible.
+ *
+ * registers used to pass values to function calls are checked against
+ * function argument constraints.
+ *
+ * ARG_PTR_TO_MAP_KEY is one of such argument constraints.
+ * It means that the register type passed to this function must be
+ * PTR_TO_STACK and it will be used inside the function as
+ * 'pointer to map element key'
+ *
+ * For example the argument constraints for bpf_map_lookup_elem():
+ *   .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ *   .arg1_type = ARG_CONST_MAP_PTR,
+ *   .arg2_type = ARG_PTR_TO_MAP_KEY,
+ *
+ * ret_type says that this function returns 'pointer to map elem value or null'
+ * function expects 1st argument to be a const pointer to 'struct bpf_map' and
+ * 2nd argument should be a pointer to stack, which will be used inside
+ * the helper function as a pointer to map element key.
+ *
+ * On the kernel side the helper function looks like:
+ * u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+ * {
+ *    struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ *    void *key = (void *) (unsigned long) r2;
+ *    void *value;
+ *
+ *    here kernel can access 'key' and 'map' pointers safely, knowing that
+ *    [key, key + map->key_size) bytes are valid and were initialized on
+ *    the stack of eBPF program.
+ * }
+ *
+ * Corresponding eBPF program may look like:
+ *    BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),  // after this insn R2 type is FRAME_PTR
+ *    BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), // after this insn R2 type is PTR_TO_STACK
+ *    BPF_LD_MAP_FD(BPF_REG_1, map_fd),      // after this insn R1 type is CONST_PTR_TO_MAP
+ *    BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ * here verifier looks at prototype of map_lookup_elem() and sees:
+ * .arg1_type == ARG_CONST_MAP_PTR and R1->type == CONST_PTR_TO_MAP, which is ok,
+ * Now verifier knows that this map has key of R1->map_ptr->key_size bytes
+ *
+ * Then .arg2_type == ARG_PTR_TO_MAP_KEY and R2->type == PTR_TO_STACK, ok so far,
+ * Now verifier checks that [R2, R2 + map's key_size) are within stack limits
+ * and were initialized prior to this call.
+ * If it's ok, then verifier allows this BPF_CALL insn and looks at
+ * .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets
+ * R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function
+ * returns ether pointer to map value or NULL.
+ *
+ * When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off'
+ * insn, the register holding that pointer in the true branch changes state to
+ * PTR_TO_MAP_VALUE and the same register changes state to CONST_IMM in the false
+ * branch. See check_cond_jmp_op().
+ *
+ * After the call R0 is set to return type of the function and registers R1-R5
+ * are set to NOT_INIT to indicate that they are no longer readable.
+ */
+/* types of values stored in eBPF registers */
+enum bpf_reg_type {
+        NOT_INIT = 0,            /* nothing was written into register */
+        UNKNOWN_VALUE,           /* reg doesn't contain a valid pointer */
+        PTR_TO_CTX,              /* reg points to bpf_context */
+        CONST_PTR_TO_MAP,        /* reg points to struct bpf_map */
+        PTR_TO_MAP_VALUE,        /* reg points to map element value */
+        PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
+        FRAME_PTR,               /* reg == frame_pointer */
+        PTR_TO_STACK,            /* reg == frame_pointer + imm */
+        CONST_IMM,               /* constant integer value */
+};
+struct reg_state {
+        enum bpf_reg_type type;
+        union {
+                /* valid when type == CONST_IMM | PTR_TO_STACK */
+                int imm;
+                /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
+                 *   PTR_TO_MAP_VALUE_OR_NULL
+                 */
+                struct bpf_map *map_ptr;
+        };
+};
+enum bpf_stack_slot_type {
+        STACK_INVALID,    /* nothing was stored in this stack slot */
+        STACK_SPILL,      /* 1st byte of register spilled into stack */
+        STACK_SPILL_PART, /* other 7 bytes of register spill */
+        STACK_MISC        /* BPF program wrote some data into this slot */
+};
+struct bpf_stack_slot {
+        enum bpf_stack_slot_type stype;
+        struct reg_state reg_st;
+};
+/* state of the program:
+ * type of all registers and stack info
+ */
+struct verifier_state {
+        struct reg_state regs[MAX_BPF_REG];
+        struct bpf_stack_slot stack[MAX_BPF_STACK];
+};
+/* linked list of verifier states used to prune search */
+struct verifier_state_list {
+        struct verifier_state state;
+        struct verifier_state_list *next;
+};
+/* verifier_state + insn_idx are pushed to stack when branch is encountered */
+struct verifier_stack_elem {
+        /* verifer state is 'st'
+         * before processing instruction 'insn_idx'
+         * and after processing instruction 'prev_insn_idx'
+         */
+        struct verifier_state st;
+        int insn_idx;
+        int prev_insn_idx;
+        struct verifier_stack_elem *next;
+};
+#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
+/* single container for all structs
+ * one verifier_env per bpf_check() call
+ */
+struct verifier_env {
+        struct bpf_prog *prog;          /* eBPF program being verified */
+        struct verifier_stack_elem *head; /* stack of verifier states to be processed */
+        int stack_size;                 /* number of states to be processed */
+        struct verifier_state cur_state; /* current verifier state */
+        struct verifier_state_list **explored_states; /* search pruning optimization */
+        struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
+        u32 used_map_cnt;               /* number of used maps */
+};
+/* verbose verifier prints what it's seeing
+ * bpf_check() is called under lock, so no race to access these global vars
+ */
+static u32 log_level, log_size, log_len;
+static char *log_buf;
+static DEFINE_MUTEX(bpf_verifier_lock);
+/* log_level controls verbosity level of eBPF verifier.
+ * verbose() is used to dump the verification trace to the log, so the user
+ * can figure out what's wrong with the program
+ */
+static void verbose(const char *fmt, ...)
+{
+        va_list args;
+        if (log_level == 0 || log_len >= log_size - 1)
+                return;
+        va_start(args, fmt);
+        log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args);
+        va_end(args);
+}
+/* string representation of 'enum bpf_reg_type' */
+static const char * const reg_type_str[] = {
+        [NOT_INIT]              = "?",
+        [UNKNOWN_VALUE]         = "inv",
+        [PTR_TO_CTX]            = "ctx",
+        [CONST_PTR_TO_MAP]      = "map_ptr",
+        [PTR_TO_MAP_VALUE]      = "map_value",
+        [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
+        [FRAME_PTR]             = "fp",
+        [PTR_TO_STACK]          = "fp",
+        [CONST_IMM]             = "imm",
+};
+static void print_verifier_state(struct verifier_env *env)
+{
+        enum bpf_reg_type t;
+        int i;
+        for (i = 0; i < MAX_BPF_REG; i++) {
+                t = env->cur_state.regs[i].type;
+                if (t == NOT_INIT)
+                        continue;
+                verbose(" R%d=%s", i, reg_type_str[t]);
+                if (t == CONST_IMM || t == PTR_TO_STACK)
+                        verbose("%d", env->cur_state.regs[i].imm);
+                else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
+                         t == PTR_TO_MAP_VALUE_OR_NULL)
+                        verbose("(ks=%d,vs=%d)",
+                                env->cur_state.regs[i].map_ptr->key_size,
+                                env->cur_state.regs[i].map_ptr->value_size);
+        }
+        for (i = 0; i < MAX_BPF_STACK; i++) {
+                if (env->cur_state.stack[i].stype == STACK_SPILL)
+                        verbose(" fp%d=%s", -MAX_BPF_STACK + i,
+                                reg_type_str[env->cur_state.stack[i].reg_st.type]);
+        }
+        verbose("\n");
+}
+static const char *const bpf_class_string[] = {
+        [BPF_LD]    = "ld",
+        [BPF_LDX]   = "ldx",
+        [BPF_ST]    = "st",
+        [BPF_STX]   = "stx",
+        [BPF_ALU]   = "alu",
+        [BPF_JMP]   = "jmp",
+        [BPF_RET]   = "BUG",
+        [BPF_ALU64] = "alu64",
+};
+static const char *const bpf_alu_string[] = {
+        [BPF_ADD >> 4]  = "+=",
+        [BPF_SUB >> 4]  = "-=",
+        [BPF_MUL >> 4]  = "*=",
+        [BPF_DIV >> 4]  = "/=",
+        [BPF_OR  >> 4]  = "|=",
+        [BPF_AND >> 4]  = "&=",
+        [BPF_LSH >> 4]  = "<<=",
+        [BPF_RSH >> 4]  = ">>=",
+        [BPF_NEG >> 4]  = "neg",
+        [BPF_MOD >> 4]  = "%=",
+        [BPF_XOR >> 4]  = "^=",
+        [BPF_MOV >> 4]  = "=",
+        [BPF_ARSH >> 4] = "s>>=",
+        [BPF_END >> 4]  = "endian",
+};
+static const char *const bpf_ldst_string[] = {
+        [BPF_W >> 3]  = "u32",
+        [BPF_H >> 3]  = "u16",
+        [BPF_B >> 3]  = "u8",
+        [BPF_DW >> 3] = "u64",
+};
+static const char *const bpf_jmp_string[] = {
+        [BPF_JA >> 4]   = "jmp",
+        [BPF_JEQ >> 4]  = "==",
+        [BPF_JGT >> 4]  = ">",
+        [BPF_JGE >> 4]  = ">=",
+        [BPF_JSET >> 4] = "&",
+        [BPF_JNE >> 4]  = "!=",
+        [BPF_JSGT >> 4] = "s>",
+        [BPF_JSGE >> 4] = "s>=",
+        [BPF_CALL >> 4] = "call",
+        [BPF_EXIT >> 4] = "exit",
+};
+static void print_bpf_insn(struct bpf_insn *insn)
+{
+        u8 class = BPF_CLASS(insn->code);
+        if (class == BPF_ALU || class == BPF_ALU64) {
+                if (BPF_SRC(insn->code) == BPF_X)
+                        verbose("(%02x) %sr%d %s %sr%d\n",
+                                insn->code, class == BPF_ALU ? "(u32) " : "",
+                                insn->dst_reg,
+                                bpf_alu_string[BPF_OP(insn->code) >> 4],
+                                class == BPF_ALU ? "(u32) " : "",
+                                insn->src_reg);
+                else
+                        verbose("(%02x) %sr%d %s %s%d\n",
+                                insn->code, class == BPF_ALU ? "(u32) " : "",
+                                insn->dst_reg,
+                                bpf_alu_string[BPF_OP(insn->code) >> 4],
+                                class == BPF_ALU ? "(u32) " : "",
+                                insn->imm);
+        } else if (class == BPF_STX) {
+                if (BPF_MODE(insn->code) == BPF_MEM)
+                        verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
+                                insn->code,
+                                bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+                                insn->dst_reg,
+                                insn->off, insn->src_reg);
+                else if (BPF_MODE(insn->code) == BPF_XADD)
+                        verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+                                insn->code,
+                                bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+                                insn->dst_reg, insn->off,
+                                insn->src_reg);
+                else
+                        verbose("BUG_%02x\n", insn->code);
+        } else if (class == BPF_ST) {
+                if (BPF_MODE(insn->code) != BPF_MEM) {
+                        verbose("BUG_st_%02x\n", insn->code);
+                        return;
+                }
+                verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
+                        insn->code,
+                        bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+                        insn->dst_reg,
+                        insn->off, insn->imm);
+        } else if (class == BPF_LDX) {
+                if (BPF_MODE(insn->code) != BPF_MEM) {
+                        verbose("BUG_ldx_%02x\n", insn->code);
+                        return;
+                }
+                verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
+                        insn->code, insn->dst_reg,
+                        bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+                        insn->src_reg, insn->off);
+        } else if (class == BPF_LD) {
+                if (BPF_MODE(insn->code) == BPF_ABS) {
+                        verbose("(%02x) r0 = *(%s *)skb[%d]\n",
+                                insn->code,
+                                bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+                                insn->imm);
+                } else if (BPF_MODE(insn->code) == BPF_IND) {
+                        verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+                                insn->code,
+                                bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+                                insn->src_reg, insn->imm);
+                } else if (BPF_MODE(insn->code) == BPF_IMM) {
+                        verbose("(%02x) r%d = 0x%x\n",
+                                insn->code, insn->dst_reg, insn->imm);
+                } else {
+                        verbose("BUG_ld_%02x\n", insn->code);
+                        return;
+                }
+        } else if (class == BPF_JMP) {
+                u8 opcode = BPF_OP(insn->code);
+                if (opcode == BPF_CALL) {
+                        verbose("(%02x) call %d\n", insn->code, insn->imm);
+                } else if (insn->code == (BPF_JMP | BPF_JA)) {
+                        verbose("(%02x) goto pc%+d\n",
+                                insn->code, insn->off);
+                } else if (insn->code == (BPF_JMP | BPF_EXIT)) {
+                        verbose("(%02x) exit\n", insn->code);
+                } else if (BPF_SRC(insn->code) == BPF_X) {
+                        verbose("(%02x) if r%d %s r%d goto pc%+d\n",
+                                insn->code, insn->dst_reg,
+                                bpf_jmp_string[BPF_OP(insn->code) >> 4],
+                                insn->src_reg, insn->off);
+                } else {
+                        verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
+                                insn->code, insn->dst_reg,
+                                bpf_jmp_string[BPF_OP(insn->code) >> 4],
+                                insn->imm, insn->off);
+                }
+        } else {
+                verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
+        }
+}
+static int pop_stack(struct verifier_env *env, int *prev_insn_idx)
+{
+        struct verifier_stack_elem *elem;
+        int insn_idx;
+        if (env->head == NULL)
+                return -1;
+        memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state));
+        insn_idx = env->head->insn_idx;
+        if (prev_insn_idx)
+                *prev_insn_idx = env->head->prev_insn_idx;
+        elem = env->head->next;
+        kfree(env->head);
+        env->head = elem;
+        env->stack_size--;
+        return insn_idx;
+}
+static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx,
+                                         int prev_insn_idx)
+{
+        struct verifier_stack_elem *elem;
+        elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL);
+        if (!elem)
+                goto err;
+        memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state));
+        elem->insn_idx = insn_idx;
+        elem->prev_insn_idx = prev_insn_idx;
+        elem->next = env->head;
+        env->head = elem;
+        env->stack_size++;
+        if (env->stack_size > 1024) {
+                verbose("BPF program is too complex\n");
+                goto err;
+        }
+        return &elem->st;
+err:
+        /* pop all elements and return */
+        while (pop_stack(env, NULL) >= 0);
+        return NULL;
+}
+#define CALLER_SAVED_REGS 6
+static const int caller_saved[CALLER_SAVED_REGS] = {
+        BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
+};
+static void init_reg_state(struct reg_state *regs)
+{
+        int i;
+        for (i = 0; i < MAX_BPF_REG; i++) {
+                regs[i].type = NOT_INIT;
+                regs[i].imm = 0;
+                regs[i].map_ptr = NULL;
+        }
+        /* frame pointer */
+        regs[BPF_REG_FP].type = FRAME_PTR;
+        /* 1st arg to a function */
+        regs[BPF_REG_1].type = PTR_TO_CTX;
+}
+static void mark_reg_unknown_value(struct reg_state *regs, u32 regno)
+{
+        BUG_ON(regno >= MAX_BPF_REG);
+        regs[regno].type = UNKNOWN_VALUE;
+        regs[regno].imm = 0;
+        regs[regno].map_ptr = NULL;
+}
+enum reg_arg_type {
+        SRC_OP,         /* register is used as source operand */
+        DST_OP,         /* register is used as destination operand */
+        DST_OP_NO_MARK  /* same as above, check only, don't mark */
+};
+static int check_reg_arg(struct reg_state *regs, u32 regno,
+                         enum reg_arg_type t)
+{
+        if (regno >= MAX_BPF_REG) {
+                verbose("R%d is invalid\n", regno);
+                return -EINVAL;
+        }
+        if (t == SRC_OP) {
+                /* check whether register used as source operand can be read */
+                if (regs[regno].type == NOT_INIT) {
+                        verbose("R%d !read_ok\n", regno);
+                        return -EACCES;
+                }
+        } else {
+                /* check whether register used as dest operand can be written to */
+                if (regno == BPF_REG_FP) {
+                        verbose("frame pointer is read only\n");
+                        return -EACCES;
+                }
+                if (t == DST_OP)
+                        mark_reg_unknown_value(regs, regno);
+        }
+        return 0;
+}
+static int bpf_size_to_bytes(int bpf_size)
+{
+        if (bpf_size == BPF_W)
+                return 4;
+        else if (bpf_size == BPF_H)
+                return 2;
+        else if (bpf_size == BPF_B)
+                return 1;
+        else if (bpf_size == BPF_DW)
+                return 8;
+        else
+                return -EINVAL;
+}
+/* check_stack_read/write functions track spill/fill of registers,
+ * stack boundary and alignment are checked in check_mem_access()
+ */
+static int check_stack_write(struct verifier_state *state, int off, int size,
+                             int value_regno)
+{
+        struct bpf_stack_slot *slot;
+        int i;
+        if (value_regno >= 0 &&
+            (state->regs[value_regno].type == PTR_TO_MAP_VALUE ||
+             state->regs[value_regno].type == PTR_TO_STACK ||
+             state->regs[value_regno].type == PTR_TO_CTX)) {
+                /* register containing pointer is being spilled into stack */
+                if (size != 8) {
+                        verbose("invalid size of register spill\n");
+                        return -EACCES;
+                }
+                slot = &state->stack[MAX_BPF_STACK + off];
+                slot->stype = STACK_SPILL;
+                /* save register state */
+                slot->reg_st = state->regs[value_regno];
+                for (i = 1; i < 8; i++) {
+                        slot = &state->stack[MAX_BPF_STACK + off + i];
+                        slot->stype = STACK_SPILL_PART;
+                        slot->reg_st.type = UNKNOWN_VALUE;
+                        slot->reg_st.map_ptr = NULL;
+                }
+        } else {
+                /* regular write of data into stack */
+                for (i = 0; i < size; i++) {
+                        slot = &state->stack[MAX_BPF_STACK + off + i];
+                        slot->stype = STACK_MISC;
+                        slot->reg_st.type = UNKNOWN_VALUE;
+                        slot->reg_st.map_ptr = NULL;
+                }
+        }
+        return 0;
+}
+static int check_stack_read(struct verifier_state *state, int off, int size,
+                            int value_regno)
+{
+        int i;
+        struct bpf_stack_slot *slot;
+        slot = &state->stack[MAX_BPF_STACK + off];
+        if (slot->stype == STACK_SPILL) {
+                if (size != 8) {
+                        verbose("invalid size of register spill\n");
+                        return -EACCES;
+                }
+                for (i = 1; i < 8; i++) {
+                        if (state->stack[MAX_BPF_STACK + off + i].stype !=
+                            STACK_SPILL_PART) {
+                                verbose("corrupted spill memory\n");
+                                return -EACCES;
+                        }
+                }
+                if (value_regno >= 0)
+                        /* restore register state from stack */
+                        state->regs[value_regno] = slot->reg_st;
+                return 0;
+        } else {
+                for (i = 0; i < size; i++) {
+                        if (state->stack[MAX_BPF_STACK + off + i].stype !=
+                            STACK_MISC) {
+                                verbose("invalid read from stack off %d+%d size %d\n",
+                                        off, i, size);
+                                return -EACCES;
+                        }
+                }
+                if (value_regno >= 0)
+                        /* have read misc data from the stack */
+                        mark_reg_unknown_value(state->regs, value_regno);
+                return 0;
+        }
+}
+/* check read/write into map element returned by bpf_map_lookup_elem() */
+static int check_map_access(struct verifier_env *env, u32 regno, int off,
+                            int size)
+{
+        struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
+        if (off < 0 || off + size > map->value_size) {
+                verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
+                        map->value_size, off, size);
+                return -EACCES;
+        }
+        return 0;
+}
+/* check access to 'struct bpf_context' fields */
+static int check_ctx_access(struct verifier_env *env, int off, int size,
+                            enum bpf_access_type t)
+{
+        if (env->prog->aux->ops->is_valid_access &&
+            env->prog->aux->ops->is_valid_access(off, size, t))
+                return 0;
+        verbose("invalid bpf_context access off=%d size=%d\n", off, size);
+        return -EACCES;
+}
+/* check whether memory at (regno + off) is accessible for t = (read | write)
+ * if t==write, value_regno is a register which value is stored into memory
+ * if t==read, value_regno is a register which will receive the value from memory
+ * if t==write && value_regno==-1, some unknown value is stored into memory
+ * if t==read && value_regno==-1, don't care what we read from memory
+ */
+static int check_mem_access(struct verifier_env *env, u32 regno, int off,
+                            int bpf_size, enum bpf_access_type t,
+                            int value_regno)
+{
+        struct verifier_state *state = &env->cur_state;
+        int size, err = 0;
+        size = bpf_size_to_bytes(bpf_size);
+        if (size < 0)
+                return size;
+        if (off % size != 0) {
+                verbose("misaligned access off %d size %d\n", off, size);
+                return -EACCES;
+        }
+        if (state->regs[regno].type == PTR_TO_MAP_VALUE) {
+                err = check_map_access(env, regno, off, size);
+                if (!err && t == BPF_READ && value_regno >= 0)
+                        mark_reg_unknown_value(state->regs, value_regno);
+        } else if (state->regs[regno].type == PTR_TO_CTX) {
+                err = check_ctx_access(env, off, size, t);
+                if (!err && t == BPF_READ && value_regno >= 0)
+                        mark_reg_unknown_value(state->regs, value_regno);
+        } else if (state->regs[regno].type == FRAME_PTR) {
+                if (off >= 0 || off < -MAX_BPF_STACK) {
+                        verbose("invalid stack off=%d size=%d\n", off, size);
+                        return -EACCES;
+                }
+                if (t == BPF_WRITE)
+                        err = check_stack_write(state, off, size, value_regno);
+                else
+                        err = check_stack_read(state, off, size, value_regno);
+        } else {
+                verbose("R%d invalid mem access '%s'\n",
+                        regno, reg_type_str[state->regs[regno].type]);
+                return -EACCES;
+        }
+        return err;
+}
+static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
+{
+        struct reg_state *regs = env->cur_state.regs;
+        int err;
+        if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
+            insn->imm != 0) {
+                verbose("BPF_XADD uses reserved fields\n");
+                return -EINVAL;
+        }
+        /* check src1 operand */
+        err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+        if (err)
+                return err;
+        /* check src2 operand */
+        err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+        if (err)
+                return err;
+        /* check whether atomic_add can read the memory */
+        err = check_mem_access(env, insn->dst_reg, insn->off,
+                               BPF_SIZE(insn->code), BPF_READ, -1);
+        if (err)
+                return err;
+        /* check whether atomic_add can write into the same memory */
+        return check_mem_access(env, insn->dst_reg, insn->off,
+                                BPF_SIZE(insn->code), BPF_WRITE, -1);
+}
+/* when register 'regno' is passed into function that will read 'access_size'
+ * bytes from that pointer, make sure that it's within stack boundary
+ * and all elements of stack are initialized
+ */
+static int check_stack_boundary(struct verifier_env *env,
+                                int regno, int access_size)
+{
+        struct verifier_state *state = &env->cur_state;
+        struct reg_state *regs = state->regs;
+        int off, i;
+        if (regs[regno].type != PTR_TO_STACK)
+                return -EACCES;
+        off = regs[regno].imm;
+        if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
+            access_size <= 0) {
+                verbose("invalid stack type R%d off=%d access_size=%d\n",
+                        regno, off, access_size);
+                return -EACCES;
+        }
+        for (i = 0; i < access_size; i++) {
+                if (state->stack[MAX_BPF_STACK + off + i].stype != STACK_MISC) {
+                        verbose("invalid indirect read from stack off %d+%d size %d\n",
+                                off, i, access_size);
+                        return -EACCES;
+                }
+        }
+        return 0;
+}
+static int check_func_arg(struct verifier_env *env, u32 regno,
+                          enum bpf_arg_type arg_type, struct bpf_map **mapp)
+{
+        struct reg_state *reg = env->cur_state.regs + regno;
+        enum bpf_reg_type expected_type;
+        int err = 0;
+        if (arg_type == ARG_ANYTHING)
+                return 0;
+        if (reg->type == NOT_INIT) {
+                verbose("R%d !read_ok\n", regno);
+                return -EACCES;
+        }
+        if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
+            arg_type == ARG_PTR_TO_MAP_VALUE) {
+                expected_type = PTR_TO_STACK;
+        } else if (arg_type == ARG_CONST_STACK_SIZE) {
+                expected_type = CONST_IMM;
+        } else if (arg_type == ARG_CONST_MAP_PTR) {
+                expected_type = CONST_PTR_TO_MAP;
+        } else {
+                verbose("unsupported arg_type %d\n", arg_type);
+                return -EFAULT;
+        }
+        if (reg->type != expected_type) {
+                verbose("R%d type=%s expected=%s\n", regno,
+                        reg_type_str[reg->type], reg_type_str[expected_type]);
+                return -EACCES;
+        }
+        if (arg_type == ARG_CONST_MAP_PTR) {
+                /* bpf_map_xxx(map_ptr) call: remember that map_ptr */
+                *mapp = reg->map_ptr;
+        } else if (arg_type == ARG_PTR_TO_MAP_KEY) {
+                /* bpf_map_xxx(..., map_ptr, ..., key) call:
+                 * check that [key, key + map->key_size) are within
+                 * stack limits and initialized
+                 */
+                if (!*mapp) {
+                        /* in function declaration map_ptr must come before
+                         * map_key, so that it's verified and known before
+                         * we have to check map_key here. Otherwise it means
+                         * that kernel subsystem misconfigured verifier
+                         */
+                        verbose("invalid map_ptr to access map->key\n");
+                        return -EACCES;
+                }
+                err = check_stack_boundary(env, regno, (*mapp)->key_size);
+        } else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
+                /* bpf_map_xxx(..., map_ptr, ..., value) call:
+                 * check [value, value + map->value_size) validity
+                 */
+                if (!*mapp) {
+                        /* kernel subsystem misconfigured verifier */
+                        verbose("invalid map_ptr to access map->value\n");
+                        return -EACCES;
+                }
+                err = check_stack_boundary(env, regno, (*mapp)->value_size);
+        } else if (arg_type == ARG_CONST_STACK_SIZE) {
+                /* bpf_xxx(..., buf, len) call will access 'len' bytes
+                 * from stack pointer 'buf'. Check it
+                 * note: regno == len, regno - 1 == buf
+                 */
+                if (regno == 0) {
+                        /* kernel subsystem misconfigured verifier */
+                        verbose("ARG_CONST_STACK_SIZE cannot be first argument\n");
+                        return -EACCES;
+                }
+                err = check_stack_boundary(env, regno - 1, reg->imm);
+        }
+        return err;
+}
+static int check_call(struct verifier_env *env, int func_id)
+{
+        struct verifier_state *state = &env->cur_state;
+        const struct bpf_func_proto *fn = NULL;
+        struct reg_state *regs = state->regs;
+        struct bpf_map *map = NULL;
+        struct reg_state *reg;
+        int i, err;
+        /* find function prototype */
+        if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
+                verbose("invalid func %d\n", func_id);
+                return -EINVAL;
+        }
+        if (env->prog->aux->ops->get_func_proto)
+                fn = env->prog->aux->ops->get_func_proto(func_id);
+        if (!fn) {
+                verbose("unknown func %d\n", func_id);
+                return -EINVAL;
+        }
+        /* eBPF programs must be GPL compatible to use GPL-ed functions */
+        if (!env->prog->aux->is_gpl_compatible && fn->gpl_only) {
+                verbose("cannot call GPL only function from proprietary program\n");
+                return -EINVAL;
+        }
+        /* check args */
+        err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &map);
+        if (err)
+                return err;
+        err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &map);
+        if (err)
+                return err;
+        err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &map);
+        if (err)
+                return err;
+        err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &map);
+        if (err)
+                return err;
+        err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &map);
+        if (err)
+                return err;
+        /* reset caller saved regs */
+        for (i = 0; i < CALLER_SAVED_REGS; i++) {
+                reg = regs + caller_saved[i];
+                reg->type = NOT_INIT;
+                reg->imm = 0;
+        }
+        /* update return register */
+        if (fn->ret_type == RET_INTEGER) {
+                regs[BPF_REG_0].type = UNKNOWN_VALUE;
+        } else if (fn->ret_type == RET_VOID) {
+                regs[BPF_REG_0].type = NOT_INIT;
+        } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
+                regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
+                /* remember map_ptr, so that check_map_access()
+                 * can check 'value_size' boundary of memory access
+                 * to map element returned from bpf_map_lookup_elem()
+                 */
+                if (map == NULL) {
+                        verbose("kernel subsystem misconfigured verifier\n");
+                        return -EINVAL;
+                }
+                regs[BPF_REG_0].map_ptr = map;
+        } else {
+                verbose("unknown return type %d of func %d\n",
+                        fn->ret_type, func_id);
+                return -EINVAL;
+        }
+        return 0;
+}
+/* check validity of 32-bit and 64-bit arithmetic operations */
+static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
+{
+        u8 opcode = BPF_OP(insn->code);
+        int err;
+        if (opcode == BPF_END || opcode == BPF_NEG) {
+                if (opcode == BPF_NEG) {
+                        if (BPF_SRC(insn->code) != 0 ||
+                            insn->src_reg != BPF_REG_0 ||
+                            insn->off != 0 || insn->imm != 0) {
+                                verbose("BPF_NEG uses reserved fields\n");
+                                return -EINVAL;
+                        }
+                } else {
+                        if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
+                            (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) {
+                                verbose("BPF_END uses reserved fields\n");
+                                return -EINVAL;
+                        }
+                }
+                /* check src operand */
+                err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+                if (err)
+                        return err;
+                /* check dest operand */
+                err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+                if (err)
+                        return err;
+        } else if (opcode == BPF_MOV) {
+                if (BPF_SRC(insn->code) == BPF_X) {
+                        if (insn->imm != 0 || insn->off != 0) {
+                                verbose("BPF_MOV uses reserved fields\n");
+                                return -EINVAL;
+                        }
+                        /* check src operand */
+                        err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+                        if (err)
+                                return err;
+                } else {
+                        if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
+                                verbose("BPF_MOV uses reserved fields\n");
+                                return -EINVAL;
+                        }
+                }
+                /* check dest operand */
+                err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+                if (err)
+                        return err;
+                if (BPF_SRC(insn->code) == BPF_X) {
+                        if (BPF_CLASS(insn->code) == BPF_ALU64) {
+                                /* case: R1 = R2
+                                 * copy register state to dest reg
+                                 */
+                                regs[insn->dst_reg] = regs[insn->src_reg];
+                        } else {
+                                regs[insn->dst_reg].type = UNKNOWN_VALUE;
+                                regs[insn->dst_reg].map_ptr = NULL;
+                        }
+                } else {
+                        /* case: R = imm
+                         * remember the value we stored into this reg
+                         */
+                        regs[insn->dst_reg].type = CONST_IMM;
+                        regs[insn->dst_reg].imm = insn->imm;
+                }
+        } else if (opcode > BPF_END) {
+                verbose("invalid BPF_ALU opcode %x\n", opcode);
+                return -EINVAL;
+        } else {        /* all other ALU ops: and, sub, xor, add, ... */
+                bool stack_relative = false;
+                if (BPF_SRC(insn->code) == BPF_X) {
+                        if (insn->imm != 0 || insn->off != 0) {
+                                verbose("BPF_ALU uses reserved fields\n");
+                                return -EINVAL;
+                        }
+                        /* check src1 operand */
+                        err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+                        if (err)
+                                return err;
+                } else {
+                        if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
+                                verbose("BPF_ALU uses reserved fields\n");
+                                return -EINVAL;
+                        }
+                }
+                /* check src2 operand */
+                err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+                if (err)
+                        return err;
+                if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
+                    BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
+                        verbose("div by zero\n");
+                        return -EINVAL;
+                }
+                /* pattern match 'bpf_add Rx, imm' instruction */
+                if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
+                    regs[insn->dst_reg].type == FRAME_PTR &&
+                    BPF_SRC(insn->code) == BPF_K)
+                        stack_relative = true;
+                /* check dest operand */
+                err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+                if (err)
+                        return err;
+                if (stack_relative) {
+                        regs[insn->dst_reg].type = PTR_TO_STACK;
+                        regs[insn->dst_reg].imm = insn->imm;
+                }
+        }
+        return 0;
+}
+static int check_cond_jmp_op(struct verifier_env *env,
+                             struct bpf_insn *insn, int *insn_idx)
+{
+        struct reg_state *regs = env->cur_state.regs;
+        struct verifier_state *other_branch;
+        u8 opcode = BPF_OP(insn->code);
+        int err;
+        if (opcode > BPF_EXIT) {
+                verbose("invalid BPF_JMP opcode %x\n", opcode);
+                return -EINVAL;
+        }
+        if (BPF_SRC(insn->code) == BPF_X) {
+                if (insn->imm != 0) {
+                        verbose("BPF_JMP uses reserved fields\n");
+                        return -EINVAL;
+                }
+                /* check src1 operand */
+                err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+                if (err)
+                        return err;
+        } else {
+                if (insn->src_reg != BPF_REG_0) {
+                        verbose("BPF_JMP uses reserved fields\n");
+                        return -EINVAL;
+                }
+        }
+        /* check src2 operand */
+        err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+        if (err)
+                return err;
+        /* detect if R == 0 where R was initialized to zero earlier */
+        if (BPF_SRC(insn->code) == BPF_K &&
+            (opcode == BPF_JEQ || opcode == BPF_JNE) &&
+            regs[insn->dst_reg].type == CONST_IMM &&
+            regs[insn->dst_reg].imm == insn->imm) {
+                if (opcode == BPF_JEQ) {
+                        /* if (imm == imm) goto pc+off;
+                         * only follow the goto, ignore fall-through
+                         */
+                        *insn_idx += insn->off;
+                        return 0;
+                } else {
+                        /* if (imm != imm) goto pc+off;
+                         * only follow fall-through branch, since
+                         * that's where the program will go
+                         */
+                        return 0;
+                }
+        }
+        other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx);
+        if (!other_branch)
+                return -EFAULT;
+        /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */
+        if (BPF_SRC(insn->code) == BPF_K &&
+            insn->imm == 0 && (opcode == BPF_JEQ ||
+                               opcode == BPF_JNE) &&
+            regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) {
+                if (opcode == BPF_JEQ) {
+                        /* next fallthrough insn can access memory via
+                         * this register
+                         */
+                        regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
+                        /* branch targer cannot access it, since reg == 0 */
+                        other_branch->regs[insn->dst_reg].type = CONST_IMM;
+                        other_branch->regs[insn->dst_reg].imm = 0;
+                } else {
+                        other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
+                        regs[insn->dst_reg].type = CONST_IMM;
+                        regs[insn->dst_reg].imm = 0;
+                }
+        } else if (BPF_SRC(insn->code) == BPF_K &&
+                   (opcode == BPF_JEQ || opcode == BPF_JNE)) {
+                if (opcode == BPF_JEQ) {
+                        /* detect if (R == imm) goto
+                         * and in the target state recognize that R = imm
+                         */
+                        other_branch->regs[insn->dst_reg].type = CONST_IMM;
+                        other_branch->regs[insn->dst_reg].imm = insn->imm;
+                } else {
+                        /* detect if (R != imm) goto
+                         * and in the fall-through state recognize that R = imm
+                         */
+                        regs[insn->dst_reg].type = CONST_IMM;
+                        regs[insn->dst_reg].imm = insn->imm;
+                }
+        }
+        if (log_level)
+                print_verifier_state(env);
+        return 0;
+}
+/* return the map pointer stored inside BPF_LD_IMM64 instruction */
+static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
+{
+        u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32;
+        return (struct bpf_map *) (unsigned long) imm64;
+}
+/* verify BPF_LD_IMM64 instruction */
+static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
+{
+        struct reg_state *regs = env->cur_state.regs;
+        int err;
+        if (BPF_SIZE(insn->code) != BPF_DW) {
+                verbose("invalid BPF_LD_IMM insn\n");
+                return -EINVAL;
+        }
+        if (insn->off != 0) {
+                verbose("BPF_LD_IMM64 uses reserved fields\n");
+                return -EINVAL;
+        }
+        err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+        if (err)
+                return err;
+        if (insn->src_reg == 0)
+                /* generic move 64-bit immediate into a register */
+                return 0;
+        /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */
+        BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD);
+        regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
+        regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn);
+        return 0;
+}
+/* non-recursive DFS pseudo code
+ * 1  procedure DFS-iterative(G,v):
+ * 2      label v as discovered
+ * 3      let S be a stack
+ * 4      S.push(v)
+ * 5      while S is not empty
+ * 6            t <- S.pop()
+ * 7            if t is what we're looking for:
+ * 8                return t
+ * 9            for all edges e in G.adjacentEdges(t) do
+ * 10               if edge e is already labelled
+ * 11                   continue with the next edge
+ * 12               w <- G.adjacentVertex(t,e)
+ * 13               if vertex w is not discovered and not explored
+ * 14                   label e as tree-edge
+ * 15                   label w as discovered
+ * 16                   S.push(w)
+ * 17                   continue at 5
+ * 18               else if vertex w is discovered
+ * 19                   label e as back-edge
+ * 20               else
+ * 21                   // vertex w is explored
+ * 22                   label e as forward- or cross-edge
+ * 23           label t as explored
+ * 24           S.pop()
+ *
+ * convention:
+ * 0x10 - discovered
+ * 0x11 - discovered and fall-through edge labelled
+ * 0x12 - discovered and fall-through and branch edges labelled
+ * 0x20 - explored
+ */
+enum {
+        DISCOVERED = 0x10,
+        EXPLORED = 0x20,
+        FALLTHROUGH = 1,
+        BRANCH = 2,
+};
+#define STATE_LIST_MARK ((struct verifier_state_list *) -1L)
+static int *insn_stack; /* stack of insns to process */
+static int cur_stack;   /* current stack index */
+static int *insn_state;
+/* t, w, e - match pseudo-code above:
+ * t - index of current instruction
+ * w - next instruction
+ * e - edge
+ */
+static int push_insn(int t, int w, int e, struct verifier_env *env)
+{
+        if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
+                return 0;
+        if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH))
+                return 0;
+        if (w < 0 || w >= env->prog->len) {
+                verbose("jump out of range from insn %d to %d\n", t, w);
+                return -EINVAL;
+        }
+        if (e == BRANCH)
+                /* mark branch target for state pruning */
+                env->explored_states[w] = STATE_LIST_MARK;
+        if (insn_state[w] == 0) {
+                /* tree-edge */
+                insn_state[t] = DISCOVERED | e;
+                insn_state[w] = DISCOVERED;
+                if (cur_stack >= env->prog->len)
+                        return -E2BIG;
+                insn_stack[cur_stack++] = w;
+                return 1;
+        } else if ((insn_state[w] & 0xF0) == DISCOVERED) {
+                verbose("back-edge from insn %d to %d\n", t, w);
+                return -EINVAL;
+        } else if (insn_state[w] == EXPLORED) {
+                /* forward- or cross-edge */
+                insn_state[t] = DISCOVERED | e;
+        } else {
+                verbose("insn state internal bug\n");
+                return -EFAULT;
+        }
+        return 0;
+}
+/* non-recursive depth-first-search to detect loops in BPF program
+ * loop == back-edge in directed graph
+ */
+static int check_cfg(struct verifier_env *env)
+{
+        struct bpf_insn *insns = env->prog->insnsi;
+        int insn_cnt = env->prog->len;
+        int ret = 0;
+        int i, t;
+        insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+        if (!insn_state)
+                return -ENOMEM;
+        insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+        if (!insn_stack) {
+                kfree(insn_state);
+                return -ENOMEM;
+        }
+        insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
+        insn_stack[0] = 0; /* 0 is the first instruction */
+        cur_stack = 1;
+peek_stack:
+        if (cur_stack == 0)
+                goto check_state;
+        t = insn_stack[cur_stack - 1];
+        if (BPF_CLASS(insns[t].code) == BPF_JMP) {
+                u8 opcode = BPF_OP(insns[t].code);
+                if (opcode == BPF_EXIT) {
+                        goto mark_explored;
+                } else if (opcode == BPF_CALL) {
+                        ret = push_insn(t, t + 1, FALLTHROUGH, env);
+                        if (ret == 1)
+                                goto peek_stack;
+                        else if (ret < 0)
+                                goto err_free;
+                } else if (opcode == BPF_JA) {
+                        if (BPF_SRC(insns[t].code) != BPF_K) {
+                                ret = -EINVAL;
+                                goto err_free;
+                        }
+                        /* unconditional jump with single edge */
+                        ret = push_insn(t, t + insns[t].off + 1,
+                                        FALLTHROUGH, env);
+                        if (ret == 1)
+                                goto peek_stack;
+                        else if (ret < 0)
+                                goto err_free;
+                        /* tell verifier to check for equivalent states
+                         * after every call and jump
+                         */
+                        env->explored_states[t + 1] = STATE_LIST_MARK;
+                } else {
+                        /* conditional jump with two edges */
+                        ret = push_insn(t, t + 1, FALLTHROUGH, env);
+                        if (ret == 1)
+                                goto peek_stack;
+                        else if (ret < 0)
+                                goto err_free;
+                        ret = push_insn(t, t + insns[t].off + 1, BRANCH, env);
+                        if (ret == 1)
+                                goto peek_stack;
+                        else if (ret < 0)
+                                goto err_free;
+                }
+        } else {
+                /* all other non-branch instructions with single
+                 * fall-through edge
+                 */
+                ret = push_insn(t, t + 1, FALLTHROUGH, env);
+                if (ret == 1)
+                        goto peek_stack;
+                else if (ret < 0)
+                        goto err_free;
+        }
+mark_explored:
+        insn_state[t] = EXPLORED;
+        if (cur_stack-- <= 0) {
+                verbose("pop stack internal bug\n");
+                ret = -EFAULT;
+                goto err_free;
+        }
+        goto peek_stack;
+check_state:
+        for (i = 0; i < insn_cnt; i++) {
+                if (insn_state[i] != EXPLORED) {
+                        verbose("unreachable insn %d\n", i);
+                        ret = -EINVAL;
+                        goto err_free;
+                }
+        }
+        ret = 0; /* cfg looks good */
+err_free:
+        kfree(insn_state);
+        kfree(insn_stack);
+        return ret;
+}
+/* compare two verifier states
+ *
+ * all states stored in state_list are known to be valid, since
+ * verifier reached 'bpf_exit' instruction through them
+ *
+ * this function is called when verifier exploring different branches of
+ * execution popped from the state stack. If it sees an old state that has
+ * more strict register state and more strict stack state then this execution
+ * branch doesn't need to be explored further, since verifier already
+ * concluded that more strict state leads to valid finish.
+ *
+ * Therefore two states are equivalent if register state is more conservative
+ * and explored stack state is more conservative than the current one.
+ * Example:
+ *       explored                   current
+ * (slot1=INV slot2=MISC) == (slot1=MISC slot2=MISC)
+ * (slot1=MISC slot2=MISC) != (slot1=INV slot2=MISC)
+ *
+ * In other words if current stack state (one being explored) has more
+ * valid slots than old one that already passed validation, it means
+ * the verifier can stop exploring and conclude that current state is valid too
+ *
+ * Similarly with registers. If explored state has register type as invalid
+ * whereas register type in current state is meaningful, it means that
+ * the current state will reach 'bpf_exit' instruction safely
+ */
+static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
+{
+        int i;
+        for (i = 0; i < MAX_BPF_REG; i++) {
+                if (memcmp(&old->regs[i], &cur->regs[i],
+                           sizeof(old->regs[0])) != 0) {
+                        if (old->regs[i].type == NOT_INIT ||
+                            (old->regs[i].type == UNKNOWN_VALUE &&
+                             cur->regs[i].type != NOT_INIT))
+                                continue;
+                        return false;
+                }
+        }
+        for (i = 0; i < MAX_BPF_STACK; i++) {
+                if (memcmp(&old->stack[i], &cur->stack[i],
+                           sizeof(old->stack[0])) != 0) {
+                        if (old->stack[i].stype == STACK_INVALID)
+                                continue;
+                        return false;
+                }
+        }
+        return true;
+}
+static int is_state_visited(struct verifier_env *env, int insn_idx)
+{
+        struct verifier_state_list *new_sl;
+        struct verifier_state_list *sl;
+        sl = env->explored_states[insn_idx];
+        if (!sl)
+                /* this 'insn_idx' instruction wasn't marked, so we will not
+                 * be doing state search here
+                 */
+                return 0;
+        while (sl != STATE_LIST_MARK) {
+                if (states_equal(&sl->state, &env->cur_state))
+                        /* reached equivalent register/stack state,
+                         * prune the search
+                         */
+                        return 1;
+                sl = sl->next;
+        }
+        /* there were no equivalent states, remember current one.
+         * technically the current state is not proven to be safe yet,
+         * but it will either reach bpf_exit (which means it's safe) or
+         * it will be rejected. Since there are no loops, we won't be
+         * seeing this 'insn_idx' instruction again on the way to bpf_exit
+         */
+        new_sl = kmalloc(sizeof(struct verifier_state_list), GFP_USER);
+        if (!new_sl)
+                return -ENOMEM;
+        /* add new state to the head of linked list */
+        memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state));
+        new_sl->next = env->explored_states[insn_idx];
+        env->explored_states[insn_idx] = new_sl;
+        return 0;
+}
+static int do_check(struct verifier_env *env)
+{
+        struct verifier_state *state = &env->cur_state;
+        struct bpf_insn *insns = env->prog->insnsi;
+        struct reg_state *regs = state->regs;
+        int insn_cnt = env->prog->len;
+        int insn_idx, prev_insn_idx = 0;
+        int insn_processed = 0;
+        bool do_print_state = false;
+        init_reg_state(regs);
+        insn_idx = 0;
+        for (;;) {
+                struct bpf_insn *insn;
+                u8 class;
+                int err;
+                if (insn_idx >= insn_cnt) {
+                        verbose("invalid insn idx %d insn_cnt %d\n",
+                                insn_idx, insn_cnt);
+                        return -EFAULT;
+                }
+                insn = &insns[insn_idx];
+                class = BPF_CLASS(insn->code);
+                if (++insn_processed > 32768) {
+                        verbose("BPF program is too large. Proccessed %d insn\n",
+                                insn_processed);
+                        return -E2BIG;
+                }
+                err = is_state_visited(env, insn_idx);
+                if (err < 0)
+                        return err;
+                if (err == 1) {
+                        /* found equivalent state, can prune the search */
+                        if (log_level) {
+                                if (do_print_state)
+                                        verbose("\nfrom %d to %d: safe\n",
+                                                prev_insn_idx, insn_idx);
+                                else
+                                        verbose("%d: safe\n", insn_idx);
+                        }
+                        goto process_bpf_exit;
+                }
+                if (log_level && do_print_state) {
+                        verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx);
+                        print_verifier_state(env);
+                        do_print_state = false;
+                }
+                if (log_level) {
+                        verbose("%d: ", insn_idx);
+                        print_bpf_insn(insn);
+                }
+                if (class == BPF_ALU || class == BPF_ALU64) {
+                        err = check_alu_op(regs, insn);
+                        if (err)
+                                return err;
+                } else if (class == BPF_LDX) {
+                        if (BPF_MODE(insn->code) != BPF_MEM ||
+                            insn->imm != 0) {
+                                verbose("BPF_LDX uses reserved fields\n");
+                                return -EINVAL;
+                        }
+                        /* check src operand */
+                        err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+                        if (err)
+                                return err;
+                        err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK);
+                        if (err)
+                                return err;
+                        /* check that memory (src_reg + off) is readable,
+                         * the state of dst_reg will be updated by this func
+                         */
+                        err = check_mem_access(env, insn->src_reg, insn->off,
+                                               BPF_SIZE(insn->code), BPF_READ,
+                                               insn->dst_reg);
+                        if (err)
+                                return err;
+                } else if (class == BPF_STX) {
+                        if (BPF_MODE(insn->code) == BPF_XADD) {
+                                err = check_xadd(env, insn);
+                                if (err)
+                                        return err;
+                                insn_idx++;
+                                continue;
+                        }
+                        if (BPF_MODE(insn->code) != BPF_MEM ||
+                            insn->imm != 0) {
+                                verbose("BPF_STX uses reserved fields\n");
+                                return -EINVAL;
+                        }
+                        /* check src1 operand */
+                        err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+                        if (err)
+                                return err;
+                        /* check src2 operand */
+                        err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+                        if (err)
+                                return err;
+                        /* check that memory (dst_reg + off) is writeable */
+                        err = check_mem_access(env, insn->dst_reg, insn->off,
+                                               BPF_SIZE(insn->code), BPF_WRITE,
+                                               insn->src_reg);
+                        if (err)
+                                return err;
+                } else if (class == BPF_ST) {
+                        if (BPF_MODE(insn->code) != BPF_MEM ||
+                            insn->src_reg != BPF_REG_0) {
+                                verbose("BPF_ST uses reserved fields\n");
+                                return -EINVAL;
+                        }
+                        /* check src operand */
+                        err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+                        if (err)
+                                return err;
+                        /* check that memory (dst_reg + off) is writeable */
+                        err = check_mem_access(env, insn->dst_reg, insn->off,
+                                               BPF_SIZE(insn->code), BPF_WRITE,
+                                               -1);
+                        if (err)
+                                return err;
+                } else if (class == BPF_JMP) {
+                        u8 opcode = BPF_OP(insn->code);
+                        if (opcode == BPF_CALL) {
+                                if (BPF_SRC(insn->code) != BPF_K ||
+                                    insn->off != 0 ||
+                                    insn->src_reg != BPF_REG_0 ||
+                                    insn->dst_reg != BPF_REG_0) {
+                                        verbose("BPF_CALL uses reserved fields\n");
+                                        return -EINVAL;
+                                }
+                                err = check_call(env, insn->imm);
+                                if (err)
+                                        return err;
+                        } else if (opcode == BPF_JA) {
+                                if (BPF_SRC(insn->code) != BPF_K ||
+                                    insn->imm != 0 ||
+                                    insn->src_reg != BPF_REG_0 ||
+                                    insn->dst_reg != BPF_REG_0) {
+                                        verbose("BPF_JA uses reserved fields\n");
+                                        return -EINVAL;
+                                }
+                                insn_idx += insn->off + 1;
+                                continue;
+                        } else if (opcode == BPF_EXIT) {
+                                if (BPF_SRC(insn->code) != BPF_K ||
+                                    insn->imm != 0 ||
+                                    insn->src_reg != BPF_REG_0 ||
+                                    insn->dst_reg != BPF_REG_0) {
+                                        verbose("BPF_EXIT uses reserved fields\n");
+                                        return -EINVAL;
+                                }
+                                /* eBPF calling convetion is such that R0 is used
+                                 * to return the value from eBPF program.
+                                 * Make sure that it's readable at this time
+                                 * of bpf_exit, which means that program wrote
+                                 * something into it earlier
+                                 */
+                                err = check_reg_arg(regs, BPF_REG_0, SRC_OP);
+                                if (err)
+                                        return err;
+process_bpf_exit:
+                                insn_idx = pop_stack(env, &prev_insn_idx);
+                                if (insn_idx < 0) {
+                                        break;
+                                } else {
+                                        do_print_state = true;
+                                        continue;
+                                }
+                        } else {
+                                err = check_cond_jmp_op(env, insn, &insn_idx);
+                                if (err)
+                                        return err;
+                        }
+                } else if (class == BPF_LD) {
+                        u8 mode = BPF_MODE(insn->code);
+                        if (mode == BPF_ABS || mode == BPF_IND) {
+                                verbose("LD_ABS is not supported yet\n");
+                                return -EINVAL;
+                        } else if (mode == BPF_IMM) {
+                                err = check_ld_imm(env, insn);
+                                if (err)
+                                        return err;
+                                insn_idx++;
+                        } else {
+                                verbose("invalid BPF_LD mode\n");
+                                return -EINVAL;
+                        }
+                } else {
+                        verbose("unknown insn class %d\n", class);
+                        return -EINVAL;
+                }
+                insn_idx++;
+        }
+        return 0;
+}
+/* look for pseudo eBPF instructions that access map FDs and
+ * replace them with actual map pointers
+ */
+static int replace_map_fd_with_map_ptr(struct verifier_env *env)
+{
+        struct bpf_insn *insn = env->prog->insnsi;
+        int insn_cnt = env->prog->len;
+        int i, j;
+        for (i = 0; i < insn_cnt; i++, insn++) {
+                if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
+                        struct bpf_map *map;
+                        struct fd f;
+                        if (i == insn_cnt - 1 || insn[1].code != 0 ||
+                            insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
+                            insn[1].off != 0) {
+                                verbose("invalid bpf_ld_imm64 insn\n");
+                                return -EINVAL;
+                        }
+                        if (insn->src_reg == 0)
+                                /* valid generic load 64-bit imm */
+                                goto next_insn;
+                        if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
+                                verbose("unrecognized bpf_ld_imm64 insn\n");
+                                return -EINVAL;
+                        }
+                        f = fdget(insn->imm);
+                        map = bpf_map_get(f);
+                        if (IS_ERR(map)) {
+                                verbose("fd %d is not pointing to valid bpf_map\n",
+                                        insn->imm);
+                                fdput(f);
+                                return PTR_ERR(map);
+                        }
+                        /* store map pointer inside BPF_LD_IMM64 instruction */
+                        insn[0].imm = (u32) (unsigned long) map;
+                        insn[1].imm = ((u64) (unsigned long) map) >> 32;
+                        /* check whether we recorded this map already */
+                        for (j = 0; j < env->used_map_cnt; j++)
+                                if (env->used_maps[j] == map) {
+                                        fdput(f);
+                                        goto next_insn;
+                                }
+                        if (env->used_map_cnt >= MAX_USED_MAPS) {
+                                fdput(f);
+                                return -E2BIG;
+                        }
+                        /* remember this map */
+                        env->used_maps[env->used_map_cnt++] = map;
+                        /* hold the map. If the program is rejected by verifier,
+                         * the map will be released by release_maps() or it
+                         * will be used by the valid program until it's unloaded
+                         * and all maps are released in free_bpf_prog_info()
+                         */
+                        atomic_inc(&map->refcnt);
+                        fdput(f);
+next_insn:
+                        insn++;
+                        i++;
+                }
+        }
+        /* now all pseudo BPF_LD_IMM64 instructions load valid
+         * 'struct bpf_map *' into a register instead of user map_fd.
+         * These pointers will be used later by verifier to validate map access.
+         */
+        return 0;
+}
+/* drop refcnt of maps used by the rejected program */
+static void release_maps(struct verifier_env *env)
+{
+        int i;
+        for (i = 0; i < env->used_map_cnt; i++)
+                bpf_map_put(env->used_maps[i]);
+}
+/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
+static void convert_pseudo_ld_imm64(struct verifier_env *env)
+{
+        struct bpf_insn *insn = env->prog->insnsi;
+        int insn_cnt = env->prog->len;
+        int i;
+        for (i = 0; i < insn_cnt; i++, insn++)
+                if (insn->code == (BPF_LD | BPF_IMM | BPF_DW))
+                        insn->src_reg = 0;
+}
+static void free_states(struct verifier_env *env)
+{
+        struct verifier_state_list *sl, *sln;
+        int i;
+        if (!env->explored_states)
+                return;
+        for (i = 0; i < env->prog->len; i++) {
+                sl = env->explored_states[i];
+                if (sl)
+                        while (sl != STATE_LIST_MARK) {
+                                sln = sl->next;
+                                kfree(sl);
+                                sl = sln;
+                        }
+        }
+        kfree(env->explored_states);
+}
+int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
+{
+        char __user *log_ubuf = NULL;
+        struct verifier_env *env;
+        int ret = -EINVAL;
+        if (prog->len <= 0 || prog->len > BPF_MAXINSNS)
+                return -E2BIG;
+        /* 'struct verifier_env' can be global, but since it's not small,
+         * allocate/free it every time bpf_check() is called
+         */
+        env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL);
+        if (!env)
+                return -ENOMEM;
+        env->prog = prog;
+        /* grab the mutex to protect few globals used by verifier */
+        mutex_lock(&bpf_verifier_lock);
+        if (attr->log_level || attr->log_buf || attr->log_size) {
+                /* user requested verbose verifier output
+                 * and supplied buffer to store the verification trace
+                 */
+                log_level = attr->log_level;
+                log_ubuf = (char __user *) (unsigned long) attr->log_buf;
+                log_size = attr->log_size;
+                log_len = 0;
+                ret = -EINVAL;
+                /* log_* values have to be sane */
+                if (log_size < 128 || log_size > UINT_MAX >> 8 ||
+                    log_level == 0 || log_ubuf == NULL)
+                        goto free_env;
+                ret = -ENOMEM;
+                log_buf = vmalloc(log_size);
+                if (!log_buf)
+                        goto free_env;
+        } else {
+                log_level = 0;
+        }
+        ret = replace_map_fd_with_map_ptr(env);
+        if (ret < 0)
+                goto skip_full_check;
+        env->explored_states = kcalloc(prog->len,
+                                       sizeof(struct verifier_state_list *),
+                                       GFP_USER);
+        ret = -ENOMEM;
+        if (!env->explored_states)
+                goto skip_full_check;
+        ret = check_cfg(env);
+        if (ret < 0)
+                goto skip_full_check;
+        ret = do_check(env);
+skip_full_check:
+        while (pop_stack(env, NULL) >= 0);
+        free_states(env);
+        if (log_level && log_len >= log_size - 1) {
+                BUG_ON(log_len >= log_size);
+                /* verifier log exceeded user supplied buffer */
+                ret = -ENOSPC;
+                /* fall through to return what was recorded */
+        }
+        /* copy verifier log back to user space including trailing zero */
+        if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
+                ret = -EFAULT;
+                goto free_log_buf;
+        }
+        if (ret == 0 && env->used_map_cnt) {
+                /* if program passed verifier, update used_maps in bpf_prog_info */
+                prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
+                                                     sizeof(env->used_maps[0]),
+                                                     GFP_KERNEL);
+                if (!prog->aux->used_maps) {
+                        ret = -ENOMEM;
+                        goto free_log_buf;
+                }
+                memcpy(prog->aux->used_maps, env->used_maps,
+                       sizeof(env->used_maps[0]) * env->used_map_cnt);
+                prog->aux->used_map_cnt = env->used_map_cnt;
+                /* program is valid. Convert pseudo bpf_ld_imm64 into generic
+                 * bpf_ld_imm64 instructions
+                 */
+                convert_pseudo_ld_imm64(env);
+        }
+free_log_buf:
+        if (log_level)
+                vfree(log_buf);
+free_env:
+        if (!prog->aux->used_maps)
+                /* if we didn't copy map pointers into bpf_prog_info, release
+                 * them now. Otherwise free_bpf_prog_info() will release them.
+                 */
+                release_maps(env);
+        kfree(env);
+        mutex_unlock(&bpf_verifier_lock);
+        return ret;
+}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7dc8788cfd52..136eceadeed1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -185,7 +185,6 @@ static int need_forkexit_callback __read_mostly;
 static struct cftype cgroup_dfl_base_files[];
 static struct cftype cgroup_legacy_base_files[];
-static void cgroup_put(struct cgroup *cgrp);
 static int rebind_subsystems(struct cgroup_root *dst_root,
                             unsigned int ss_mask);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
@@ -195,7 +194,6 @@ static void css_release(struct percpu_ref *ref);
 static void kill_css(struct cgroup_subsys_state *css);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
                              bool is_add);
-static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
 /* IDR wrappers which synchronize using cgroup_idr_lock */
 static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
@@ -331,14 +329,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
        return false;
 }
-static int cgroup_is_releasable(const struct cgroup *cgrp)
-{
-        const int bits =
-                (1 << CGRP_RELEASABLE) |
-                (1 << CGRP_NOTIFY_ON_RELEASE);
-        return (cgrp->flags & bits) == bits;
-}
 static int notify_on_release(const struct cgroup *cgrp)
 {
        return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -394,12 +384,7 @@ static int notify_on_release(const struct cgroup *cgrp)
                        ;                                               \
                else
-/* the list of cgroups eligible for automatic release. Protected by
- * release_list_lock */
-static LIST_HEAD(release_list);
-static DEFINE_RAW_SPINLOCK(release_list_lock);
 static void cgroup_release_agent(struct work_struct *work);
-static DECLARE_WORK(release_agent_work, cgroup_release_agent);
 static void check_for_release(struct cgroup *cgrp);
 /*
@@ -498,7 +483,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
        return key;
 }
-static void put_css_set_locked(struct css_set *cset, bool taskexit)
+static void put_css_set_locked(struct css_set *cset)
 {
        struct cgrp_cset_link *link, *tmp_link;
        struct cgroup_subsys *ss;
@@ -524,11 +509,7 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
                /* @cgrp can't go away while we're holding css_set_rwsem */
                if (list_empty(&cgrp->cset_links)) {
                        cgroup_update_populated(cgrp, false);
-                        if (notify_on_release(cgrp)) {
+                        check_for_release(cgrp);
-                                if (taskexit)
-                                        set_bit(CGRP_RELEASABLE, &cgrp->flags);
-                                check_for_release(cgrp);
-                        }
                }
                kfree(link);
@@ -537,7 +518,7 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
        kfree_rcu(cset, rcu_head);
 }
-static void put_css_set(struct css_set *cset, bool taskexit)
+static void put_css_set(struct css_set *cset)
 {
        /*
         * Ensure that the refcount doesn't hit zero while any readers
@@ -548,7 +529,7 @@ static void put_css_set(struct css_set *cset, bool taskexit)
                return;
        down_write(&css_set_rwsem);
-        put_css_set_locked(cset, taskexit);
+        put_css_set_locked(cset);
        up_write(&css_set_rwsem);
 }
@@ -969,14 +950,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 * knows that the cgroup won't be removed, as cgroup_rmdir()
 * needs that mutex.
 *
- * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
- * (usually) take cgroup_mutex.  These are the two most performance
- * critical pieces of code here.  The exception occurs on cgroup_exit(),
- * when a task in a notify_on_release cgroup exits.  Then cgroup_mutex
- * is taken, and if the cgroup count is zero, a usermode call made
- * to the release agent with the name of the cgroup (path relative to
- * the root of cgroup file system) as the argument.
- *
 * A cgroup can only be deleted if both its 'count' of using tasks
 * is zero, and its list of 'children' cgroups is empty.  Since all
 * tasks in the system use _some_ cgroup, and since there is always at
@@ -1035,6 +1008,11 @@ static void cgroup_get(struct cgroup *cgrp)
        css_get(&cgrp->self);
 }
+static bool cgroup_tryget(struct cgroup *cgrp)
+{
+        return css_tryget(&cgrp->self);
+}
 static void cgroup_put(struct cgroup *cgrp)
 {
        css_put(&cgrp->self);
@@ -1147,7 +1125,8 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
         * protection against removal.  Ensure @cgrp stays accessible and
         * break the active_ref protection.
         */
-        cgroup_get(cgrp);
+        if (!cgroup_tryget(cgrp))
+                return NULL;
        kernfs_break_active_protection(kn);
        mutex_lock(&cgroup_mutex);
@@ -1581,7 +1560,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
        INIT_LIST_HEAD(&cgrp->self.sibling);
        INIT_LIST_HEAD(&cgrp->self.children);
        INIT_LIST_HEAD(&cgrp->cset_links);
-        INIT_LIST_HEAD(&cgrp->release_list);
        INIT_LIST_HEAD(&cgrp->pidlists);
        mutex_init(&cgrp->pidlist_mutex);
        cgrp->self.cgroup = cgrp;
@@ -1591,6 +1569,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
                INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
        init_waitqueue_head(&cgrp->offline_waitq);
+        INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
 }
 static void init_cgroup_root(struct cgroup_root *root,
@@ -1628,7 +1607,8 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
                goto out;
        root_cgrp->id = ret;
-        ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release);
+        ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
+                              GFP_KERNEL);
        if (ret)
                goto out;
@@ -2046,8 +2026,7 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
         * task. As trading it for new_cset is protected by cgroup_mutex,
         * we're safe to drop it here; it will be freed under RCU.
         */
-        set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
+        put_css_set_locked(old_cset);
-        put_css_set_locked(old_cset, false);
 }
 /**
@@ -2068,7 +2047,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
                cset->mg_src_cgrp = NULL;
                cset->mg_dst_cset = NULL;
                list_del_init(&cset->mg_preload_node);
-                put_css_set_locked(cset, false);
+                put_css_set_locked(cset);
        }
        up_write(&css_set_rwsem);
 }
@@ -2162,8 +2141,8 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
                if (src_cset == dst_cset) {
                        src_cset->mg_src_cgrp = NULL;
                        list_del_init(&src_cset->mg_preload_node);
-                        put_css_set(src_cset, false);
+                        put_css_set(src_cset);
-                        put_css_set(dst_cset, false);
+                        put_css_set(dst_cset);
                        continue;
                }
@@ -2172,7 +2151,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
                if (list_empty(&dst_cset->mg_preload_node))
                        list_add(&dst_cset->mg_preload_node, &csets);
                else
-                        put_css_set(dst_cset, false);
+                        put_css_set(dst_cset);
        }
        list_splice_tail(&csets, preloaded_csets);
@@ -3271,8 +3250,17 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 {
        struct cftype *cft;
-        for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+        /*
-                cft->flags |= __CFTYPE_NOT_ON_DFL;
+         * If legacy_flies_on_dfl, we want to show the legacy files on the
+         * dfl hierarchy but iff the target subsystem hasn't been updated
+         * for the dfl hierarchy yet.
+         */
+        if (!cgroup_legacy_files_on_dfl ||
+            ss->dfl_cftypes != ss->legacy_cftypes) {
+                for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+                        cft->flags |= __CFTYPE_NOT_ON_DFL;
+        }
        return cgroup_add_cftypes(ss, cfts);
 }
@@ -3970,7 +3958,6 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
        l = cgroup_pidlist_find_create(cgrp, type);
        if (!l) {
-                mutex_unlock(&cgrp->pidlist_mutex);
                pidlist_free(array);
                return -ENOMEM;
        }
@@ -4159,7 +4146,6 @@ static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
 static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
                                          struct cftype *cft, u64 val)
 {
-        clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
        if (val)
                set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
        else
@@ -4337,6 +4323,7 @@ static void css_free_work_fn(struct work_struct *work)
                /* cgroup free path */
                atomic_dec(&cgrp->root->nr_cgrps);
                cgroup_pidlist_destroy_all(cgrp);
+                cancel_work_sync(&cgrp->release_agent_work);
                if (cgroup_parent(cgrp)) {
                        /*
@@ -4387,6 +4374,15 @@ static void css_release_work_fn(struct work_struct *work)
                /* cgroup release path */
                cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
                cgrp->id = -1;
+                /*
+                 * There are two control paths which try to determine
+                 * cgroup from dentry without going through kernfs -
+                 * cgroupstats_build() and css_tryget_online_from_dir().
+                 * Those are supported by RCU protecting clearing of
+                 * cgrp->kn->priv backpointer.
+                 */
+                RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
        }
        mutex_unlock(&cgroup_mutex);
@@ -4487,7 +4483,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
        init_and_link_css(css, ss, cgrp);
-        err = percpu_ref_init(&css->refcnt, css_release);
+        err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
        if (err)
                goto err_free_css;
@@ -4543,6 +4539,11 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
        struct cftype *base_files;
        int ssid, ret;
+        /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
+         */
+        if (strchr(name, '\n'))
+                return -EINVAL;
        parent = cgroup_kn_lock_live(parent_kn);
        if (!parent)
                return -ENODEV;
@@ -4555,7 +4556,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
                goto out_unlock;
        }
-        ret = percpu_ref_init(&cgrp->self.refcnt, css_release);
+        ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
        if (ret)
                goto out_free_cgrp;
@@ -4785,19 +4786,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
        for_each_css(css, ssid, cgrp)
                kill_css(css);
-        /* CSS_ONLINE is clear, remove from ->release_list for the last time */
-        raw_spin_lock(&release_list_lock);
-        if (!list_empty(&cgrp->release_list))
-                list_del_init(&cgrp->release_list);
-        raw_spin_unlock(&release_list_lock);
        /*
         * Remove @cgrp directory along with the base files.  @cgrp has an
         * extra ref on its kn.
         */
        kernfs_remove(cgrp->kn);
-        set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags);
        check_for_release(cgroup_parent(cgrp));
        /* put the base reference */
@@ -4814,23 +4808,10 @@ static int cgroup_rmdir(struct kernfs_node *kn)
        cgrp = cgroup_kn_lock_live(kn);
        if (!cgrp)
                return 0;
-        cgroup_get(cgrp);       /* for @kn->priv clearing */
        ret = cgroup_destroy_locked(cgrp);
        cgroup_kn_unlock(kn);
-        /*
-         * There are two control paths which try to determine cgroup from
-         * dentry without going through kernfs - cgroupstats_build() and
-         * css_tryget_online_from_dir().  Those are supported by RCU
-         * protecting clearing of cgrp->kn->priv backpointer, which should
-         * happen after all files under it have been removed.
-         */
-        if (!ret)
-                RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL);
-        cgroup_put(cgrp);
        return ret;
 }
@@ -5034,12 +5015,9 @@ core_initcall(cgroup_wq_init);
 *  - Print task's cgroup paths into seq_file, one line for each hierarchy
 *  - Used for /proc/<pid>/cgroup.
 */
+int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
-/* TODO: Use a proper seq_file iterator */
+                     struct pid *pid, struct task_struct *tsk)
-int proc_cgroup_show(struct seq_file *m, void *v)
 {
-        struct pid *pid;
-        struct task_struct *tsk;
        char *buf, *path;
        int retval;
        struct cgroup_root *root;
@@ -5049,14 +5027,6 @@ int proc_cgroup_show(struct seq_file *m, void *v)
        if (!buf)
                goto out;
-        retval = -ESRCH;
-        pid = m->private;
-        tsk = get_pid_task(pid, PIDTYPE_PID);
-        if (!tsk)
-                goto out_free;
-        retval = 0;
        mutex_lock(&cgroup_mutex);
        down_read(&css_set_rwsem);
@@ -5086,11 +5056,10 @@ int proc_cgroup_show(struct seq_file *m, void *v)
                seq_putc(m, '\n');
        }
+        retval = 0;
 out_unlock:
        up_read(&css_set_rwsem);
        mutex_unlock(&cgroup_mutex);
-        put_task_struct(tsk);
-out_free:
        kfree(buf);
 out:
        return retval;
@@ -5161,7 +5130,7 @@ void cgroup_post_fork(struct task_struct *child)
        int i;
        /*
-         * This may race against cgroup_enable_task_cg_links().  As that
+         * This may race against cgroup_enable_task_cg_lists().  As that
         * function sets use_task_css_set_links before grabbing
         * tasklist_lock and we just went through tasklist_lock to add
         * @child, it's guaranteed that either we see the set
@@ -5176,7 +5145,7 @@ void cgroup_post_fork(struct task_struct *child)
         * when implementing operations which need to migrate all tasks of
         * a cgroup to another.
         *
-         * Note that if we lose to cgroup_enable_task_cg_links(), @child
+         * Note that if we lose to cgroup_enable_task_cg_lists(), @child
         * will remain in init_css_set.  This is safe because all tasks are
         * in the init_css_set before cg_links is enabled and there's no
         * operation which transfers all tasks out of init_css_set.
@@ -5260,30 +5229,14 @@ void cgroup_exit(struct task_struct *tsk)
        }
        if (put_cset)
-                put_css_set(cset, true);
+                put_css_set(cset);
 }
 static void check_for_release(struct cgroup *cgrp)
 {
-        if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) &&
+        if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) &&
-            !css_has_online_children(&cgrp->self)) {
+            !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
-                /*
+                schedule_work(&cgrp->release_agent_work);
-                 * Control Group is currently removeable. If it's not
-                 * already queued for a userspace notification, queue
-                 * it now
-                 */
-                int need_schedule_work = 0;
-                raw_spin_lock(&release_list_lock);
-                if (!cgroup_is_dead(cgrp) &&
-                    list_empty(&cgrp->release_list)) {
-                        list_add(&cgrp->release_list, &release_list);
-                        need_schedule_work = 1;
-                }
-                raw_spin_unlock(&release_list_lock);
-                if (need_schedule_work)
-                        schedule_work(&release_agent_work);
-        }
 }
 /*
@@ -5311,52 +5264,39 @@ static void check_for_release(struct cgroup *cgrp)
 */
 static void cgroup_release_agent(struct work_struct *work)
 {
-        BUG_ON(work != &release_agent_work);
+        struct cgroup *cgrp =
+                container_of(work, struct cgroup, release_agent_work);
+        char *pathbuf = NULL, *agentbuf = NULL, *path;
+        char *argv[3], *envp[3];
        mutex_lock(&cgroup_mutex);
-        raw_spin_lock(&release_list_lock);
-        while (!list_empty(&release_list)) {
+        pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
-                char *argv[3], *envp[3];
+        agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
-                int i;
+        if (!pathbuf || !agentbuf)
-                char *pathbuf = NULL, *agentbuf = NULL, *path;
+                goto out;
-                struct cgroup *cgrp = list_entry(release_list.next,
-                                                    struct cgroup,
+        path = cgroup_path(cgrp, pathbuf, PATH_MAX);
-                                                    release_list);
+        if (!path)
-                list_del_init(&cgrp->release_list);
+                goto out;
-                raw_spin_unlock(&release_list_lock);
-                pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
+        argv[0] = agentbuf;
-                if (!pathbuf)
+        argv[1] = path;
-                        goto continue_free;
+        argv[2] = NULL;
-                path = cgroup_path(cgrp, pathbuf, PATH_MAX);
-                if (!path)
+        /* minimal command environment */
-                        goto continue_free;
+        envp[0] = "HOME=/";
-                agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
+        envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
-                if (!agentbuf)
+        envp[2] = NULL;
-                        goto continue_free;
-                i = 0;
-                argv[i++] = agentbuf;
-                argv[i++] = path;
-                argv[i] = NULL;
-                i = 0;
-                /* minimal command environment */
-                envp[i++] = "HOME=/";
-                envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
-                envp[i] = NULL;
-                /* Drop the lock while we invoke the usermode helper,
-                 * since the exec could involve hitting disk and hence
-                 * be a slow process */
-                mutex_unlock(&cgroup_mutex);
-                call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
-                mutex_lock(&cgroup_mutex);
- continue_free:
-                kfree(pathbuf);
-                kfree(agentbuf);
-                raw_spin_lock(&release_list_lock);
-        }
-        raw_spin_unlock(&release_list_lock);
        mutex_unlock(&cgroup_mutex);
+        call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+        goto out_free;
+out:
+        mutex_unlock(&cgroup_mutex);
+out_free:
+        kfree(agentbuf);
+        kfree(pathbuf);
 }
 static int __init cgroup_disable(char *str)
@@ -5416,7 +5356,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
        /*
         * This path doesn't originate from kernfs and @kn could already
         * have been or be removed at any point.  @kn->priv is RCU
-         * protected for this access.  See cgroup_rmdir() for details.
+         * protected for this access.  See css_release_work_fn() for details.
         */
        cgrp = rcu_dereference(kn->priv);
        if (cgrp)
@@ -5544,7 +5484,8 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
 static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
 {
-        return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
+        return (!cgroup_has_tasks(css->cgroup) &&
+                !css_has_online_children(&css->cgroup->self));
 }
 static struct cftype debug_files[] =  {
diff --git a/kernel/compat.c b/kernel/compat.c
index 633394f442f8..ebb3c369d03d 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -226,7 +226,7 @@ static long compat_nanosleep_restart(struct restart_block *restart)
        ret = hrtimer_nanosleep_restart(restart);
        set_fs(oldfs);
-        if (ret) {
+        if (ret == -ERESTART_RESTARTBLOCK) {
                rmtp = restart->nanosleep.compat_rmtp;
                if (rmtp && compat_put_timespec(&rmt, rmtp))
@@ -256,7 +256,26 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
                                HRTIMER_MODE_REL, CLOCK_MONOTONIC);
        set_fs(oldfs);
-        if (ret) {
+        /*
+         * hrtimer_nanosleep() can only return 0 or
+         * -ERESTART_RESTARTBLOCK here because:
+         *
+         * - we call it with HRTIMER_MODE_REL and therefor exclude the
+         *   -ERESTARTNOHAND return path.
+         *
+         * - we supply the rmtp argument from the task stack (due to
+         *   the necessary compat conversion. So the update cannot
+         *   fail, which excludes the -EFAULT return path as well. If
+         *   it fails nevertheless we have a bigger problem and wont
+         *   reach this place anymore.
+         *
+         * - if the return value is 0, we do not have to update rmtp
+         *    because there is no remaining time.
+         *
+         * We check for -ERESTART_RESTARTBLOCK nevertheless if the
+         * core implementation decides to return random nonsense.
+         */
+        if (ret == -ERESTART_RESTARTBLOCK) {
                struct restart_block *restart
                        = &current_thread_info()->restart_block;
@@ -266,7 +285,6 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
                if (rmtp && compat_put_timespec(&rmt, rmtp))
                        return -EFAULT;
        }
        return ret;
 }
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
new file mode 100644
index 000000000000..c2de56ab0fce
--- /dev/null
+++ b/kernel/configs/tiny.config
@@ -0,0 +1,4 @@
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+CONFIG_KERNEL_XZ=y
+CONFIG_OPTIMIZE_INLINING=y
+CONFIG_SLOB=y
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 5664985c46a0..937ecdfdf258 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -107,46 +107,6 @@ void context_tracking_user_enter(void)
 }
 NOKPROBE_SYMBOL(context_tracking_user_enter);
-#ifdef CONFIG_PREEMPT
-/**
- * preempt_schedule_context - preempt_schedule called by tracing
- *
- * The tracing infrastructure uses preempt_enable_notrace to prevent
- * recursion and tracing preempt enabling caused by the tracing
- * infrastructure itself. But as tracing can happen in areas coming
- * from userspace or just about to enter userspace, a preempt enable
- * can occur before user_exit() is called. This will cause the scheduler
- * to be called when the system is still in usermode.
- *
- * To prevent this, the preempt_enable_notrace will use this function
- * instead of preempt_schedule() to exit user context if needed before
- * calling the scheduler.
- */
-asmlinkage __visible void __sched notrace preempt_schedule_context(void)
-{
-        enum ctx_state prev_ctx;
-        if (likely(!preemptible()))
-                return;
-        /*
-         * Need to disable preemption in case user_exit() is traced
-         * and the tracer calls preempt_enable_notrace() causing
-         * an infinite recursion.
-         */
-        preempt_disable_notrace();
-        prev_ctx = exception_enter();
-        preempt_enable_no_resched_notrace();
-        preempt_schedule();
-        preempt_disable_notrace();
-        exception_exit(prev_ctx);
-        preempt_enable_notrace();
-}
-EXPORT_SYMBOL_GPL(preempt_schedule_context);
-#endif /* CONFIG_PREEMPT */
 /**
 * context_tracking_user_exit - Inform the context tracking that the CPU is
 *                              exiting userspace mode and entering the kernel.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 81e2a388a0f6..90a3d017b90c 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -64,6 +64,8 @@ static struct {
         * an ongoing cpu hotplug operation.
         */
        int refcount;
+        /* And allows lockless put_online_cpus(). */
+        atomic_t puts_pending;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map dep_map;
@@ -79,6 +81,8 @@ static struct {
 /* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */
 #define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
+#define cpuhp_lock_acquire_tryread() \
+                                  lock_map_acquire_tryread(&cpu_hotplug.dep_map)
 #define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map)
 #define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
@@ -91,15 +95,31 @@ void get_online_cpus(void)
        mutex_lock(&cpu_hotplug.lock);
        cpu_hotplug.refcount++;
        mutex_unlock(&cpu_hotplug.lock);
 }
 EXPORT_SYMBOL_GPL(get_online_cpus);
+bool try_get_online_cpus(void)
+{
+        if (cpu_hotplug.active_writer == current)
+                return true;
+        if (!mutex_trylock(&cpu_hotplug.lock))
+                return false;
+        cpuhp_lock_acquire_tryread();
+        cpu_hotplug.refcount++;
+        mutex_unlock(&cpu_hotplug.lock);
+        return true;
+}
+EXPORT_SYMBOL_GPL(try_get_online_cpus);
 void put_online_cpus(void)
 {
        if (cpu_hotplug.active_writer == current)
                return;
-        mutex_lock(&cpu_hotplug.lock);
+        if (!mutex_trylock(&cpu_hotplug.lock)) {
+                atomic_inc(&cpu_hotplug.puts_pending);
+                cpuhp_lock_release();
+                return;
+        }
        if (WARN_ON(!cpu_hotplug.refcount))
                cpu_hotplug.refcount++; /* try to fix things up */
@@ -141,6 +161,12 @@ void cpu_hotplug_begin(void)
        cpuhp_lock_acquire();
        for (;;) {
                mutex_lock(&cpu_hotplug.lock);
+                if (atomic_read(&cpu_hotplug.puts_pending)) {
+                        int delta;
+                        delta = atomic_xchg(&cpu_hotplug.puts_pending, 0);
+                        cpu_hotplug.refcount -= delta;
+                }
                if (likely(!cpu_hotplug.refcount))
                        break;
                __set_current_state(TASK_UNINTERRUPTIBLE);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 22874d7cf2c0..1f107c74087b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -365,13 +365,14 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
                                        struct task_struct *tsk)
 {
        if (is_spread_page(cs))
-                tsk->flags |= PF_SPREAD_PAGE;
+                task_set_spread_page(tsk);
        else
-                tsk->flags &= ~PF_SPREAD_PAGE;
+                task_clear_spread_page(tsk);
        if (is_spread_slab(cs))
-                tsk->flags |= PF_SPREAD_SLAB;
+                task_set_spread_slab(tsk);
        else
-                tsk->flags &= ~PF_SPREAD_SLAB;
+                task_clear_spread_slab(tsk);
 }
 /*
@@ -2729,10 +2730,9 @@ void __cpuset_memory_pressure_bump(void)
 *    and we take cpuset_mutex, keeping cpuset_attach() from changing it
 *    anyway.
 */
-int proc_cpuset_show(struct seq_file *m, void *unused_v)
+int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
+                     struct pid *pid, struct task_struct *tsk)
 {
-        struct pid *pid;
-        struct task_struct *tsk;
        char *buf, *p;
        struct cgroup_subsys_state *css;
        int retval;
@@ -2742,24 +2742,16 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
        if (!buf)
                goto out;
-        retval = -ESRCH;
-        pid = m->private;
-        tsk = get_pid_task(pid, PIDTYPE_PID);
-        if (!tsk)
-                goto out_free;
        retval = -ENAMETOOLONG;
        rcu_read_lock();
        css = task_css(tsk, cpuset_cgrp_id);
        p = cgroup_path(css->cgroup, buf, PATH_MAX);
        rcu_read_unlock();
        if (!p)
-                goto out_put_task;
+                goto out_free;
        seq_puts(m, p);
        seq_putc(m, '\n');
        retval = 0;
-out_put_task:
-        put_task_struct(tsk);
 out_free:
        kfree(buf);
 out:
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
index c766ee54c0b1..b64e238b553b 100644
--- a/kernel/crash_dump.c
+++ b/kernel/crash_dump.c
@@ -18,6 +18,7 @@ unsigned long saved_max_pfn;
 * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
 */
 unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+EXPORT_SYMBOL_GPL(elfcorehdr_addr);
 /*
 * stores the size of elf header of crash image
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 70a504601dc3..b20d544f20c2 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -52,11 +52,11 @@ static int kdb_parsebp(int argc, const char **argv, int *nextargp, kdb_bp_t *bp)
        bp->bph_length = 1;
        if ((argc + 1) != nextarg) {
-                if (strnicmp(argv[nextarg], "datar", sizeof("datar")) == 0)
+                if (strncasecmp(argv[nextarg], "datar", sizeof("datar")) == 0)
                        bp->bp_type = BP_ACCESS_WATCHPOINT;
-                else if (strnicmp(argv[nextarg], "dataw", sizeof("dataw")) == 0)
+                else if (strncasecmp(argv[nextarg], "dataw", sizeof("dataw")) == 0)
                        bp->bp_type = BP_WRITE_WATCHPOINT;
-                else if (strnicmp(argv[nextarg], "inst", sizeof("inst")) == 0)
+                else if (strncasecmp(argv[nextarg], "inst", sizeof("inst")) == 0)
                        bp->bp_type = BP_HARDWARE_BREAKPOINT;
                else
                        return KDB_ARGCOUNT;
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 97b67df8fbfe..d659487254d5 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -52,7 +52,7 @@ static void release_callchain_buffers(void)
        struct callchain_cpus_entries *entries;
        entries = callchain_cpus_entries;
-        rcu_assign_pointer(callchain_cpus_entries, NULL);
+        RCU_INIT_POINTER(callchain_cpus_entries, NULL);
        call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
 }
@@ -137,7 +137,7 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
        int cpu;
        struct callchain_cpus_entries *entries;
-        *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
+        *rctx = get_recursion_context(this_cpu_ptr(callchain_recursion));
        if (*rctx == -1)
                return NULL;
@@ -153,7 +153,7 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
 static void
 put_callchain_entry(int rctx)
 {
-        put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
+        put_recursion_context(this_cpu_ptr(callchain_recursion), rctx);
 }
 struct perf_callchain_entry *
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1cf24b3e42ec..2b02c9fda790 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -41,11 +41,14 @@
 #include <linux/cgroup.h>
 #include <linux/module.h>
 #include <linux/mman.h>
+#include <linux/compat.h>
 #include "internal.h"
 #include <asm/irq_regs.h>
+static struct workqueue_struct *perf_wq;
 struct remote_function_call {
        struct task_struct      *p;
        int                     (*func)(void *info);
@@ -119,6 +122,13 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
        return data.ret;
 }
+#define EVENT_OWNER_KERNEL ((void *) -1)
+static bool is_kernel_event(struct perf_event *event)
+{
+        return event->owner == EVENT_OWNER_KERNEL;
+}
 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
                       PERF_FLAG_FD_OUTPUT  |\
                       PERF_FLAG_PID_CGROUP |\
@@ -239,7 +249,7 @@ static void perf_duration_warn(struct irq_work *w)
        u64 avg_local_sample_len;
        u64 local_samples_len;
-        local_samples_len = __get_cpu_var(running_sample_length);
+        local_samples_len = __this_cpu_read(running_sample_length);
        avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
        printk_ratelimited(KERN_WARNING
@@ -261,10 +271,10 @@ void perf_sample_event_took(u64 sample_len_ns)
                return;
        /* decay the counter by 1 average sample */
-        local_samples_len = __get_cpu_var(running_sample_length);
+        local_samples_len = __this_cpu_read(running_sample_length);
        local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
        local_samples_len += sample_len_ns;
-        __get_cpu_var(running_sample_length) = local_samples_len;
+        __this_cpu_write(running_sample_length, local_samples_len);
        /*
         * note: this will be biased artifically low until we have
@@ -391,14 +401,9 @@ perf_cgroup_match(struct perf_event *event)
                                    event->cgrp->css.cgroup);
 }
-static inline void perf_put_cgroup(struct perf_event *event)
-{
-        css_put(&event->cgrp->css);
-}
 static inline void perf_detach_cgroup(struct perf_event *event)
 {
-        perf_put_cgroup(event);
+        css_put(&event->cgrp->css);
        event->cgrp = NULL;
 }
@@ -877,7 +882,7 @@ static DEFINE_PER_CPU(struct list_head, rotation_list);
 static void perf_pmu_rotate_start(struct pmu *pmu)
 {
        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-        struct list_head *head = &__get_cpu_var(rotation_list);
+        struct list_head *head = this_cpu_ptr(&rotation_list);
        WARN_ON(!irqs_disabled());
@@ -901,13 +906,23 @@ static void put_ctx(struct perf_event_context *ctx)
        }
 }
-static void unclone_ctx(struct perf_event_context *ctx)
+/*
+ * This must be done under the ctx->lock, such as to serialize against
+ * context_equiv(), therefore we cannot call put_ctx() since that might end up
+ * calling scheduler related locks and ctx->lock nests inside those.
+ */
+static __must_check struct perf_event_context *
+unclone_ctx(struct perf_event_context *ctx)
 {
-        if (ctx->parent_ctx) {
+        struct perf_event_context *parent_ctx = ctx->parent_ctx;
-                put_ctx(ctx->parent_ctx);
+        lockdep_assert_held(&ctx->lock);
+        if (parent_ctx)
                ctx->parent_ctx = NULL;
-        }
        ctx->generation++;
+        return parent_ctx;
 }
 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
@@ -1374,6 +1389,45 @@ out:
                perf_event__header_size(tmp);
 }
+/*
+ * User event without the task.
+ */
+static bool is_orphaned_event(struct perf_event *event)
+{
+        return event && !is_kernel_event(event) && !event->owner;
+}
+/*
+ * Event has a parent but parent's task finished and it's
+ * alive only because of children holding refference.
+ */
+static bool is_orphaned_child(struct perf_event *event)
+{
+        return is_orphaned_event(event->parent);
+}
+static void orphans_remove_work(struct work_struct *work);
+static void schedule_orphans_remove(struct perf_event_context *ctx)
+{
+        if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
+                return;
+        if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
+                get_ctx(ctx);
+                ctx->orphans_remove_sched = true;
+        }
+}
+static int __init perf_workqueue_init(void)
+{
+        perf_wq = create_singlethread_workqueue("perf");
+        WARN(!perf_wq, "failed to create perf workqueue\n");
+        return perf_wq ? 0 : -1;
+}
+core_initcall(perf_workqueue_init);
 static inline int
 event_filter_match(struct perf_event *event)
 {
@@ -1423,6 +1477,9 @@ event_sched_out(struct perf_event *event,
        if (event->attr.exclusive || !cpuctx->active_oncpu)
                cpuctx->exclusive = 0;
+        if (is_orphaned_child(event))
+                schedule_orphans_remove(ctx);
        perf_pmu_enable(event->pmu);
 }
@@ -1523,6 +1580,11 @@ retry:
         */
        if (ctx->is_active) {
                raw_spin_unlock_irq(&ctx->lock);
+                /*
+                 * Reload the task pointer, it might have been changed by
+                 * a concurrent perf_event_context_sched_out().
+                 */
+                task = ctx->task;
                goto retry;
        }
@@ -1725,6 +1787,9 @@ event_sched_in(struct perf_event *event,
        if (event->attr.exclusive)
                cpuctx->exclusive = 1;
+        if (is_orphaned_child(event))
+                schedule_orphans_remove(ctx);
 out:
        perf_pmu_enable(event->pmu);
@@ -1966,6 +2031,11 @@ retry:
         */
        if (ctx->is_active) {
                raw_spin_unlock_irq(&ctx->lock);
+                /*
+                 * Reload the task pointer, it might have been changed by
+                 * a concurrent perf_event_context_sched_out().
+                 */
+                task = ctx->task;
                goto retry;
        }
@@ -2199,6 +2269,9 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 static int context_equiv(struct perf_event_context *ctx1,
                         struct perf_event_context *ctx2)
 {
+        lockdep_assert_held(&ctx1->lock);
+        lockdep_assert_held(&ctx2->lock);
        /* Pinning disables the swap optimization */
        if (ctx1->pin_count || ctx2->pin_count)
                return 0;
@@ -2320,7 +2393,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
        next_parent = rcu_dereference(next_ctx->parent_ctx);
        /* If neither context have a parent context; they cannot be clones. */
-        if (!parent || !next_parent)
+        if (!parent && !next_parent)
                goto unlock;
        if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
@@ -2389,7 +2462,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
         * to check if we have to switch out PMU state.
         * cgroup event are system-wide mode only
         */
-        if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
+        if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
                perf_cgroup_sched_out(task, next);
 }
@@ -2632,11 +2705,11 @@ void __perf_event_task_sched_in(struct task_struct *prev,
         * to check if we have to switch in PMU state.
         * cgroup event are system-wide mode only
         */
-        if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
+        if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
                perf_cgroup_sched_in(prev, task);
        /* check for system-wide branch_stack events */
-        if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
+        if (atomic_read(this_cpu_ptr(&perf_branch_stack_events)))
                perf_branch_stack_sched_in(prev, task);
 }
@@ -2891,7 +2964,7 @@ bool perf_event_can_stop_tick(void)
 void perf_event_task_tick(void)
 {
-        struct list_head *head = &__get_cpu_var(rotation_list);
+        struct list_head *head = this_cpu_ptr(&rotation_list);
        struct perf_cpu_context *cpuctx, *tmp;
        struct perf_event_context *ctx;
        int throttled;
@@ -2932,6 +3005,7 @@ static int event_enable_on_exec(struct perf_event *event,
 */
 static void perf_event_enable_on_exec(struct perf_event_context *ctx)
 {
+        struct perf_event_context *clone_ctx = NULL;
        struct perf_event *event;
        unsigned long flags;
        int enabled = 0;
@@ -2963,7 +3037,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
         * Unclone this context if we enabled any event.
         */
        if (enabled)
-                unclone_ctx(ctx);
+                clone_ctx = unclone_ctx(ctx);
        raw_spin_unlock(&ctx->lock);
@@ -2973,6 +3047,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
        perf_event_context_sched_in(ctx, ctx->task);
 out:
        local_irq_restore(flags);
+        if (clone_ctx)
+                put_ctx(clone_ctx);
 }
 void perf_event_exec(void)
@@ -3067,6 +3144,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
        INIT_LIST_HEAD(&ctx->flexible_groups);
        INIT_LIST_HEAD(&ctx->event_list);
        atomic_set(&ctx->refcount, 1);
+        INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
 }
 static struct perf_event_context *
@@ -3124,7 +3202,7 @@ errout:
 static struct perf_event_context *
 find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
 {
-        struct perf_event_context *ctx;
+        struct perf_event_context *ctx, *clone_ctx = NULL;
        struct perf_cpu_context *cpuctx;
        unsigned long flags;
        int ctxn, err;
@@ -3158,9 +3236,12 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
 retry:
        ctx = perf_lock_task_context(task, ctxn, &flags);
        if (ctx) {
-                unclone_ctx(ctx);
+                clone_ctx = unclone_ctx(ctx);
                ++ctx->pin_count;
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
+                if (clone_ctx)
+                        put_ctx(clone_ctx);
        } else {
                ctx = alloc_perf_context(pmu, task);
                err = -ENOMEM;
@@ -3312,16 +3393,12 @@ static void free_event(struct perf_event *event)
 }
 /*
- * Called when the last reference to the file is gone.
+ * Remove user event from the owner task.
 */
-static void put_event(struct perf_event *event)
+static void perf_remove_from_owner(struct perf_event *event)
 {
-        struct perf_event_context *ctx = event->ctx;
        struct task_struct *owner;
-        if (!atomic_long_dec_and_test(&event->refcount))
-                return;
        rcu_read_lock();
        owner = ACCESS_ONCE(event->owner);
        /*
@@ -3354,6 +3431,20 @@ static void put_event(struct perf_event *event)
                mutex_unlock(&owner->perf_event_mutex);
                put_task_struct(owner);
        }
+}
+/*
+ * Called when the last reference to the file is gone.
+ */
+static void put_event(struct perf_event *event)
+{
+        struct perf_event_context *ctx = event->ctx;
+        if (!atomic_long_dec_and_test(&event->refcount))
+                return;
+        if (!is_kernel_event(event))
+                perf_remove_from_owner(event);
        WARN_ON_ONCE(ctx->parent_ctx);
        /*
@@ -3388,6 +3479,42 @@ static int perf_release(struct inode *inode, struct file *file)
        return 0;
 }
+/*
+ * Remove all orphanes events from the context.
+ */
+static void orphans_remove_work(struct work_struct *work)
+{
+        struct perf_event_context *ctx;
+        struct perf_event *event, *tmp;
+        ctx = container_of(work, struct perf_event_context,
+                           orphans_remove.work);
+        mutex_lock(&ctx->mutex);
+        list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
+                struct perf_event *parent_event = event->parent;
+                if (!is_orphaned_child(event))
+                        continue;
+                perf_remove_from_context(event, true);
+                mutex_lock(&parent_event->child_mutex);
+                list_del_init(&event->child_list);
+                mutex_unlock(&parent_event->child_mutex);
+                free_event(event);
+                put_event(parent_event);
+        }
+        raw_spin_lock_irq(&ctx->lock);
+        ctx->orphans_remove_sched = false;
+        raw_spin_unlock_irq(&ctx->lock);
+        mutex_unlock(&ctx->mutex);
+        put_ctx(ctx);
+}
 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
 {
        struct perf_event *child;
@@ -3485,6 +3612,19 @@ static int perf_event_read_one(struct perf_event *event,
        return n * sizeof(u64);
 }
+static bool is_event_hup(struct perf_event *event)
+{
+        bool no_children;
+        if (event->state != PERF_EVENT_STATE_EXIT)
+                return false;
+        mutex_lock(&event->child_mutex);
+        no_children = list_empty(&event->child_list);
+        mutex_unlock(&event->child_mutex);
+        return no_children;
+}
 /*
 * Read the performance event - simple non blocking version for now
 */
@@ -3526,7 +3666,12 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
 {
        struct perf_event *event = file->private_data;
        struct ring_buffer *rb;
-        unsigned int events = POLL_HUP;
+        unsigned int events = POLLHUP;
+        poll_wait(file, &event->waitq, wait);
+        if (is_event_hup(event))
+                return events;
        /*
         * Pin the event->rb by taking event->mmap_mutex; otherwise
@@ -3537,9 +3682,6 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
        if (rb)
                events = atomic_xchg(&rb->poll, 0);
        mutex_unlock(&event->mmap_mutex);
-        poll_wait(file, &event->waitq, wait);
        return events;
 }
@@ -3717,6 +3859,26 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        return 0;
 }
+#ifdef CONFIG_COMPAT
+static long perf_compat_ioctl(struct file *file, unsigned int cmd,
+                                unsigned long arg)
+{
+        switch (_IOC_NR(cmd)) {
+        case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
+        case _IOC_NR(PERF_EVENT_IOC_ID):
+                /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
+                if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
+                        cmd &= ~IOCSIZE_MASK;
+                        cmd |= sizeof(void *) << IOCSIZE_SHIFT;
+                }
+                break;
+        }
+        return perf_ioctl(file, cmd, arg);
+}
+#else
+# define perf_compat_ioctl NULL
+#endif
 int perf_event_task_enable(void)
 {
        struct perf_event *event;
@@ -4222,7 +4384,7 @@ static const struct file_operations perf_fops = {
        .read                   = perf_read,
        .poll                   = perf_poll,
        .unlocked_ioctl         = perf_ioctl,
-        .compat_ioctl           = perf_ioctl,
+        .compat_ioctl           = perf_compat_ioctl,
        .mmap                   = perf_mmap,
        .fasync                 = perf_fasync,
 };
@@ -5671,7 +5833,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
                                    struct perf_sample_data *data,
                                    struct pt_regs *regs)
 {
-        struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
+        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
        struct perf_event *event;
        struct hlist_head *head;
@@ -5690,7 +5852,7 @@ end:
 int perf_swevent_get_recursion_context(void)
 {
-        struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
+        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
        return get_recursion_context(swhash->recursion);
 }
@@ -5698,7 +5860,7 @@ EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
 inline void perf_swevent_put_recursion_context(int rctx)
 {
-        struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
+        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
        put_recursion_context(swhash->recursion, rctx);
 }
@@ -5727,7 +5889,7 @@ static void perf_swevent_read(struct perf_event *event)
 static int perf_swevent_add(struct perf_event *event, int flags)
 {
-        struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
+        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
        struct hw_perf_event *hwc = &event->hw;
        struct hlist_head *head;
@@ -5783,7 +5945,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash)
        if (!hlist)
                return;
-        rcu_assign_pointer(swhash->swevent_hlist, NULL);
+        RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
        kfree_rcu(hlist, rcu_head);
 }
@@ -5909,11 +6071,6 @@ static int perf_swevent_init(struct perf_event *event)
        return 0;
 }
-static int perf_swevent_event_idx(struct perf_event *event)
-{
-        return 0;
-}
 static struct pmu perf_swevent = {
        .task_ctx_nr    = perf_sw_context,
@@ -5923,8 +6080,6 @@ static struct pmu perf_swevent = {
        .start          = perf_swevent_start,
        .stop           = perf_swevent_stop,
        .read           = perf_swevent_read,
-        .event_idx      = perf_swevent_event_idx,
 };
 #ifdef CONFIG_EVENT_TRACING
@@ -6042,8 +6197,6 @@ static struct pmu perf_tracepoint = {
        .start          = perf_swevent_start,
        .stop           = perf_swevent_stop,
        .read           = perf_swevent_read,
-        .event_idx      = perf_swevent_event_idx,
 };
 static inline void perf_tp_register(void)
@@ -6269,8 +6422,6 @@ static struct pmu perf_cpu_clock = {
        .start          = cpu_clock_event_start,
        .stop           = cpu_clock_event_stop,
        .read           = cpu_clock_event_read,
-        .event_idx      = perf_swevent_event_idx,
 };
 /*
@@ -6349,8 +6500,6 @@ static struct pmu perf_task_clock = {
        .start          = task_clock_event_start,
        .stop           = task_clock_event_stop,
        .read           = task_clock_event_read,
-        .event_idx      = perf_swevent_event_idx,
 };
 static void perf_pmu_nop_void(struct pmu *pmu)
@@ -6380,7 +6529,7 @@ static void perf_pmu_cancel_txn(struct pmu *pmu)
 static int perf_event_idx_default(struct perf_event *event)
 {
-        return event->hw.idx + 1;
+        return 0;
 }
 /*
@@ -7366,6 +7515,9 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
                goto err;
        }
+        /* Mark owner so we could distinguish it from user events. */
+        event->owner = EVENT_OWNER_KERNEL;
        account_event(event);
        ctx = find_get_context(event->pmu, task, cpu);
@@ -7453,6 +7605,12 @@ static void sync_child_event(struct perf_event *child_event,
        mutex_unlock(&parent_event->child_mutex);
        /*
+         * Make sure user/parent get notified, that we just
+         * lost one event.
+         */
+        perf_event_wakeup(parent_event);
+        /*
         * Release the parent event, if this was the last
         * reference to it.
         */
@@ -7486,13 +7644,16 @@ __perf_event_exit_task(struct perf_event *child_event,
        if (child_event->parent) {
                sync_child_event(child_event, child);
                free_event(child_event);
+        } else {
+                child_event->state = PERF_EVENT_STATE_EXIT;
+                perf_event_wakeup(child_event);
        }
 }
 static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
 {
        struct perf_event *child_event, *next;
-        struct perf_event_context *child_ctx, *parent_ctx;
+        struct perf_event_context *child_ctx, *clone_ctx = NULL;
        unsigned long flags;
        if (likely(!child->perf_event_ctxp[ctxn])) {
@@ -7519,28 +7680,16 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
        child->perf_event_ctxp[ctxn] = NULL;
        /*
-         * In order to avoid freeing: child_ctx->parent_ctx->task
-         * under perf_event_context::lock, grab another reference.
-         */
-        parent_ctx = child_ctx->parent_ctx;
-        if (parent_ctx)
-                get_ctx(parent_ctx);
-        /*
         * If this context is a clone; unclone it so it can't get
         * swapped to another process while we're removing all
         * the events from it.
         */
-        unclone_ctx(child_ctx);
+        clone_ctx = unclone_ctx(child_ctx);
        update_context_time(child_ctx);
        raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
-        /*
+        if (clone_ctx)
-         * Now that we no longer hold perf_event_context::lock, drop
+                put_ctx(clone_ctx);
-         * our extra child_ctx->parent_ctx reference.
-         */
-        if (parent_ctx)
-                put_ctx(parent_ctx);
        /*
         * Report the task dead after unscheduling the events so that we
@@ -7669,6 +7818,7 @@ inherit_event(struct perf_event *parent_event,
              struct perf_event *group_leader,
              struct perf_event_context *child_ctx)
 {
+        enum perf_event_active_state parent_state = parent_event->state;
        struct perf_event *child_event;
        unsigned long flags;
@@ -7689,7 +7839,8 @@ inherit_event(struct perf_event *parent_event,
        if (IS_ERR(child_event))
                return child_event;
-        if (!atomic_long_inc_not_zero(&parent_event->refcount)) {
+        if (is_orphaned_event(parent_event) ||
+            !atomic_long_inc_not_zero(&parent_event->refcount)) {
                free_event(child_event);
                return NULL;
        }
@@ -7701,7 +7852,7 @@ inherit_event(struct perf_event *parent_event,
         * not its attr.disabled bit.  We hold the parent's mutex,
         * so we won't race with perf_event_{en, dis}able_family.
         */
-        if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
+        if (parent_state >= PERF_EVENT_STATE_INACTIVE)
                child_event->state = PERF_EVENT_STATE_INACTIVE;
        else
                child_event->state = PERF_EVENT_STATE_OFF;
@@ -7917,8 +8068,10 @@ int perf_event_init_task(struct task_struct *child)
        for_each_task_context_nr(ctxn) {
                ret = perf_event_init_context(child, ctxn);
-                if (ret)
+                if (ret) {
+                        perf_event_free_task(child);
                        return ret;
+                }
        }
        return 0;
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 1559fb0b9296..9803a6600d49 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -605,11 +605,6 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags)
        bp->hw.state = PERF_HES_STOPPED;
 }
-static int hw_breakpoint_event_idx(struct perf_event *bp)
-{
-        return 0;
-}
 static struct pmu perf_breakpoint = {
        .task_ctx_nr    = perf_sw_context, /* could eventually get its own */
@@ -619,8 +614,6 @@ static struct pmu perf_breakpoint = {
        .start          = hw_breakpoint_start,
        .stop           = hw_breakpoint_stop,
        .read           = hw_breakpoint_pmu_read,
-        .event_idx      = hw_breakpoint_event_idx,
 };
 int __init init_hw_breakpoint(void)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 6f3254e8c137..1d0af8a2c646 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -167,6 +167,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
        /* For mmu_notifiers */
        const unsigned long mmun_start = addr;
        const unsigned long mmun_end   = addr + PAGE_SIZE;
+        struct mem_cgroup *memcg;
+        err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg);
+        if (err)
+                return err;
        /* For try_to_free_swap() and munlock_vma_page() below */
        lock_page(page);
@@ -179,6 +184,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
        get_page(kpage);
        page_add_new_anon_rmap(kpage, vma, addr);
+        mem_cgroup_commit_charge(kpage, memcg, false);
+        lru_cache_add_active_or_unevictable(kpage, vma);
        if (!PageAnon(page)) {
                dec_mm_counter(mm, MM_FILEPAGES);
@@ -200,6 +207,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
        err = 0;
 unlock:
+        mem_cgroup_cancel_charge(kpage, memcg);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        unlock_page(page);
        return err;
@@ -315,18 +323,11 @@ retry:
        if (!new_page)
                goto put_old;
-        if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))
-                goto put_new;
        __SetPageUptodate(new_page);
        copy_highpage(new_page, old_page);
        copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
        ret = __replace_page(vma, vaddr, old_page, new_page);
-        if (ret)
-                mem_cgroup_uncharge_page(new_page);
-put_new:
        page_cache_release(new_page);
 put_old:
        put_page(old_page);
diff --git a/kernel/exit.c b/kernel/exit.c
index e5c4668f1799..5d30019ff953 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -59,7 +59,7 @@
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
-static void exit_mm(struct task_struct * tsk);
+static void exit_mm(struct task_struct *tsk);
 static void __unhash_process(struct task_struct *p, bool group_dead)
 {
@@ -115,32 +115,33 @@ static void __exit_signal(struct task_struct *tsk)
                if (tsk == sig->curr_target)
                        sig->curr_target = next_thread(tsk);
-                /*
-                 * Accumulate here the counters for all threads but the
-                 * group leader as they die, so they can be added into
-                 * the process-wide totals when those are taken.
-                 * The group leader stays around as a zombie as long
-                 * as there are other threads.  When it gets reaped,
-                 * the exit.c code will add its counts into these totals.
-                 * We won't ever get here for the group leader, since it
-                 * will have been the last reference on the signal_struct.
-                 */
-                task_cputime(tsk, &utime, &stime);
-                sig->utime += utime;
-                sig->stime += stime;
-                sig->gtime += task_gtime(tsk);
-                sig->min_flt += tsk->min_flt;
-                sig->maj_flt += tsk->maj_flt;
-                sig->nvcsw += tsk->nvcsw;
-                sig->nivcsw += tsk->nivcsw;
-                sig->inblock += task_io_get_inblock(tsk);
-                sig->oublock += task_io_get_oublock(tsk);
-                task_io_accounting_add(&sig->ioac, &tsk->ioac);
-                sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
        }
+        /*
+         * Accumulate here the counters for all threads but the group leader
+         * as they die, so they can be added into the process-wide totals
+         * when those are taken.  The group leader stays around as a zombie as
+         * long as there are other threads.  When it gets reaped, the exit.c
+         * code will add its counts into these totals.  We won't ever get here
+         * for the group leader, since it will have been the last reference on
+         * the signal_struct.
+         */
+        task_cputime(tsk, &utime, &stime);
+        write_seqlock(&sig->stats_lock);
+        sig->utime += utime;
+        sig->stime += stime;
+        sig->gtime += task_gtime(tsk);
+        sig->min_flt += tsk->min_flt;
+        sig->maj_flt += tsk->maj_flt;
+        sig->nvcsw += tsk->nvcsw;
+        sig->nivcsw += tsk->nivcsw;
+        sig->inblock += task_io_get_inblock(tsk);
+        sig->oublock += task_io_get_oublock(tsk);
+        task_io_accounting_add(&sig->ioac, &tsk->ioac);
+        sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
        sig->nr_threads--;
        __unhash_process(tsk, group_dead);
+        write_sequnlock(&sig->stats_lock);
        /*
         * Do this under ->siglock, we can race with another thread
@@ -151,7 +152,7 @@ static void __exit_signal(struct task_struct *tsk)
        spin_unlock(&sighand->siglock);
        __cleanup_sighand(sighand);
-        clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
+        clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
        if (group_dead) {
                flush_sigqueue(&sig->shared_pending);
                tty_kref_put(tty);
@@ -168,7 +169,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 }
-void release_task(struct task_struct * p)
+void release_task(struct task_struct *p)
 {
        struct task_struct *leader;
        int zap_leader;
@@ -192,7 +193,8 @@ repeat:
         */
        zap_leader = 0;
        leader = p->group_leader;
-        if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
+        if (leader != p && thread_group_empty(leader)
+                        && leader->exit_state == EXIT_ZOMBIE) {
                /*
                 * If we were the last child thread and the leader has
                 * exited already, and the leader's parent ignores SIGCHLD,
@@ -241,7 +243,8 @@ struct pid *session_of_pgrp(struct pid *pgrp)
 *
 * "I ask you, have you ever known what it is to be an orphan?"
 */
-static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task)
+static int will_become_orphaned_pgrp(struct pid *pgrp,
+                                        struct task_struct *ignored_task)
 {
        struct task_struct *p;
@@ -294,9 +297,9 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
        struct task_struct *ignored_task = tsk;
        if (!parent)
-                 /* exit: our father is in a different pgrp than
+                /* exit: our father is in a different pgrp than
-                  * we are and we were the only connection outside.
+                 * we are and we were the only connection outside.
-                  */
+                 */
                parent = tsk->real_parent;
        else
                /* reparent: our child is in a different pgrp than
@@ -405,7 +408,7 @@ assign_new_owner:
 * Turn us into a lazy TLB process if we
 * aren't already..
 */
-static void exit_mm(struct task_struct * tsk)
+static void exit_mm(struct task_struct *tsk)
 {
        struct mm_struct *mm = tsk->mm;
        struct core_state *core_state;
@@ -425,6 +428,7 @@ static void exit_mm(struct task_struct * tsk)
        core_state = mm->core_state;
        if (core_state) {
                struct core_thread self;
                up_read(&mm->mmap_sem);
                self.task = tsk;
@@ -455,6 +459,7 @@ static void exit_mm(struct task_struct * tsk)
        task_unlock(tsk);
        mm_update_next_owner(mm);
        mmput(mm);
+        clear_thread_flag(TIF_MEMDIE);
 }
 /*
@@ -565,6 +570,7 @@ static void forget_original_parent(struct task_struct *father)
        list_for_each_entry_safe(p, n, &father->children, sibling) {
                struct task_struct *t = p;
                do {
                        t->real_parent = reaper;
                        if (t->parent == father) {
@@ -598,7 +604,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
        /*
         * This does two things:
         *
-         * A.  Make init inherit all the child processes
+         * A.  Make init inherit all the child processes
         * B.  Check to see if any process groups have become orphaned
         *      as a result of our exiting, and if they have any stopped
         *      jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
@@ -648,9 +654,8 @@ static void check_stack_usage(void)
        spin_lock(&low_water_lock);
        if (free < lowest_to_date) {
-                printk(KERN_WARNING "%s (%d) used greatest stack depth: "
+                pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n",
-                                "%lu bytes left\n",
+                        current->comm, task_pid_nr(current), free);
-                                current->comm, task_pid_nr(current), free);
                lowest_to_date = free;
        }
        spin_unlock(&low_water_lock);
@@ -663,6 +668,7 @@ void do_exit(long code)
 {
        struct task_struct *tsk = current;
        int group_dead;
+        TASKS_RCU(int tasks_rcu_i);
        profile_task_exit(tsk);
@@ -691,8 +697,7 @@ void do_exit(long code)
         * leave this task alone and wait for reboot.
         */
        if (unlikely(tsk->flags & PF_EXITING)) {
-                printk(KERN_ALERT
+                pr_alert("Fixing recursive fault but reboot is needed!\n");
-                        "Fixing recursive fault but reboot is needed!\n");
                /*
                 * We can do this unlocked here. The futex code uses
                 * this flag just to verify whether the pi state
@@ -716,9 +721,9 @@ void do_exit(long code)
        raw_spin_unlock_wait(&tsk->pi_lock);
        if (unlikely(in_atomic()))
-                printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
+                pr_info("note: %s[%d] exited with preempt_count %d\n",
-                                current->comm, task_pid_nr(current),
+                        current->comm, task_pid_nr(current),
-                                preempt_count());
+                        preempt_count());
        acct_update_integrals(tsk);
        /* sync mm's RSS info before statistics gathering */
@@ -772,6 +777,7 @@ void do_exit(long code)
         */
        flush_ptrace_hw_breakpoint(tsk);
+        TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));
        exit_notify(tsk, group_dead);
        proc_exit_connector(tsk);
 #ifdef CONFIG_NUMA
@@ -811,6 +817,7 @@ void do_exit(long code)
        if (tsk->nr_dirtied)
                __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
        exit_rcu();
+        TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
        /*
         * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
@@ -836,7 +843,6 @@ void do_exit(long code)
        for (;;)
                cpu_relax();    /* For when BUG is null */
 }
 EXPORT_SYMBOL_GPL(do_exit);
 void complete_and_exit(struct completion *comp, long code)
@@ -846,7 +852,6 @@ void complete_and_exit(struct completion *comp, long code)
        do_exit(code);
 }
 EXPORT_SYMBOL(complete_and_exit);
 SYSCALL_DEFINE1(exit, int, error_code)
@@ -869,6 +874,7 @@ do_group_exit(int exit_code)
                exit_code = sig->group_exit_code;
        else if (!thread_group_empty(current)) {
                struct sighand_struct *const sighand = current->sighand;
                spin_lock_irq(&sighand->siglock);
                if (signal_group_exit(sig))
                        /* Another thread got here before we took the lock.  */
@@ -1033,14 +1039,15 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
                 * as other threads in the parent group can be right
                 * here reaping other children at the same time.
                 *
-                 * We use thread_group_cputime_adjusted() to get times for the thread
+                 * We use thread_group_cputime_adjusted() to get times for
-                 * group, which consolidates times for all threads in the
+                 * the thread group, which consolidates times for all threads
-                 * group including the group leader.
+                 * in the group including the group leader.
                 */
                thread_group_cputime_adjusted(p, &tgutime, &tgstime);
                spin_lock_irq(&p->real_parent->sighand->siglock);
                psig = p->real_parent->signal;
                sig = p->signal;
+                write_seqlock(&psig->stats_lock);
                psig->cutime += tgutime + sig->cutime;
                psig->cstime += tgstime + sig->cstime;
                psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
@@ -1063,6 +1070,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
                        psig->cmaxrss = maxrss;
                task_io_accounting_add(&psig->ioac, &p->ioac);
                task_io_accounting_add(&psig->ioac, &sig->ioac);
+                write_sequnlock(&psig->stats_lock);
                spin_unlock_irq(&p->real_parent->sighand->siglock);
        }
@@ -1417,6 +1425,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
        list_for_each_entry(p, &tsk->children, sibling) {
                int ret = wait_consider_task(wo, 0, p);
                if (ret)
                        return ret;
        }
@@ -1430,6 +1439,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
        list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
                int ret = wait_consider_task(wo, 1, p);
                if (ret)
                        return ret;
        }
diff --git a/kernel/fork.c b/kernel/fork.c
index fbd3497b221f..9b7d746d6d62 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -294,11 +294,18 @@ int __weak arch_dup_task_struct(struct task_struct *dst,
        return 0;
 }
+void set_task_stack_end_magic(struct task_struct *tsk)
+{
+        unsigned long *stackend;
+        stackend = end_of_stack(tsk);
+        *stackend = STACK_END_MAGIC;    /* for overflow detection */
+}
 static struct task_struct *dup_task_struct(struct task_struct *orig)
 {
        struct task_struct *tsk;
        struct thread_info *ti;
-        unsigned long *stackend;
        int node = tsk_fork_get_node(orig);
        int err;
@@ -328,8 +335,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        setup_thread_stack(tsk, orig);
        clear_user_return_notifier(tsk);
        clear_tsk_need_resched(tsk);
-        stackend = end_of_stack(tsk);
+        set_task_stack_end_magic(tsk);
-        *stackend = STACK_END_MAGIC;    /* for overflow detection */
 #ifdef CONFIG_CC_STACKPROTECTOR
        tsk->stack_canary = get_random_int();
@@ -374,12 +380,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
         */
        down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
-        mm->locked_vm = 0;
+        mm->total_vm = oldmm->total_vm;
-        mm->mmap = NULL;
+        mm->shared_vm = oldmm->shared_vm;
-        mm->vmacache_seqnum = 0;
+        mm->exec_vm = oldmm->exec_vm;
-        mm->map_count = 0;
+        mm->stack_vm = oldmm->stack_vm;
-        cpumask_clear(mm_cpumask(mm));
-        mm->mm_rb = RB_ROOT;
        rb_link = &mm->mm_rb.rb_node;
        rb_parent = NULL;
        pprev = &mm->mmap;
@@ -430,7 +435,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                                atomic_dec(&inode->i_writecount);
                        mutex_lock(&mapping->i_mmap_mutex);
                        if (tmp->vm_flags & VM_SHARED)
-                                mapping->i_mmap_writable++;
+                                atomic_inc(&mapping->i_mmap_writable);
                        flush_dcache_mmap_lock(mapping);
                        /* insert tmp into the share list, just after mpnt */
                        if (unlikely(tmp->vm_flags & VM_NONLINEAR))
@@ -536,19 +541,37 @@ static void mm_init_aio(struct mm_struct *mm)
 #endif
 }
+static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
+{
+#ifdef CONFIG_MEMCG
+        mm->owner = p;
+#endif
+}
 static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
 {
+        mm->mmap = NULL;
+        mm->mm_rb = RB_ROOT;
+        mm->vmacache_seqnum = 0;
        atomic_set(&mm->mm_users, 1);
        atomic_set(&mm->mm_count, 1);
        init_rwsem(&mm->mmap_sem);
        INIT_LIST_HEAD(&mm->mmlist);
        mm->core_state = NULL;
        atomic_long_set(&mm->nr_ptes, 0);
+        mm->map_count = 0;
+        mm->locked_vm = 0;
+        mm->pinned_vm = 0;
        memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
        spin_lock_init(&mm->page_table_lock);
+        mm_init_cpumask(mm);
        mm_init_aio(mm);
        mm_init_owner(mm, p);
+        mmu_notifier_mm_init(mm);
        clear_tlb_flush_pending(mm);
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+        mm->pmd_huge_pte = NULL;
+#endif
        if (current->mm) {
                mm->flags = current->mm->flags & MMF_INIT_MASK;
@@ -558,11 +581,17 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
                mm->def_flags = 0;
        }
-        if (likely(!mm_alloc_pgd(mm))) {
+        if (mm_alloc_pgd(mm))
-                mmu_notifier_mm_init(mm);
+                goto fail_nopgd;
-                return mm;
-        }
+        if (init_new_context(p, mm))
+                goto fail_nocontext;
+        return mm;
+fail_nocontext:
+        mm_free_pgd(mm);
+fail_nopgd:
        free_mm(mm);
        return NULL;
 }
@@ -578,9 +607,8 @@ static void check_mm(struct mm_struct *mm)
                        printk(KERN_ALERT "BUG: Bad rss-counter state "
                                          "mm:%p idx:%d val:%ld\n", mm, i, x);
        }
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
-        VM_BUG_ON(mm->pmd_huge_pte);
+        VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
 #endif
 }
@@ -596,7 +624,6 @@ struct mm_struct *mm_alloc(void)
                return NULL;
        memset(mm, 0, sizeof(*mm));
-        mm_init_cpumask(mm);
        return mm_init(mm, current);
 }
@@ -828,17 +855,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
                goto fail_nomem;
        memcpy(mm, oldmm, sizeof(*mm));
-        mm_init_cpumask(mm);
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
-        mm->pmd_huge_pte = NULL;
-#endif
        if (!mm_init(mm, tsk))
                goto fail_nomem;
-        if (init_new_context(tsk, mm))
-                goto fail_nocontext;
        dup_mm_exe_file(oldmm, mm);
        err = dup_mmap(mm, oldmm);
@@ -860,15 +880,6 @@ free_pt:
 fail_nomem:
        return NULL;
-fail_nocontext:
-        /*
-         * If init_new_context() failed, we cannot use mmput() to free the mm
-         * because it calls destroy_context()
-         */
-        mm_free_pgd(mm);
-        free_mm(mm);
-        return NULL;
 }
 static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
@@ -1062,6 +1073,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        sig->curr_target = tsk;
        init_sigpending(&sig->shared_pending);
        INIT_LIST_HEAD(&sig->posix_timers);
+        seqlock_init(&sig->stats_lock);
        hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        sig->real_timer.function = it_real_fn;
@@ -1099,7 +1111,7 @@ static void copy_seccomp(struct task_struct *p)
         * needed because this new task is not yet running and cannot
         * be racing exec.
         */
-        BUG_ON(!spin_is_locked(&current->sighand->siglock));
+        assert_spin_locked(&current->sighand->siglock);
        /* Ref-count the new filter user, and assign it. */
        get_seccomp_filter(current);
@@ -1140,13 +1152,6 @@ static void rt_mutex_init_task(struct task_struct *p)
 #endif
 }
-#ifdef CONFIG_MEMCG
-void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
-{
-        mm->owner = p;
-}
-#endif /* CONFIG_MEMCG */
 /*
 * Initialize POSIX timer handling for a single task.
 */
@@ -1346,10 +1351,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_DEBUG_MUTEXES
        p->blocked_on = NULL; /* not blocked yet */
 #endif
-#ifdef CONFIG_MEMCG
-        p->memcg_batch.do_batch = 0;
-        p->memcg_batch.memcg = NULL;
-#endif
 #ifdef CONFIG_BCACHE
        p->sequential_io        = 0;
        p->sequential_io_avg    = 0;
@@ -1365,8 +1366,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                goto bad_fork_cleanup_policy;
        retval = audit_alloc(p);
        if (retval)
-                goto bad_fork_cleanup_policy;
+                goto bad_fork_cleanup_perf;
        /* copy all the process information */
+        shm_init_task(p);
        retval = copy_semundo(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_audit;
@@ -1570,8 +1572,9 @@ bad_fork_cleanup_semundo:
        exit_sem(p);
 bad_fork_cleanup_audit:
        audit_free(p);
-bad_fork_cleanup_policy:
+bad_fork_cleanup_perf:
        perf_event_free_task(p);
+bad_fork_cleanup_policy:
 #ifdef CONFIG_NUMA
        mpol_put(p->mempolicy);
 bad_fork_cleanup_threadgroup_lock:
@@ -1918,6 +1921,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
                         */
                        exit_sem(current);
                }
+                if (unshare_flags & CLONE_NEWIPC) {
+                        /* Orphan segments in old ns (see sem above). */
+                        exit_shm(current);
+                        shm_init_task(current);
+                }
                if (new_nsproxy)
                        switch_task_namespaces(current, new_nsproxy);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index aa6a8aadb911..a8900a3bc27a 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -42,6 +42,9 @@ bool freezing_slow_path(struct task_struct *p)
        if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))
                return false;
+        if (test_thread_flag(TIF_MEMDIE))
+                return false;
        if (pm_nosig_freezing || cgroup_freezing(p))
                return true;
@@ -147,12 +150,6 @@ void __thaw_task(struct task_struct *p)
 {
        unsigned long flags;
-        /*
-         * Clear freezing and kick @p if FROZEN.  Clearing is guaranteed to
-         * be visible to @p as waking up implies wmb.  Waking up inside
-         * freezer_lock also prevents wakeups from leaking outside
-         * refrigerator.
-         */
        spin_lock_irqsave(&freezer_lock, flags);
        if (frozen(p))
                wake_up_process(p);
diff --git a/kernel/futex.c b/kernel/futex.c
index d3a9d946d0b7..63678b573d61 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -143,9 +143,8 @@
 *
 * Where (A) orders the waiters increment and the futex value read through
 * atomic operations (see hb_waiters_inc) and where (B) orders the write
- * to futex and the waiters read -- this is done by the barriers in
+ * to futex and the waiters read -- this is done by the barriers for both
- * get_futex_key_refs(), through either ihold or atomic_inc, depending on the
+ * shared and private futexes in get_futex_key_refs().
- * futex type.
 *
 * This yields the following case (where X:=waiters, Y:=futex):
 *
@@ -343,12 +342,21 @@ static void get_futex_key_refs(union futex_key *key)
        case FUT_OFF_MMSHARED:
                futex_get_mm(key); /* implies MB (B) */
                break;
+        default:
+                /*
+                 * Private futexes do not hold reference on an inode or
+                 * mm, therefore the only purpose of calling get_futex_key_refs
+                 * is because we need the barrier for the lockless waiter check.
+                 */
+                smp_mb(); /* explicit MB (B) */
        }
 }
 /*
 * Drop a reference to the resource addressed by a key.
- * The hash bucket spinlock must not be held.
+ * The hash bucket spinlock must not be held. This is
+ * a no-op for private futexes, see comment in the get
+ * counterpart.
 */
 static void drop_futex_key_refs(union futex_key *key)
 {
@@ -639,8 +647,14 @@ static struct futex_pi_state * alloc_pi_state(void)
        return pi_state;
 }
+/*
+ * Must be called with the hb lock held.
+ */
 static void free_pi_state(struct futex_pi_state *pi_state)
 {
+        if (!pi_state)
+                return;
        if (!atomic_dec_and_test(&pi_state->refcount))
                return;
@@ -1519,15 +1533,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
        }
 retry:
-        if (pi_state != NULL) {
-                /*
-                 * We will have to lookup the pi_state again, so free this one
-                 * to keep the accounting correct.
-                 */
-                free_pi_state(pi_state);
-                pi_state = NULL;
-        }
        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
        if (unlikely(ret != 0))
                goto out;
@@ -1617,6 +1622,8 @@ retry_private:
                case 0:
                        break;
                case -EFAULT:
+                        free_pi_state(pi_state);
+                        pi_state = NULL;
                        double_unlock_hb(hb1, hb2);
                        hb_waiters_dec(hb2);
                        put_futex_key(&key2);
@@ -1632,6 +1639,8 @@ retry_private:
                         *   exit to complete.
                         * - The user space value changed.
                         */
+                        free_pi_state(pi_state);
+                        pi_state = NULL;
                        double_unlock_hb(hb1, hb2);
                        hb_waiters_dec(hb2);
                        put_futex_key(&key2);
@@ -1708,6 +1717,7 @@ retry_private:
        }
 out_unlock:
+        free_pi_state(pi_state);
        double_unlock_hb(hb1, hb2);
        hb_waiters_dec(hb2);
@@ -1725,8 +1735,6 @@ out_put_keys:
 out_put_key1:
        put_futex_key(&key1);
 out:
-        if (pi_state != NULL)
-                free_pi_state(pi_state);
        return ret ? ret : task_count;
 }
@@ -2592,6 +2600,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
         * shared futexes. We need to compare the keys:
         */
        if (match_futex(&q.key, &key2)) {
+                queue_unlock(hb);
                ret = -EINVAL;
                goto out_put_keys;
        }
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index d04ce8ac4399..3b7408759bdf 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -35,7 +35,7 @@ config GCOV_KERNEL
 config GCOV_PROFILE_ALL
        bool "Profile entire Kernel"
        depends on GCOV_KERNEL
-        depends on SUPERH || S390 || X86 || PPC || MICROBLAZE
+        depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM || ARM64
        default n
        ---help---
        This options activates profiling for the entire kernel.
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index 15ff01a76379..edf67c493a8e 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -784,8 +784,7 @@ static __init int gcov_fs_init(void)
 err_remove:
        pr_err("init failed\n");
-        if (root_node.dentry)
+        debugfs_remove(root_node.dentry);
-                debugfs_remove(root_node.dentry);
        return rc;
 }
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index d269cecdfbf0..225086b2652e 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -55,6 +55,9 @@ config GENERIC_IRQ_CHIP
 config IRQ_DOMAIN
        bool
+config HANDLE_DOMAIN_IRQ
+        bool
 config IRQ_DOMAIN_DEBUG
        bool "Expose hardware/virtual IRQ mapping via debugfs"
        depends on IRQ_DOMAIN && DEBUG_FS
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index a2b28a2fd7b1..e5202f00cabc 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -342,6 +342,31 @@ static bool irq_check_poll(struct irq_desc *desc)
        return irq_wait_for_poll(desc);
 }
+static bool irq_may_run(struct irq_desc *desc)
+{
+        unsigned int mask = IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED;
+        /*
+         * If the interrupt is not in progress and is not an armed
+         * wakeup interrupt, proceed.
+         */
+        if (!irqd_has_set(&desc->irq_data, mask))
+                return true;
+        /*
+         * If the interrupt is an armed wakeup source, mark it pending
+         * and suspended, disable it and notify the pm core about the
+         * event.
+         */
+        if (irq_pm_check_wakeup(desc))
+                return false;
+        /*
+         * Handle a potential concurrent poll on a different core.
+         */
+        return irq_check_poll(desc);
+}
 /**
 *      handle_simple_irq - Simple and software-decoded IRQs.
 *      @irq:   the interrupt number
@@ -359,9 +384,8 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
 {
        raw_spin_lock(&desc->lock);
-        if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
+        if (!irq_may_run(desc))
-                if (!irq_check_poll(desc))
+                goto out_unlock;
-                        goto out_unlock;
        desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
        kstat_incr_irqs_this_cpu(irq, desc);
@@ -412,9 +436,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
        raw_spin_lock(&desc->lock);
        mask_ack_irq(desc);
-        if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
+        if (!irq_may_run(desc))
-                if (!irq_check_poll(desc))
+                goto out_unlock;
-                        goto out_unlock;
        desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
        kstat_incr_irqs_this_cpu(irq, desc);
@@ -485,9 +508,8 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
        raw_spin_lock(&desc->lock);
-        if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
+        if (!irq_may_run(desc))
-                if (!irq_check_poll(desc))
+                goto out;
-                        goto out;
        desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
        kstat_incr_irqs_this_cpu(irq, desc);
@@ -517,6 +539,7 @@ out:
                chip->irq_eoi(&desc->irq_data);
        raw_spin_unlock(&desc->lock);
 }
+EXPORT_SYMBOL_GPL(handle_fasteoi_irq);
 /**
 *      handle_edge_irq - edge type IRQ handler
@@ -540,19 +563,23 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
        raw_spin_lock(&desc->lock);
        desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+        if (!irq_may_run(desc)) {
+                desc->istate |= IRQS_PENDING;
+                mask_ack_irq(desc);
+                goto out_unlock;
+        }
        /*
-         * If we're currently running this IRQ, or its disabled,
+         * If its disabled or no action available then mask it and get
-         * we shouldn't process the IRQ. Mark it pending, handle
+         * out of here.
-         * the necessary masking and go out
         */
-        if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
+        if (irqd_irq_disabled(&desc->irq_data) || !desc->action) {
-                     irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
+                desc->istate |= IRQS_PENDING;
-                if (!irq_check_poll(desc)) {
+                mask_ack_irq(desc);
-                        desc->istate |= IRQS_PENDING;
+                goto out_unlock;
-                        mask_ack_irq(desc);
-                        goto out_unlock;
-                }
        }
        kstat_incr_irqs_this_cpu(irq, desc);
        /* Start handling the irq */
@@ -601,18 +628,21 @@ void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc)
        raw_spin_lock(&desc->lock);
        desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+        if (!irq_may_run(desc)) {
+                desc->istate |= IRQS_PENDING;
+                goto out_eoi;
+        }
        /*
-         * If we're currently running this IRQ, or its disabled,
+         * If its disabled or no action available then mask it and get
-         * we shouldn't process the IRQ. Mark it pending, handle
+         * out of here.
-         * the necessary masking and go out
         */
-        if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
+        if (irqd_irq_disabled(&desc->irq_data) || !desc->action) {
-                     irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
+                desc->istate |= IRQS_PENDING;
-                if (!irq_check_poll(desc)) {
+                goto out_eoi;
-                        desc->istate |= IRQS_PENDING;
-                        goto out_eoi;
-                }
        }
        kstat_incr_irqs_this_cpu(irq, desc);
        do {
@@ -669,7 +699,7 @@ void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc)
 {
        struct irq_chip *chip = irq_desc_get_chip(desc);
        struct irqaction *action = desc->action;
-        void *dev_id = __this_cpu_ptr(action->percpu_dev_id);
+        void *dev_id = raw_cpu_ptr(action->percpu_dev_id);
        irqreturn_t res;
        kstat_incr_irqs_this_cpu(irq, desc);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 099ea2e0eb88..4332d766619d 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -63,8 +63,8 @@ enum {
 extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
                unsigned long flags);
-extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
+extern void __disable_irq(struct irq_desc *desc, unsigned int irq);
-extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
+extern void __enable_irq(struct irq_desc *desc, unsigned int irq);
 extern int irq_startup(struct irq_desc *desc, bool resend);
 extern void irq_shutdown(struct irq_desc *desc);
@@ -194,3 +194,15 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *d
        __this_cpu_inc(*desc->kstat_irqs);
        __this_cpu_inc(kstat.irqs_sum);
 }
+#ifdef CONFIG_PM_SLEEP
+bool irq_pm_check_wakeup(struct irq_desc *desc);
+void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action);
+void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action);
+#else
+static inline bool irq_pm_check_wakeup(struct irq_desc *desc) { return false; }
+static inline void
+irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { }
+static inline void
+irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { }
+#endif
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 1487a123db5c..a1782f88f0af 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -14,6 +14,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/radix-tree.h>
 #include <linux/bitmap.h>
+#include <linux/irqdomain.h>
 #include "internals.h"
@@ -336,6 +337,47 @@ int generic_handle_irq(unsigned int irq)
 }
 EXPORT_SYMBOL_GPL(generic_handle_irq);
+#ifdef CONFIG_HANDLE_DOMAIN_IRQ
+/**
+ * __handle_domain_irq - Invoke the handler for a HW irq belonging to a domain
+ * @domain:     The domain where to perform the lookup
+ * @hwirq:      The HW irq number to convert to a logical one
+ * @lookup:     Whether to perform the domain lookup or not
+ * @regs:       Register file coming from the low-level handling code
+ *
+ * Returns:     0 on success, or -EINVAL if conversion has failed
+ */
+int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
+                        bool lookup, struct pt_regs *regs)
+{
+        struct pt_regs *old_regs = set_irq_regs(regs);
+        unsigned int irq = hwirq;
+        int ret = 0;
+        irq_enter();
+#ifdef CONFIG_IRQ_DOMAIN
+        if (lookup)
+                irq = irq_find_mapping(domain, hwirq);
+#endif
+        /*
+         * Some hardware gives randomly wrong interrupts.  Rather
+         * than crashing, do something sensible.
+         */
+        if (unlikely(!irq || irq >= nr_irqs)) {
+                ack_bad_irq(irq);
+                ret = -EINVAL;
+        } else {
+                generic_handle_irq(irq);
+        }
+        irq_exit();
+        set_irq_regs(old_regs);
+        return ret;
+}
+#endif
 /* Dynamic interrupt handling */
 /**
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 3dc6a61bf06a..0a9104b4608b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -382,14 +382,8 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
 }
 #endif
-void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
+void __disable_irq(struct irq_desc *desc, unsigned int irq)
 {
-        if (suspend) {
-                if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
-                        return;
-                desc->istate |= IRQS_SUSPENDED;
-        }
        if (!desc->depth++)
                irq_disable(desc);
 }
@@ -401,7 +395,7 @@ static int __disable_irq_nosync(unsigned int irq)
        if (!desc)
                return -EINVAL;
-        __disable_irq(desc, irq, false);
+        __disable_irq(desc, irq);
        irq_put_desc_busunlock(desc, flags);
        return 0;
 }
@@ -442,20 +436,8 @@ void disable_irq(unsigned int irq)
 }
 EXPORT_SYMBOL(disable_irq);
-void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
+void __enable_irq(struct irq_desc *desc, unsigned int irq)
 {
-        if (resume) {
-                if (!(desc->istate & IRQS_SUSPENDED)) {
-                        if (!desc->action)
-                                return;
-                        if (!(desc->action->flags & IRQF_FORCE_RESUME))
-                                return;
-                        /* Pretend that it got disabled ! */
-                        desc->depth++;
-                }
-                desc->istate &= ~IRQS_SUSPENDED;
-        }
        switch (desc->depth) {
        case 0:
 err_out:
@@ -497,7 +479,7 @@ void enable_irq(unsigned int irq)
                 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
                goto out;
-        __enable_irq(desc, irq, false);
+        __enable_irq(desc, irq);
 out:
        irq_put_desc_busunlock(desc, flags);
 }
@@ -1218,6 +1200,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
        new->irq = irq;
        *old_ptr = new;
+        irq_pm_install_action(desc, new);
        /* Reset broken irq detection when installing new handler */
        desc->irq_count = 0;
        desc->irqs_unhandled = 0;
@@ -1228,7 +1212,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
         */
        if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
                desc->istate &= ~IRQS_SPURIOUS_DISABLED;
-                __enable_irq(desc, irq, false);
+                __enable_irq(desc, irq);
        }
        raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -1336,6 +1320,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
        /* Found it - now remove it from the list of entries: */
        *action_ptr = action->next;
+        irq_pm_remove_action(desc, action);
        /* If this was the last handler, shut down the IRQ line: */
        if (!desc->action) {
                irq_shutdown(desc);
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index abcd6ca86cb7..3ca532592704 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -9,17 +9,105 @@
 #include <linux/irq.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
+#include <linux/suspend.h>
 #include <linux/syscore_ops.h>
 #include "internals.h"
+bool irq_pm_check_wakeup(struct irq_desc *desc)
+{
+        if (irqd_is_wakeup_armed(&desc->irq_data)) {
+                irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
+                desc->istate |= IRQS_SUSPENDED | IRQS_PENDING;
+                desc->depth++;
+                irq_disable(desc);
+                pm_system_wakeup();
+                return true;
+        }
+        return false;
+}
+/*
+ * Called from __setup_irq() with desc->lock held after @action has
+ * been installed in the action chain.
+ */
+void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action)
+{
+        desc->nr_actions++;
+        if (action->flags & IRQF_FORCE_RESUME)
+                desc->force_resume_depth++;
+        WARN_ON_ONCE(desc->force_resume_depth &&
+                     desc->force_resume_depth != desc->nr_actions);
+        if (action->flags & IRQF_NO_SUSPEND)
+                desc->no_suspend_depth++;
+        WARN_ON_ONCE(desc->no_suspend_depth &&
+                     desc->no_suspend_depth != desc->nr_actions);
+}
+/*
+ * Called from __free_irq() with desc->lock held after @action has
+ * been removed from the action chain.
+ */
+void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action)
+{
+        desc->nr_actions--;
+        if (action->flags & IRQF_FORCE_RESUME)
+                desc->force_resume_depth--;
+        if (action->flags & IRQF_NO_SUSPEND)
+                desc->no_suspend_depth--;
+}
+static bool suspend_device_irq(struct irq_desc *desc, int irq)
+{
+        if (!desc->action || desc->no_suspend_depth)
+                return false;
+        if (irqd_is_wakeup_set(&desc->irq_data)) {
+                irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED);
+                /*
+                 * We return true here to force the caller to issue
+                 * synchronize_irq(). We need to make sure that the
+                 * IRQD_WAKEUP_ARMED is visible before we return from
+                 * suspend_device_irqs().
+                 */
+                return true;
+        }
+        desc->istate |= IRQS_SUSPENDED;
+        __disable_irq(desc, irq);
+        /*
+         * Hardware which has no wakeup source configuration facility
+         * requires that the non wakeup interrupts are masked at the
+         * chip level. The chip implementation indicates that with
+         * IRQCHIP_MASK_ON_SUSPEND.
+         */
+        if (irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
+                mask_irq(desc);
+        return true;
+}
 /**
 * suspend_device_irqs - disable all currently enabled interrupt lines
 *
- * During system-wide suspend or hibernation device drivers need to be prevented
+ * During system-wide suspend or hibernation device drivers need to be
- * from receiving interrupts and this function is provided for this purpose.
+ * prevented from receiving interrupts and this function is provided
- * It marks all interrupt lines in use, except for the timer ones, as disabled
+ * for this purpose.
- * and sets the IRQS_SUSPENDED flag for each of them.
+ *
+ * So we disable all interrupts and mark them IRQS_SUSPENDED except
+ * for those which are unused, those which are marked as not
+ * suspendable via an interrupt request with the flag IRQF_NO_SUSPEND
+ * set and those which are marked as active wakeup sources.
+ *
+ * The active wakeup sources are handled by the flow handler entry
+ * code which checks for the IRQD_WAKEUP_ARMED flag, suspends the
+ * interrupt and notifies the pm core about the wakeup.
 */
 void suspend_device_irqs(void)
 {
@@ -28,18 +116,36 @@ void suspend_device_irqs(void)
        for_each_irq_desc(irq, desc) {
                unsigned long flags;
+                bool sync;
                raw_spin_lock_irqsave(&desc->lock, flags);
-                __disable_irq(desc, irq, true);
+                sync = suspend_device_irq(desc, irq);
                raw_spin_unlock_irqrestore(&desc->lock, flags);
-        }
-        for_each_irq_desc(irq, desc)
+                if (sync)
-                if (desc->istate & IRQS_SUSPENDED)
                        synchronize_irq(irq);
+        }
 }
 EXPORT_SYMBOL_GPL(suspend_device_irqs);
+static void resume_irq(struct irq_desc *desc, int irq)
+{
+        irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
+        if (desc->istate & IRQS_SUSPENDED)
+                goto resume;
+        /* Force resume the interrupt? */
+        if (!desc->force_resume_depth)
+                return;
+        /* Pretend that it got disabled ! */
+        desc->depth++;
+resume:
+        desc->istate &= ~IRQS_SUSPENDED;
+        __enable_irq(desc, irq);
+}
 static void resume_irqs(bool want_early)
 {
        struct irq_desc *desc;
@@ -54,7 +160,7 @@ static void resume_irqs(bool want_early)
                        continue;
                raw_spin_lock_irqsave(&desc->lock, flags);
-                __enable_irq(desc, irq, true);
+                resume_irq(desc, irq);
                raw_spin_unlock_irqrestore(&desc->lock, flags);
        }
 }
@@ -93,38 +199,3 @@ void resume_device_irqs(void)
        resume_irqs(false);
 }
 EXPORT_SYMBOL_GPL(resume_device_irqs);
-/**
- * check_wakeup_irqs - check if any wake-up interrupts are pending
- */
-int check_wakeup_irqs(void)
-{
-        struct irq_desc *desc;
-        int irq;
-        for_each_irq_desc(irq, desc) {
-                /*
-                 * Only interrupts which are marked as wakeup source
-                 * and have not been disabled before the suspend check
-                 * can abort suspend.
-                 */
-                if (irqd_is_wakeup_set(&desc->irq_data)) {
-                        if (desc->depth == 1 && desc->istate & IRQS_PENDING)
-                                return -EBUSY;
-                        continue;
-                }
-                /*
-                 * Check the non wakeup interrupts whether they need
-                 * to be masked before finally going into suspend
-                 * state. That's for hardware which has no wakeup
-                 * source configuration facility. The chip
-                 * implementation indicates that with
-                 * IRQCHIP_MASK_ON_SUSPEND.
-                 */
-                if (desc->istate & IRQS_SUSPENDED &&
-                    irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
-                        mask_irq(desc);
-        }
-        return 0;
-}
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index e6bcbe756663..3ab9048483fa 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -95,11 +95,11 @@ bool irq_work_queue(struct irq_work *work)
        /* If the work is "lazy", handle it from next tick if any */
        if (work->flags & IRQ_WORK_LAZY) {
-                if (llist_add(&work->llnode, &__get_cpu_var(lazy_list)) &&
+                if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
                    tick_nohz_tick_stopped())
                        arch_irq_work_raise();
        } else {
-                if (llist_add(&work->llnode, &__get_cpu_var(raised_list)))
+                if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
                        arch_irq_work_raise();
        }
@@ -113,10 +113,12 @@ bool irq_work_needs_cpu(void)
 {
        struct llist_head *raised, *lazy;
-        raised = &__get_cpu_var(raised_list);
+        raised = this_cpu_ptr(&raised_list);
-        lazy = &__get_cpu_var(lazy_list);
+        lazy = this_cpu_ptr(&lazy_list);
-        if (llist_empty(raised) && llist_empty(lazy))
-                return false;
+        if (llist_empty(raised) || arch_irq_work_has_interrupt())
+                if (llist_empty(lazy))
+                        return false;
        /* All work should have been flushed before going offline */
        WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
@@ -166,11 +168,20 @@ static void irq_work_run_list(struct llist_head *list)
 */
 void irq_work_run(void)
 {
-        irq_work_run_list(&__get_cpu_var(raised_list));
+        irq_work_run_list(this_cpu_ptr(&raised_list));
-        irq_work_run_list(&__get_cpu_var(lazy_list));
+        irq_work_run_list(this_cpu_ptr(&lazy_list));
 }
 EXPORT_SYMBOL_GPL(irq_work_run);
+void irq_work_tick(void)
+{
+        struct llist_head *raised = &__get_cpu_var(raised_list);
+        if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
+                irq_work_run_list(raised);
+        irq_work_run_list(&__get_cpu_var(lazy_list));
+}
 /*
 * Synchronize against the irq_work @entry, ensures the entry is not
 * currently in use.
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index cb0cf37dac3a..5c5987f10819 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -364,7 +364,7 @@ static int __sprint_symbol(char *buffer, unsigned long address,
        address += symbol_offset;
        name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
        if (!name)
-                return sprintf(buffer, "0x%lx", address);
+                return sprintf(buffer, "0x%lx", address - symbol_offset);
        if (name != buffer)
                strcpy(buffer, name);
@@ -565,19 +565,12 @@ static int kallsyms_open(struct inode *inode, struct file *file)
         * using get_symbol_offset for every symbol.
         */
        struct kallsym_iter *iter;
-        int ret;
+        iter = __seq_open_private(file, &kallsyms_op, sizeof(*iter));
-        iter = kmalloc(sizeof(*iter), GFP_KERNEL);
        if (!iter)
                return -ENOMEM;
        reset_iter(iter, 0);
-        ret = seq_open(file, &kallsyms_op);
+        return 0;
-        if (ret == 0)
-                ((struct seq_file *)file->private_data)->private = iter;
-        else
-                kfree(iter);
-        return ret;
 }
 #ifdef  CONFIG_KGDB_KDB
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index e30ac0fe61c3..0aa69ea1d8fd 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -44,11 +44,12 @@ static long kptr_obfuscate(long v, int type)
 */
 static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type)
 {
-        long ret;
+        long t1, t2;
-        ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type);
+        t1 = kptr_obfuscate((long)v1, type);
+        t2 = kptr_obfuscate((long)v2, type);
-        return (ret < 0) | ((ret > 0) << 1);
+        return (t1 < t2) | ((t1 > t2) << 1);
 }
 /* The caller must have pinned the task */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 4b8f0c925884..2abf9f6e9a61 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -6,6 +6,8 @@
 * Version 2.  See the file COPYING for more details.
 */
+#define pr_fmt(fmt)     "kexec: " fmt
 #include <linux/capability.h>
 #include <linux/mm.h>
 #include <linux/file.h>
@@ -40,6 +42,9 @@
 #include <asm/io.h>
 #include <asm/sections.h>
+#include <crypto/hash.h>
+#include <crypto/sha.h>
 /* Per cpu memory for storing cpu states in case of system crash. */
 note_buf_t __percpu *crash_notes;
@@ -52,6 +57,17 @@ size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
 /* Flag to indicate we are going to kexec a new kernel */
 bool kexec_in_progress = false;
+/*
+ * Declare these symbols weak so that if architecture provides a purgatory,
+ * these will be overridden.
+ */
+char __weak kexec_purgatory[0];
+size_t __weak kexec_purgatory_size = 0;
+#ifdef CONFIG_KEXEC_FILE
+static int kexec_calculate_store_digests(struct kimage *image);
+#endif
 /* Location of the reserved area for the crash kernel */
 struct resource crashk_res = {
        .name  = "Crash kernel",
@@ -125,45 +141,27 @@ static struct page *kimage_alloc_page(struct kimage *image,
                                       gfp_t gfp_mask,
                                       unsigned long dest);
-static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
+static int copy_user_segment_list(struct kimage *image,
-                           unsigned long nr_segments,
+                                  unsigned long nr_segments,
-                           struct kexec_segment __user *segments)
+                                  struct kexec_segment __user *segments)
 {
+        int ret;
        size_t segment_bytes;
-        struct kimage *image;
-        unsigned long i;
-        int result;
-        /* Allocate a controlling structure */
-        result = -ENOMEM;
-        image = kzalloc(sizeof(*image), GFP_KERNEL);
-        if (!image)
-                goto out;
-        image->head = 0;
-        image->entry = &image->head;
-        image->last_entry = &image->head;
-        image->control_page = ~0; /* By default this does not apply */
-        image->start = entry;
-        image->type = KEXEC_TYPE_DEFAULT;
-        /* Initialize the list of control pages */
-        INIT_LIST_HEAD(&image->control_pages);
-        /* Initialize the list of destination pages */
-        INIT_LIST_HEAD(&image->dest_pages);
-        /* Initialize the list of unusable pages */
-        INIT_LIST_HEAD(&image->unuseable_pages);
        /* Read in the segments */
        image->nr_segments = nr_segments;
        segment_bytes = nr_segments * sizeof(*segments);
-        result = copy_from_user(image->segment, segments, segment_bytes);
+        ret = copy_from_user(image->segment, segments, segment_bytes);
-        if (result) {
+        if (ret)
-                result = -EFAULT;
+                ret = -EFAULT;
-                goto out;
-        }
+        return ret;
+}
+static int sanity_check_segment_list(struct kimage *image)
+{
+        int result, i;
+        unsigned long nr_segments = image->nr_segments;
        /*
         * Verify we have good destination addresses.  The caller is
@@ -185,9 +183,9 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
                mstart = image->segment[i].mem;
                mend   = mstart + image->segment[i].memsz;
                if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
-                        goto out;
+                        return result;
                if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
-                        goto out;
+                        return result;
        }
        /* Verify our destination addresses do not overlap.
@@ -208,7 +206,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
                        pend   = pstart + image->segment[j].memsz;
                        /* Do the segments overlap ? */
                        if ((mend > pstart) && (mstart < pend))
-                                goto out;
+                                return result;
                }
        }
@@ -220,131 +218,406 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
        result = -EINVAL;
        for (i = 0; i < nr_segments; i++) {
                if (image->segment[i].bufsz > image->segment[i].memsz)
-                        goto out;
+                        return result;
        }
-        result = 0;
+        /*
-out:
+         * Verify we have good destination addresses.  Normally
-        if (result == 0)
+         * the caller is responsible for making certain we don't
-                *rimage = image;
+         * attempt to load the new image into invalid or reserved
-        else
+         * areas of RAM.  But crash kernels are preloaded into a
-                kfree(image);
+         * reserved area of ram.  We must ensure the addresses
+         * are in the reserved area otherwise preloading the
+         * kernel could corrupt things.
+         */
-        return result;
+        if (image->type == KEXEC_TYPE_CRASH) {
+                result = -EADDRNOTAVAIL;
+                for (i = 0; i < nr_segments; i++) {
+                        unsigned long mstart, mend;
+                        mstart = image->segment[i].mem;
+                        mend = mstart + image->segment[i].memsz - 1;
+                        /* Ensure we are within the crash kernel limits */
+                        if ((mstart < crashk_res.start) ||
+                            (mend > crashk_res.end))
+                                return result;
+                }
+        }
+        return 0;
+}
+static struct kimage *do_kimage_alloc_init(void)
+{
+        struct kimage *image;
+        /* Allocate a controlling structure */
+        image = kzalloc(sizeof(*image), GFP_KERNEL);
+        if (!image)
+                return NULL;
+        image->head = 0;
+        image->entry = &image->head;
+        image->last_entry = &image->head;
+        image->control_page = ~0; /* By default this does not apply */
+        image->type = KEXEC_TYPE_DEFAULT;
+        /* Initialize the list of control pages */
+        INIT_LIST_HEAD(&image->control_pages);
+        /* Initialize the list of destination pages */
+        INIT_LIST_HEAD(&image->dest_pages);
+        /* Initialize the list of unusable pages */
+        INIT_LIST_HEAD(&image->unusable_pages);
+        return image;
 }
 static void kimage_free_page_list(struct list_head *list);
-static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
+static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
-                                unsigned long nr_segments,
+                             unsigned long nr_segments,
-                                struct kexec_segment __user *segments)
+                             struct kexec_segment __user *segments,
+                             unsigned long flags)
 {
-        int result;
+        int ret;
        struct kimage *image;
+        bool kexec_on_panic = flags & KEXEC_ON_CRASH;
+        if (kexec_on_panic) {
+                /* Verify we have a valid entry point */
+                if ((entry < crashk_res.start) || (entry > crashk_res.end))
+                        return -EADDRNOTAVAIL;
+        }
        /* Allocate and initialize a controlling structure */
-        image = NULL;
+        image = do_kimage_alloc_init();
-        result = do_kimage_alloc(&image, entry, nr_segments, segments);
+        if (!image)
-        if (result)
+                return -ENOMEM;
-                goto out;
+        image->start = entry;
+        ret = copy_user_segment_list(image, nr_segments, segments);
+        if (ret)
+                goto out_free_image;
+        ret = sanity_check_segment_list(image);
+        if (ret)
+                goto out_free_image;
+         /* Enable the special crash kernel control page allocation policy. */
+        if (kexec_on_panic) {
+                image->control_page = crashk_res.start;
+                image->type = KEXEC_TYPE_CRASH;
+        }
        /*
         * Find a location for the control code buffer, and add it
         * the vector of segments so that it's pages will also be
         * counted as destination pages.
         */
-        result = -ENOMEM;
+        ret = -ENOMEM;
        image->control_code_page = kimage_alloc_control_pages(image,
                                           get_order(KEXEC_CONTROL_PAGE_SIZE));
        if (!image->control_code_page) {
                pr_err("Could not allocate control_code_buffer\n");
-                goto out_free;
+                goto out_free_image;
        }
-        image->swap_page = kimage_alloc_control_pages(image, 0);
+        if (!kexec_on_panic) {
-        if (!image->swap_page) {
+                image->swap_page = kimage_alloc_control_pages(image, 0);
-                pr_err("Could not allocate swap buffer\n");
+                if (!image->swap_page) {
-                goto out_free;
+                        pr_err("Could not allocate swap buffer\n");
+                        goto out_free_control_pages;
+                }
        }
        *rimage = image;
        return 0;
+out_free_control_pages:
-out_free:
        kimage_free_page_list(&image->control_pages);
+out_free_image:
        kfree(image);
-out:
+        return ret;
-        return result;
 }
-static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
+#ifdef CONFIG_KEXEC_FILE
-                                unsigned long nr_segments,
+static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
-                                struct kexec_segment __user *segments)
 {
-        int result;
+        struct fd f = fdget(fd);
-        struct kimage *image;
+        int ret;
-        unsigned long i;
+        struct kstat stat;
+        loff_t pos;
+        ssize_t bytes = 0;
-        image = NULL;
+        if (!f.file)
-        /* Verify we have a valid entry point */
+                return -EBADF;
-        if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
-                result = -EADDRNOTAVAIL;
+        ret = vfs_getattr(&f.file->f_path, &stat);
+        if (ret)
+                goto out;
+        if (stat.size > INT_MAX) {
+                ret = -EFBIG;
                goto out;
        }
-        /* Allocate and initialize a controlling structure */
+        /* Don't hand 0 to vmalloc, it whines. */
-        result = do_kimage_alloc(&image, entry, nr_segments, segments);
+        if (stat.size == 0) {
-        if (result)
+                ret = -EINVAL;
                goto out;
+        }
-        /* Enable the special crash kernel control page
+        *buf = vmalloc(stat.size);
-         * allocation policy.
+        if (!*buf) {
-         */
+                ret = -ENOMEM;
-        image->control_page = crashk_res.start;
+                goto out;
-        image->type = KEXEC_TYPE_CRASH;
+        }
-        /*
+        pos = 0;
-         * Verify we have good destination addresses.  Normally
+        while (pos < stat.size) {
-         * the caller is responsible for making certain we don't
+                bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
-         * attempt to load the new image into invalid or reserved
+                                    stat.size - pos);
-         * areas of RAM.  But crash kernels are preloaded into a
+                if (bytes < 0) {
-         * reserved area of ram.  We must ensure the addresses
+                        vfree(*buf);
-         * are in the reserved area otherwise preloading the
+                        ret = bytes;
-         * kernel could corrupt things.
+                        goto out;
-         */
+                }
-        result = -EADDRNOTAVAIL;
-        for (i = 0; i < nr_segments; i++) {
-                unsigned long mstart, mend;
-                mstart = image->segment[i].mem;
+                if (bytes == 0)
-                mend = mstart + image->segment[i].memsz - 1;
+                        break;
-                /* Ensure we are within the crash kernel limits */
+                pos += bytes;
-                if ((mstart < crashk_res.start) || (mend > crashk_res.end))
-                        goto out_free;
        }
+        if (pos != stat.size) {
+                ret = -EBADF;
+                vfree(*buf);
+                goto out;
+        }
+        *buf_len = pos;
+out:
+        fdput(f);
+        return ret;
+}
+/* Architectures can provide this probe function */
+int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+                                         unsigned long buf_len)
+{
+        return -ENOEXEC;
+}
+void * __weak arch_kexec_kernel_image_load(struct kimage *image)
+{
+        return ERR_PTR(-ENOEXEC);
+}
+void __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
+{
+}
+int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
+                                        unsigned long buf_len)
+{
+        return -EKEYREJECTED;
+}
+/* Apply relocations of type RELA */
+int __weak
+arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+                                 unsigned int relsec)
+{
+        pr_err("RELA relocation unsupported.\n");
+        return -ENOEXEC;
+}
+/* Apply relocations of type REL */
+int __weak
+arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+                             unsigned int relsec)
+{
+        pr_err("REL relocation unsupported.\n");
+        return -ENOEXEC;
+}
+/*
+ * Free up memory used by kernel, initrd, and comand line. This is temporary
+ * memory allocation which is not needed any more after these buffers have
+ * been loaded into separate segments and have been copied elsewhere.
+ */
+static void kimage_file_post_load_cleanup(struct kimage *image)
+{
+        struct purgatory_info *pi = &image->purgatory_info;
+        vfree(image->kernel_buf);
+        image->kernel_buf = NULL;
+        vfree(image->initrd_buf);
+        image->initrd_buf = NULL;
+        kfree(image->cmdline_buf);
+        image->cmdline_buf = NULL;
+        vfree(pi->purgatory_buf);
+        pi->purgatory_buf = NULL;
+        vfree(pi->sechdrs);
+        pi->sechdrs = NULL;
+        /* See if architecture has anything to cleanup post load */
+        arch_kimage_file_post_load_cleanup(image);
        /*
-         * Find a location for the control code buffer, and add
+         * Above call should have called into bootloader to free up
-         * the vector of segments so that it's pages will also be
+         * any data stored in kimage->image_loader_data. It should
-         * counted as destination pages.
+         * be ok now to free it up.
         */
-        result = -ENOMEM;
+        kfree(image->image_loader_data);
+        image->image_loader_data = NULL;
+}
+/*
+ * In file mode list of segments is prepared by kernel. Copy relevant
+ * data from user space, do error checking, prepare segment list
+ */
+static int
+kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
+                             const char __user *cmdline_ptr,
+                             unsigned long cmdline_len, unsigned flags)
+{
+        int ret = 0;
+        void *ldata;
+        ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
+                                &image->kernel_buf_len);
+        if (ret)
+                return ret;
+        /* Call arch image probe handlers */
+        ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
+                                            image->kernel_buf_len);
+        if (ret)
+                goto out;
+#ifdef CONFIG_KEXEC_VERIFY_SIG
+        ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
+                                           image->kernel_buf_len);
+        if (ret) {
+                pr_debug("kernel signature verification failed.\n");
+                goto out;
+        }
+        pr_debug("kernel signature verification successful.\n");
+#endif
+        /* It is possible that there no initramfs is being loaded */
+        if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
+                ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
+                                        &image->initrd_buf_len);
+                if (ret)
+                        goto out;
+        }
+        if (cmdline_len) {
+                image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
+                if (!image->cmdline_buf) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
+                                     cmdline_len);
+                if (ret) {
+                        ret = -EFAULT;
+                        goto out;
+                }
+                image->cmdline_buf_len = cmdline_len;
+                /* command line should be a string with last byte null */
+                if (image->cmdline_buf[cmdline_len - 1] != '\0') {
+                        ret = -EINVAL;
+                        goto out;
+                }
+        }
+        /* Call arch image load handlers */
+        ldata = arch_kexec_kernel_image_load(image);
+        if (IS_ERR(ldata)) {
+                ret = PTR_ERR(ldata);
+                goto out;
+        }
+        image->image_loader_data = ldata;
+out:
+        /* In case of error, free up all allocated memory in this function */
+        if (ret)
+                kimage_file_post_load_cleanup(image);
+        return ret;
+}
+static int
+kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
+                       int initrd_fd, const char __user *cmdline_ptr,
+                       unsigned long cmdline_len, unsigned long flags)
+{
+        int ret;
+        struct kimage *image;
+        bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
+        image = do_kimage_alloc_init();
+        if (!image)
+                return -ENOMEM;
+        image->file_mode = 1;
+        if (kexec_on_panic) {
+                /* Enable special crash kernel control page alloc policy. */
+                image->control_page = crashk_res.start;
+                image->type = KEXEC_TYPE_CRASH;
+        }
+        ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
+                                           cmdline_ptr, cmdline_len, flags);
+        if (ret)
+                goto out_free_image;
+        ret = sanity_check_segment_list(image);
+        if (ret)
+                goto out_free_post_load_bufs;
+        ret = -ENOMEM;
        image->control_code_page = kimage_alloc_control_pages(image,
                                           get_order(KEXEC_CONTROL_PAGE_SIZE));
        if (!image->control_code_page) {
                pr_err("Could not allocate control_code_buffer\n");
-                goto out_free;
+                goto out_free_post_load_bufs;
+        }
+        if (!kexec_on_panic) {
+                image->swap_page = kimage_alloc_control_pages(image, 0);
+                if (!image->swap_page) {
+                        pr_err(KERN_ERR "Could not allocate swap buffer\n");
+                        goto out_free_control_pages;
+                }
        }
        *rimage = image;
        return 0;
+out_free_control_pages:
-out_free:
+        kimage_free_page_list(&image->control_pages);
+out_free_post_load_bufs:
+        kimage_file_post_load_cleanup(image);
+out_free_image:
        kfree(image);
-out:
+        return ret;
-        return result;
 }
+#else /* CONFIG_KEXEC_FILE */
+static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
+#endif /* CONFIG_KEXEC_FILE */
 static int kimage_is_destination_range(struct kimage *image,
                                        unsigned long start,
@@ -609,7 +882,7 @@ static void kimage_free_extra_pages(struct kimage *image)
        kimage_free_page_list(&image->dest_pages);
        /* Walk through and free any unusable pages I have cached */
-        kimage_free_page_list(&image->unuseable_pages);
+        kimage_free_page_list(&image->unusable_pages);
 }
 static void kimage_terminate(struct kimage *image)
@@ -663,6 +936,14 @@ static void kimage_free(struct kimage *image)
        /* Free the kexec control pages... */
        kimage_free_page_list(&image->control_pages);
+        /*
+         * Free up any temporary buffers allocated. This might hit if
+         * error occurred much later after buffer allocation.
+         */
+        if (image->file_mode)
+                kimage_file_post_load_cleanup(image);
        kfree(image);
 }
@@ -732,7 +1013,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
                /* If the page cannot be used file it away */
                if (page_to_pfn(page) >
                                (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
-                        list_add(&page->lru, &image->unuseable_pages);
+                        list_add(&page->lru, &image->unusable_pages);
                        continue;
                }
                addr = page_to_pfn(page) << PAGE_SHIFT;
@@ -791,10 +1072,14 @@ static int kimage_load_normal_segment(struct kimage *image,
        unsigned long maddr;
        size_t ubytes, mbytes;
        int result;
-        unsigned char __user *buf;
+        unsigned char __user *buf = NULL;
+        unsigned char *kbuf = NULL;
        result = 0;
-        buf = segment->buf;
+        if (image->file_mode)
+                kbuf = segment->kbuf;
+        else
+                buf = segment->buf;
        ubytes = segment->bufsz;
        mbytes = segment->memsz;
        maddr = segment->mem;
@@ -826,7 +1111,11 @@ static int kimage_load_normal_segment(struct kimage *image,
                                PAGE_SIZE - (maddr & ~PAGE_MASK));
                uchunk = min(ubytes, mchunk);
-                result = copy_from_user(ptr, buf, uchunk);
+                /* For file based kexec, source pages are in kernel memory */
+                if (image->file_mode)
+                        memcpy(ptr, kbuf, uchunk);
+                else
+                        result = copy_from_user(ptr, buf, uchunk);
                kunmap(page);
                if (result) {
                        result = -EFAULT;
@@ -834,7 +1123,10 @@ static int kimage_load_normal_segment(struct kimage *image,
                }
                ubytes -= uchunk;
                maddr  += mchunk;
-                buf    += mchunk;
+                if (image->file_mode)
+                        kbuf += mchunk;
+                else
+                        buf += mchunk;
                mbytes -= mchunk;
        }
 out:
@@ -851,10 +1143,14 @@ static int kimage_load_crash_segment(struct kimage *image,
        unsigned long maddr;
        size_t ubytes, mbytes;
        int result;
-        unsigned char __user *buf;
+        unsigned char __user *buf = NULL;
+        unsigned char *kbuf = NULL;
        result = 0;
-        buf = segment->buf;
+        if (image->file_mode)
+                kbuf = segment->kbuf;
+        else
+                buf = segment->buf;
        ubytes = segment->bufsz;
        mbytes = segment->memsz;
        maddr = segment->mem;
@@ -877,7 +1173,12 @@ static int kimage_load_crash_segment(struct kimage *image,
                        /* Zero the trailing part of the page */
                        memset(ptr + uchunk, 0, mchunk - uchunk);
                }
-                result = copy_from_user(ptr, buf, uchunk);
+                /* For file based kexec, source pages are in kernel memory */
+                if (image->file_mode)
+                        memcpy(ptr, kbuf, uchunk);
+                else
+                        result = copy_from_user(ptr, buf, uchunk);
                kexec_flush_icache_page(page);
                kunmap(page);
                if (result) {
@@ -886,7 +1187,10 @@ static int kimage_load_crash_segment(struct kimage *image,
                }
                ubytes -= uchunk;
                maddr  += mchunk;
-                buf    += mchunk;
+                if (image->file_mode)
+                        kbuf += mchunk;
+                else
+                        buf += mchunk;
                mbytes -= mchunk;
        }
 out:
@@ -986,16 +1290,16 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
                /* Loading another kernel to reboot into */
                if ((flags & KEXEC_ON_CRASH) == 0)
-                        result = kimage_normal_alloc(&image, entry,
+                        result = kimage_alloc_init(&image, entry, nr_segments,
-                                                        nr_segments, segments);
+                                                   segments, flags);
                /* Loading another kernel to switch to if this one crashes */
                else if (flags & KEXEC_ON_CRASH) {
                        /* Free any current crash dump kernel before
                         * we corrupt it.
                         */
                        kimage_free(xchg(&kexec_crash_image, NULL));
-                        result = kimage_crash_alloc(&image, entry,
+                        result = kimage_alloc_init(&image, entry, nr_segments,
-                                                     nr_segments, segments);
+                                                   segments, flags);
                        crash_map_reserved_pages();
                }
                if (result)
@@ -1077,6 +1381,85 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
 }
 #endif
+#ifdef CONFIG_KEXEC_FILE
+SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
+                unsigned long, cmdline_len, const char __user *, cmdline_ptr,
+                unsigned long, flags)
+{
+        int ret = 0, i;
+        struct kimage **dest_image, *image;
+        /* We only trust the superuser with rebooting the system. */
+        if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+                return -EPERM;
+        /* Make sure we have a legal set of flags */
+        if (flags != (flags & KEXEC_FILE_FLAGS))
+                return -EINVAL;
+        image = NULL;
+        if (!mutex_trylock(&kexec_mutex))
+                return -EBUSY;
+        dest_image = &kexec_image;
+        if (flags & KEXEC_FILE_ON_CRASH)
+                dest_image = &kexec_crash_image;
+        if (flags & KEXEC_FILE_UNLOAD)
+                goto exchange;
+        /*
+         * In case of crash, new kernel gets loaded in reserved region. It is
+         * same memory where old crash kernel might be loaded. Free any
+         * current crash dump kernel before we corrupt it.
+         */
+        if (flags & KEXEC_FILE_ON_CRASH)
+                kimage_free(xchg(&kexec_crash_image, NULL));
+        ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
+                                     cmdline_len, flags);
+        if (ret)
+                goto out;
+        ret = machine_kexec_prepare(image);
+        if (ret)
+                goto out;
+        ret = kexec_calculate_store_digests(image);
+        if (ret)
+                goto out;
+        for (i = 0; i < image->nr_segments; i++) {
+                struct kexec_segment *ksegment;
+                ksegment = &image->segment[i];
+                pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
+                         i, ksegment->buf, ksegment->bufsz, ksegment->mem,
+                         ksegment->memsz);
+                ret = kimage_load_segment(image, &image->segment[i]);
+                if (ret)
+                        goto out;
+        }
+        kimage_terminate(image);
+        /*
+         * Free up any temporary buffers allocated which are not needed
+         * after image has been loaded
+         */
+        kimage_file_post_load_cleanup(image);
+exchange:
+        image = xchg(dest_image, image);
+out:
+        mutex_unlock(&kexec_mutex);
+        kimage_free(image);
+        return ret;
+}
+#endif /* CONFIG_KEXEC_FILE */
 void crash_kexec(struct pt_regs *regs)
 {
        /* Take the kexec_mutex here to prevent sys_kexec_load
@@ -1376,7 +1759,6 @@ static __initdata char *suffix_tbl[] = {
 */
 static int __init parse_crashkernel_suffix(char *cmdline,
                                           unsigned long long   *crash_size,
-                                           unsigned long long   *crash_base,
                                           const char *suffix)
 {
        char *cur = cmdline;
@@ -1465,7 +1847,7 @@ static int __init __parse_crashkernel(char *cmdline,
        if (suffix)
                return parse_crashkernel_suffix(ck_cmdline, crash_size,
-                                crash_base, suffix);
+                                suffix);
        /*
         * if the commandline contains a ':', then that's the extended
         * syntax -- if not, it must be the classic syntax
@@ -1632,6 +2014,672 @@ static int __init crash_save_vmcoreinfo_init(void)
 subsys_initcall(crash_save_vmcoreinfo_init);
+#ifdef CONFIG_KEXEC_FILE
+static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
+                                    struct kexec_buf *kbuf)
+{
+        struct kimage *image = kbuf->image;
+        unsigned long temp_start, temp_end;
+        temp_end = min(end, kbuf->buf_max);
+        temp_start = temp_end - kbuf->memsz;
+        do {
+                /* align down start */
+                temp_start = temp_start & (~(kbuf->buf_align - 1));
+                if (temp_start < start || temp_start < kbuf->buf_min)
+                        return 0;
+                temp_end = temp_start + kbuf->memsz - 1;
+                /*
+                 * Make sure this does not conflict with any of existing
+                 * segments
+                 */
+                if (kimage_is_destination_range(image, temp_start, temp_end)) {
+                        temp_start = temp_start - PAGE_SIZE;
+                        continue;
+                }
+                /* We found a suitable memory range */
+                break;
+        } while (1);
+        /* If we are here, we found a suitable memory range */
+        kbuf->mem = temp_start;
+        /* Success, stop navigating through remaining System RAM ranges */
+        return 1;
+}
+static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
+                                     struct kexec_buf *kbuf)
+{
+        struct kimage *image = kbuf->image;
+        unsigned long temp_start, temp_end;
+        temp_start = max(start, kbuf->buf_min);
+        do {
+                temp_start = ALIGN(temp_start, kbuf->buf_align);
+                temp_end = temp_start + kbuf->memsz - 1;
+                if (temp_end > end || temp_end > kbuf->buf_max)
+                        return 0;
+                /*
+                 * Make sure this does not conflict with any of existing
+                 * segments
+                 */
+                if (kimage_is_destination_range(image, temp_start, temp_end)) {
+                        temp_start = temp_start + PAGE_SIZE;
+                        continue;
+                }
+                /* We found a suitable memory range */
+                break;
+        } while (1);
+        /* If we are here, we found a suitable memory range */
+        kbuf->mem = temp_start;
+        /* Success, stop navigating through remaining System RAM ranges */
+        return 1;
+}
+static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
+{
+        struct kexec_buf *kbuf = (struct kexec_buf *)arg;
+        unsigned long sz = end - start + 1;
+        /* Returning 0 will take to next memory range */
+        if (sz < kbuf->memsz)
+                return 0;
+        if (end < kbuf->buf_min || start > kbuf->buf_max)
+                return 0;
+        /*
+         * Allocate memory top down with-in ram range. Otherwise bottom up
+         * allocation.
+         */
+        if (kbuf->top_down)
+                return locate_mem_hole_top_down(start, end, kbuf);
+        return locate_mem_hole_bottom_up(start, end, kbuf);
+}
+/*
+ * Helper function for placing a buffer in a kexec segment. This assumes
+ * that kexec_mutex is held.
+ */
+int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
+                     unsigned long memsz, unsigned long buf_align,
+                     unsigned long buf_min, unsigned long buf_max,
+                     bool top_down, unsigned long *load_addr)
+{
+        struct kexec_segment *ksegment;
+        struct kexec_buf buf, *kbuf;
+        int ret;
+        /* Currently adding segment this way is allowed only in file mode */
+        if (!image->file_mode)
+                return -EINVAL;
+        if (image->nr_segments >= KEXEC_SEGMENT_MAX)
+                return -EINVAL;
+        /*
+         * Make sure we are not trying to add buffer after allocating
+         * control pages. All segments need to be placed first before
+         * any control pages are allocated. As control page allocation
+         * logic goes through list of segments to make sure there are
+         * no destination overlaps.
+         */
+        if (!list_empty(&image->control_pages)) {
+                WARN_ON(1);
+                return -EINVAL;
+        }
+        memset(&buf, 0, sizeof(struct kexec_buf));
+        kbuf = &buf;
+        kbuf->image = image;
+        kbuf->buffer = buffer;
+        kbuf->bufsz = bufsz;
+        kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
+        kbuf->buf_align = max(buf_align, PAGE_SIZE);
+        kbuf->buf_min = buf_min;
+        kbuf->buf_max = buf_max;
+        kbuf->top_down = top_down;
+        /* Walk the RAM ranges and allocate a suitable range for the buffer */
+        if (image->type == KEXEC_TYPE_CRASH)
+                ret = walk_iomem_res("Crash kernel",
+                                     IORESOURCE_MEM | IORESOURCE_BUSY,
+                                     crashk_res.start, crashk_res.end, kbuf,
+                                     locate_mem_hole_callback);
+        else
+                ret = walk_system_ram_res(0, -1, kbuf,
+                                          locate_mem_hole_callback);
+        if (ret != 1) {
+                /* A suitable memory range could not be found for buffer */
+                return -EADDRNOTAVAIL;
+        }
+        /* Found a suitable memory range */
+        ksegment = &image->segment[image->nr_segments];
+        ksegment->kbuf = kbuf->buffer;
+        ksegment->bufsz = kbuf->bufsz;
+        ksegment->mem = kbuf->mem;
+        ksegment->memsz = kbuf->memsz;
+        image->nr_segments++;
+        *load_addr = ksegment->mem;
+        return 0;
+}
+/* Calculate and store the digest of segments */
+static int kexec_calculate_store_digests(struct kimage *image)
+{
+        struct crypto_shash *tfm;
+        struct shash_desc *desc;
+        int ret = 0, i, j, zero_buf_sz, sha_region_sz;
+        size_t desc_size, nullsz;
+        char *digest;
+        void *zero_buf;
+        struct kexec_sha_region *sha_regions;
+        struct purgatory_info *pi = &image->purgatory_info;
+        zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
+        zero_buf_sz = PAGE_SIZE;
+        tfm = crypto_alloc_shash("sha256", 0, 0);
+        if (IS_ERR(tfm)) {
+                ret = PTR_ERR(tfm);
+                goto out;
+        }
+        desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
+        desc = kzalloc(desc_size, GFP_KERNEL);
+        if (!desc) {
+                ret = -ENOMEM;
+                goto out_free_tfm;
+        }
+        sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
+        sha_regions = vzalloc(sha_region_sz);
+        if (!sha_regions)
+                goto out_free_desc;
+        desc->tfm   = tfm;
+        desc->flags = 0;
+        ret = crypto_shash_init(desc);
+        if (ret < 0)
+                goto out_free_sha_regions;
+        digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
+        if (!digest) {
+                ret = -ENOMEM;
+                goto out_free_sha_regions;
+        }
+        for (j = i = 0; i < image->nr_segments; i++) {
+                struct kexec_segment *ksegment;
+                ksegment = &image->segment[i];
+                /*
+                 * Skip purgatory as it will be modified once we put digest
+                 * info in purgatory.
+                 */
+                if (ksegment->kbuf == pi->purgatory_buf)
+                        continue;
+                ret = crypto_shash_update(desc, ksegment->kbuf,
+                                          ksegment->bufsz);
+                if (ret)
+                        break;
+                /*
+                 * Assume rest of the buffer is filled with zero and
+                 * update digest accordingly.
+                 */
+                nullsz = ksegment->memsz - ksegment->bufsz;
+                while (nullsz) {
+                        unsigned long bytes = nullsz;
+                        if (bytes > zero_buf_sz)
+                                bytes = zero_buf_sz;
+                        ret = crypto_shash_update(desc, zero_buf, bytes);
+                        if (ret)
+                                break;
+                        nullsz -= bytes;
+                }
+                if (ret)
+                        break;
+                sha_regions[j].start = ksegment->mem;
+                sha_regions[j].len = ksegment->memsz;
+                j++;
+        }
+        if (!ret) {
+                ret = crypto_shash_final(desc, digest);
+                if (ret)
+                        goto out_free_digest;
+                ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
+                                                sha_regions, sha_region_sz, 0);
+                if (ret)
+                        goto out_free_digest;
+                ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
+                                                digest, SHA256_DIGEST_SIZE, 0);
+                if (ret)
+                        goto out_free_digest;
+        }
+out_free_digest:
+        kfree(digest);
+out_free_sha_regions:
+        vfree(sha_regions);
+out_free_desc:
+        kfree(desc);
+out_free_tfm:
+        kfree(tfm);
+out:
+        return ret;
+}
+/* Actually load purgatory. Lot of code taken from kexec-tools */
+static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
+                                  unsigned long max, int top_down)
+{
+        struct purgatory_info *pi = &image->purgatory_info;
+        unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
+        unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
+        unsigned char *buf_addr, *src;
+        int i, ret = 0, entry_sidx = -1;
+        const Elf_Shdr *sechdrs_c;
+        Elf_Shdr *sechdrs = NULL;
+        void *purgatory_buf = NULL;
+        /*
+         * sechdrs_c points to section headers in purgatory and are read
+         * only. No modifications allowed.
+         */
+        sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
+        /*
+         * We can not modify sechdrs_c[] and its fields. It is read only.
+         * Copy it over to a local copy where one can store some temporary
+         * data and free it at the end. We need to modify ->sh_addr and
+         * ->sh_offset fields to keep track of permanent and temporary
+         * locations of sections.
+         */
+        sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+        if (!sechdrs)
+                return -ENOMEM;
+        memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+        /*
+         * We seem to have multiple copies of sections. First copy is which
+         * is embedded in kernel in read only section. Some of these sections
+         * will be copied to a temporary buffer and relocated. And these
+         * sections will finally be copied to their final destination at
+         * segment load time.
+         *
+         * Use ->sh_offset to reflect section address in memory. It will
+         * point to original read only copy if section is not allocatable.
+         * Otherwise it will point to temporary copy which will be relocated.
+         *
+         * Use ->sh_addr to contain final address of the section where it
+         * will go during execution time.
+         */
+        for (i = 0; i < pi->ehdr->e_shnum; i++) {
+                if (sechdrs[i].sh_type == SHT_NOBITS)
+                        continue;
+                sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
+                                                sechdrs[i].sh_offset;
+        }
+        /*
+         * Identify entry point section and make entry relative to section
+         * start.
+         */
+        entry = pi->ehdr->e_entry;
+        for (i = 0; i < pi->ehdr->e_shnum; i++) {
+                if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+                        continue;
+                if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
+                        continue;
+                /* Make entry section relative */
+                if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
+                    ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
+                     pi->ehdr->e_entry)) {
+                        entry_sidx = i;
+                        entry -= sechdrs[i].sh_addr;
+                        break;
+                }
+        }
+        /* Determine how much memory is needed to load relocatable object. */
+        buf_align = 1;
+        bss_align = 1;
+        buf_sz = 0;
+        bss_sz = 0;
+        for (i = 0; i < pi->ehdr->e_shnum; i++) {
+                if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+                        continue;
+                align = sechdrs[i].sh_addralign;
+                if (sechdrs[i].sh_type != SHT_NOBITS) {
+                        if (buf_align < align)
+                                buf_align = align;
+                        buf_sz = ALIGN(buf_sz, align);
+                        buf_sz += sechdrs[i].sh_size;
+                } else {
+                        /* bss section */
+                        if (bss_align < align)
+                                bss_align = align;
+                        bss_sz = ALIGN(bss_sz, align);
+                        bss_sz += sechdrs[i].sh_size;
+                }
+        }
+        /* Determine the bss padding required to align bss properly */
+        bss_pad = 0;
+        if (buf_sz & (bss_align - 1))
+                bss_pad = bss_align - (buf_sz & (bss_align - 1));
+        memsz = buf_sz + bss_pad + bss_sz;
+        /* Allocate buffer for purgatory */
+        purgatory_buf = vzalloc(buf_sz);
+        if (!purgatory_buf) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        if (buf_align < bss_align)
+                buf_align = bss_align;
+        /* Add buffer to segment list */
+        ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
+                                buf_align, min, max, top_down,
+                                &pi->purgatory_load_addr);
+        if (ret)
+                goto out;
+        /* Load SHF_ALLOC sections */
+        buf_addr = purgatory_buf;
+        load_addr = curr_load_addr = pi->purgatory_load_addr;
+        bss_addr = load_addr + buf_sz + bss_pad;
+        for (i = 0; i < pi->ehdr->e_shnum; i++) {
+                if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+                        continue;
+                align = sechdrs[i].sh_addralign;
+                if (sechdrs[i].sh_type != SHT_NOBITS) {
+                        curr_load_addr = ALIGN(curr_load_addr, align);
+                        offset = curr_load_addr - load_addr;
+                        /* We already modifed ->sh_offset to keep src addr */
+                        src = (char *) sechdrs[i].sh_offset;
+                        memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
+                        /* Store load address and source address of section */
+                        sechdrs[i].sh_addr = curr_load_addr;
+                        /*
+                         * This section got copied to temporary buffer. Update
+                         * ->sh_offset accordingly.
+                         */
+                        sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
+                        /* Advance to the next address */
+                        curr_load_addr += sechdrs[i].sh_size;
+                } else {
+                        bss_addr = ALIGN(bss_addr, align);
+                        sechdrs[i].sh_addr = bss_addr;
+                        bss_addr += sechdrs[i].sh_size;
+                }
+        }
+        /* Update entry point based on load address of text section */
+        if (entry_sidx >= 0)
+                entry += sechdrs[entry_sidx].sh_addr;
+        /* Make kernel jump to purgatory after shutdown */
+        image->start = entry;
+        /* Used later to get/set symbol values */
+        pi->sechdrs = sechdrs;
+        /*
+         * Used later to identify which section is purgatory and skip it
+         * from checksumming.
+         */
+        pi->purgatory_buf = purgatory_buf;
+        return ret;
+out:
+        vfree(sechdrs);
+        vfree(purgatory_buf);
+        return ret;
+}
+static int kexec_apply_relocations(struct kimage *image)
+{
+        int i, ret;
+        struct purgatory_info *pi = &image->purgatory_info;
+        Elf_Shdr *sechdrs = pi->sechdrs;
+        /* Apply relocations */
+        for (i = 0; i < pi->ehdr->e_shnum; i++) {
+                Elf_Shdr *section, *symtab;
+                if (sechdrs[i].sh_type != SHT_RELA &&
+                    sechdrs[i].sh_type != SHT_REL)
+                        continue;
+                /*
+                 * For section of type SHT_RELA/SHT_REL,
+                 * ->sh_link contains section header index of associated
+                 * symbol table. And ->sh_info contains section header
+                 * index of section to which relocations apply.
+                 */
+                if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
+                    sechdrs[i].sh_link >= pi->ehdr->e_shnum)
+                        return -ENOEXEC;
+                section = &sechdrs[sechdrs[i].sh_info];
+                symtab = &sechdrs[sechdrs[i].sh_link];
+                if (!(section->sh_flags & SHF_ALLOC))
+                        continue;
+                /*
+                 * symtab->sh_link contain section header index of associated
+                 * string table.
+                 */
+                if (symtab->sh_link >= pi->ehdr->e_shnum)
+                        /* Invalid section number? */
+                        continue;
+                /*
+                 * Respective archicture needs to provide support for applying
+                 * relocations of type SHT_RELA/SHT_REL.
+                 */
+                if (sechdrs[i].sh_type == SHT_RELA)
+                        ret = arch_kexec_apply_relocations_add(pi->ehdr,
+                                                               sechdrs, i);
+                else if (sechdrs[i].sh_type == SHT_REL)
+                        ret = arch_kexec_apply_relocations(pi->ehdr,
+                                                           sechdrs, i);
+                if (ret)
+                        return ret;
+        }
+        return 0;
+}
+/* Load relocatable purgatory object and relocate it appropriately */
+int kexec_load_purgatory(struct kimage *image, unsigned long min,
+                         unsigned long max, int top_down,
+                         unsigned long *load_addr)
+{
+        struct purgatory_info *pi = &image->purgatory_info;
+        int ret;
+        if (kexec_purgatory_size <= 0)
+                return -EINVAL;
+        if (kexec_purgatory_size < sizeof(Elf_Ehdr))
+                return -ENOEXEC;
+        pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
+        if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
+            || pi->ehdr->e_type != ET_REL
+            || !elf_check_arch(pi->ehdr)
+            || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
+                return -ENOEXEC;
+        if (pi->ehdr->e_shoff >= kexec_purgatory_size
+            || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
+            kexec_purgatory_size - pi->ehdr->e_shoff))
+                return -ENOEXEC;
+        ret = __kexec_load_purgatory(image, min, max, top_down);
+        if (ret)
+                return ret;
+        ret = kexec_apply_relocations(image);
+        if (ret)
+                goto out;
+        *load_addr = pi->purgatory_load_addr;
+        return 0;
+out:
+        vfree(pi->sechdrs);
+        vfree(pi->purgatory_buf);
+        return ret;
+}
+static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
+                                            const char *name)
+{
+        Elf_Sym *syms;
+        Elf_Shdr *sechdrs;
+        Elf_Ehdr *ehdr;
+        int i, k;
+        const char *strtab;
+        if (!pi->sechdrs || !pi->ehdr)
+                return NULL;
+        sechdrs = pi->sechdrs;
+        ehdr = pi->ehdr;
+        for (i = 0; i < ehdr->e_shnum; i++) {
+                if (sechdrs[i].sh_type != SHT_SYMTAB)
+                        continue;
+                if (sechdrs[i].sh_link >= ehdr->e_shnum)
+                        /* Invalid strtab section number */
+                        continue;
+                strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
+                syms = (Elf_Sym *)sechdrs[i].sh_offset;
+                /* Go through symbols for a match */
+                for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
+                        if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
+                                continue;
+                        if (strcmp(strtab + syms[k].st_name, name) != 0)
+                                continue;
+                        if (syms[k].st_shndx == SHN_UNDEF ||
+                            syms[k].st_shndx >= ehdr->e_shnum) {
+                                pr_debug("Symbol: %s has bad section index %d.\n",
+                                                name, syms[k].st_shndx);
+                                return NULL;
+                        }
+                        /* Found the symbol we are looking for */
+                        return &syms[k];
+                }
+        }
+        return NULL;
+}
+void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
+{
+        struct purgatory_info *pi = &image->purgatory_info;
+        Elf_Sym *sym;
+        Elf_Shdr *sechdr;
+        sym = kexec_purgatory_find_symbol(pi, name);
+        if (!sym)
+                return ERR_PTR(-EINVAL);
+        sechdr = &pi->sechdrs[sym->st_shndx];
+        /*
+         * Returns the address where symbol will finally be loaded after
+         * kexec_load_segment()
+         */
+        return (void *)(sechdr->sh_addr + sym->st_value);
+}
+/*
+ * Get or set value of a symbol. If "get_value" is true, symbol value is
+ * returned in buf otherwise symbol value is set based on value in buf.
+ */
+int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
+                                   void *buf, unsigned int size, bool get_value)
+{
+        Elf_Sym *sym;
+        Elf_Shdr *sechdrs;
+        struct purgatory_info *pi = &image->purgatory_info;
+        char *sym_buf;
+        sym = kexec_purgatory_find_symbol(pi, name);
+        if (!sym)
+                return -EINVAL;
+        if (sym->st_size != size) {
+                pr_err("symbol %s size mismatch: expected %lu actual %u\n",
+                       name, (unsigned long)sym->st_size, size);
+                return -EINVAL;
+        }
+        sechdrs = pi->sechdrs;
+        if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+                pr_err("symbol %s is in a bss section. Cannot %s\n", name,
+                       get_value ? "get" : "set");
+                return -EINVAL;
+        }
+        sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
+                                        sym->st_value;
+        if (get_value)
+                memcpy((void *)buf, sym_buf, size);
+        else
+                memcpy((void *)sym_buf, buf, size);
+        return 0;
+}
+#endif /* CONFIG_KEXEC_FILE */
 /*
 * Move into place and start executing a preloaded standalone
 * executable.  If nothing was preloaded return an error.
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 8637e041a247..80f7a6d00519 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -196,12 +196,34 @@ int __request_module(bool wait, const char *fmt, ...)
 EXPORT_SYMBOL(__request_module);
 #endif /* CONFIG_MODULES */
+static void call_usermodehelper_freeinfo(struct subprocess_info *info)
+{
+        if (info->cleanup)
+                (*info->cleanup)(info);
+        kfree(info);
+}
+static void umh_complete(struct subprocess_info *sub_info)
+{
+        struct completion *comp = xchg(&sub_info->complete, NULL);
+        /*
+         * See call_usermodehelper_exec(). If xchg() returns NULL
+         * we own sub_info, the UMH_KILLABLE caller has gone away
+         * or the caller used UMH_NO_WAIT.
+         */
+        if (comp)
+                complete(comp);
+        else
+                call_usermodehelper_freeinfo(sub_info);
+}
 /*
 * This is the task which runs the usermode application
 */
 static int ____call_usermodehelper(void *data)
 {
        struct subprocess_info *sub_info = data;
+        int wait = sub_info->wait & ~UMH_KILLABLE;
        struct cred *new;
        int retval;
@@ -221,7 +243,7 @@ static int ____call_usermodehelper(void *data)
        retval = -ENOMEM;
        new = prepare_kernel_cred(current);
        if (!new)
-                goto fail;
+                goto out;
        spin_lock(&umh_sysctl_lock);
        new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
@@ -233,7 +255,7 @@ static int ____call_usermodehelper(void *data)
                retval = sub_info->init(sub_info, new);
                if (retval) {
                        abort_creds(new);
-                        goto fail;
+                        goto out;
                }
        }
@@ -242,12 +264,13 @@ static int ____call_usermodehelper(void *data)
        retval = do_execve(getname_kernel(sub_info->path),
                           (const char __user *const __user *)sub_info->argv,
                           (const char __user *const __user *)sub_info->envp);
+out:
+        sub_info->retval = retval;
+        /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */
+        if (wait != UMH_WAIT_PROC)
+                umh_complete(sub_info);
        if (!retval)
                return 0;
-        /* Exec failed? */
-fail:
-        sub_info->retval = retval;
        do_exit(0);
 }
@@ -258,26 +281,6 @@ static int call_helper(void *data)
        return ____call_usermodehelper(data);
 }
-static void call_usermodehelper_freeinfo(struct subprocess_info *info)
-{
-        if (info->cleanup)
-                (*info->cleanup)(info);
-        kfree(info);
-}
-static void umh_complete(struct subprocess_info *sub_info)
-{
-        struct completion *comp = xchg(&sub_info->complete, NULL);
-        /*
-         * See call_usermodehelper_exec(). If xchg() returns NULL
-         * we own sub_info, the UMH_KILLABLE caller has gone away.
-         */
-        if (comp)
-                complete(comp);
-        else
-                call_usermodehelper_freeinfo(sub_info);
-}
 /* Keventd can't block, but this (a child) can. */
 static int wait_for_helper(void *data)
 {
@@ -336,18 +339,8 @@ static void __call_usermodehelper(struct work_struct *work)
                kmod_thread_locker = NULL;
        }
-        switch (wait) {
+        if (pid < 0) {
-        case UMH_NO_WAIT:
+                sub_info->retval = pid;
-                call_usermodehelper_freeinfo(sub_info);
-                break;
-        case UMH_WAIT_PROC:
-                if (pid > 0)
-                        break;
-                /* FALLTHROUGH */
-        case UMH_WAIT_EXEC:
-                if (pid < 0)
-                        sub_info->retval = pid;
                umh_complete(sub_info);
        }
 }
@@ -588,7 +581,12 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
                goto out;
        }
-        sub_info->complete = &done;
+        /*
+         * Set the completion pointer only if there is a waiter.
+         * This makes it possible to use umh_complete to free
+         * the data structure in case of UMH_NO_WAIT.
+         */
+        sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
        sub_info->wait = wait;
        queue_work(khelper_wq, &sub_info->work);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 734e9a7d280b..3995f546d0f3 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1778,7 +1778,18 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
        unsigned long hash, flags = 0;
        struct kretprobe_instance *ri;
-        /*TODO: consider to only swap the RA after the last pre_handler fired */
+        /*
+         * To avoid deadlocks, prohibit return probing in NMI contexts,
+         * just skip the probe and increase the (inexact) 'nmissed'
+         * statistical counter, so that the user is informed that
+         * something happened:
+         */
+        if (unlikely(in_nmi())) {
+                rp->nmissed++;
+                return 0;
+        }
+        /* TODO: consider to only swap the RA after the last pre_handler fired */
        hash = hash_ptr(current, KPROBE_HASH_BITS);
        raw_spin_lock_irqsave(&rp->lock, flags);
        if (!hlist_empty(&rp->free_instances)) {
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ef483220e855..10e489c448fe 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -369,7 +369,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
 {
        struct task_struct *p;
-        p = kthread_create_on_node(threadfn, data, cpu_to_mem(cpu), namefmt,
+        p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
                                   cpu);
        if (IS_ERR(p))
                return p;
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 0955b885d0dc..ec8cce259779 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -20,30 +20,20 @@
 * Author: Paul E. McKenney <paulmck@us.ibm.com>
 *      Based on kernel/rcu/torture.c.
 */
-#include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/init.h>
 #include <linux/module.h>
 #include <linux/kthread.h>
-#include <linux/err.h>
 #include <linux/spinlock.h>
+#include <linux/rwlock.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
 #include <linux/smp.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
 #include <linux/atomic.h>
-#include <linux/bitops.h>
-#include <linux/completion.h>
 #include <linux/moduleparam.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/reboot.h>
-#include <linux/freezer.h>
-#include <linux/cpu.h>
 #include <linux/delay.h>
-#include <linux/stat.h>
 #include <linux/slab.h>
-#include <linux/trace_clock.h>
-#include <asm/byteorder.h>
 #include <linux/torture.h>
 MODULE_LICENSE("GPL");
@@ -51,6 +41,8 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
 torture_param(int, nwriters_stress, -1,
             "Number of write-locking stress-test threads");
+torture_param(int, nreaders_stress, -1,
+             "Number of read-locking stress-test threads");
 torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
 torture_param(int, onoff_interval, 0,
             "Time between CPU hotplugs (s), 0=disable");
@@ -66,30 +58,28 @@ torture_param(bool, verbose, true,
 static char *torture_type = "spin_lock";
 module_param(torture_type, charp, 0444);
 MODULE_PARM_DESC(torture_type,
-                 "Type of lock to torture (spin_lock, spin_lock_irq, ...)");
+                 "Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)");
-static atomic_t n_lock_torture_errors;
 static struct task_struct *stats_task;
 static struct task_struct **writer_tasks;
+static struct task_struct **reader_tasks;
-static int nrealwriters_stress;
 static bool lock_is_write_held;
+static bool lock_is_read_held;
-struct lock_writer_stress_stats {
+struct lock_stress_stats {
-        long n_write_lock_fail;
+        long n_lock_fail;
-        long n_write_lock_acquired;
+        long n_lock_acquired;
 };
-static struct lock_writer_stress_stats *lwsa;
 #if defined(MODULE)
 #define LOCKTORTURE_RUNNABLE_INIT 1
 #else
 #define LOCKTORTURE_RUNNABLE_INIT 0
 #endif
-int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT;
+int torture_runnable = LOCKTORTURE_RUNNABLE_INIT;
-module_param(locktorture_runnable, int, 0444);
+module_param(torture_runnable, int, 0444);
-MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at module init");
+MODULE_PARM_DESC(torture_runnable, "Start locktorture at module init");
 /* Forward reference. */
 static void lock_torture_cleanup(void);
@@ -102,12 +92,25 @@ struct lock_torture_ops {
        int (*writelock)(void);
        void (*write_delay)(struct torture_random_state *trsp);
        void (*writeunlock)(void);
+        int (*readlock)(void);
+        void (*read_delay)(struct torture_random_state *trsp);
+        void (*readunlock)(void);
        unsigned long flags;
        const char *name;
 };
-static struct lock_torture_ops *cur_ops;
+struct lock_torture_cxt {
+        int nrealwriters_stress;
+        int nrealreaders_stress;
+        bool debug_lock;
+        atomic_t n_lock_torture_errors;
+        struct lock_torture_ops *cur_ops;
+        struct lock_stress_stats *lwsa; /* writer statistics */
+        struct lock_stress_stats *lrsa; /* reader statistics */
+};
+static struct lock_torture_cxt cxt = { 0, 0, false,
+                                       ATOMIC_INIT(0),
+                                       NULL, NULL};
 /*
 * Definitions for lock torture testing.
 */
@@ -123,10 +126,10 @@ static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
        /* We want a long delay occasionally to force massive contention.  */
        if (!(torture_random(trsp) %
-              (nrealwriters_stress * 2000 * longdelay_us)))
+              (cxt.nrealwriters_stress * 2000 * longdelay_us)))
                mdelay(longdelay_us);
 #ifdef CONFIG_PREEMPT
-        if (!(torture_random(trsp) % (nrealwriters_stress * 20000)))
+        if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
                preempt_schedule();  /* Allow test to be preempted. */
 #endif
 }
@@ -140,6 +143,9 @@ static struct lock_torture_ops lock_busted_ops = {
        .writelock      = torture_lock_busted_write_lock,
        .write_delay    = torture_lock_busted_write_delay,
        .writeunlock    = torture_lock_busted_write_unlock,
+        .readlock       = NULL,
+        .read_delay     = NULL,
+        .readunlock     = NULL,
        .name           = "lock_busted"
 };
@@ -160,13 +166,13 @@ static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
         * we want a long delay occasionally to force massive contention.
         */
        if (!(torture_random(trsp) %
-              (nrealwriters_stress * 2000 * longdelay_us)))
+              (cxt.nrealwriters_stress * 2000 * longdelay_us)))
                mdelay(longdelay_us);
        if (!(torture_random(trsp) %
-              (nrealwriters_stress * 2 * shortdelay_us)))
+              (cxt.nrealwriters_stress * 2 * shortdelay_us)))
                udelay(shortdelay_us);
 #ifdef CONFIG_PREEMPT
-        if (!(torture_random(trsp) % (nrealwriters_stress * 20000)))
+        if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
                preempt_schedule();  /* Allow test to be preempted. */
 #endif
 }
@@ -180,39 +186,253 @@ static struct lock_torture_ops spin_lock_ops = {
        .writelock      = torture_spin_lock_write_lock,
        .write_delay    = torture_spin_lock_write_delay,
        .writeunlock    = torture_spin_lock_write_unlock,
+        .readlock       = NULL,
+        .read_delay     = NULL,
+        .readunlock     = NULL,
        .name           = "spin_lock"
 };
 static int torture_spin_lock_write_lock_irq(void)
-__acquires(torture_spinlock_irq)
+__acquires(torture_spinlock)
 {
        unsigned long flags;
        spin_lock_irqsave(&torture_spinlock, flags);
-        cur_ops->flags = flags;
+        cxt.cur_ops->flags = flags;
        return 0;
 }
 static void torture_lock_spin_write_unlock_irq(void)
 __releases(torture_spinlock)
 {
-        spin_unlock_irqrestore(&torture_spinlock, cur_ops->flags);
+        spin_unlock_irqrestore(&torture_spinlock, cxt.cur_ops->flags);
 }
 static struct lock_torture_ops spin_lock_irq_ops = {
        .writelock      = torture_spin_lock_write_lock_irq,
        .write_delay    = torture_spin_lock_write_delay,
        .writeunlock    = torture_lock_spin_write_unlock_irq,
+        .readlock       = NULL,
+        .read_delay     = NULL,
+        .readunlock     = NULL,
        .name           = "spin_lock_irq"
 };
+static DEFINE_RWLOCK(torture_rwlock);
+static int torture_rwlock_write_lock(void) __acquires(torture_rwlock)
+{
+        write_lock(&torture_rwlock);
+        return 0;
+}
+static void torture_rwlock_write_delay(struct torture_random_state *trsp)
+{
+        const unsigned long shortdelay_us = 2;
+        const unsigned long longdelay_ms = 100;
+        /* We want a short delay mostly to emulate likely code, and
+         * we want a long delay occasionally to force massive contention.
+         */
+        if (!(torture_random(trsp) %
+              (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+                mdelay(longdelay_ms);
+        else
+                udelay(shortdelay_us);
+}
+static void torture_rwlock_write_unlock(void) __releases(torture_rwlock)
+{
+        write_unlock(&torture_rwlock);
+}
+static int torture_rwlock_read_lock(void) __acquires(torture_rwlock)
+{
+        read_lock(&torture_rwlock);
+        return 0;
+}
+static void torture_rwlock_read_delay(struct torture_random_state *trsp)
+{
+        const unsigned long shortdelay_us = 10;
+        const unsigned long longdelay_ms = 100;
+        /* We want a short delay mostly to emulate likely code, and
+         * we want a long delay occasionally to force massive contention.
+         */
+        if (!(torture_random(trsp) %
+              (cxt.nrealreaders_stress * 2000 * longdelay_ms)))
+                mdelay(longdelay_ms);
+        else
+                udelay(shortdelay_us);
+}
+static void torture_rwlock_read_unlock(void) __releases(torture_rwlock)
+{
+        read_unlock(&torture_rwlock);
+}
+static struct lock_torture_ops rw_lock_ops = {
+        .writelock      = torture_rwlock_write_lock,
+        .write_delay    = torture_rwlock_write_delay,
+        .writeunlock    = torture_rwlock_write_unlock,
+        .readlock       = torture_rwlock_read_lock,
+        .read_delay     = torture_rwlock_read_delay,
+        .readunlock     = torture_rwlock_read_unlock,
+        .name           = "rw_lock"
+};
+static int torture_rwlock_write_lock_irq(void) __acquires(torture_rwlock)
+{
+        unsigned long flags;
+        write_lock_irqsave(&torture_rwlock, flags);
+        cxt.cur_ops->flags = flags;
+        return 0;
+}
+static void torture_rwlock_write_unlock_irq(void)
+__releases(torture_rwlock)
+{
+        write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
+}
+static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock)
+{
+        unsigned long flags;
+        read_lock_irqsave(&torture_rwlock, flags);
+        cxt.cur_ops->flags = flags;
+        return 0;
+}
+static void torture_rwlock_read_unlock_irq(void)
+__releases(torture_rwlock)
+{
+        write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
+}
+static struct lock_torture_ops rw_lock_irq_ops = {
+        .writelock      = torture_rwlock_write_lock_irq,
+        .write_delay    = torture_rwlock_write_delay,
+        .writeunlock    = torture_rwlock_write_unlock_irq,
+        .readlock       = torture_rwlock_read_lock_irq,
+        .read_delay     = torture_rwlock_read_delay,
+        .readunlock     = torture_rwlock_read_unlock_irq,
+        .name           = "rw_lock_irq"
+};
+static DEFINE_MUTEX(torture_mutex);
+static int torture_mutex_lock(void) __acquires(torture_mutex)
+{
+        mutex_lock(&torture_mutex);
+        return 0;
+}
+static void torture_mutex_delay(struct torture_random_state *trsp)
+{
+        const unsigned long longdelay_ms = 100;
+        /* We want a long delay occasionally to force massive contention.  */
+        if (!(torture_random(trsp) %
+              (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+                mdelay(longdelay_ms * 5);
+        else
+                mdelay(longdelay_ms / 5);
+#ifdef CONFIG_PREEMPT
+        if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
+                preempt_schedule();  /* Allow test to be preempted. */
+#endif
+}
+static void torture_mutex_unlock(void) __releases(torture_mutex)
+{
+        mutex_unlock(&torture_mutex);
+}
+static struct lock_torture_ops mutex_lock_ops = {
+        .writelock      = torture_mutex_lock,
+        .write_delay    = torture_mutex_delay,
+        .writeunlock    = torture_mutex_unlock,
+        .readlock       = NULL,
+        .read_delay     = NULL,
+        .readunlock     = NULL,
+        .name           = "mutex_lock"
+};
+static DECLARE_RWSEM(torture_rwsem);
+static int torture_rwsem_down_write(void) __acquires(torture_rwsem)
+{
+        down_write(&torture_rwsem);
+        return 0;
+}
+static void torture_rwsem_write_delay(struct torture_random_state *trsp)
+{
+        const unsigned long longdelay_ms = 100;
+        /* We want a long delay occasionally to force massive contention.  */
+        if (!(torture_random(trsp) %
+              (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+                mdelay(longdelay_ms * 10);
+        else
+                mdelay(longdelay_ms / 10);
+#ifdef CONFIG_PREEMPT
+        if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
+                preempt_schedule();  /* Allow test to be preempted. */
+#endif
+}
+static void torture_rwsem_up_write(void) __releases(torture_rwsem)
+{
+        up_write(&torture_rwsem);
+}
+static int torture_rwsem_down_read(void) __acquires(torture_rwsem)
+{
+        down_read(&torture_rwsem);
+        return 0;
+}
+static void torture_rwsem_read_delay(struct torture_random_state *trsp)
+{
+        const unsigned long longdelay_ms = 100;
+        /* We want a long delay occasionally to force massive contention.  */
+        if (!(torture_random(trsp) %
+              (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+                mdelay(longdelay_ms * 2);
+        else
+                mdelay(longdelay_ms / 2);
+#ifdef CONFIG_PREEMPT
+        if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000)))
+                preempt_schedule();  /* Allow test to be preempted. */
+#endif
+}
+static void torture_rwsem_up_read(void) __releases(torture_rwsem)
+{
+        up_read(&torture_rwsem);
+}
+static struct lock_torture_ops rwsem_lock_ops = {
+        .writelock      = torture_rwsem_down_write,
+        .write_delay    = torture_rwsem_write_delay,
+        .writeunlock    = torture_rwsem_up_write,
+        .readlock       = torture_rwsem_down_read,
+        .read_delay     = torture_rwsem_read_delay,
+        .readunlock     = torture_rwsem_up_read,
+        .name           = "rwsem_lock"
+};
 /*
 * Lock torture writer kthread.  Repeatedly acquires and releases
 * the lock, checking for duplicate acquisitions.
 */
 static int lock_torture_writer(void *arg)
 {
-        struct lock_writer_stress_stats *lwsp = arg;
+        struct lock_stress_stats *lwsp = arg;
        static DEFINE_TORTURE_RANDOM(rand);
        VERBOSE_TOROUT_STRING("lock_torture_writer task started");
@@ -221,14 +441,19 @@ static int lock_torture_writer(void *arg)
        do {
                if ((torture_random(&rand) & 0xfffff) == 0)
                        schedule_timeout_uninterruptible(1);
-                cur_ops->writelock();
+                cxt.cur_ops->writelock();
                if (WARN_ON_ONCE(lock_is_write_held))
-                        lwsp->n_write_lock_fail++;
+                        lwsp->n_lock_fail++;
                lock_is_write_held = 1;
-                lwsp->n_write_lock_acquired++;
+                if (WARN_ON_ONCE(lock_is_read_held))
-                cur_ops->write_delay(&rand);
+                        lwsp->n_lock_fail++; /* rare, but... */
+                lwsp->n_lock_acquired++;
+                cxt.cur_ops->write_delay(&rand);
                lock_is_write_held = 0;
-                cur_ops->writeunlock();
+                cxt.cur_ops->writeunlock();
                stutter_wait("lock_torture_writer");
        } while (!torture_must_stop());
        torture_kthread_stopping("lock_torture_writer");
@@ -236,32 +461,66 @@ static int lock_torture_writer(void *arg)
 }
 /*
+ * Lock torture reader kthread.  Repeatedly acquires and releases
+ * the reader lock.
+ */
+static int lock_torture_reader(void *arg)
+{
+        struct lock_stress_stats *lrsp = arg;
+        static DEFINE_TORTURE_RANDOM(rand);
+        VERBOSE_TOROUT_STRING("lock_torture_reader task started");
+        set_user_nice(current, MAX_NICE);
+        do {
+                if ((torture_random(&rand) & 0xfffff) == 0)
+                        schedule_timeout_uninterruptible(1);
+                cxt.cur_ops->readlock();
+                lock_is_read_held = 1;
+                if (WARN_ON_ONCE(lock_is_write_held))
+                        lrsp->n_lock_fail++; /* rare, but... */
+                lrsp->n_lock_acquired++;
+                cxt.cur_ops->read_delay(&rand);
+                lock_is_read_held = 0;
+                cxt.cur_ops->readunlock();
+                stutter_wait("lock_torture_reader");
+        } while (!torture_must_stop());
+        torture_kthread_stopping("lock_torture_reader");
+        return 0;
+}
+/*
 * Create an lock-torture-statistics message in the specified buffer.
 */
-static void lock_torture_printk(char *page)
+static void __torture_print_stats(char *page,
+                                  struct lock_stress_stats *statp, bool write)
 {
        bool fail = 0;
-        int i;
+        int i, n_stress;
        long max = 0;
-        long min = lwsa[0].n_write_lock_acquired;
+        long min = statp[0].n_lock_acquired;
        long long sum = 0;
-        for (i = 0; i < nrealwriters_stress; i++) {
+        n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress;
-                if (lwsa[i].n_write_lock_fail)
+        for (i = 0; i < n_stress; i++) {
+                if (statp[i].n_lock_fail)
                        fail = true;
-                sum += lwsa[i].n_write_lock_acquired;
+                sum += statp[i].n_lock_acquired;
-                if (max < lwsa[i].n_write_lock_fail)
+                if (max < statp[i].n_lock_fail)
-                        max = lwsa[i].n_write_lock_fail;
+                        max = statp[i].n_lock_fail;
-                if (min > lwsa[i].n_write_lock_fail)
+                if (min > statp[i].n_lock_fail)
-                        min = lwsa[i].n_write_lock_fail;
+                        min = statp[i].n_lock_fail;
        }
-        page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG);
        page += sprintf(page,
-                        "Writes:  Total: %lld  Max/Min: %ld/%ld %s  Fail: %d %s\n",
+                        "%s:  Total: %lld  Max/Min: %ld/%ld %s  Fail: %d %s\n",
+                        write ? "Writes" : "Reads ",
                        sum, max, min, max / 2 > min ? "???" : "",
                        fail, fail ? "!!!" : "");
        if (fail)
-                atomic_inc(&n_lock_torture_errors);
+                atomic_inc(&cxt.n_lock_torture_errors);
 }
 /*
@@ -274,18 +533,35 @@ static void lock_torture_printk(char *page)
 */
 static void lock_torture_stats_print(void)
 {
-        int size = nrealwriters_stress * 200 + 8192;
+        int size = cxt.nrealwriters_stress * 200 + 8192;
        char *buf;
+        if (cxt.cur_ops->readlock)
+                size += cxt.nrealreaders_stress * 200 + 8192;
        buf = kmalloc(size, GFP_KERNEL);
        if (!buf) {
                pr_err("lock_torture_stats_print: Out of memory, need: %d",
                       size);
                return;
        }
-        lock_torture_printk(buf);
+        __torture_print_stats(buf, cxt.lwsa, true);
        pr_alert("%s", buf);
        kfree(buf);
+        if (cxt.cur_ops->readlock) {
+                buf = kmalloc(size, GFP_KERNEL);
+                if (!buf) {
+                        pr_err("lock_torture_stats_print: Out of memory, need: %d",
+                               size);
+                        return;
+                }
+                __torture_print_stats(buf, cxt.lrsa, false);
+                pr_alert("%s", buf);
+                kfree(buf);
+        }
 }
 /*
@@ -312,9 +588,10 @@ lock_torture_print_module_parms(struct lock_torture_ops *cur_ops,
                                const char *tag)
 {
        pr_alert("%s" TORTURE_FLAG
-                 "--- %s: nwriters_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
+                 "--- %s%s: nwriters_stress=%d nreaders_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
-                 torture_type, tag, nrealwriters_stress, stat_interval, verbose,
+                 torture_type, tag, cxt.debug_lock ? " [debug]": "",
-                 shuffle_interval, stutter, shutdown_secs,
+                 cxt.nrealwriters_stress, cxt.nrealreaders_stress, stat_interval,
+                 verbose, shuffle_interval, stutter, shutdown_secs,
                 onoff_interval, onoff_holdoff);
 }
@@ -322,46 +599,59 @@ static void lock_torture_cleanup(void)
 {
        int i;
-        if (torture_cleanup())
+        if (torture_cleanup_begin())
                return;
        if (writer_tasks) {
-                for (i = 0; i < nrealwriters_stress; i++)
+                for (i = 0; i < cxt.nrealwriters_stress; i++)
                        torture_stop_kthread(lock_torture_writer,
                                             writer_tasks[i]);
                kfree(writer_tasks);
                writer_tasks = NULL;
        }
+        if (reader_tasks) {
+                for (i = 0; i < cxt.nrealreaders_stress; i++)
+                        torture_stop_kthread(lock_torture_reader,
+                                             reader_tasks[i]);
+                kfree(reader_tasks);
+                reader_tasks = NULL;
+        }
        torture_stop_kthread(lock_torture_stats, stats_task);
        lock_torture_stats_print();  /* -After- the stats thread is stopped! */
-        if (atomic_read(&n_lock_torture_errors))
+        if (atomic_read(&cxt.n_lock_torture_errors))
-                lock_torture_print_module_parms(cur_ops,
+                lock_torture_print_module_parms(cxt.cur_ops,
                                                "End of test: FAILURE");
        else if (torture_onoff_failures())
-                lock_torture_print_module_parms(cur_ops,
+                lock_torture_print_module_parms(cxt.cur_ops,
                                                "End of test: LOCK_HOTPLUG");
        else
-                lock_torture_print_module_parms(cur_ops,
+                lock_torture_print_module_parms(cxt.cur_ops,
                                                "End of test: SUCCESS");
+        torture_cleanup_end();
 }
 static int __init lock_torture_init(void)
 {
-        int i;
+        int i, j;
        int firsterr = 0;
        static struct lock_torture_ops *torture_ops[] = {
-                &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops,
+                &lock_busted_ops,
+                &spin_lock_ops, &spin_lock_irq_ops,
+                &rw_lock_ops, &rw_lock_irq_ops,
+                &mutex_lock_ops,
+                &rwsem_lock_ops,
        };
-        if (!torture_init_begin(torture_type, verbose, &locktorture_runnable))
+        if (!torture_init_begin(torture_type, verbose, &torture_runnable))
                return -EBUSY;
        /* Process args and tell the world that the torturer is on the job. */
        for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
-                cur_ops = torture_ops[i];
+                cxt.cur_ops = torture_ops[i];
-                if (strcmp(torture_type, cur_ops->name) == 0)
+                if (strcmp(torture_type, cxt.cur_ops->name) == 0)
                        break;
        }
        if (i == ARRAY_SIZE(torture_ops)) {
@@ -374,31 +664,69 @@ static int __init lock_torture_init(void)
                torture_init_end();
                return -EINVAL;
        }
-        if (cur_ops->init)
+        if (cxt.cur_ops->init)
-                cur_ops->init(); /* no "goto unwind" prior to this point!!! */
+                cxt.cur_ops->init(); /* no "goto unwind" prior to this point!!! */
        if (nwriters_stress >= 0)
-                nrealwriters_stress = nwriters_stress;
+                cxt.nrealwriters_stress = nwriters_stress;
        else
-                nrealwriters_stress = 2 * num_online_cpus();
+                cxt.nrealwriters_stress = 2 * num_online_cpus();
-        lock_torture_print_module_parms(cur_ops, "Start of test");
+#ifdef CONFIG_DEBUG_MUTEXES
+        if (strncmp(torture_type, "mutex", 5) == 0)
+                cxt.debug_lock = true;
+#endif
+#ifdef CONFIG_DEBUG_SPINLOCK
+        if ((strncmp(torture_type, "spin", 4) == 0) ||
+            (strncmp(torture_type, "rw_lock", 7) == 0))
+                cxt.debug_lock = true;
+#endif
        /* Initialize the statistics so that each run gets its own numbers. */
        lock_is_write_held = 0;
-        lwsa = kmalloc(sizeof(*lwsa) * nrealwriters_stress, GFP_KERNEL);
+        cxt.lwsa = kmalloc(sizeof(*cxt.lwsa) * cxt.nrealwriters_stress, GFP_KERNEL);
-        if (lwsa == NULL) {
+        if (cxt.lwsa == NULL) {
-                VERBOSE_TOROUT_STRING("lwsa: Out of memory");
+                VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory");
                firsterr = -ENOMEM;
                goto unwind;
        }
-        for (i = 0; i < nrealwriters_stress; i++) {
+        for (i = 0; i < cxt.nrealwriters_stress; i++) {
-                lwsa[i].n_write_lock_fail = 0;
+                cxt.lwsa[i].n_lock_fail = 0;
-                lwsa[i].n_write_lock_acquired = 0;
+                cxt.lwsa[i].n_lock_acquired = 0;
        }
-        /* Start up the kthreads. */
+        if (cxt.cur_ops->readlock) {
+                if (nreaders_stress >= 0)
+                        cxt.nrealreaders_stress = nreaders_stress;
+                else {
+                        /*
+                         * By default distribute evenly the number of
+                         * readers and writers. We still run the same number
+                         * of threads as the writer-only locks default.
+                         */
+                        if (nwriters_stress < 0) /* user doesn't care */
+                                cxt.nrealwriters_stress = num_online_cpus();
+                        cxt.nrealreaders_stress = cxt.nrealwriters_stress;
+                }
+                lock_is_read_held = 0;
+                cxt.lrsa = kmalloc(sizeof(*cxt.lrsa) * cxt.nrealreaders_stress, GFP_KERNEL);
+                if (cxt.lrsa == NULL) {
+                        VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory");
+                        firsterr = -ENOMEM;
+                        kfree(cxt.lwsa);
+                        goto unwind;
+                }
+                for (i = 0; i < cxt.nrealreaders_stress; i++) {
+                        cxt.lrsa[i].n_lock_fail = 0;
+                        cxt.lrsa[i].n_lock_acquired = 0;
+                }
+        }
+        lock_torture_print_module_parms(cxt.cur_ops, "Start of test");
+        /* Prepare torture context. */
        if (onoff_interval > 0) {
                firsterr = torture_onoff_init(onoff_holdoff * HZ,
                                              onoff_interval * HZ);
@@ -422,18 +750,51 @@ static int __init lock_torture_init(void)
                        goto unwind;
        }
-        writer_tasks = kzalloc(nrealwriters_stress * sizeof(writer_tasks[0]),
+        writer_tasks = kzalloc(cxt.nrealwriters_stress * sizeof(writer_tasks[0]),
                               GFP_KERNEL);
        if (writer_tasks == NULL) {
                VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory");
                firsterr = -ENOMEM;
                goto unwind;
        }
-        for (i = 0; i < nrealwriters_stress; i++) {
-                firsterr = torture_create_kthread(lock_torture_writer, &lwsa[i],
+        if (cxt.cur_ops->readlock) {
+                reader_tasks = kzalloc(cxt.nrealreaders_stress * sizeof(reader_tasks[0]),
+                                       GFP_KERNEL);
+                if (reader_tasks == NULL) {
+                        VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory");
+                        firsterr = -ENOMEM;
+                        goto unwind;
+                }
+        }
+        /*
+         * Create the kthreads and start torturing (oh, those poor little locks).
+         *
+         * TODO: Note that we interleave writers with readers, giving writers a
+         * slight advantage, by creating its kthread first. This can be modified
+         * for very specific needs, or even let the user choose the policy, if
+         * ever wanted.
+         */
+        for (i = 0, j = 0; i < cxt.nrealwriters_stress ||
+                    j < cxt.nrealreaders_stress; i++, j++) {
+                if (i >= cxt.nrealwriters_stress)
+                        goto create_reader;
+                /* Create writer. */
+                firsterr = torture_create_kthread(lock_torture_writer, &cxt.lwsa[i],
                                                  writer_tasks[i]);
                if (firsterr)
                        goto unwind;
+        create_reader:
+                if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress))
+                        continue;
+                /* Create reader. */
+                firsterr = torture_create_kthread(lock_torture_reader, &cxt.lrsa[j],
+                                                  reader_tasks[j]);
+                if (firsterr)
+                        goto unwind;
        }
        if (stat_interval > 0) {
                firsterr = torture_create_kthread(lock_torture_stats, NULL,
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 23e89c5930e9..4d60986fcbee 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -56,9 +56,6 @@ do {									\
 * If the lock has already been acquired, then this will proceed to spin
 * on this node->locked until the previous lock holder sets the node->locked
 * in mcs_spin_unlock().
- *
- * We don't inline mcs_spin_lock() so that perf can correctly account for the
- * time spent in this lock function.
 */
 static inline
 void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index ae712b25e492..dadbf88c22c4 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -15,7 +15,7 @@
 *    by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
 *    and Sven Dietrich.
 *
- * Also see Documentation/mutex-design.txt.
+ * Also see Documentation/locking/mutex-design.txt.
 */
 #include <linux/mutex.h>
 #include <linux/ww_mutex.h>
@@ -106,6 +106,92 @@ void __sched mutex_lock(struct mutex *lock)
 EXPORT_SYMBOL(mutex_lock);
 #endif
+static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
+                                                   struct ww_acquire_ctx *ww_ctx)
+{
+#ifdef CONFIG_DEBUG_MUTEXES
+        /*
+         * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
+         * but released with a normal mutex_unlock in this call.
+         *
+         * This should never happen, always use ww_mutex_unlock.
+         */
+        DEBUG_LOCKS_WARN_ON(ww->ctx);
+        /*
+         * Not quite done after calling ww_acquire_done() ?
+         */
+        DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
+        if (ww_ctx->contending_lock) {
+                /*
+                 * After -EDEADLK you tried to
+                 * acquire a different ww_mutex? Bad!
+                 */
+                DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
+                /*
+                 * You called ww_mutex_lock after receiving -EDEADLK,
+                 * but 'forgot' to unlock everything else first?
+                 */
+                DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
+                ww_ctx->contending_lock = NULL;
+        }
+        /*
+         * Naughty, using a different class will lead to undefined behavior!
+         */
+        DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
+#endif
+        ww_ctx->acquired++;
+}
+/*
+ * after acquiring lock with fastpath or when we lost out in contested
+ * slowpath, set ctx and wake up any waiters so they can recheck.
+ *
+ * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
+ * as the fastpath and opportunistic spinning are disabled in that case.
+ */
+static __always_inline void
+ww_mutex_set_context_fastpath(struct ww_mutex *lock,
+                               struct ww_acquire_ctx *ctx)
+{
+        unsigned long flags;
+        struct mutex_waiter *cur;
+        ww_mutex_lock_acquired(lock, ctx);
+        lock->ctx = ctx;
+        /*
+         * The lock->ctx update should be visible on all cores before
+         * the atomic read is done, otherwise contended waiters might be
+         * missed. The contended waiters will either see ww_ctx == NULL
+         * and keep spinning, or it will acquire wait_lock, add itself
+         * to waiter list and sleep.
+         */
+        smp_mb(); /* ^^^ */
+        /*
+         * Check if lock is contended, if not there is nobody to wake up
+         */
+        if (likely(atomic_read(&lock->base.count) == 0))
+                return;
+        /*
+         * Uh oh, we raced in fastpath, wake up everyone in this case,
+         * so they can see the new lock->ctx.
+         */
+        spin_lock_mutex(&lock->base.wait_lock, flags);
+        list_for_each_entry(cur, &lock->base.wait_list, list) {
+                debug_mutex_wake_waiter(&lock->base, cur);
+                wake_up_process(cur->task);
+        }
+        spin_unlock_mutex(&lock->base.wait_lock, flags);
+}
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 /*
 * In order to avoid a stampede of mutex spinners from acquiring the mutex
@@ -180,6 +266,129 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
         */
        return retval;
 }
+/*
+ * Atomically try to take the lock when it is available
+ */
+static inline bool mutex_try_to_acquire(struct mutex *lock)
+{
+        return !mutex_is_locked(lock) &&
+                (atomic_cmpxchg(&lock->count, 1, 0) == 1);
+}
+/*
+ * Optimistic spinning.
+ *
+ * We try to spin for acquisition when we find that the lock owner
+ * is currently running on a (different) CPU and while we don't
+ * need to reschedule. The rationale is that if the lock owner is
+ * running, it is likely to release the lock soon.
+ *
+ * Since this needs the lock owner, and this mutex implementation
+ * doesn't track the owner atomically in the lock field, we need to
+ * track it non-atomically.
+ *
+ * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
+ * to serialize everything.
+ *
+ * The mutex spinners are queued up using MCS lock so that only one
+ * spinner can compete for the mutex. However, if mutex spinning isn't
+ * going to happen, there is no point in going through the lock/unlock
+ * overhead.
+ *
+ * Returns true when the lock was taken, otherwise false, indicating
+ * that we need to jump to the slowpath and sleep.
+ */
+static bool mutex_optimistic_spin(struct mutex *lock,
+                                  struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
+{
+        struct task_struct *task = current;
+        if (!mutex_can_spin_on_owner(lock))
+                goto done;
+        if (!osq_lock(&lock->osq))
+                goto done;
+        while (true) {
+                struct task_struct *owner;
+                if (use_ww_ctx && ww_ctx->acquired > 0) {
+                        struct ww_mutex *ww;
+                        ww = container_of(lock, struct ww_mutex, base);
+                        /*
+                         * If ww->ctx is set the contents are undefined, only
+                         * by acquiring wait_lock there is a guarantee that
+                         * they are not invalid when reading.
+                         *
+                         * As such, when deadlock detection needs to be
+                         * performed the optimistic spinning cannot be done.
+                         */
+                        if (ACCESS_ONCE(ww->ctx))
+                                break;
+                }
+                /*
+                 * If there's an owner, wait for it to either
+                 * release the lock or go to sleep.
+                 */
+                owner = ACCESS_ONCE(lock->owner);
+                if (owner && !mutex_spin_on_owner(lock, owner))
+                        break;
+                /* Try to acquire the mutex if it is unlocked. */
+                if (mutex_try_to_acquire(lock)) {
+                        lock_acquired(&lock->dep_map, ip);
+                        if (use_ww_ctx) {
+                                struct ww_mutex *ww;
+                                ww = container_of(lock, struct ww_mutex, base);
+                                ww_mutex_set_context_fastpath(ww, ww_ctx);
+                        }
+                        mutex_set_owner(lock);
+                        osq_unlock(&lock->osq);
+                        return true;
+                }
+                /*
+                 * When there's no owner, we might have preempted between the
+                 * owner acquiring the lock and setting the owner field. If
+                 * we're an RT task that will live-lock because we won't let
+                 * the owner complete.
+                 */
+                if (!owner && (need_resched() || rt_task(task)))
+                        break;
+                /*
+                 * The cpu_relax() call is a compiler barrier which forces
+                 * everything in this loop to be re-loaded. We don't need
+                 * memory barriers as we'll eventually observe the right
+                 * values at the cost of a few extra spins.
+                 */
+                cpu_relax_lowlatency();
+        }
+        osq_unlock(&lock->osq);
+done:
+        /*
+         * If we fell out of the spin path because of need_resched(),
+         * reschedule now, before we try-lock the mutex. This avoids getting
+         * scheduled out right after we obtained the mutex.
+         */
+        if (need_resched())
+                schedule_preempt_disabled();
+        return false;
+}
+#else
+static bool mutex_optimistic_spin(struct mutex *lock,
+                                  struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
+{
+        return false;
+}
 #endif
 __visible __used noinline
@@ -277,91 +486,6 @@ __mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
        return 0;
 }
-static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
-                                                   struct ww_acquire_ctx *ww_ctx)
-{
-#ifdef CONFIG_DEBUG_MUTEXES
-        /*
-         * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
-         * but released with a normal mutex_unlock in this call.
-         *
-         * This should never happen, always use ww_mutex_unlock.
-         */
-        DEBUG_LOCKS_WARN_ON(ww->ctx);
-        /*
-         * Not quite done after calling ww_acquire_done() ?
-         */
-        DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
-        if (ww_ctx->contending_lock) {
-                /*
-                 * After -EDEADLK you tried to
-                 * acquire a different ww_mutex? Bad!
-                 */
-                DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
-                /*
-                 * You called ww_mutex_lock after receiving -EDEADLK,
-                 * but 'forgot' to unlock everything else first?
-                 */
-                DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
-                ww_ctx->contending_lock = NULL;
-        }
-        /*
-         * Naughty, using a different class will lead to undefined behavior!
-         */
-        DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
-#endif
-        ww_ctx->acquired++;
-}
-/*
- * after acquiring lock with fastpath or when we lost out in contested
- * slowpath, set ctx and wake up any waiters so they can recheck.
- *
- * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
- * as the fastpath and opportunistic spinning are disabled in that case.
- */
-static __always_inline void
-ww_mutex_set_context_fastpath(struct ww_mutex *lock,
-                               struct ww_acquire_ctx *ctx)
-{
-        unsigned long flags;
-        struct mutex_waiter *cur;
-        ww_mutex_lock_acquired(lock, ctx);
-        lock->ctx = ctx;
-        /*
-         * The lock->ctx update should be visible on all cores before
-         * the atomic read is done, otherwise contended waiters might be
-         * missed. The contended waiters will either see ww_ctx == NULL
-         * and keep spinning, or it will acquire wait_lock, add itself
-         * to waiter list and sleep.
-         */
-        smp_mb(); /* ^^^ */
-        /*
-         * Check if lock is contended, if not there is nobody to wake up
-         */
-        if (likely(atomic_read(&lock->base.count) == 0))
-                return;
-        /*
-         * Uh oh, we raced in fastpath, wake up everyone in this case,
-         * so they can see the new lock->ctx.
-         */
-        spin_lock_mutex(&lock->base.wait_lock, flags);
-        list_for_each_entry(cur, &lock->base.wait_list, list) {
-                debug_mutex_wake_waiter(&lock->base, cur);
-                wake_up_process(cur->task);
-        }
-        spin_unlock_mutex(&lock->base.wait_lock, flags);
-}
 /*
 * Lock a mutex (possibly interruptible), slowpath:
 */
@@ -378,104 +502,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
        preempt_disable();
        mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+        if (mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) {
-        /*
+                /* got the lock, yay! */
-         * Optimistic spinning.
+                preempt_enable();
-         *
+                return 0;
-         * We try to spin for acquisition when we find that the lock owner
-         * is currently running on a (different) CPU and while we don't
-         * need to reschedule. The rationale is that if the lock owner is
-         * running, it is likely to release the lock soon.
-         *
-         * Since this needs the lock owner, and this mutex implementation
-         * doesn't track the owner atomically in the lock field, we need to
-         * track it non-atomically.
-         *
-         * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
-         * to serialize everything.
-         *
-         * The mutex spinners are queued up using MCS lock so that only one
-         * spinner can compete for the mutex. However, if mutex spinning isn't
-         * going to happen, there is no point in going through the lock/unlock
-         * overhead.
-         */
-        if (!mutex_can_spin_on_owner(lock))
-                goto slowpath;
-        if (!osq_lock(&lock->osq))
-                goto slowpath;
-        for (;;) {
-                struct task_struct *owner;
-                if (use_ww_ctx && ww_ctx->acquired > 0) {
-                        struct ww_mutex *ww;
-                        ww = container_of(lock, struct ww_mutex, base);
-                        /*
-                         * If ww->ctx is set the contents are undefined, only
-                         * by acquiring wait_lock there is a guarantee that
-                         * they are not invalid when reading.
-                         *
-                         * As such, when deadlock detection needs to be
-                         * performed the optimistic spinning cannot be done.
-                         */
-                        if (ACCESS_ONCE(ww->ctx))
-                                break;
-                }
-                /*
-                 * If there's an owner, wait for it to either
-                 * release the lock or go to sleep.
-                 */
-                owner = ACCESS_ONCE(lock->owner);
-                if (owner && !mutex_spin_on_owner(lock, owner))
-                        break;
-                /* Try to acquire the mutex if it is unlocked. */
-                if (!mutex_is_locked(lock) &&
-                    (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
-                        lock_acquired(&lock->dep_map, ip);
-                        if (use_ww_ctx) {
-                                struct ww_mutex *ww;
-                                ww = container_of(lock, struct ww_mutex, base);
-                                ww_mutex_set_context_fastpath(ww, ww_ctx);
-                        }
-                        mutex_set_owner(lock);
-                        osq_unlock(&lock->osq);
-                        preempt_enable();
-                        return 0;
-                }
-                /*
-                 * When there's no owner, we might have preempted between the
-                 * owner acquiring the lock and setting the owner field. If
-                 * we're an RT task that will live-lock because we won't let
-                 * the owner complete.
-                 */
-                if (!owner && (need_resched() || rt_task(task)))
-                        break;
-                /*
-                 * The cpu_relax() call is a compiler barrier which forces
-                 * everything in this loop to be re-loaded. We don't need
-                 * memory barriers as we'll eventually observe the right
-                 * values at the cost of a few extra spins.
-                 */
-                cpu_relax_lowlatency();
        }
-        osq_unlock(&lock->osq);
-slowpath:
-        /*
-         * If we fell out of the spin path because of need_resched(),
-         * reschedule now, before we try-lock the mutex. This avoids getting
-         * scheduled out right after we obtained the mutex.
-         */
-        if (need_resched())
-                schedule_preempt_disabled();
-#endif
        spin_lock_mutex(&lock->wait_lock, flags);
        /*
@@ -679,15 +711,21 @@ EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
 * Release the lock, slowpath:
 */
 static inline void
-__mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
+__mutex_unlock_common_slowpath(struct mutex *lock, int nested)
 {
-        struct mutex *lock = container_of(lock_count, struct mutex, count);
        unsigned long flags;
        /*
-         * some architectures leave the lock unlocked in the fastpath failure
+         * As a performance measurement, release the lock before doing other
+         * wakeup related duties to follow. This allows other tasks to acquire
+         * the lock sooner, while still handling cleanups in past unlock calls.
+         * This can be done as we do not enforce strict equivalence between the
+         * mutex counter and wait_list.
+         *
+         *
+         * Some architectures leave the lock unlocked in the fastpath failure
         * case, others need to leave it locked. In the later case we have to
-         * unlock it here
+         * unlock it here - as the lock counter is currently 0 or negative.
         */
        if (__mutex_slowpath_needs_to_unlock())
                atomic_set(&lock->count, 1);
@@ -716,7 +754,9 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
 __visible void
 __mutex_unlock_slowpath(atomic_t *lock_count)
 {
-        __mutex_unlock_common_slowpath(lock_count, 1);
+        struct mutex *lock = container_of(lock_count, struct mutex, count);
+        __mutex_unlock_common_slowpath(lock, 1);
 }
 #ifndef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 4115fbf83b12..5cda397607f2 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -16,7 +16,7 @@
 #define mutex_remove_waiter(lock, waiter, ti) \
                __list_del((waiter)->list.prev, (waiter)->list.next)
-#ifdef CONFIG_SMP
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 static inline void mutex_set_owner(struct mutex *lock)
 {
        lock->owner = current;
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index a0ea2a141b3b..7c98873a3077 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -8,7 +8,7 @@
 *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
 *  Copyright (C) 2006 Esben Nielsen
 *
- *  See Documentation/rt-mutex-design.txt for details.
+ *  See Documentation/locking/rt-mutex-design.txt for details.
 */
 #include <linux/spinlock.h>
 #include <linux/export.h>
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index d6203faf2eb1..7628c3fc37ca 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -246,19 +246,22 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
        return sem;
 }
+EXPORT_SYMBOL(rwsem_down_read_failed);
 static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
 {
-        if (!(count & RWSEM_ACTIVE_MASK)) {
+        /*
-                /* try acquiring the write lock */
+         * Try acquiring the write lock. Check count first in order
-                if (sem->count == RWSEM_WAITING_BIAS &&
+         * to reduce unnecessary expensive cmpxchg() operations.
-                    cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
+         */
-                            RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
+        if (count == RWSEM_WAITING_BIAS &&
-                        if (!list_is_singular(&sem->wait_list))
+            cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
-                                rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+                    RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
-                        return true;
+                if (!list_is_singular(&sem->wait_list))
-                }
+                        rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+                return true;
        }
        return false;
 }
@@ -465,6 +468,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
        return sem;
 }
+EXPORT_SYMBOL(rwsem_down_write_failed);
 /*
 * handle waking up a waiter on the semaphore
@@ -485,6 +489,7 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
        return sem;
 }
+EXPORT_SYMBOL(rwsem_wake);
 /*
 * downgrade a write lock into a read lock
@@ -506,8 +511,4 @@ struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
        return sem;
 }
-EXPORT_SYMBOL(rwsem_down_read_failed);
-EXPORT_SYMBOL(rwsem_down_write_failed);
-EXPORT_SYMBOL(rwsem_wake);
 EXPORT_SYMBOL(rwsem_downgrade_wake);
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 6815171a4fff..b8120abe594b 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -36,7 +36,7 @@
 static noinline void __down(struct semaphore *sem);
 static noinline int __down_interruptible(struct semaphore *sem);
 static noinline int __down_killable(struct semaphore *sem);
-static noinline int __down_timeout(struct semaphore *sem, long jiffies);
+static noinline int __down_timeout(struct semaphore *sem, long timeout);
 static noinline void __up(struct semaphore *sem);
 /**
@@ -145,14 +145,14 @@ EXPORT_SYMBOL(down_trylock);
 /**
 * down_timeout - acquire the semaphore within a specified time
 * @sem: the semaphore to be acquired
- * @jiffies: how long to wait before failing
+ * @timeout: how long to wait before failing
 *
 * Attempts to acquire the semaphore.  If no more tasks are allowed to
 * acquire the semaphore, calling this function will put the task to sleep.
 * If the semaphore is not released within the specified number of jiffies,
 * this function returns -ETIME.  It returns 0 if the semaphore was acquired.
 */
-int down_timeout(struct semaphore *sem, long jiffies)
+int down_timeout(struct semaphore *sem, long timeout)
 {
        unsigned long flags;
        int result = 0;
@@ -161,7 +161,7 @@ int down_timeout(struct semaphore *sem, long jiffies)
        if (likely(sem->count > 0))
                sem->count--;
        else
-                result = __down_timeout(sem, jiffies);
+                result = __down_timeout(sem, timeout);
        raw_spin_unlock_irqrestore(&sem->lock, flags);
        return result;
@@ -248,9 +248,9 @@ static noinline int __sched __down_killable(struct semaphore *sem)
        return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT);
 }
-static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies)
+static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
 {
-        return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies);
+        return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);
 }
 static noinline void __sched __up(struct semaphore *sem)
diff --git a/kernel/module.c b/kernel/module.c
index ae79ce615cb9..88cec1ddb1e3 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -135,7 +135,7 @@ static int param_set_bool_enable_only(const char *val,
 }
 static const struct kernel_param_ops param_ops_bool_enable_only = {
-        .flags = KERNEL_PARAM_FL_NOARG,
+        .flags = KERNEL_PARAM_OPS_FL_NOARG,
        .set = param_set_bool_enable_only,
        .get = param_get_bool,
 };
@@ -1842,7 +1842,9 @@ static void free_module(struct module *mod)
        /* We leave it in list to prevent duplicate loads, but make sure
         * that noone uses it while it's being deconstructed. */
+        mutex_lock(&module_mutex);
        mod->state = MODULE_STATE_UNFORMED;
+        mutex_unlock(&module_mutex);
        /* Remove dynamic debug info */
        ddebug_remove_module(mod->name);
@@ -3304,6 +3306,11 @@ static int load_module(struct load_info *info, const char __user *uargs,
        mutex_lock(&module_mutex);
        module_bug_cleanup(mod);
        mutex_unlock(&module_mutex);
+        /* we can't deallocate the module until we clear memory protection */
+        unset_module_init_ro_nx(mod);
+        unset_module_core_ro_nx(mod);
 ddebug_cleanup:
        dynamic_debug_remove(info->debug);
        synchronize_sched();
@@ -3381,7 +3388,9 @@ static inline int within(unsigned long addr, void *start, unsigned long size)
 */
 static inline int is_arm_mapping_symbol(const char *str)
 {
-        return str[0] == '$' && strchr("atd", str[1])
+        if (str[0] == '.' && str[1] == 'L')
+                return true;
+        return str[0] == '$' && strchr("axtd", str[1])
               && (str[2] == '\0' || str[2] == '.');
 }
@@ -3444,8 +3453,7 @@ const char *module_address_lookup(unsigned long addr,
        list_for_each_entry_rcu(mod, &modules, list) {
                if (mod->state == MODULE_STATE_UNFORMED)
                        continue;
-                if (within_module_init(addr, mod) ||
+                if (within_module(addr, mod)) {
-                    within_module_core(addr, mod)) {
                        if (modname)
                                *modname = mod->name;
                        ret = get_ksymbol(mod, addr, size, offset);
@@ -3469,8 +3477,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
        list_for_each_entry_rcu(mod, &modules, list) {
                if (mod->state == MODULE_STATE_UNFORMED)
                        continue;
-                if (within_module_init(addr, mod) ||
+                if (within_module(addr, mod)) {
-                    within_module_core(addr, mod)) {
                        const char *sym;
                        sym = get_ksymbol(mod, addr, NULL, NULL);
@@ -3495,8 +3502,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
        list_for_each_entry_rcu(mod, &modules, list) {
                if (mod->state == MODULE_STATE_UNFORMED)
                        continue;
-                if (within_module_init(addr, mod) ||
+                if (within_module(addr, mod)) {
-                    within_module_core(addr, mod)) {
                        const char *sym;
                        sym = get_ksymbol(mod, addr, size, offset);
@@ -3760,8 +3766,7 @@ struct module *__module_address(unsigned long addr)
        list_for_each_entry_rcu(mod, &modules, list) {
                if (mod->state == MODULE_STATE_UNFORMED)
                        continue;
-                if (within_module_core(addr, mod)
+                if (within_module(addr, mod))
-                    || within_module_init(addr, mod))
                        return mod;
        }
        return NULL;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 8e7811086b82..ef42d0ab3115 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -204,20 +204,13 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
        might_sleep();
+        task_lock(p);
        ns = p->nsproxy;
+        p->nsproxy = new;
+        task_unlock(p);
-        rcu_assign_pointer(p->nsproxy, new);
+        if (ns && atomic_dec_and_test(&ns->count))
-        if (ns && atomic_dec_and_test(&ns->count)) {
-                /*
-                 * wait for others to get what they want from this nsproxy.
-                 *
-                 * cannot release this nsproxy via the call_rcu() since
-                 * put_mnt_ns() will want to sleep
-                 */
-                synchronize_rcu();
                free_nsproxy(ns);
-        }
 }
 void exit_task_namespaces(struct task_struct *p)
diff --git a/kernel/panic.c b/kernel/panic.c
index 62e16cef9cc2..cf80672b7924 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -224,6 +224,7 @@ static const struct tnt tnts[] = {
        { TAINT_FIRMWARE_WORKAROUND,    'I', ' ' },
        { TAINT_OOT_MODULE,             'O', ' ' },
        { TAINT_UNSIGNED_MODULE,        'E', ' ' },
+        { TAINT_SOFTLOCKUP,             'L', ' ' },
 };
 /**
@@ -243,6 +244,7 @@ static const struct tnt tnts[] = {
 *  'I' - Working around severe firmware bug.
 *  'O' - Out-of-tree module has been loaded.
 *  'E' - Unsigned module has been loaded.
+ *  'L' - A soft lockup has previously occurred.
 *
 *      The string is overwritten by the next call to print_tainted().
 */
diff --git a/kernel/params.c b/kernel/params.c
index 34f527023794..db97b791390f 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -19,6 +19,7 @@
 #include <linux/string.h>
 #include <linux/errno.h>
 #include <linux/module.h>
+#include <linux/moduleparam.h>
 #include <linux/device.h>
 #include <linux/err.h>
 #include <linux/slab.h>
@@ -83,6 +84,15 @@ bool parameq(const char *a, const char *b)
        return parameqn(a, b, strlen(a)+1);
 }
+static void param_check_unsafe(const struct kernel_param *kp)
+{
+        if (kp->flags & KERNEL_PARAM_FL_UNSAFE) {
+                pr_warn("Setting dangerous option %s - tainting kernel\n",
+                        kp->name);
+                add_taint(TAINT_USER, LOCKDEP_STILL_OK);
+        }
+}
 static int parse_one(char *param,
                     char *val,
                     const char *doing,
@@ -104,11 +114,12 @@ static int parse_one(char *param,
                                return 0;
                        /* No one handled NULL, so do it here. */
                        if (!val &&
-                            !(params[i].ops->flags & KERNEL_PARAM_FL_NOARG))
+                            !(params[i].ops->flags & KERNEL_PARAM_OPS_FL_NOARG))
                                return -EINVAL;
                        pr_debug("handling %s with %p\n", param,
                                params[i].ops->set);
                        mutex_lock(&param_lock);
+                        param_check_unsafe(&params[i]);
                        err = params[i].ops->set(val, &params[i]);
                        mutex_unlock(&param_lock);
                        return err;
@@ -318,7 +329,7 @@ int param_get_bool(char *buffer, const struct kernel_param *kp)
 EXPORT_SYMBOL(param_get_bool);
 struct kernel_param_ops param_ops_bool = {
-        .flags = KERNEL_PARAM_FL_NOARG,
+        .flags = KERNEL_PARAM_OPS_FL_NOARG,
        .set = param_set_bool,
        .get = param_get_bool,
 };
@@ -369,7 +380,7 @@ int param_set_bint(const char *val, const struct kernel_param *kp)
 EXPORT_SYMBOL(param_set_bint);
 struct kernel_param_ops param_ops_bint = {
-        .flags = KERNEL_PARAM_FL_NOARG,
+        .flags = KERNEL_PARAM_OPS_FL_NOARG,
        .set = param_set_bint,
        .get = param_get_int,
 };
@@ -503,8 +514,6 @@ EXPORT_SYMBOL(param_ops_string);
 #define to_module_attr(n) container_of(n, struct module_attribute, attr)
 #define to_module_kobject(n) container_of(n, struct module_kobject, kobj)
-extern struct kernel_param __start___param[], __stop___param[];
 struct param_attribute
 {
        struct module_attribute mattr;
@@ -552,6 +561,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
                return -EPERM;
        mutex_lock(&param_lock);
+        param_check_unsafe(attribute->param);
        err = attribute->param->ops->set(buf, attribute->param);
        mutex_unlock(&param_lock);
        if (!err)
@@ -763,7 +773,7 @@ static struct module_kobject * __init locate_module_kobject(const char *name)
 }
 static void __init kernel_add_sysfs_param(const char *name,
-                                          struct kernel_param *kparam,
+                                          const struct kernel_param *kparam,
                                          unsigned int name_skip)
 {
        struct module_kobject *mk;
@@ -798,7 +808,7 @@ static void __init kernel_add_sysfs_param(const char *name,
 */
 static void __init param_sysfs_builtin(void)
 {
-        struct kernel_param *kp;
+        const struct kernel_param *kp;
        unsigned int name_len;
        char modname[MODULE_NAME_LEN];
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index e4e4121fa327..bbef57f5bdfd 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -302,6 +302,10 @@ config PM_GENERIC_DOMAINS_RUNTIME
        def_bool y
        depends on PM_RUNTIME && PM_GENERIC_DOMAINS
+config PM_GENERIC_DOMAINS_OF
+        def_bool y
+        depends on PM_GENERIC_DOMAINS && OF
 config CPU_PM
        bool
        depends on SUSPEND || CPU_IDLE
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index a9dfa79b6bab..1f35a3478f3c 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -502,8 +502,14 @@ int hibernation_restore(int platform_mode)
        error = dpm_suspend_start(PMSG_QUIESCE);
        if (!error) {
                error = resume_target_kernel(platform_mode);
-                dpm_resume_end(PMSG_RECOVER);
+                /*
+                 * The above should either succeed and jump to the new kernel,
+                 * or return with an error. Otherwise things are just
+                 * undefined, so let's be paranoid.
+                 */
+                BUG_ON(!error);
        }
+        dpm_resume_end(PMSG_RECOVER);
        pm_restore_gfp_mask();
        resume_console();
        pm_restore_console();
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 5d49dcac2537..2df883a9d3cb 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -179,6 +179,7 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *,
 #ifdef CONFIG_SUSPEND
 /* kernel/power/suspend.c */
+extern const char *pm_labels[];
 extern const char *pm_states[];
 extern int suspend_devices_and_enter(suspend_state_t state);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 4ee194eb524b..5a6ec8678b9a 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -46,13 +46,13 @@ static int try_to_freeze_tasks(bool user_only)
        while (true) {
                todo = 0;
                read_lock(&tasklist_lock);
-                do_each_thread(g, p) {
+                for_each_process_thread(g, p) {
                        if (p == current || !freeze_task(p))
                                continue;
                        if (!freezer_should_skip(p))
                                todo++;
-                } while_each_thread(g, p);
+                }
                read_unlock(&tasklist_lock);
                if (!user_only) {
@@ -93,11 +93,11 @@ static int try_to_freeze_tasks(bool user_only)
                if (!wakeup) {
                        read_lock(&tasklist_lock);
-                        do_each_thread(g, p) {
+                        for_each_process_thread(g, p) {
                                if (p != current && !freezer_should_skip(p)
                                    && freezing(p) && !frozen(p))
                                        sched_show_task(p);
-                        } while_each_thread(g, p);
+                        }
                        read_unlock(&tasklist_lock);
                }
        } else {
@@ -108,6 +108,30 @@ static int try_to_freeze_tasks(bool user_only)
        return todo ? -EBUSY : 0;
 }
+static bool __check_frozen_processes(void)
+{
+        struct task_struct *g, *p;
+        for_each_process_thread(g, p)
+                if (p != current && !freezer_should_skip(p) && !frozen(p))
+                        return false;
+        return true;
+}
+/*
+ * Returns true if all freezable tasks (except for current) are frozen already
+ */
+static bool check_frozen_processes(void)
+{
+        bool ret;
+        read_lock(&tasklist_lock);
+        ret = __check_frozen_processes();
+        read_unlock(&tasklist_lock);
+        return ret;
+}
 /**
 * freeze_processes - Signal user space processes to enter the refrigerator.
 * The current thread will not be frozen.  The same process that calls
@@ -118,6 +142,7 @@ static int try_to_freeze_tasks(bool user_only)
 int freeze_processes(void)
 {
        int error;
+        int oom_kills_saved;
        error = __usermodehelper_disable(UMH_FREEZING);
        if (error)
@@ -129,13 +154,28 @@ int freeze_processes(void)
        if (!pm_freezing)
                atomic_inc(&system_freezing_cnt);
+        pm_wakeup_clear();
        printk("Freezing user space processes ... ");
        pm_freezing = true;
+        oom_kills_saved = oom_kills_count();
        error = try_to_freeze_tasks(true);
        if (!error) {
-                printk("done.");
                __usermodehelper_set_disable_depth(UMH_DISABLED);
                oom_killer_disable();
+                /*
+                 * There might have been an OOM kill while we were
+                 * freezing tasks and the killed task might be still
+                 * on the way out so we have to double check for race.
+                 */
+                if (oom_kills_count() != oom_kills_saved &&
+                    !check_frozen_processes()) {
+                        __usermodehelper_set_disable_depth(UMH_ENABLED);
+                        printk("OOM in progress.");
+                        error = -EBUSY;
+                } else {
+                        printk("done.");
+                }
        }
        printk("\n");
        BUG_ON(in_atomic());
@@ -190,11 +230,11 @@ void thaw_processes(void)
        thaw_workqueues();
        read_lock(&tasklist_lock);
-        do_each_thread(g, p) {
+        for_each_process_thread(g, p) {
                /* No other threads should have PF_SUSPEND_TASK set */
                WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK));
                __thaw_task(p);
-        } while_each_thread(g, p);
+        }
        read_unlock(&tasklist_lock);
        WARN_ON(!(curr->flags & PF_SUSPEND_TASK));
@@ -217,10 +257,10 @@ void thaw_kernel_threads(void)
        thaw_workqueues();
        read_lock(&tasklist_lock);
-        do_each_thread(g, p) {
+        for_each_process_thread(g, p) {
                if (p->flags & (PF_KTHREAD | PF_WQ_WORKER))
                        __thaw_task(p);
-        } while_each_thread(g, p);
+        }
        read_unlock(&tasklist_lock);
        schedule();
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 884b77058864..5f4c006c4b1e 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -105,11 +105,27 @@ static struct pm_qos_object network_throughput_pm_qos = {
 };
+static BLOCKING_NOTIFIER_HEAD(memory_bandwidth_notifier);
+static struct pm_qos_constraints memory_bw_constraints = {
+        .list = PLIST_HEAD_INIT(memory_bw_constraints.list),
+        .target_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE,
+        .default_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE,
+        .no_constraint_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE,
+        .type = PM_QOS_SUM,
+        .notifiers = &memory_bandwidth_notifier,
+};
+static struct pm_qos_object memory_bandwidth_pm_qos = {
+        .constraints = &memory_bw_constraints,
+        .name = "memory_bandwidth",
+};
 static struct pm_qos_object *pm_qos_array[] = {
        &null_pm_qos,
        &cpu_dma_pm_qos,
        &network_lat_pm_qos,
-        &network_throughput_pm_qos
+        &network_throughput_pm_qos,
+        &memory_bandwidth_pm_qos,
 };
 static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
@@ -130,6 +146,9 @@ static const struct file_operations pm_qos_power_fops = {
 /* unlocked internal variant */
 static inline int pm_qos_get_value(struct pm_qos_constraints *c)
 {
+        struct plist_node *node;
+        int total_value = 0;
        if (plist_head_empty(&c->list))
                return c->no_constraint_value;
@@ -140,6 +159,12 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c)
        case PM_QOS_MAX:
                return plist_last(&c->list)->prio;
+        case PM_QOS_SUM:
+                plist_for_each(node, &c->list)
+                        total_value += node->prio;
+                return total_value;
        default:
                /* runtime check for not using enum */
                BUG();
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 4fc5c32422b3..791a61892bb5 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -954,6 +954,25 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
        }
 }
+static bool is_nosave_page(unsigned long pfn)
+{
+        struct nosave_region *region;
+        list_for_each_entry(region, &nosave_regions, list) {
+                if (pfn >= region->start_pfn && pfn < region->end_pfn) {
+                        pr_err("PM: %#010llx in e820 nosave region: "
+                               "[mem %#010llx-%#010llx]\n",
+                               (unsigned long long) pfn << PAGE_SHIFT,
+                               (unsigned long long) region->start_pfn << PAGE_SHIFT,
+                               ((unsigned long long) region->end_pfn << PAGE_SHIFT)
+                                        - 1);
+                        return true;
+                }
+        }
+        return false;
+}
 /**
 *      create_basic_memory_bitmaps - create bitmaps needed for marking page
 *      frames that should not be saved and free page frames.  The pointers
@@ -1324,6 +1343,9 @@ void swsusp_free(void)
 {
        unsigned long fb_pfn, fr_pfn;
+        if (!forbidden_pages_map || !free_pages_map)
+                goto out;
        memory_bm_position_reset(forbidden_pages_map);
        memory_bm_position_reset(free_pages_map);
@@ -1351,6 +1373,7 @@ loop:
                goto loop;
        }
+out:
        nr_copy_pages = 0;
        nr_meta_pages = 0;
        restore_pblist = NULL;
@@ -2015,7 +2038,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
        do {
                pfn = memory_bm_next_pfn(bm);
                if (likely(pfn != BM_END_OF_MAP)) {
-                        if (likely(pfn_valid(pfn)))
+                        if (likely(pfn_valid(pfn)) && !is_nosave_page(pfn))
                                swsusp_set_page_free(pfn_to_page(pfn));
                        else
                                return -EFAULT;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 6dadb25cb0d8..c347e3ce3a55 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -31,7 +31,7 @@
 #include "power.h"
-static const char *pm_labels[] = { "mem", "standby", "freeze", };
+const char *pm_labels[] = { "mem", "standby", "freeze", NULL };
 const char *pm_states[PM_SUSPEND_MAX];
 static const struct platform_suspend_ops *suspend_ops;
@@ -146,17 +146,29 @@ static int platform_suspend_prepare(suspend_state_t state)
 static int platform_suspend_prepare_late(suspend_state_t state)
 {
+        return state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->prepare ?
+                freeze_ops->prepare() : 0;
+}
+static int platform_suspend_prepare_noirq(suspend_state_t state)
+{
        return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ?
                suspend_ops->prepare_late() : 0;
 }
-static void platform_suspend_wake(suspend_state_t state)
+static void platform_resume_noirq(suspend_state_t state)
 {
        if (state != PM_SUSPEND_FREEZE && suspend_ops->wake)
                suspend_ops->wake();
 }
-static void platform_suspend_finish(suspend_state_t state)
+static void platform_resume_early(suspend_state_t state)
+{
+        if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->restore)
+                freeze_ops->restore();
+}
+static void platform_resume_finish(suspend_state_t state)
 {
        if (state != PM_SUSPEND_FREEZE && suspend_ops->finish)
                suspend_ops->finish();
@@ -172,7 +184,7 @@ static int platform_suspend_begin(suspend_state_t state)
                return 0;
 }
-static void platform_suspend_end(suspend_state_t state)
+static void platform_resume_end(suspend_state_t state)
 {
        if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
                freeze_ops->end();
@@ -180,7 +192,7 @@ static void platform_suspend_end(suspend_state_t state)
                suspend_ops->end();
 }
-static void platform_suspend_recover(suspend_state_t state)
+static void platform_recover(suspend_state_t state)
 {
        if (state != PM_SUSPEND_FREEZE && suspend_ops->recover)
                suspend_ops->recover();
@@ -265,13 +277,22 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
        if (error)
                goto Platform_finish;
-        error = dpm_suspend_end(PMSG_SUSPEND);
+        error = dpm_suspend_late(PMSG_SUSPEND);
        if (error) {
-                printk(KERN_ERR "PM: Some devices failed to power down\n");
+                printk(KERN_ERR "PM: late suspend of devices failed\n");
                goto Platform_finish;
        }
        error = platform_suspend_prepare_late(state);
        if (error)
+                goto Devices_early_resume;
+        error = dpm_suspend_noirq(PMSG_SUSPEND);
+        if (error) {
+                printk(KERN_ERR "PM: noirq suspend of devices failed\n");
+                goto Platform_early_resume;
+        }
+        error = platform_suspend_prepare_noirq(state);
+        if (error)
                goto Platform_wake;
        if (suspend_test(TEST_PLATFORM))
@@ -318,11 +339,17 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
        enable_nonboot_cpus();
 Platform_wake:
-        platform_suspend_wake(state);
+        platform_resume_noirq(state);
-        dpm_resume_start(PMSG_RESUME);
+        dpm_resume_noirq(PMSG_RESUME);
+ Platform_early_resume:
+        platform_resume_early(state);
+ Devices_early_resume:
+        dpm_resume_early(PMSG_RESUME);
 Platform_finish:
-        platform_suspend_finish(state);
+        platform_resume_finish(state);
        return error;
 }
@@ -361,14 +388,16 @@ int suspend_devices_and_enter(suspend_state_t state)
        suspend_test_start();
        dpm_resume_end(PMSG_RESUME);
        suspend_test_finish("resume devices");
+        trace_suspend_resume(TPS("resume_console"), state, true);
        resume_console();
+        trace_suspend_resume(TPS("resume_console"), state, false);
 Close:
-        platform_suspend_end(state);
+        platform_resume_end(state);
        return error;
 Recover_platform:
-        platform_suspend_recover(state);
+        platform_recover(state);
        goto Resume_devices;
 }
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 2f524928b6aa..084452e34a12 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -22,6 +22,8 @@
 #define TEST_SUSPEND_SECONDS    10
 static unsigned long suspend_test_start_time;
+static u32 test_repeat_count_max = 1;
+static u32 test_repeat_count_current;
 void suspend_test_start(void)
 {
@@ -74,6 +76,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
        int                     status;
        /* this may fail if the RTC hasn't been initialized */
+repeat:
        status = rtc_read_time(rtc, &alm.time);
        if (status < 0) {
                printk(err_readtime, dev_name(&rtc->dev), status);
@@ -100,10 +103,21 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
        if (state == PM_SUSPEND_STANDBY) {
                printk(info_test, pm_states[state]);
                status = pm_suspend(state);
+                if (status < 0)
+                        state = PM_SUSPEND_FREEZE;
        }
+        if (state == PM_SUSPEND_FREEZE) {
+                printk(info_test, pm_states[state]);
+                status = pm_suspend(state);
+        }
        if (status < 0)
                printk(err_suspend, status);
+        test_repeat_count_current++;
+        if (test_repeat_count_current < test_repeat_count_max)
+                goto repeat;
        /* Some platforms can't detect that the alarm triggered the
         * wakeup, or (accordingly) disable it after it afterwards.
         * It's supposed to give oneshot behavior; cope.
@@ -129,24 +143,36 @@ static int __init has_wakealarm(struct device *dev, const void *data)
 * at startup time.  They're normally disabled, for faster boot and because
 * we can't know which states really work on this particular system.
 */
-static suspend_state_t test_state __initdata = PM_SUSPEND_ON;
+static const char *test_state_label __initdata;
 static char warn_bad_state[] __initdata =
        KERN_WARNING "PM: can't test '%s' suspend state\n";
 static int __init setup_test_suspend(char *value)
 {
-        suspend_state_t i;
+        int i;
+        char *repeat;
+        char *suspend_type;
-        /* "=mem" ==> "mem" */
+        /* example : "=mem[,N]" ==> "mem[,N]" */
        value++;
-        for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
+        suspend_type = strsep(&value, ",");
-                if (!strcmp(pm_states[i], value)) {
+        if (!suspend_type)
-                        test_state = i;
+                return 0;
+        repeat = strsep(&value, ",");
+        if (repeat) {
+                if (kstrtou32(repeat, 0, &test_repeat_count_max))
+                        return 0;
+        }
+        for (i = 0; pm_labels[i]; i++)
+                if (!strcmp(pm_labels[i], suspend_type)) {
+                        test_state_label = pm_labels[i];
                        return 0;
                }
-        printk(warn_bad_state, value);
+        printk(warn_bad_state, suspend_type);
        return 0;
 }
 __setup("test_suspend", setup_test_suspend);
@@ -158,13 +184,21 @@ static int __init test_suspend(void)
        struct rtc_device       *rtc = NULL;
        struct device           *dev;
+        suspend_state_t test_state;
        /* PM is initialized by now; is that state testable? */
-        if (test_state == PM_SUSPEND_ON)
+        if (!test_state_label)
-                goto done;
+                return 0;
-        if (!pm_states[test_state]) {
-                printk(warn_bad_state, pm_states[test_state]);
+        for (test_state = PM_SUSPEND_MIN; test_state < PM_SUSPEND_MAX; test_state++) {
-                goto done;
+                const char *state_label = pm_states[test_state];
+                if (state_label && !strcmp(test_state_label, state_label))
+                        break;
+        }
+        if (test_state == PM_SUSPEND_MAX) {
+                printk(warn_bad_state, test_state_label);
+                return 0;
        }
        /* RTCs have initialized by now too ... can we use one? */
@@ -173,13 +207,12 @@ static int __init test_suspend(void)
                rtc = rtc_class_open(dev_name(dev));
        if (!rtc) {
                printk(warn_no_rtc);
-                goto done;
+                return 0;
        }
        /* go for it */
        test_wakealarm(rtc, test_state);
        rtc_class_close(rtc);
-done:
        return 0;
 }
 late_initcall(test_suspend);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 13e839dbca07..ced2b84b1cb7 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -45,6 +45,7 @@
 #include <linux/poll.h>
 #include <linux/irq_work.h>
 #include <linux/utsname.h>
+#include <linux/ctype.h>
 #include <asm/uaccess.h>
@@ -56,7 +57,7 @@
 int console_printk[4] = {
        CONSOLE_LOGLEVEL_DEFAULT,       /* console_loglevel */
-        DEFAULT_MESSAGE_LOGLEVEL,       /* default_message_loglevel */
+        MESSAGE_LOGLEVEL_DEFAULT,       /* default_message_loglevel */
        CONSOLE_LOGLEVEL_MIN,           /* minimum_console_loglevel */
        CONSOLE_LOGLEVEL_DEFAULT,       /* default_console_loglevel */
 };
@@ -113,9 +114,9 @@ static int __down_trylock_console_sem(unsigned long ip)
 * This is used for debugging the mess that is the VT code by
 * keeping track if we have the console semaphore held. It's
 * definitely not the perfect debug tool (we don't know if _WE_
- * hold it are racing, but it helps tracking those weird code
+ * hold it and are racing, but it helps tracking those weird code
- * path in the console code where we end up in places I want
+ * paths in the console code where we end up in places I want
- * locked without the console sempahore held
+ * locked without the console sempahore held).
 */
 static int console_locked, console_suspended;
@@ -146,8 +147,8 @@ static int console_may_schedule;
 * the overall length of the record.
 *
 * The heads to the first and last entry in the buffer, as well as the
- * sequence numbers of these both entries are maintained when messages
+ * sequence numbers of these entries are maintained when messages are
- * are stored..
+ * stored.
 *
 * If the heads indicate available messages, the length in the header
 * tells the start next message. A length == 0 for the next message
@@ -257,7 +258,7 @@ static u64 clear_seq;
 static u32 clear_idx;
 #define PREFIX_MAX              32
-#define LOG_LINE_MAX            1024 - PREFIX_MAX
+#define LOG_LINE_MAX            (1024 - PREFIX_MAX)
 /* record buffer */
 #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
@@ -270,6 +271,18 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
 static char *log_buf = __log_buf;
 static u32 log_buf_len = __LOG_BUF_LEN;
+/* Return log buffer address */
+char *log_buf_addr_get(void)
+{
+        return log_buf;
+}
+/* Return log buffer size */
+u32 log_buf_len_get(void)
+{
+        return log_buf_len;
+}
 /* human readable text of the record */
 static char *log_text(const struct printk_log *msg)
 {
@@ -344,7 +357,7 @@ static int log_make_free_space(u32 msg_size)
        while (log_first_seq < log_next_seq) {
                if (logbuf_has_space(msg_size, false))
                        return 0;
-                /* drop old messages until we have enough continuous space */
+                /* drop old messages until we have enough contiguous space */
                log_first_idx = log_next(log_first_idx);
                log_first_seq++;
        }
@@ -453,11 +466,7 @@ static int log_store(int facility, int level,
        return msg->text_len;
 }
-#ifdef CONFIG_SECURITY_DMESG_RESTRICT
+int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT);
-int dmesg_restrict = 1;
-#else
-int dmesg_restrict;
-#endif
 static int syslog_action_restricted(int type)
 {
@@ -509,14 +518,13 @@ struct devkmsg_user {
        char buf[8192];
 };
-static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
+static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
-                              unsigned long count, loff_t pos)
 {
        char *buf, *line;
        int i;
        int level = default_message_loglevel;
        int facility = 1;       /* LOG_USER */
-        size_t len = iov_length(iv, count);
+        size_t len = iocb->ki_nbytes;
        ssize_t ret = len;
        if (len > LOG_LINE_MAX)
@@ -525,13 +533,10 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
        if (buf == NULL)
                return -ENOMEM;
-        line = buf;
+        buf[len] = '\0';
-        for (i = 0; i < count; i++) {
+        if (copy_from_iter(buf, len, from) != len) {
-                if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) {
+                kfree(buf);
-                        ret = -EFAULT;
+                return -EFAULT;
-                        goto out;
-                }
-                line += iv[i].iov_len;
        }
        /*
@@ -557,10 +562,8 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
                        line = endp;
                }
        }
-        line[len] = '\0';
        printk_emit(facility, level, NULL, 0, "%s", line);
-out:
        kfree(buf);
        return ret;
 }
@@ -792,7 +795,7 @@ static int devkmsg_release(struct inode *inode, struct file *file)
 const struct file_operations kmsg_fops = {
        .open = devkmsg_open,
        .read = devkmsg_read,
-        .aio_write = devkmsg_writev,
+        .write_iter = devkmsg_write,
        .llseek = devkmsg_llseek,
        .poll = devkmsg_poll,
        .release = devkmsg_release,
@@ -828,34 +831,80 @@ void log_buf_kexec_setup(void)
 /* requested log_buf_len from kernel cmdline */
 static unsigned long __initdata new_log_buf_len;
-/* save requested log_buf_len since it's too early to process it */
+/* we practice scaling the ring buffer by powers of 2 */
-static int __init log_buf_len_setup(char *str)
+static void __init log_buf_len_update(unsigned size)
 {
-        unsigned size = memparse(str, &str);
        if (size)
                size = roundup_pow_of_two(size);
        if (size > log_buf_len)
                new_log_buf_len = size;
+}
+/* save requested log_buf_len since it's too early to process it */
+static int __init log_buf_len_setup(char *str)
+{
+        unsigned size = memparse(str, &str);
+        log_buf_len_update(size);
        return 0;
 }
 early_param("log_buf_len", log_buf_len_setup);
+#ifdef CONFIG_SMP
+#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT)
+static void __init log_buf_add_cpu(void)
+{
+        unsigned int cpu_extra;
+        /*
+         * archs should set up cpu_possible_bits properly with
+         * set_cpu_possible() after setup_arch() but just in
+         * case lets ensure this is valid.
+         */
+        if (num_possible_cpus() == 1)
+                return;
+        cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN;
+        /* by default this will only continue through for large > 64 CPUs */
+        if (cpu_extra <= __LOG_BUF_LEN / 2)
+                return;
+        pr_info("log_buf_len individual max cpu contribution: %d bytes\n",
+                __LOG_CPU_MAX_BUF_LEN);
+        pr_info("log_buf_len total cpu_extra contributions: %d bytes\n",
+                cpu_extra);
+        pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN);
+        log_buf_len_update(cpu_extra + __LOG_BUF_LEN);
+}
+#else /* !CONFIG_SMP */
+static inline void log_buf_add_cpu(void) {}
+#endif /* CONFIG_SMP */
 void __init setup_log_buf(int early)
 {
        unsigned long flags;
        char *new_log_buf;
        int free;
+        if (log_buf != __log_buf)
+                return;
+        if (!early && !new_log_buf_len)
+                log_buf_add_cpu();
        if (!new_log_buf_len)
                return;
        if (early) {
                new_log_buf =
-                        memblock_virt_alloc(new_log_buf_len, PAGE_SIZE);
+                        memblock_virt_alloc(new_log_buf_len, LOG_ALIGN);
        } else {
-                new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0);
+                new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len,
+                                                          LOG_ALIGN);
        }
        if (unlikely(!new_log_buf)) {
@@ -872,7 +921,7 @@ void __init setup_log_buf(int early)
        memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
-        pr_info("log_buf_len: %d\n", log_buf_len);
+        pr_info("log_buf_len: %d bytes\n", log_buf_len);
        pr_info("early log buf free: %d(%d%%)\n",
                free, (free * 100) / __LOG_BUF_LEN);
 }
@@ -881,7 +930,7 @@ static bool __read_mostly ignore_loglevel;
 static int __init ignore_loglevel_setup(char *str)
 {
-        ignore_loglevel = 1;
+        ignore_loglevel = true;
        pr_info("debug: ignoring loglevel setting.\n");
        return 0;
@@ -947,11 +996,7 @@ static inline void boot_delay_msec(int level)
 }
 #endif
-#if defined(CONFIG_PRINTK_TIME)
+static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME);
-static bool printk_time = 1;
-#else
-static bool printk_time;
-#endif
 module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
 static size_t print_time(u64 ts, char *buf)
@@ -1310,7 +1355,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
                         * for pending data, not the size; return the count of
                         * records, not the length.
                         */
-                        error = log_next_idx - syslog_idx;
+                        error = log_next_seq - syslog_seq;
                } else {
                        u64 seq = syslog_seq;
                        u32 idx = syslog_idx;
@@ -1416,10 +1461,9 @@ static int have_callable_console(void)
 /*
 * Can we actually use the console at this time on this cpu?
 *
- * Console drivers may assume that per-cpu resources have
+ * Console drivers may assume that per-cpu resources have been allocated. So
- * been allocated. So unless they're explicitly marked as
+ * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
- * being able to cope (CON_ANYTIME) don't call them until
+ * call them until this CPU is officially up.
- * this CPU is officially up.
 */
 static inline int can_use_console(unsigned int cpu)
 {
@@ -1432,8 +1476,10 @@ static inline int can_use_console(unsigned int cpu)
 * console_lock held, and 'console_locked' set) if it
 * is successful, false otherwise.
 */
-static int console_trylock_for_printk(unsigned int cpu)
+static int console_trylock_for_printk(void)
 {
+        unsigned int cpu = smp_processor_id();
        if (!console_trylock())
                return 0;
        /*
@@ -1476,7 +1522,7 @@ static struct cont {
        struct task_struct *owner;      /* task of first print*/
        u64 ts_nsec;                    /* time of first print */
        u8 level;                       /* log level of first message */
-        u8 facility;                    /* log level of first message */
+        u8 facility;                    /* log facility of first message */
        enum log_flags flags;           /* prefix, newline flags */
        bool flushed:1;                 /* buffer sealed and committed */
 } cont;
@@ -1608,7 +1654,8 @@ asmlinkage int vprintk_emit(int facility, int level,
                 */
                if (!oops_in_progress && !lockdep_recursing(current)) {
                        recursion_bug = 1;
-                        goto out_restore_irqs;
+                        local_irq_restore(flags);
+                        return 0;
                }
                zap_locks();
        }
@@ -1617,27 +1664,22 @@ asmlinkage int vprintk_emit(int facility, int level,
        raw_spin_lock(&logbuf_lock);
        logbuf_cpu = this_cpu;
-        if (recursion_bug) {
+        if (unlikely(recursion_bug)) {
                static const char recursion_msg[] =
                        "BUG: recent printk recursion!";
                recursion_bug = 0;
-                text_len = strlen(recursion_msg);
                /* emit KERN_CRIT message */
                printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
-                                         NULL, 0, recursion_msg, text_len);
+                                         NULL, 0, recursion_msg,
+                                         strlen(recursion_msg));
        }
        /*
         * The printf needs to come first; we need the syslog
         * prefix which might be passed-in as a parameter.
         */
-        if (in_sched)
+        text_len = vscnprintf(text, sizeof(textbuf), fmt, args);
-                text_len = scnprintf(text, sizeof(textbuf),
-                                     KERN_WARNING "[sched_delayed] ");
-        text_len += vscnprintf(text + text_len,
-                               sizeof(textbuf) - text_len, fmt, args);
        /* mark and strip a trailing newline */
        if (text_len && text[text_len-1] == '\n') {
@@ -1716,21 +1758,30 @@ asmlinkage int vprintk_emit(int facility, int level,
        logbuf_cpu = UINT_MAX;
        raw_spin_unlock(&logbuf_lock);
+        lockdep_on();
+        local_irq_restore(flags);
        /* If called from the scheduler, we can not call up(). */
        if (!in_sched) {
+                lockdep_off();
+                /*
+                 * Disable preemption to avoid being preempted while holding
+                 * console_sem which would prevent anyone from printing to
+                 * console
+                 */
+                preempt_disable();
                /*
                 * Try to acquire and then immediately release the console
                 * semaphore.  The release will print out buffers and wake up
                 * /dev/kmsg and syslog() users.
                 */
-                if (console_trylock_for_printk(this_cpu))
+                if (console_trylock_for_printk())
                        console_unlock();
+                preempt_enable();
+                lockdep_on();
        }
-        lockdep_on();
-out_restore_irqs:
-        local_irq_restore(flags);
        return printed_len;
 }
 EXPORT_SYMBOL(vprintk_emit);
@@ -1802,7 +1853,7 @@ EXPORT_SYMBOL(printk);
 #define LOG_LINE_MAX            0
 #define PREFIX_MAX              0
-#define LOG_LINE_MAX 0
 static u64 syslog_seq;
 static u32 syslog_idx;
 static u64 console_seq;
@@ -1881,11 +1932,12 @@ static int __add_preferred_console(char *name, int idx, char *options,
        return 0;
 }
 /*
- * Set up a list of consoles.  Called from init/main.c
+ * Set up a console.  Called via do_early_param() in init/main.c
+ * for each "console=" parameter in the boot command line.
 */
 static int __init console_setup(char *str)
 {
-        char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */
+        char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for "ttyS" */
        char *s, *options, *brl_options = NULL;
        int idx;
@@ -1902,7 +1954,8 @@ static int __init console_setup(char *str)
                strncpy(buf, str, sizeof(buf) - 1);
        }
        buf[sizeof(buf) - 1] = 0;
-        if ((options = strchr(str, ',')) != NULL)
+        options = strchr(str, ',');
+        if (options)
                *(options++) = 0;
 #ifdef __sparc__
        if (!strcmp(str, "ttya"))
@@ -1911,7 +1964,7 @@ static int __init console_setup(char *str)
                strcpy(buf, "ttyS1");
 #endif
        for (s = buf; *s; s++)
-                if ((*s >= '0' && *s <= '9') || *s == ',')
+                if (isdigit(*s) || *s == ',')
                        break;
        idx = simple_strtoul(s, NULL, 10);
        *s = 0;
@@ -1950,7 +2003,6 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
             i++, c++)
                if (strcmp(c->name, name) == 0 && c->index == idx) {
                        strlcpy(c->name, name_new, sizeof(c->name));
-                        c->name[sizeof(c->name) - 1] = 0;
                        c->options = options;
                        c->index = idx_new;
                        return i;
@@ -1959,12 +2011,12 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
        return -1;
 }
-bool console_suspend_enabled = 1;
+bool console_suspend_enabled = true;
 EXPORT_SYMBOL(console_suspend_enabled);
 static int __init console_suspend_disable(char *str)
 {
-        console_suspend_enabled = 0;
+        console_suspend_enabled = false;
        return 1;
 }
 __setup("no_console_suspend", console_suspend_disable);
@@ -2045,8 +2097,8 @@ EXPORT_SYMBOL(console_lock);
 /**
 * console_trylock - try to lock the console system for exclusive use.
 *
- * Tried to acquire a lock which guarantees that the caller has
+ * Try to acquire a lock which guarantees that the caller has exclusive
- * exclusive access to the console system and the console_drivers list.
+ * access to the console system and the console_drivers list.
 *
 * returns 1 on success, and 0 on failure to acquire the lock.
 */
@@ -2570,7 +2622,7 @@ void wake_up_klogd(void)
        preempt_disable();
        if (waitqueue_active(&log_wait)) {
                this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
-                irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
+                irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
        }
        preempt_enable();
 }
@@ -2586,7 +2638,7 @@ int printk_deferred(const char *fmt, ...)
        va_end(args);
        __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
-        irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
+        irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
        preempt_enable();
        return r;
@@ -2618,14 +2670,13 @@ EXPORT_SYMBOL(__printk_ratelimit);
 bool printk_timed_ratelimit(unsigned long *caller_jiffies,
                        unsigned int interval_msecs)
 {
-        if (*caller_jiffies == 0
+        unsigned long elapsed = jiffies - *caller_jiffies;
-                        || !time_in_range(jiffies, *caller_jiffies,
-                                        *caller_jiffies
+        if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs))
-                                        + msecs_to_jiffies(interval_msecs))) {
+                return false;
-                *caller_jiffies = jiffies;
-                return true;
+        *caller_jiffies = jiffies;
-        }
+        return true;
-        return false;
 }
 EXPORT_SYMBOL(printk_timed_ratelimit);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 948a7693748e..240fa9094f83 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -49,11 +49,19 @@
 #include <linux/trace_clock.h>
 #include <asm/byteorder.h>
 #include <linux/torture.h>
+#include <linux/vmalloc.h>
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
+torture_param(int, cbflood_inter_holdoff, HZ,
+              "Holdoff between floods (jiffies)");
+torture_param(int, cbflood_intra_holdoff, 1,
+              "Holdoff between bursts (jiffies)");
+torture_param(int, cbflood_n_burst, 3, "# bursts in flood, zero to disable");
+torture_param(int, cbflood_n_per_burst, 20000,
+              "# callbacks per burst in flood");
 torture_param(int, fqs_duration, 0,
              "Duration of fqs bursts (us), 0 to disable");
 torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");
@@ -96,10 +104,12 @@ module_param(torture_type, charp, 0444);
 MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)");
 static int nrealreaders;
+static int ncbflooders;
 static struct task_struct *writer_task;
 static struct task_struct **fakewriter_tasks;
 static struct task_struct **reader_tasks;
 static struct task_struct *stats_task;
+static struct task_struct **cbflood_task;
 static struct task_struct *fqs_task;
 static struct task_struct *boost_tasks[NR_CPUS];
 static struct task_struct *stall_task;
@@ -138,6 +148,7 @@ static long n_rcu_torture_boosts;
 static long n_rcu_torture_timers;
 static long n_barrier_attempts;
 static long n_barrier_successes;
+static atomic_long_t n_cbfloods;
 static struct list_head rcu_torture_removed;
 static int rcu_torture_writer_state;
@@ -157,9 +168,9 @@ static int rcu_torture_writer_state;
 #else
 #define RCUTORTURE_RUNNABLE_INIT 0
 #endif
-int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
+static int torture_runnable = RCUTORTURE_RUNNABLE_INIT;
-module_param(rcutorture_runnable, int, 0444);
+module_param(torture_runnable, int, 0444);
-MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");
+MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot");
 #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
 #define rcu_can_boost() 1
@@ -182,7 +193,7 @@ static u64 notrace rcu_trace_clock_local(void)
 #endif /* #else #ifdef CONFIG_RCU_TRACE */
 static unsigned long boost_starttime;   /* jiffies of next boost test start. */
-DEFINE_MUTEX(boost_mutex);              /* protect setting boost_starttime */
+static DEFINE_MUTEX(boost_mutex);       /* protect setting boost_starttime */
                                        /*  and boost task create/destroy. */
 static atomic_t barrier_cbs_count;      /* Barrier callbacks registered. */
 static bool barrier_phase;              /* Test phase. */
@@ -242,7 +253,7 @@ struct rcu_torture_ops {
        void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
        void (*cb_barrier)(void);
        void (*fqs)(void);
-        void (*stats)(char *page);
+        void (*stats)(void);
        int irq_capable;
        int can_boost;
        const char *name;
@@ -525,21 +536,21 @@ static void srcu_torture_barrier(void)
        srcu_barrier(&srcu_ctl);
 }
-static void srcu_torture_stats(char *page)
+static void srcu_torture_stats(void)
 {
        int cpu;
        int idx = srcu_ctl.completed & 0x1;
-        page += sprintf(page, "%s%s per-CPU(idx=%d):",
+        pr_alert("%s%s per-CPU(idx=%d):",
-                       torture_type, TORTURE_FLAG, idx);
+                 torture_type, TORTURE_FLAG, idx);
        for_each_possible_cpu(cpu) {
                long c0, c1;
                c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx];
                c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx];
-                page += sprintf(page, " %d(%ld,%ld)", cpu, c0, c1);
+                pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
        }
-        sprintf(page, "\n");
+        pr_cont("\n");
 }
 static void srcu_torture_synchronize_expedited(void)
@@ -601,6 +612,52 @@ static struct rcu_torture_ops sched_ops = {
        .name           = "sched"
 };
+#ifdef CONFIG_TASKS_RCU
+/*
+ * Definitions for RCU-tasks torture testing.
+ */
+static int tasks_torture_read_lock(void)
+{
+        return 0;
+}
+static void tasks_torture_read_unlock(int idx)
+{
+}
+static void rcu_tasks_torture_deferred_free(struct rcu_torture *p)
+{
+        call_rcu_tasks(&p->rtort_rcu, rcu_torture_cb);
+}
+static struct rcu_torture_ops tasks_ops = {
+        .ttype          = RCU_TASKS_FLAVOR,
+        .init           = rcu_sync_torture_init,
+        .readlock       = tasks_torture_read_lock,
+        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
+        .readunlock     = tasks_torture_read_unlock,
+        .completed      = rcu_no_completed,
+        .deferred_free  = rcu_tasks_torture_deferred_free,
+        .sync           = synchronize_rcu_tasks,
+        .exp_sync       = synchronize_rcu_tasks,
+        .call           = call_rcu_tasks,
+        .cb_barrier     = rcu_barrier_tasks,
+        .fqs            = NULL,
+        .stats          = NULL,
+        .irq_capable    = 1,
+        .name           = "tasks"
+};
+#define RCUTORTURE_TASKS_OPS &tasks_ops,
+#else /* #ifdef CONFIG_TASKS_RCU */
+#define RCUTORTURE_TASKS_OPS
+#endif /* #else #ifdef CONFIG_TASKS_RCU */
 /*
 * RCU torture priority-boost testing.  Runs one real-time thread per
 * CPU for moderate bursts, repeatedly registering RCU callbacks and
@@ -667,7 +724,7 @@ static int rcu_torture_boost(void *arg)
                                }
                                call_rcu_time = jiffies;
                        }
-                        cond_resched();
+                        cond_resched_rcu_qs();
                        stutter_wait("rcu_torture_boost");
                        if (torture_must_stop())
                                goto checkwait;
@@ -707,6 +764,58 @@ checkwait:	stutter_wait("rcu_torture_boost");
        return 0;
 }
+static void rcu_torture_cbflood_cb(struct rcu_head *rhp)
+{
+}
+/*
+ * RCU torture callback-flood kthread.  Repeatedly induces bursts of calls
+ * to call_rcu() or analogous, increasing the probability of occurrence
+ * of callback-overflow corner cases.
+ */
+static int
+rcu_torture_cbflood(void *arg)
+{
+        int err = 1;
+        int i;
+        int j;
+        struct rcu_head *rhp;
+        if (cbflood_n_per_burst > 0 &&
+            cbflood_inter_holdoff > 0 &&
+            cbflood_intra_holdoff > 0 &&
+            cur_ops->call &&
+            cur_ops->cb_barrier) {
+                rhp = vmalloc(sizeof(*rhp) *
+                              cbflood_n_burst * cbflood_n_per_burst);
+                err = !rhp;
+        }
+        if (err) {
+                VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM");
+                while (!torture_must_stop())
+                        schedule_timeout_interruptible(HZ);
+                return 0;
+        }
+        VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started");
+        do {
+                schedule_timeout_interruptible(cbflood_inter_holdoff);
+                atomic_long_inc(&n_cbfloods);
+                WARN_ON(signal_pending(current));
+                for (i = 0; i < cbflood_n_burst; i++) {
+                        for (j = 0; j < cbflood_n_per_burst; j++) {
+                                cur_ops->call(&rhp[i * cbflood_n_per_burst + j],
+                                              rcu_torture_cbflood_cb);
+                        }
+                        schedule_timeout_interruptible(cbflood_intra_holdoff);
+                        WARN_ON(signal_pending(current));
+                }
+                cur_ops->cb_barrier();
+                stutter_wait("rcu_torture_cbflood");
+        } while (!torture_must_stop());
+        torture_kthread_stopping("rcu_torture_cbflood");
+        return 0;
+}
 /*
 * RCU torture force-quiescent-state kthread.  Repeatedly induces
 * bursts of calls to force_quiescent_state(), increasing the probability
@@ -1019,7 +1128,7 @@ rcu_torture_reader(void *arg)
                __this_cpu_inc(rcu_torture_batch[completed]);
                preempt_enable();
                cur_ops->readunlock(idx);
-                cond_resched();
+                cond_resched_rcu_qs();
                stutter_wait("rcu_torture_reader");
        } while (!torture_must_stop());
        if (irqreader && cur_ops->irq_capable) {
@@ -1031,10 +1140,15 @@ rcu_torture_reader(void *arg)
 }
 /*
- * Create an RCU-torture statistics message in the specified buffer.
+ * Print torture statistics.  Caller must ensure that there is only
+ * one call to this function at a given time!!!  This is normally
+ * accomplished by relying on the module system to only have one copy
+ * of the module loaded, and then by giving the rcu_torture_stats
+ * kthread full control (or the init/cleanup functions when rcu_torture_stats
+ * thread is not running).
 */
 static void
-rcu_torture_printk(char *page)
+rcu_torture_stats_print(void)
 {
        int cpu;
        int i;
@@ -1052,55 +1166,61 @@ rcu_torture_printk(char *page)
                if (pipesummary[i] != 0)
                        break;
        }
-        page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG);
-        page += sprintf(page,
+        pr_alert("%s%s ", torture_type, TORTURE_FLAG);
-                       "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
+        pr_cont("rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
-                       rcu_torture_current,
+                rcu_torture_current,
-                       rcu_torture_current_version,
+                rcu_torture_current_version,
-                       list_empty(&rcu_torture_freelist),
+                list_empty(&rcu_torture_freelist),
-                       atomic_read(&n_rcu_torture_alloc),
+                atomic_read(&n_rcu_torture_alloc),
-                       atomic_read(&n_rcu_torture_alloc_fail),
+                atomic_read(&n_rcu_torture_alloc_fail),
-                       atomic_read(&n_rcu_torture_free));
+                atomic_read(&n_rcu_torture_free));
-        page += sprintf(page, "rtmbe: %d rtbke: %ld rtbre: %ld ",
+        pr_cont("rtmbe: %d rtbke: %ld rtbre: %ld ",
-                       atomic_read(&n_rcu_torture_mberror),
+                atomic_read(&n_rcu_torture_mberror),
-                       n_rcu_torture_boost_ktrerror,
+                n_rcu_torture_boost_ktrerror,
-                       n_rcu_torture_boost_rterror);
+                n_rcu_torture_boost_rterror);
-        page += sprintf(page, "rtbf: %ld rtb: %ld nt: %ld ",
+        pr_cont("rtbf: %ld rtb: %ld nt: %ld ",
-                       n_rcu_torture_boost_failure,
+                n_rcu_torture_boost_failure,
-                       n_rcu_torture_boosts,
+                n_rcu_torture_boosts,
-                       n_rcu_torture_timers);
+                n_rcu_torture_timers);
-        page = torture_onoff_stats(page);
+        torture_onoff_stats();
-        page += sprintf(page, "barrier: %ld/%ld:%ld",
+        pr_cont("barrier: %ld/%ld:%ld ",
-                       n_barrier_successes,
+                n_barrier_successes,
-                       n_barrier_attempts,
+                n_barrier_attempts,
-                       n_rcu_torture_barrier_error);
+                n_rcu_torture_barrier_error);
-        page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
+        pr_cont("cbflood: %ld\n", atomic_long_read(&n_cbfloods));
+        pr_alert("%s%s ", torture_type, TORTURE_FLAG);
        if (atomic_read(&n_rcu_torture_mberror) != 0 ||
            n_rcu_torture_barrier_error != 0 ||
            n_rcu_torture_boost_ktrerror != 0 ||
            n_rcu_torture_boost_rterror != 0 ||
            n_rcu_torture_boost_failure != 0 ||
            i > 1) {
-                page += sprintf(page, "!!! ");
+                pr_cont("%s", "!!! ");
                atomic_inc(&n_rcu_torture_error);
                WARN_ON_ONCE(1);
        }
-        page += sprintf(page, "Reader Pipe: ");
+        pr_cont("Reader Pipe: ");
        for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
-                page += sprintf(page, " %ld", pipesummary[i]);
+                pr_cont(" %ld", pipesummary[i]);
-        page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
+        pr_cont("\n");
-        page += sprintf(page, "Reader Batch: ");
+        pr_alert("%s%s ", torture_type, TORTURE_FLAG);
+        pr_cont("Reader Batch: ");
        for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
-                page += sprintf(page, " %ld", batchsummary[i]);
+                pr_cont(" %ld", batchsummary[i]);
-        page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
+        pr_cont("\n");
-        page += sprintf(page, "Free-Block Circulation: ");
+        pr_alert("%s%s ", torture_type, TORTURE_FLAG);
+        pr_cont("Free-Block Circulation: ");
        for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
-                page += sprintf(page, " %d",
+                pr_cont(" %d", atomic_read(&rcu_torture_wcount[i]));
-                               atomic_read(&rcu_torture_wcount[i]));
        }
-        page += sprintf(page, "\n");
+        pr_cont("\n");
        if (cur_ops->stats)
-                cur_ops->stats(page);
+                cur_ops->stats();
        if (rtcv_snap == rcu_torture_current_version &&
            rcu_torture_current != NULL) {
                int __maybe_unused flags;
@@ -1109,10 +1229,9 @@ rcu_torture_printk(char *page)
                rcutorture_get_gp_data(cur_ops->ttype,
                                       &flags, &gpnum, &completed);
-                page += sprintf(page,
+                pr_alert("??? Writer stall state %d g%lu c%lu f%#x\n",
-                                "??? Writer stall state %d g%lu c%lu f%#x\n",
+                         rcu_torture_writer_state,
-                                rcu_torture_writer_state,
+                         gpnum, completed, flags);
-                                gpnum, completed, flags);
                show_rcu_gp_kthreads();
                rcutorture_trace_dump();
        }
@@ -1120,30 +1239,6 @@ rcu_torture_printk(char *page)
 }
 /*
- * Print torture statistics.  Caller must ensure that there is only
- * one call to this function at a given time!!!  This is normally
- * accomplished by relying on the module system to only have one copy
- * of the module loaded, and then by giving the rcu_torture_stats
- * kthread full control (or the init/cleanup functions when rcu_torture_stats
- * thread is not running).
- */
-static void
-rcu_torture_stats_print(void)
-{
-        int size = nr_cpu_ids * 200 + 8192;
-        char *buf;
-        buf = kmalloc(size, GFP_KERNEL);
-        if (!buf) {
-                pr_err("rcu-torture: Out of memory, need: %d", size);
-                return;
-        }
-        rcu_torture_printk(buf);
-        pr_alert("%s", buf);
-        kfree(buf);
-}
-/*
 * Periodically prints torture statistics, if periodic statistics printing
 * was specified via the stat_interval module parameter.
 */
@@ -1295,7 +1390,8 @@ static int rcu_torture_barrier_cbs(void *arg)
                if (atomic_dec_and_test(&barrier_cbs_count))
                        wake_up(&barrier_wq);
        } while (!torture_must_stop());
-        cur_ops->cb_barrier();
+        if (cur_ops->cb_barrier != NULL)
+                cur_ops->cb_barrier();
        destroy_rcu_head_on_stack(&rcu);
        torture_kthread_stopping("rcu_torture_barrier_cbs");
        return 0;
@@ -1418,7 +1514,7 @@ rcu_torture_cleanup(void)
        int i;
        rcutorture_record_test_transition();
-        if (torture_cleanup()) {
+        if (torture_cleanup_begin()) {
                if (cur_ops->cb_barrier != NULL)
                        cur_ops->cb_barrier();
                return;
@@ -1447,6 +1543,8 @@ rcu_torture_cleanup(void)
        torture_stop_kthread(rcu_torture_stats, stats_task);
        torture_stop_kthread(rcu_torture_fqs, fqs_task);
+        for (i = 0; i < ncbflooders; i++)
+                torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]);
        if ((test_boost == 1 && cur_ops->can_boost) ||
            test_boost == 2) {
                unregister_cpu_notifier(&rcutorture_cpu_nb);
@@ -1468,6 +1566,7 @@ rcu_torture_cleanup(void)
                                               "End of test: RCU_HOTPLUG");
        else
                rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
+        torture_cleanup_end();
 }
 #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
@@ -1534,9 +1633,10 @@ rcu_torture_init(void)
        int firsterr = 0;
        static struct rcu_torture_ops *torture_ops[] = {
                &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops,
+                RCUTORTURE_TASKS_OPS
        };
-        if (!torture_init_begin(torture_type, verbose, &rcutorture_runnable))
+        if (!torture_init_begin(torture_type, verbose, &torture_runnable))
                return -EBUSY;
        /* Process args and tell the world that the torturer is on the job. */
@@ -1693,6 +1793,24 @@ rcu_torture_init(void)
                goto unwind;
        if (object_debug)
                rcu_test_debug_objects();
+        if (cbflood_n_burst > 0) {
+                /* Create the cbflood threads */
+                ncbflooders = (num_online_cpus() + 3) / 4;
+                cbflood_task = kcalloc(ncbflooders, sizeof(*cbflood_task),
+                                       GFP_KERNEL);
+                if (!cbflood_task) {
+                        VERBOSE_TOROUT_ERRSTRING("out of memory");
+                        firsterr = -ENOMEM;
+                        goto unwind;
+                }
+                for (i = 0; i < ncbflooders; i++) {
+                        firsterr = torture_create_kthread(rcu_torture_cbflood,
+                                                          NULL,
+                                                          cbflood_task[i]);
+                        if (firsterr)
+                                goto unwind;
+                }
+        }
        rcutorture_record_test_transition();
        torture_init_end();
        return 0;
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index d9efcc13008c..c0623fc47125 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -51,7 +51,7 @@ static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
 #include "tiny_plugin.h"
-/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
+/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcu/tree.c. */
 static void rcu_idle_enter_common(long long newval)
 {
        if (newval) {
@@ -62,7 +62,7 @@ static void rcu_idle_enter_common(long long newval)
        }
        RCU_TRACE(trace_rcu_dyntick(TPS("Start"),
                                    rcu_dynticks_nesting, newval));
-        if (!is_idle_task(current)) {
+        if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) {
                struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
                RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"),
@@ -72,7 +72,7 @@ static void rcu_idle_enter_common(long long newval)
                          current->pid, current->comm,
                          idle->pid, idle->comm); /* must be idle task! */
        }
-        rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
+        rcu_sched_qs(); /* implies rcu_bh_inc() */
        barrier();
        rcu_dynticks_nesting = newval;
 }
@@ -114,7 +114,7 @@ void rcu_irq_exit(void)
 }
 EXPORT_SYMBOL_GPL(rcu_irq_exit);
-/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */
+/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcu/tree.c. */
 static void rcu_idle_exit_common(long long oldval)
 {
        if (oldval) {
@@ -123,7 +123,7 @@ static void rcu_idle_exit_common(long long oldval)
                return;
        }
        RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting));
-        if (!is_idle_task(current)) {
+        if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) {
                struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
                RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"),
@@ -217,7 +217,7 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
 * are at it, given that any rcu quiescent state is also an rcu_bh
 * quiescent state.  Use "+" instead of "||" to defeat short circuiting.
 */
-void rcu_sched_qs(int cpu)
+void rcu_sched_qs(void)
 {
        unsigned long flags;
@@ -231,7 +231,7 @@ void rcu_sched_qs(int cpu)
 /*
 * Record an rcu_bh quiescent state.
 */
-void rcu_bh_qs(int cpu)
+void rcu_bh_qs(void)
 {
        unsigned long flags;
@@ -251,9 +251,11 @@ void rcu_check_callbacks(int cpu, int user)
 {
        RCU_TRACE(check_cpu_stalls());
        if (user || rcu_is_cpu_rrupt_from_idle())
-                rcu_sched_qs(cpu);
+                rcu_sched_qs();
        else if (!in_softirq())
-                rcu_bh_qs(cpu);
+                rcu_bh_qs();
+        if (user)
+                rcu_note_voluntary_context_switch(current);
 }
 /*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 1b70cb6fbe3c..9815447d22e0 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -79,9 +79,18 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
 * the tracing userspace tools to be able to decipher the string
 * address to the matching string.
 */
-#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
+#ifdef CONFIG_TRACING
+# define DEFINE_RCU_TPS(sname) \
 static char sname##_varname[] = #sname; \
-static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; \
+static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname;
+# define RCU_STATE_NAME(sname) sname##_varname
+#else
+# define DEFINE_RCU_TPS(sname)
+# define RCU_STATE_NAME(sname) __stringify(sname)
+#endif
+#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
+DEFINE_RCU_TPS(sname) \
 struct rcu_state sname##_state = { \
        .level = { &sname##_state.node[0] }, \
        .call = cr, \
@@ -93,7 +102,7 @@ struct rcu_state sname##_state = { \
        .orphan_donetail = &sname##_state.orphan_donelist, \
        .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
        .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
-        .name = sname##_varname, \
+        .name = RCU_STATE_NAME(sname), \
        .abbr = sabbr, \
 }; \
 DEFINE_PER_CPU(struct rcu_data, sname##_data)
@@ -188,22 +197,24 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
 * one since the start of the grace period, this just sets a flag.
 * The caller must have disabled preemption.
 */
-void rcu_sched_qs(int cpu)
+void rcu_sched_qs(void)
 {
-        struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
+        if (!__this_cpu_read(rcu_sched_data.passed_quiesce)) {
+                trace_rcu_grace_period(TPS("rcu_sched"),
-        if (rdp->passed_quiesce == 0)
+                                       __this_cpu_read(rcu_sched_data.gpnum),
-                trace_rcu_grace_period(TPS("rcu_sched"), rdp->gpnum, TPS("cpuqs"));
+                                       TPS("cpuqs"));
-        rdp->passed_quiesce = 1;
+                __this_cpu_write(rcu_sched_data.passed_quiesce, 1);
+        }
 }
-void rcu_bh_qs(int cpu)
+void rcu_bh_qs(void)
 {
-        struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
+        if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) {
+                trace_rcu_grace_period(TPS("rcu_bh"),
-        if (rdp->passed_quiesce == 0)
+                                       __this_cpu_read(rcu_bh_data.gpnum),
-                trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs"));
+                                       TPS("cpuqs"));
-        rdp->passed_quiesce = 1;
+                __this_cpu_write(rcu_bh_data.passed_quiesce, 1);
+        }
 }
 static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
@@ -278,7 +289,7 @@ static void rcu_momentary_dyntick_idle(void)
 void rcu_note_context_switch(int cpu)
 {
        trace_rcu_utilization(TPS("Start context switch"));
-        rcu_sched_qs(cpu);
+        rcu_sched_qs();
        rcu_preempt_note_context_switch(cpu);
        if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
                rcu_momentary_dyntick_idle();
@@ -526,6 +537,7 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
        atomic_inc(&rdtp->dynticks);
        smp_mb__after_atomic();  /* Force ordering with next sojourn. */
        WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
+        rcu_dynticks_task_enter();
        /*
         * It is illegal to enter an extended quiescent state while
@@ -642,6 +654,7 @@ void rcu_irq_exit(void)
 static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
                               int user)
 {
+        rcu_dynticks_task_exit();
        smp_mb__before_atomic();  /* Force ordering w/previous sojourn. */
        atomic_inc(&rdtp->dynticks);
        /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
@@ -819,7 +832,7 @@ bool notrace __rcu_is_watching(void)
 */
 bool notrace rcu_is_watching(void)
 {
-        int ret;
+        bool ret;
        preempt_disable();
        ret = __rcu_is_watching();
@@ -1647,7 +1660,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
                                            rnp->level, rnp->grplo,
                                            rnp->grphi, rnp->qsmask);
                raw_spin_unlock_irq(&rnp->lock);
-                cond_resched();
+                cond_resched_rcu_qs();
        }
        mutex_unlock(&rsp->onoff_mutex);
@@ -1668,7 +1681,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
        if (fqs_state == RCU_SAVE_DYNTICK) {
                /* Collect dyntick-idle snapshots. */
                if (is_sysidle_rcu_state(rsp)) {
-                        isidle = 1;
+                        isidle = true;
                        maxj = jiffies - ULONG_MAX / 4;
                }
                force_qs_rnp(rsp, dyntick_save_progress_counter,
@@ -1677,14 +1690,15 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
                fqs_state = RCU_FORCE_QS;
        } else {
                /* Handle dyntick-idle and offline CPUs. */
-                isidle = 0;
+                isidle = false;
                force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
        }
        /* Clear flag to prevent immediate re-entry. */
        if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
                raw_spin_lock_irq(&rnp->lock);
                smp_mb__after_unlock_lock();
-                ACCESS_ONCE(rsp->gp_flags) &= ~RCU_GP_FLAG_FQS;
+                ACCESS_ONCE(rsp->gp_flags) =
+                        ACCESS_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS;
                raw_spin_unlock_irq(&rnp->lock);
        }
        return fqs_state;
@@ -1736,7 +1750,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
                /* smp_mb() provided by prior unlock-lock pair. */
                nocb += rcu_future_gp_cleanup(rsp, rnp);
                raw_spin_unlock_irq(&rnp->lock);
-                cond_resched();
+                cond_resched_rcu_qs();
        }
        rnp = rcu_get_root(rsp);
        raw_spin_lock_irq(&rnp->lock);
@@ -1785,8 +1799,8 @@ static int __noreturn rcu_gp_kthread(void *arg)
                        /* Locking provides needed memory barrier. */
                        if (rcu_gp_init(rsp))
                                break;
-                        cond_resched();
+                        cond_resched_rcu_qs();
-                        flush_signals(current);
+                        WARN_ON(signal_pending(current));
                        trace_rcu_grace_period(rsp->name,
                                               ACCESS_ONCE(rsp->gpnum),
                                               TPS("reqwaitsig"));
@@ -1828,11 +1842,11 @@ static int __noreturn rcu_gp_kthread(void *arg)
                                trace_rcu_grace_period(rsp->name,
                                                       ACCESS_ONCE(rsp->gpnum),
                                                       TPS("fqsend"));
-                                cond_resched();
+                                cond_resched_rcu_qs();
                        } else {
                                /* Deal with stray signal. */
-                                cond_resched();
+                                cond_resched_rcu_qs();
-                                flush_signals(current);
+                                WARN_ON(signal_pending(current));
                                trace_rcu_grace_period(rsp->name,
                                                       ACCESS_ONCE(rsp->gpnum),
                                                       TPS("fqswaitsig"));
@@ -1928,7 +1942,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
 {
        WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
        raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
-        wake_up(&rsp->gp_wq);  /* Memory barrier implied by wake_up() path. */
+        rcu_gp_kthread_wake(rsp);
 }
 /*
@@ -2210,8 +2224,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
        /* Adjust any no-longer-needed kthreads. */
        rcu_boost_kthread_setaffinity(rnp, -1);
-        /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
        /* Exclude any attempts to start a new grace period. */
        mutex_lock(&rsp->onoff_mutex);
        raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
@@ -2393,8 +2405,8 @@ void rcu_check_callbacks(int cpu, int user)
                 * at least not while the corresponding CPU is online.
                 */
-                rcu_sched_qs(cpu);
+                rcu_sched_qs();
-                rcu_bh_qs(cpu);
+                rcu_bh_qs();
        } else if (!in_softirq()) {
@@ -2405,11 +2417,13 @@ void rcu_check_callbacks(int cpu, int user)
                 * critical section, so note it.
                 */
-                rcu_bh_qs(cpu);
+                rcu_bh_qs();
        }
        rcu_preempt_check_callbacks(cpu);
        if (rcu_pending(cpu))
                invoke_rcu_core();
+        if (user)
+                rcu_note_voluntary_context_switch(current);
        trace_rcu_utilization(TPS("End scheduler-tick"));
 }
@@ -2432,7 +2446,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
        struct rcu_node *rnp;
        rcu_for_each_leaf_node(rsp, rnp) {
-                cond_resched();
+                cond_resched_rcu_qs();
                mask = 0;
                raw_spin_lock_irqsave(&rnp->lock, flags);
                smp_mb__after_unlock_lock();
@@ -2449,7 +2463,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
                for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
                        if ((rnp->qsmask & bit) != 0) {
                                if ((rnp->qsmaskinit & bit) != 0)
-                                        *isidle = 0;
+                                        *isidle = false;
                                if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
                                        mask |= bit;
                        }
@@ -2505,9 +2519,10 @@ static void force_quiescent_state(struct rcu_state *rsp)
                raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
                return;  /* Someone beat us to it. */
        }
-        ACCESS_ONCE(rsp->gp_flags) |= RCU_GP_FLAG_FQS;
+        ACCESS_ONCE(rsp->gp_flags) =
+                ACCESS_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS;
        raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
-        wake_up(&rsp->gp_wq);  /* Memory barrier implied by wake_up() path. */
+        rcu_gp_kthread_wake(rsp);
 }
 /*
@@ -2925,11 +2940,6 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
 * restructure your code to batch your updates, and then use a single
 * synchronize_sched() instead.
 *
- * Note that it is illegal to call this function while holding any lock
- * that is acquired by a CPU-hotplug notifier.  And yes, it is also illegal
- * to call this function from a CPU-hotplug notifier.  Failing to observe
- * these restriction will result in deadlock.
- *
 * This implementation can be thought of as an application of ticket
 * locking to RCU, with sync_sched_expedited_started and
 * sync_sched_expedited_done taking on the roles of the halves
@@ -2979,7 +2989,12 @@ void synchronize_sched_expedited(void)
         */
        snap = atomic_long_inc_return(&rsp->expedited_start);
        firstsnap = snap;
-        get_online_cpus();
+        if (!try_get_online_cpus()) {
+                /* CPU hotplug operation in flight, fall back to normal GP. */
+                wait_rcu_gp(call_rcu_sched);
+                atomic_long_inc(&rsp->expedited_normal);
+                return;
+        }
        WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
        /*
@@ -3026,7 +3041,12 @@ void synchronize_sched_expedited(void)
                 * and they started after our first try, so their grace
                 * period works for us.
                 */
-                get_online_cpus();
+                if (!try_get_online_cpus()) {
+                        /* CPU hotplug operation in flight, use normal GP. */
+                        wait_rcu_gp(call_rcu_sched);
+                        atomic_long_inc(&rsp->expedited_normal);
+                        return;
+                }
                snap = atomic_long_read(&rsp->expedited_start);
                smp_mb(); /* ensure read is before try_stop_cpus(). */
        }
@@ -3279,11 +3299,16 @@ static void _rcu_barrier(struct rcu_state *rsp)
                        continue;
                rdp = per_cpu_ptr(rsp->rda, cpu);
                if (rcu_is_nocb_cpu(cpu)) {
-                        _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
+                        if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) {
-                                           rsp->n_barrier_done);
+                                _rcu_barrier_trace(rsp, "OfflineNoCB", cpu,
-                        atomic_inc(&rsp->barrier_cpu_count);
+                                                   rsp->n_barrier_done);
-                        __call_rcu(&rdp->barrier_head, rcu_barrier_callback,
+                        } else {
-                                   rsp, cpu, 0);
+                                _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
+                                                   rsp->n_barrier_done);
+                                atomic_inc(&rsp->barrier_cpu_count);
+                                __call_rcu(&rdp->barrier_head,
+                                           rcu_barrier_callback, rsp, cpu, 0);
+                        }
                } else if (ACCESS_ONCE(rdp->qlen)) {
                        _rcu_barrier_trace(rsp, "OnlineQ", cpu,
                                           rsp->n_barrier_done);
@@ -3442,6 +3467,7 @@ static int rcu_cpu_notify(struct notifier_block *self,
        case CPU_UP_PREPARE_FROZEN:
                rcu_prepare_cpu(cpu);
                rcu_prepare_kthreads(cpu);
+                rcu_spawn_all_nocb_kthreads(cpu);
                break;
        case CPU_ONLINE:
        case CPU_DOWN_FAILED:
@@ -3489,7 +3515,7 @@ static int rcu_pm_notify(struct notifier_block *self,
 }
 /*
- * Spawn the kthread that handles this RCU flavor's grace periods.
+ * Spawn the kthreads that handle each RCU flavor's grace periods.
 */
 static int __init rcu_spawn_gp_kthread(void)
 {
@@ -3498,6 +3524,7 @@ static int __init rcu_spawn_gp_kthread(void)
        struct rcu_state *rsp;
        struct task_struct *t;
+        rcu_scheduler_fully_active = 1;
        for_each_rcu_flavor(rsp) {
                t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name);
                BUG_ON(IS_ERR(t));
@@ -3505,8 +3532,9 @@ static int __init rcu_spawn_gp_kthread(void)
                raw_spin_lock_irqsave(&rnp->lock, flags);
                rsp->gp_kthread = t;
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                rcu_spawn_nocb_kthreads(rsp);
        }
+        rcu_spawn_nocb_kthreads();
+        rcu_spawn_boost_kthreads();
        return 0;
 }
 early_initcall(rcu_spawn_gp_kthread);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 71e64c718f75..bbdc45d8d74f 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -350,7 +350,7 @@ struct rcu_data {
        int nocb_p_count_lazy;          /*  (approximate). */
        wait_queue_head_t nocb_wq;      /* For nocb kthreads to sleep on. */
        struct task_struct *nocb_kthread;
-        bool nocb_defer_wakeup;         /* Defer wakeup of nocb_kthread. */
+        int nocb_defer_wakeup;          /* Defer wakeup of nocb_kthread. */
        /* The following fields are used by the leader, hence own cacheline. */
        struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
@@ -358,7 +358,7 @@ struct rcu_data {
        struct rcu_head **nocb_gp_tail;
        long nocb_gp_count;
        long nocb_gp_count_lazy;
-        bool nocb_leader_wake;          /* Is the nocb leader thread awake? */
+        bool nocb_leader_sleep;         /* Is the nocb leader thread asleep? */
        struct rcu_data *nocb_next_follower;
                                        /* Next follower in wakeup chain. */
@@ -383,6 +383,11 @@ struct rcu_data {
 #define RCU_FORCE_QS            3       /* Need to force quiescent state. */
 #define RCU_SIGNAL_INIT         RCU_SAVE_DYNTICK
+/* Values for nocb_defer_wakeup field in struct rcu_data. */
+#define RCU_NOGP_WAKE_NOT       0
+#define RCU_NOGP_WAKE           1
+#define RCU_NOGP_WAKE_FORCE     2
 #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
                                        /* For jiffies_till_first_fqs and */
                                        /*  and jiffies_till_next_fqs. */
@@ -572,6 +577,7 @@ static void rcu_preempt_do_callbacks(void);
 static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
                                                 struct rcu_node *rnp);
 #endif /* #ifdef CONFIG_RCU_BOOST */
+static void __init rcu_spawn_boost_kthreads(void);
 static void rcu_prepare_kthreads(int cpu);
 static void rcu_cleanup_after_idle(int cpu);
 static void rcu_prepare_for_idle(int cpu);
@@ -581,6 +587,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
 static void print_cpu_stall_info_end(void);
 static void zero_cpu_stall_ticks(struct rcu_data *rdp);
 static void increment_cpu_stall_ticks(void);
+static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
 static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
 static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
 static void rcu_init_one_nocb(struct rcu_node *rnp);
@@ -589,10 +596,14 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
 static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
                                      struct rcu_data *rdp,
                                      unsigned long flags);
-static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
 static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
 static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
-static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
+static void rcu_spawn_all_nocb_kthreads(int cpu);
+static void __init rcu_spawn_nocb_kthreads(void);
+#ifdef CONFIG_RCU_NOCB_CPU
+static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp);
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
 static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
 static bool init_nocb_callback_list(struct rcu_data *rdp);
 static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
@@ -605,6 +616,8 @@ static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
 static void rcu_bind_gp_kthread(void);
 static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
 static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
+static void rcu_dynticks_task_enter(void);
+static void rcu_dynticks_task_exit(void);
 #endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 00dc411e9676..c1d7f27bd38f 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -85,33 +85,6 @@ static void __init rcu_bootup_announce_oddness(void)
                pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
        if (nr_cpu_ids != NR_CPUS)
                pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
-#ifdef CONFIG_RCU_NOCB_CPU
-#ifndef CONFIG_RCU_NOCB_CPU_NONE
-        if (!have_rcu_nocb_mask) {
-                zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL);
-                have_rcu_nocb_mask = true;
-        }
-#ifdef CONFIG_RCU_NOCB_CPU_ZERO
-        pr_info("\tOffload RCU callbacks from CPU 0\n");
-        cpumask_set_cpu(0, rcu_nocb_mask);
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
-#ifdef CONFIG_RCU_NOCB_CPU_ALL
-        pr_info("\tOffload RCU callbacks from all CPUs\n");
-        cpumask_copy(rcu_nocb_mask, cpu_possible_mask);
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
-        if (have_rcu_nocb_mask) {
-                if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
-                        pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n");
-                        cpumask_and(rcu_nocb_mask, cpu_possible_mask,
-                                    rcu_nocb_mask);
-                }
-                cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
-                pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf);
-                if (rcu_nocb_poll)
-                        pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
-        }
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
 }
 #ifdef CONFIG_TREE_PREEMPT_RCU
@@ -134,7 +107,7 @@ static void __init rcu_bootup_announce(void)
 * Return the number of RCU-preempt batches processed thus far
 * for debug and statistics.
 */
-long rcu_batches_completed_preempt(void)
+static long rcu_batches_completed_preempt(void)
 {
        return rcu_preempt_state.completed;
 }
@@ -155,18 +128,19 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
 * not in a quiescent state.  There might be any number of tasks blocked
 * while in an RCU read-side critical section.
 *
- * Unlike the other rcu_*_qs() functions, callers to this function
+ * As with the other rcu_*_qs() functions, callers to this function
- * must disable irqs in order to protect the assignment to
+ * must disable preemption.
- * ->rcu_read_unlock_special.
+ */
- */
+static void rcu_preempt_qs(void)
-static void rcu_preempt_qs(int cpu)
+{
-{
+        if (!__this_cpu_read(rcu_preempt_data.passed_quiesce)) {
-        struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
+                trace_rcu_grace_period(TPS("rcu_preempt"),
+                                       __this_cpu_read(rcu_preempt_data.gpnum),
-        if (rdp->passed_quiesce == 0)
+                                       TPS("cpuqs"));
-                trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs"));
+                __this_cpu_write(rcu_preempt_data.passed_quiesce, 1);
-        rdp->passed_quiesce = 1;
+                barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */
-        current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
+                current->rcu_read_unlock_special.b.need_qs = false;
+        }
 }
 /*
@@ -190,14 +164,14 @@ static void rcu_preempt_note_context_switch(int cpu)
        struct rcu_node *rnp;
        if (t->rcu_read_lock_nesting > 0 &&
-            (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
+            !t->rcu_read_unlock_special.b.blocked) {
                /* Possibly blocking in an RCU read-side critical section. */
                rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
                rnp = rdp->mynode;
                raw_spin_lock_irqsave(&rnp->lock, flags);
                smp_mb__after_unlock_lock();
-                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
+                t->rcu_read_unlock_special.b.blocked = true;
                t->rcu_blocked_node = rnp;
                /*
@@ -239,7 +213,7 @@ static void rcu_preempt_note_context_switch(int cpu)
                                       : rnp->gpnum + 1);
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        } else if (t->rcu_read_lock_nesting < 0 &&
-                   t->rcu_read_unlock_special) {
+                   t->rcu_read_unlock_special.s) {
                /*
                 * Complete exit from RCU read-side critical section on
@@ -257,9 +231,7 @@ static void rcu_preempt_note_context_switch(int cpu)
         * grace period, then the fact that the task has been enqueued
         * means that we continue to block the current grace period.
         */
-        local_irq_save(flags);
+        rcu_preempt_qs();
-        rcu_preempt_qs(cpu);
-        local_irq_restore(flags);
 }
 /*
@@ -340,7 +312,7 @@ void rcu_read_unlock_special(struct task_struct *t)
        bool drop_boost_mutex = false;
 #endif /* #ifdef CONFIG_RCU_BOOST */
        struct rcu_node *rnp;
-        int special;
+        union rcu_special special;
        /* NMI handlers cannot block and cannot safely manipulate state. */
        if (in_nmi())
@@ -350,12 +322,13 @@ void rcu_read_unlock_special(struct task_struct *t)
        /*
         * If RCU core is waiting for this CPU to exit critical section,
-         * let it know that we have done so.
+         * let it know that we have done so.  Because irqs are disabled,
+         * t->rcu_read_unlock_special cannot change.
         */
        special = t->rcu_read_unlock_special;
-        if (special & RCU_READ_UNLOCK_NEED_QS) {
+        if (special.b.need_qs) {
-                rcu_preempt_qs(smp_processor_id());
+                rcu_preempt_qs();
-                if (!t->rcu_read_unlock_special) {
+                if (!t->rcu_read_unlock_special.s) {
                        local_irq_restore(flags);
                        return;
                }
@@ -368,8 +341,8 @@ void rcu_read_unlock_special(struct task_struct *t)
        }
        /* Clean up if blocked during RCU read-side critical section. */
-        if (special & RCU_READ_UNLOCK_BLOCKED) {
+        if (special.b.blocked) {
-                t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
+                t->rcu_read_unlock_special.b.blocked = false;
                /*
                 * Remove this task from the list it blocked on.  The
@@ -653,12 +626,13 @@ static void rcu_preempt_check_callbacks(int cpu)
        struct task_struct *t = current;
        if (t->rcu_read_lock_nesting == 0) {
-                rcu_preempt_qs(cpu);
+                rcu_preempt_qs();
                return;
        }
        if (t->rcu_read_lock_nesting > 0 &&
-            per_cpu(rcu_preempt_data, cpu).qs_pending)
+            per_cpu(rcu_preempt_data, cpu).qs_pending &&
-                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
+            !per_cpu(rcu_preempt_data, cpu).passed_quiesce)
+                t->rcu_read_unlock_special.b.need_qs = true;
 }
 #ifdef CONFIG_RCU_BOOST
@@ -819,11 +793,6 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
 * In fact, if you are using synchronize_rcu_expedited() in a loop,
 * please restructure your code to batch your updates, and then Use a
 * single synchronize_rcu() instead.
- *
- * Note that it is illegal to call this function while holding any lock
- * that is acquired by a CPU-hotplug notifier.  And yes, it is also illegal
- * to call this function from a CPU-hotplug notifier.  Failing to observe
- * these restriction will result in deadlock.
 */
 void synchronize_rcu_expedited(void)
 {
@@ -845,7 +814,11 @@ void synchronize_rcu_expedited(void)
         * being boosted.  This simplifies the process of moving tasks
         * from leaf to root rcu_node structures.
         */
-        get_online_cpus();
+        if (!try_get_online_cpus()) {
+                /* CPU-hotplug operation in flight, fall back to normal GP. */
+                wait_rcu_gp(call_rcu);
+                return;
+        }
        /*
         * Acquire lock, falling back to synchronize_rcu() if too many
@@ -897,7 +870,8 @@ void synchronize_rcu_expedited(void)
        /* Clean up and exit. */
        smp_mb(); /* ensure expedited GP seen before counter increment. */
-        ACCESS_ONCE(sync_rcu_preempt_exp_count)++;
+        ACCESS_ONCE(sync_rcu_preempt_exp_count) =
+                                        sync_rcu_preempt_exp_count + 1;
 unlock_mb_ret:
        mutex_unlock(&sync_rcu_preempt_exp_mutex);
 mb_ret:
@@ -941,7 +915,7 @@ void exit_rcu(void)
                return;
        t->rcu_read_lock_nesting = 1;
        barrier();
-        t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED;
+        t->rcu_read_unlock_special.b.blocked = true;
        __rcu_read_unlock();
 }
@@ -1462,14 +1436,13 @@ static struct smp_hotplug_thread rcu_cpu_thread_spec = {
 };
 /*
- * Spawn all kthreads -- called as soon as the scheduler is running.
+ * Spawn boost kthreads -- called as soon as the scheduler is running.
 */
-static int __init rcu_spawn_kthreads(void)
+static void __init rcu_spawn_boost_kthreads(void)
 {
        struct rcu_node *rnp;
        int cpu;
-        rcu_scheduler_fully_active = 1;
        for_each_possible_cpu(cpu)
                per_cpu(rcu_cpu_has_work, cpu) = 0;
        BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
@@ -1479,9 +1452,7 @@ static int __init rcu_spawn_kthreads(void)
                rcu_for_each_leaf_node(rcu_state_p, rnp)
                        (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
        }
-        return 0;
 }
-early_initcall(rcu_spawn_kthreads);
 static void rcu_prepare_kthreads(int cpu)
 {
@@ -1519,12 +1490,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 {
 }
-static int __init rcu_scheduler_really_started(void)
+static void __init rcu_spawn_boost_kthreads(void)
 {
-        rcu_scheduler_fully_active = 1;
-        return 0;
 }
-early_initcall(rcu_scheduler_really_started);
 static void rcu_prepare_kthreads(int cpu)
 {
@@ -1625,7 +1593,7 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
        /* Exit early if we advanced recently. */
        if (jiffies == rdtp->last_advance_all)
-                return 0;
+                return false;
        rdtp->last_advance_all = jiffies;
        for_each_rcu_flavor(rsp) {
@@ -1848,7 +1816,7 @@ static int rcu_oom_notify(struct notifier_block *self,
        get_online_cpus();
        for_each_online_cpu(cpu) {
                smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
-                cond_resched();
+                cond_resched_rcu_qs();
        }
        put_online_cpus();
@@ -2074,14 +2042,41 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
        if (!ACCESS_ONCE(rdp_leader->nocb_kthread))
                return;
-        if (!ACCESS_ONCE(rdp_leader->nocb_leader_wake) || force) {
+        if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) {
-                /* Prior xchg orders against prior callback enqueue. */
+                /* Prior smp_mb__after_atomic() orders against prior enqueue. */
-                ACCESS_ONCE(rdp_leader->nocb_leader_wake) = true;
+                ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false;
                wake_up(&rdp_leader->nocb_wq);
        }
 }
 /*
+ * Does the specified CPU need an RCU callback for the specified flavor
+ * of rcu_barrier()?
+ */
+static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
+{
+        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+        struct rcu_head *rhp;
+        /* No-CBs CPUs might have callbacks on any of three lists. */
+        rhp = ACCESS_ONCE(rdp->nocb_head);
+        if (!rhp)
+                rhp = ACCESS_ONCE(rdp->nocb_gp_head);
+        if (!rhp)
+                rhp = ACCESS_ONCE(rdp->nocb_follower_head);
+        /* Having no rcuo kthread but CBs after scheduler starts is bad! */
+        if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp) {
+                /* RCU callback enqueued before CPU first came online??? */
+                pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
+                       cpu, rhp->func);
+                WARN_ON_ONCE(1);
+        }
+        return !!rhp;
+}
+/*
 * Enqueue the specified string of rcu_head structures onto the specified
 * CPU's no-CBs lists.  The CPU is specified by rdp, the head of the
 * string by rhp, and the tail of the string by rhtp.  The non-lazy/lazy
@@ -2104,6 +2099,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
        ACCESS_ONCE(*old_rhpp) = rhp;
        atomic_long_add(rhcount, &rdp->nocb_q_count);
        atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
+        smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
        /* If we are not being polled and there is a kthread, awaken it ... */
        t = ACCESS_ONCE(rdp->nocb_kthread);
@@ -2120,16 +2116,23 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
                                            TPS("WakeEmpty"));
                } else {
-                        rdp->nocb_defer_wakeup = true;
+                        rdp->nocb_defer_wakeup = RCU_NOGP_WAKE;
                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
                                            TPS("WakeEmptyIsDeferred"));
                }
                rdp->qlen_last_fqs_check = 0;
        } else if (len > rdp->qlen_last_fqs_check + qhimark) {
                /* ... or if many callbacks queued. */
-                wake_nocb_leader(rdp, true);
+                if (!irqs_disabled_flags(flags)) {
+                        wake_nocb_leader(rdp, true);
+                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                            TPS("WakeOvf"));
+                } else {
+                        rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE;
+                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                            TPS("WakeOvfIsDeferred"));
+                }
                rdp->qlen_last_fqs_check = LONG_MAX / 2;
-                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf"));
        } else {
                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot"));
        }
@@ -2150,7 +2153,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
 {
        if (!rcu_is_nocb_cpu(rdp->cpu))
-                return 0;
+                return false;
        __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags);
        if (__is_kfree_rcu_offset((unsigned long)rhp->func))
                trace_rcu_kfree_callback(rdp->rsp->name, rhp,
@@ -2161,7 +2164,18 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
                trace_rcu_callback(rdp->rsp->name, rhp,
                                   -atomic_long_read(&rdp->nocb_q_count_lazy),
                                   -atomic_long_read(&rdp->nocb_q_count));
-        return 1;
+        /*
+         * If called from an extended quiescent state with interrupts
+         * disabled, invoke the RCU core in order to allow the idle-entry
+         * deferred-wakeup check to function.
+         */
+        if (irqs_disabled_flags(flags) &&
+            !rcu_is_watching() &&
+            cpu_online(smp_processor_id()))
+                invoke_rcu_core();
+        return true;
 }
 /*
@@ -2177,7 +2191,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
        /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
        if (!rcu_is_nocb_cpu(smp_processor_id()))
-                return 0;
+                return false;
        rsp->qlen = 0;
        rsp->qlen_lazy = 0;
@@ -2196,7 +2210,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
                rsp->orphan_nxtlist = NULL;
                rsp->orphan_nxttail = &rsp->orphan_nxtlist;
        }
-        return 1;
+        return true;
 }
 /*
@@ -2229,7 +2243,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
                        (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
                if (likely(d))
                        break;
-                flush_signals(current);
+                WARN_ON(signal_pending(current));
                trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait"));
        }
        trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait"));
@@ -2253,7 +2267,7 @@ wait_again:
        if (!rcu_nocb_poll) {
                trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
                wait_event_interruptible(my_rdp->nocb_wq,
-                                         ACCESS_ONCE(my_rdp->nocb_leader_wake));
+                                !ACCESS_ONCE(my_rdp->nocb_leader_sleep));
                /* Memory barrier handled by smp_mb() calls below and repoll. */
        } else if (firsttime) {
                firsttime = false; /* Don't drown trace log with "Poll"! */
@@ -2288,16 +2302,16 @@ wait_again:
                if (!rcu_nocb_poll)
                        trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
                                            "WokeEmpty");
-                flush_signals(current);
+                WARN_ON(signal_pending(current));
                schedule_timeout_interruptible(1);
                /* Rescan in case we were a victim of memory ordering. */
-                my_rdp->nocb_leader_wake = false;
+                my_rdp->nocb_leader_sleep = true;
-                smp_mb();  /* Ensure _wake false before scan. */
+                smp_mb();  /* Ensure _sleep true before scan. */
                for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
                        if (ACCESS_ONCE(rdp->nocb_head)) {
                                /* Found CB, so short-circuit next wait. */
-                                my_rdp->nocb_leader_wake = true;
+                                my_rdp->nocb_leader_sleep = false;
                                break;
                        }
                goto wait_again;
@@ -2307,17 +2321,17 @@ wait_again:
        rcu_nocb_wait_gp(my_rdp);
        /*
-         * We left ->nocb_leader_wake set to reduce cache thrashing.
+         * We left ->nocb_leader_sleep unset to reduce cache thrashing.
-         * We clear it now, but recheck for new callbacks while
+         * We set it now, but recheck for new callbacks while
         * traversing our follower list.
         */
-        my_rdp->nocb_leader_wake = false;
+        my_rdp->nocb_leader_sleep = true;
-        smp_mb(); /* Ensure _wake false before scan of ->nocb_head. */
+        smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */
        /* Each pass through the following loop wakes a follower, if needed. */
        for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
                if (ACCESS_ONCE(rdp->nocb_head))
-                        my_rdp->nocb_leader_wake = true; /* No need to wait. */
+                        my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
                if (!rdp->nocb_gp_head)
                        continue; /* No CBs, so no need to wake follower. */
@@ -2327,6 +2341,7 @@ wait_again:
                atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count);
                atomic_long_add(rdp->nocb_gp_count_lazy,
                                &rdp->nocb_follower_count_lazy);
+                smp_mb__after_atomic(); /* Store *tail before wakeup. */
                if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
                        /*
                         * List was empty, wake up the follower.
@@ -2367,7 +2382,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
                if (!rcu_nocb_poll)
                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
                                            "WokeEmpty");
-                flush_signals(current);
+                WARN_ON(signal_pending(current));
                schedule_timeout_interruptible(1);
        }
 }
@@ -2428,15 +2443,16 @@ static int rcu_nocb_kthread(void *arg)
                        list = next;
                }
                trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
-                ACCESS_ONCE(rdp->nocb_p_count) -= c;
+                ACCESS_ONCE(rdp->nocb_p_count) = rdp->nocb_p_count - c;
-                ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl;
+                ACCESS_ONCE(rdp->nocb_p_count_lazy) =
+                                                rdp->nocb_p_count_lazy - cl;
                rdp->n_nocbs_invoked += c;
        }
        return 0;
 }
 /* Is a deferred wakeup of rcu_nocb_kthread() required? */
-static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
 {
        return ACCESS_ONCE(rdp->nocb_defer_wakeup);
 }
@@ -2444,11 +2460,79 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
 /* Do a deferred wakeup of rcu_nocb_kthread(). */
 static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
 {
+        int ndw;
        if (!rcu_nocb_need_deferred_wakeup(rdp))
                return;
-        ACCESS_ONCE(rdp->nocb_defer_wakeup) = false;
+        ndw = ACCESS_ONCE(rdp->nocb_defer_wakeup);
-        wake_nocb_leader(rdp, false);
+        ACCESS_ONCE(rdp->nocb_defer_wakeup) = RCU_NOGP_WAKE_NOT;
-        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty"));
+        wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE);
+        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
+}
+void __init rcu_init_nohz(void)
+{
+        int cpu;
+        bool need_rcu_nocb_mask = true;
+        struct rcu_state *rsp;
+#ifdef CONFIG_RCU_NOCB_CPU_NONE
+        need_rcu_nocb_mask = false;
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
+#if defined(CONFIG_NO_HZ_FULL)
+        if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
+                need_rcu_nocb_mask = true;
+#endif /* #if defined(CONFIG_NO_HZ_FULL) */
+        if (!have_rcu_nocb_mask && need_rcu_nocb_mask) {
+                if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
+                        pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
+                        return;
+                }
+                have_rcu_nocb_mask = true;
+        }
+        if (!have_rcu_nocb_mask)
+                return;
+#ifdef CONFIG_RCU_NOCB_CPU_ZERO
+        pr_info("\tOffload RCU callbacks from CPU 0\n");
+        cpumask_set_cpu(0, rcu_nocb_mask);
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
+#ifdef CONFIG_RCU_NOCB_CPU_ALL
+        pr_info("\tOffload RCU callbacks from all CPUs\n");
+        cpumask_copy(rcu_nocb_mask, cpu_possible_mask);
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
+#if defined(CONFIG_NO_HZ_FULL)
+        if (tick_nohz_full_running)
+                cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
+#endif /* #if defined(CONFIG_NO_HZ_FULL) */
+        if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
+                pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n");
+                cpumask_and(rcu_nocb_mask, cpu_possible_mask,
+                            rcu_nocb_mask);
+        }
+        cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
+        pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf);
+        if (rcu_nocb_poll)
+                pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
+        for_each_rcu_flavor(rsp) {
+                for_each_cpu(cpu, rcu_nocb_mask) {
+                        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+                        /*
+                         * If there are early callbacks, they will need
+                         * to be moved to the nocb lists.
+                         */
+                        WARN_ON_ONCE(rdp->nxttail[RCU_NEXT_TAIL] !=
+                                     &rdp->nxtlist &&
+                                     rdp->nxttail[RCU_NEXT_TAIL] != NULL);
+                        init_nocb_callback_list(rdp);
+                }
+                rcu_organize_nocb_kthreads(rsp);
+        }
 }
 /* Initialize per-rcu_data variables for no-CBs CPUs. */
@@ -2459,15 +2543,85 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
        rdp->nocb_follower_tail = &rdp->nocb_follower_head;
 }
+/*
+ * If the specified CPU is a no-CBs CPU that does not already have its
+ * rcuo kthread for the specified RCU flavor, spawn it.  If the CPUs are
+ * brought online out of order, this can require re-organizing the
+ * leader-follower relationships.
+ */
+static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu)
+{
+        struct rcu_data *rdp;
+        struct rcu_data *rdp_last;
+        struct rcu_data *rdp_old_leader;
+        struct rcu_data *rdp_spawn = per_cpu_ptr(rsp->rda, cpu);
+        struct task_struct *t;
+        /*
+         * If this isn't a no-CBs CPU or if it already has an rcuo kthread,
+         * then nothing to do.
+         */
+        if (!rcu_is_nocb_cpu(cpu) || rdp_spawn->nocb_kthread)
+                return;
+        /* If we didn't spawn the leader first, reorganize! */
+        rdp_old_leader = rdp_spawn->nocb_leader;
+        if (rdp_old_leader != rdp_spawn && !rdp_old_leader->nocb_kthread) {
+                rdp_last = NULL;
+                rdp = rdp_old_leader;
+                do {
+                        rdp->nocb_leader = rdp_spawn;
+                        if (rdp_last && rdp != rdp_spawn)
+                                rdp_last->nocb_next_follower = rdp;
+                        rdp_last = rdp;
+                        rdp = rdp->nocb_next_follower;
+                        rdp_last->nocb_next_follower = NULL;
+                } while (rdp);
+                rdp_spawn->nocb_next_follower = rdp_old_leader;
+        }
+        /* Spawn the kthread for this CPU and RCU flavor. */
+        t = kthread_run(rcu_nocb_kthread, rdp_spawn,
+                        "rcuo%c/%d", rsp->abbr, cpu);
+        BUG_ON(IS_ERR(t));
+        ACCESS_ONCE(rdp_spawn->nocb_kthread) = t;
+}
+/*
+ * If the specified CPU is a no-CBs CPU that does not already have its
+ * rcuo kthreads, spawn them.
+ */
+static void rcu_spawn_all_nocb_kthreads(int cpu)
+{
+        struct rcu_state *rsp;
+        if (rcu_scheduler_fully_active)
+                for_each_rcu_flavor(rsp)
+                        rcu_spawn_one_nocb_kthread(rsp, cpu);
+}
+/*
+ * Once the scheduler is running, spawn rcuo kthreads for all online
+ * no-CBs CPUs.  This assumes that the early_initcall()s happen before
+ * non-boot CPUs come online -- if this changes, we will need to add
+ * some mutual exclusion.
+ */
+static void __init rcu_spawn_nocb_kthreads(void)
+{
+        int cpu;
+        for_each_online_cpu(cpu)
+                rcu_spawn_all_nocb_kthreads(cpu);
+}
 /* How many follower CPU IDs per leader?  Default of -1 for sqrt(nr_cpu_ids). */
 static int rcu_nocb_leader_stride = -1;
 module_param(rcu_nocb_leader_stride, int, 0444);
 /*
- * Create a kthread for each RCU flavor for each no-CBs CPU.
+ * Initialize leader-follower relationships for all no-CBs CPU.
- * Also initialize leader-follower relationships.
 */
-static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
+static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp)
 {
        int cpu;
        int ls = rcu_nocb_leader_stride;
@@ -2475,14 +2629,9 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
        struct rcu_data *rdp;
        struct rcu_data *rdp_leader = NULL;  /* Suppress misguided gcc warn. */
        struct rcu_data *rdp_prev = NULL;
-        struct task_struct *t;
-        if (rcu_nocb_mask == NULL)
+        if (!have_rcu_nocb_mask)
                return;
-#if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL)
-        if (tick_nohz_full_running)
-                cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
-#endif /* #if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) */
        if (ls == -1) {
                ls = int_sqrt(nr_cpu_ids);
                rcu_nocb_leader_stride = ls;
@@ -2505,27 +2654,27 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
                        rdp_prev->nocb_next_follower = rdp;
                }
                rdp_prev = rdp;
-                /* Spawn the kthread for this CPU. */
-                t = kthread_run(rcu_nocb_kthread, rdp,
-                                "rcuo%c/%d", rsp->abbr, cpu);
-                BUG_ON(IS_ERR(t));
-                ACCESS_ONCE(rdp->nocb_kthread) = t;
        }
 }
 /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
 static bool init_nocb_callback_list(struct rcu_data *rdp)
 {
-        if (rcu_nocb_mask == NULL ||
+        if (!rcu_is_nocb_cpu(rdp->cpu))
-            !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask))
                return false;
        rdp->nxttail[RCU_NEXT_TAIL] = NULL;
        return true;
 }
 #else /* #ifdef CONFIG_RCU_NOCB_CPU */
+static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
+{
+        WARN_ON_ONCE(1); /* Should be dead code. */
+        return false;
+}
 static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
 {
 }
@@ -2541,21 +2690,21 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
 static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
                            bool lazy, unsigned long flags)
 {
-        return 0;
+        return false;
 }
 static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
                                                     struct rcu_data *rdp,
                                                     unsigned long flags)
 {
-        return 0;
+        return false;
 }
 static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
 {
 }
-static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
 {
        return false;
 }
@@ -2564,7 +2713,11 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
 {
 }
-static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
+static void rcu_spawn_all_nocb_kthreads(int cpu)
+{
+}
+static void __init rcu_spawn_nocb_kthreads(void)
 {
 }
@@ -2595,16 +2748,6 @@ static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
 #ifdef CONFIG_NO_HZ_FULL_SYSIDLE
-/*
- * Define RCU flavor that holds sysidle state.  This needs to be the
- * most active flavor of RCU.
- */
-#ifdef CONFIG_PREEMPT_RCU
-static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state;
-#else /* #ifdef CONFIG_PREEMPT_RCU */
-static struct rcu_state *rcu_sysidle_state = &rcu_sched_state;
-#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 static int full_sysidle_state;          /* Current system-idle state. */
 #define RCU_SYSIDLE_NOT         0       /* Some CPU is not idle. */
 #define RCU_SYSIDLE_SHORT       1       /* All CPUs idle for brief period. */
@@ -2622,6 +2765,10 @@ static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
 {
        unsigned long j;
+        /* If there are no nohz_full= CPUs, no need to track this. */
+        if (!tick_nohz_full_enabled())
+                return;
        /* Adjust nesting, check for fully idle. */
        if (irq) {
                rdtp->dynticks_idle_nesting--;
@@ -2687,6 +2834,10 @@ void rcu_sysidle_force_exit(void)
 */
 static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
 {
+        /* If there are no nohz_full= CPUs, no need to track this. */
+        if (!tick_nohz_full_enabled())
+                return;
        /* Adjust nesting, check for already non-idle. */
        if (irq) {
                rdtp->dynticks_idle_nesting++;
@@ -2741,12 +2892,16 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
        unsigned long j;
        struct rcu_dynticks *rdtp = rdp->dynticks;
+        /* If there are no nohz_full= CPUs, don't check system-wide idleness. */
+        if (!tick_nohz_full_enabled())
+                return;
        /*
         * If some other CPU has already reported non-idle, if this is
         * not the flavor of RCU that tracks sysidle state, or if this
         * is an offline or the timekeeping CPU, nothing to do.
         */
-        if (!*isidle || rdp->rsp != rcu_sysidle_state ||
+        if (!*isidle || rdp->rsp != rcu_state_p ||
            cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
                return;
        if (rcu_gp_in_progress(rdp->rsp))
@@ -2772,7 +2927,7 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
 */
 static bool is_sysidle_rcu_state(struct rcu_state *rsp)
 {
-        return rsp == rcu_sysidle_state;
+        return rsp == rcu_state_p;
 }
 /*
@@ -2850,7 +3005,7 @@ static void rcu_sysidle_cancel(void)
 static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
                               unsigned long maxj, bool gpkt)
 {
-        if (rsp != rcu_sysidle_state)
+        if (rsp != rcu_state_p)
                return;  /* Wrong flavor, ignore. */
        if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
                return;  /* Running state machine from timekeeping CPU. */
@@ -2867,6 +3022,10 @@ static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
 static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
                                  unsigned long maxj)
 {
+        /* If there are no nohz_full= CPUs, no need to track this. */
+        if (!tick_nohz_full_enabled())
+                return;
        rcu_sysidle_report(rsp, isidle, maxj, true);
 }
@@ -2893,7 +3052,8 @@ static void rcu_sysidle_cb(struct rcu_head *rhp)
 /*
 * Check to see if the system is fully idle, other than the timekeeping CPU.
- * The caller must have disabled interrupts.
+ * The caller must have disabled interrupts.  This is not intended to be
+ * called unless tick_nohz_full_enabled().
 */
 bool rcu_sys_is_idle(void)
 {
@@ -2919,13 +3079,12 @@ bool rcu_sys_is_idle(void)
                        /* Scan all the CPUs looking for nonidle CPUs. */
                        for_each_possible_cpu(cpu) {
-                                rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu);
+                                rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
                                rcu_sysidle_check_cpu(rdp, &isidle, &maxj);
                                if (!isidle)
                                        break;
                        }
-                        rcu_sysidle_report(rcu_sysidle_state,
+                        rcu_sysidle_report(rcu_state_p, isidle, maxj, false);
-                                           isidle, maxj, false);
                        oldrss = rss;
                        rss = ACCESS_ONCE(full_sysidle_state);
                }
@@ -2952,7 +3111,7 @@ bool rcu_sys_is_idle(void)
         * provided by the memory allocator.
         */
        if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL &&
-            !rcu_gp_in_progress(rcu_sysidle_state) &&
+            !rcu_gp_in_progress(rcu_state_p) &&
            !rsh.inuse && xchg(&rsh.inuse, 1) == 0)
                call_rcu(&rsh.rh, rcu_sysidle_cb);
        return false;
@@ -3036,3 +3195,19 @@ static void rcu_bind_gp_kthread(void)
                housekeeping_affine(current);
 #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
 }
+/* Record the current task on dyntick-idle entry. */
+static void rcu_dynticks_task_enter(void)
+{
+#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
+        ACCESS_ONCE(current->rcu_tasks_idle_cpu) = smp_processor_id();
+#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
+}
+/* Record no current task on dyntick-idle exit. */
+static void rcu_dynticks_task_exit(void)
+{
+#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
+        ACCESS_ONCE(current->rcu_tasks_idle_cpu) = -1;
+#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
+}
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 4056d7992a6c..3ef8ba58694e 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -47,6 +47,8 @@
 #include <linux/hardirq.h>
 #include <linux/delay.h>
 #include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/tick.h>
 #define CREATE_TRACE_POINTS
@@ -91,7 +93,7 @@ void __rcu_read_unlock(void)
                barrier();  /* critical section before exit code. */
                t->rcu_read_lock_nesting = INT_MIN;
                barrier();  /* assign before ->rcu_read_unlock_special load */
-                if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
+                if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special.s)))
                        rcu_read_unlock_special(t);
                barrier();  /* ->rcu_read_unlock_special load before assign */
                t->rcu_read_lock_nesting = 0;
@@ -137,6 +139,38 @@ int notrace debug_lockdep_rcu_enabled(void)
 EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
 /**
+ * rcu_read_lock_held() - might we be in RCU read-side critical section?
+ *
+ * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an RCU
+ * read-side critical section.  In absence of CONFIG_DEBUG_LOCK_ALLOC,
+ * this assumes we are in an RCU read-side critical section unless it can
+ * prove otherwise.  This is useful for debug checks in functions that
+ * require that they be called within an RCU read-side critical section.
+ *
+ * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
+ * and while lockdep is disabled.
+ *
+ * Note that rcu_read_lock() and the matching rcu_read_unlock() must
+ * occur in the same context, for example, it is illegal to invoke
+ * rcu_read_unlock() in process context if the matching rcu_read_lock()
+ * was invoked from within an irq handler.
+ *
+ * Note that rcu_read_lock() is disallowed if the CPU is either idle or
+ * offline from an RCU perspective, so check for those as well.
+ */
+int rcu_read_lock_held(void)
+{
+        if (!debug_lockdep_rcu_enabled())
+                return 1;
+        if (!rcu_is_watching())
+                return 0;
+        if (!rcu_lockdep_current_cpu_online())
+                return 0;
+        return lock_is_held(&rcu_lock_map);
+}
+EXPORT_SYMBOL_GPL(rcu_read_lock_held);
+/**
 * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
 *
 * Check for bottom half being disabled, which covers both the
@@ -347,3 +381,312 @@ static int __init check_cpu_stall_init(void)
 early_initcall(check_cpu_stall_init);
 #endif /* #ifdef CONFIG_RCU_STALL_COMMON */
+#ifdef CONFIG_TASKS_RCU
+/*
+ * Simple variant of RCU whose quiescent states are voluntary context switch,
+ * user-space execution, and idle.  As such, grace periods can take one good
+ * long time.  There are no read-side primitives similar to rcu_read_lock()
+ * and rcu_read_unlock() because this implementation is intended to get
+ * the system into a safe state for some of the manipulations involved in
+ * tracing and the like.  Finally, this implementation does not support
+ * high call_rcu_tasks() rates from multiple CPUs.  If this is required,
+ * per-CPU callback lists will be needed.
+ */
+/* Global list of callbacks and associated lock. */
+static struct rcu_head *rcu_tasks_cbs_head;
+static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head;
+static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq);
+static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock);
+/* Track exiting tasks in order to allow them to be waited for. */
+DEFINE_SRCU(tasks_rcu_exit_srcu);
+/* Control stall timeouts.  Disable with <= 0, otherwise jiffies till stall. */
+static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10;
+module_param(rcu_task_stall_timeout, int, 0644);
+static void rcu_spawn_tasks_kthread(void);
+/*
+ * Post an RCU-tasks callback.  First call must be from process context
+ * after the scheduler if fully operational.
+ */
+void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp))
+{
+        unsigned long flags;
+        bool needwake;
+        rhp->next = NULL;
+        rhp->func = func;
+        raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags);
+        needwake = !rcu_tasks_cbs_head;
+        *rcu_tasks_cbs_tail = rhp;
+        rcu_tasks_cbs_tail = &rhp->next;
+        raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
+        if (needwake) {
+                rcu_spawn_tasks_kthread();
+                wake_up(&rcu_tasks_cbs_wq);
+        }
+}
+EXPORT_SYMBOL_GPL(call_rcu_tasks);
+/**
+ * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full rcu-tasks
+ * grace period has elapsed, in other words after all currently
+ * executing rcu-tasks read-side critical sections have elapsed.  These
+ * read-side critical sections are delimited by calls to schedule(),
+ * cond_resched_rcu_qs(), idle execution, userspace execution, calls
+ * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched().
+ *
+ * This is a very specialized primitive, intended only for a few uses in
+ * tracing and other situations requiring manipulation of function
+ * preambles and profiling hooks.  The synchronize_rcu_tasks() function
+ * is not (yet) intended for heavy use from multiple CPUs.
+ *
+ * Note that this guarantee implies further memory-ordering guarantees.
+ * On systems with more than one CPU, when synchronize_rcu_tasks() returns,
+ * each CPU is guaranteed to have executed a full memory barrier since the
+ * end of its last RCU-tasks read-side critical section whose beginning
+ * preceded the call to synchronize_rcu_tasks().  In addition, each CPU
+ * having an RCU-tasks read-side critical section that extends beyond
+ * the return from synchronize_rcu_tasks() is guaranteed to have executed
+ * a full memory barrier after the beginning of synchronize_rcu_tasks()
+ * and before the beginning of that RCU-tasks read-side critical section.
+ * Note that these guarantees include CPUs that are offline, idle, or
+ * executing in user mode, as well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked synchronize_rcu_tasks(), which returned
+ * to its caller on CPU B, then both CPU A and CPU B are guaranteed
+ * to have executed a full memory barrier during the execution of
+ * synchronize_rcu_tasks() -- even if CPU A and CPU B are the same CPU
+ * (but again only if the system has more than one CPU).
+ */
+void synchronize_rcu_tasks(void)
+{
+        /* Complain if the scheduler has not started.  */
+        rcu_lockdep_assert(!rcu_scheduler_active,
+                           "synchronize_rcu_tasks called too soon");
+        /* Wait for the grace period. */
+        wait_rcu_gp(call_rcu_tasks);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_tasks);
+/**
+ * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks.
+ *
+ * Although the current implementation is guaranteed to wait, it is not
+ * obligated to, for example, if there are no pending callbacks.
+ */
+void rcu_barrier_tasks(void)
+{
+        /* There is only one callback queue, so this is easy.  ;-) */
+        synchronize_rcu_tasks();
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_tasks);
+/* See if tasks are still holding out, complain if so. */
+static void check_holdout_task(struct task_struct *t,
+                               bool needreport, bool *firstreport)
+{
+        int cpu;
+        if (!ACCESS_ONCE(t->rcu_tasks_holdout) ||
+            t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) ||
+            !ACCESS_ONCE(t->on_rq) ||
+            (IS_ENABLED(CONFIG_NO_HZ_FULL) &&
+             !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) {
+                ACCESS_ONCE(t->rcu_tasks_holdout) = false;
+                list_del_init(&t->rcu_tasks_holdout_list);
+                put_task_struct(t);
+                return;
+        }
+        if (!needreport)
+                return;
+        if (*firstreport) {
+                pr_err("INFO: rcu_tasks detected stalls on tasks:\n");
+                *firstreport = false;
+        }
+        cpu = task_cpu(t);
+        pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n",
+                 t, ".I"[is_idle_task(t)],
+                 "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)],
+                 t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout,
+                 t->rcu_tasks_idle_cpu, cpu);
+        sched_show_task(t);
+}
+/* RCU-tasks kthread that detects grace periods and invokes callbacks. */
+static int __noreturn rcu_tasks_kthread(void *arg)
+{
+        unsigned long flags;
+        struct task_struct *g, *t;
+        unsigned long lastreport;
+        struct rcu_head *list;
+        struct rcu_head *next;
+        LIST_HEAD(rcu_tasks_holdouts);
+        /* FIXME: Add housekeeping affinity. */
+        /*
+         * Each pass through the following loop makes one check for
+         * newly arrived callbacks, and, if there are some, waits for
+         * one RCU-tasks grace period and then invokes the callbacks.
+         * This loop is terminated by the system going down.  ;-)
+         */
+        for (;;) {
+                /* Pick up any new callbacks. */
+                raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags);
+                list = rcu_tasks_cbs_head;
+                rcu_tasks_cbs_head = NULL;
+                rcu_tasks_cbs_tail = &rcu_tasks_cbs_head;
+                raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
+                /* If there were none, wait a bit and start over. */
+                if (!list) {
+                        wait_event_interruptible(rcu_tasks_cbs_wq,
+                                                 rcu_tasks_cbs_head);
+                        if (!rcu_tasks_cbs_head) {
+                                WARN_ON(signal_pending(current));
+                                schedule_timeout_interruptible(HZ/10);
+                        }
+                        continue;
+                }
+                /*
+                 * Wait for all pre-existing t->on_rq and t->nvcsw
+                 * transitions to complete.  Invoking synchronize_sched()
+                 * suffices because all these transitions occur with
+                 * interrupts disabled.  Without this synchronize_sched(),
+                 * a read-side critical section that started before the
+                 * grace period might be incorrectly seen as having started
+                 * after the grace period.
+                 *
+                 * This synchronize_sched() also dispenses with the
+                 * need for a memory barrier on the first store to
+                 * ->rcu_tasks_holdout, as it forces the store to happen
+                 * after the beginning of the grace period.
+                 */
+                synchronize_sched();
+                /*
+                 * There were callbacks, so we need to wait for an
+                 * RCU-tasks grace period.  Start off by scanning
+                 * the task list for tasks that are not already
+                 * voluntarily blocked.  Mark these tasks and make
+                 * a list of them in rcu_tasks_holdouts.
+                 */
+                rcu_read_lock();
+                for_each_process_thread(g, t) {
+                        if (t != current && ACCESS_ONCE(t->on_rq) &&
+                            !is_idle_task(t)) {
+                                get_task_struct(t);
+                                t->rcu_tasks_nvcsw = ACCESS_ONCE(t->nvcsw);
+                                ACCESS_ONCE(t->rcu_tasks_holdout) = true;
+                                list_add(&t->rcu_tasks_holdout_list,
+                                         &rcu_tasks_holdouts);
+                        }
+                }
+                rcu_read_unlock();
+                /*
+                 * Wait for tasks that are in the process of exiting.
+                 * This does only part of the job, ensuring that all
+                 * tasks that were previously exiting reach the point
+                 * where they have disabled preemption, allowing the
+                 * later synchronize_sched() to finish the job.
+                 */
+                synchronize_srcu(&tasks_rcu_exit_srcu);
+                /*
+                 * Each pass through the following loop scans the list
+                 * of holdout tasks, removing any that are no longer
+                 * holdouts.  When the list is empty, we are done.
+                 */
+                lastreport = jiffies;
+                while (!list_empty(&rcu_tasks_holdouts)) {
+                        bool firstreport;
+                        bool needreport;
+                        int rtst;
+                        struct task_struct *t1;
+                        schedule_timeout_interruptible(HZ);
+                        rtst = ACCESS_ONCE(rcu_task_stall_timeout);
+                        needreport = rtst > 0 &&
+                                     time_after(jiffies, lastreport + rtst);
+                        if (needreport)
+                                lastreport = jiffies;
+                        firstreport = true;
+                        WARN_ON(signal_pending(current));
+                        list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts,
+                                                rcu_tasks_holdout_list) {
+                                check_holdout_task(t, needreport, &firstreport);
+                                cond_resched();
+                        }
+                }
+                /*
+                 * Because ->on_rq and ->nvcsw are not guaranteed
+                 * to have a full memory barriers prior to them in the
+                 * schedule() path, memory reordering on other CPUs could
+                 * cause their RCU-tasks read-side critical sections to
+                 * extend past the end of the grace period.  However,
+                 * because these ->nvcsw updates are carried out with
+                 * interrupts disabled, we can use synchronize_sched()
+                 * to force the needed ordering on all such CPUs.
+                 *
+                 * This synchronize_sched() also confines all
+                 * ->rcu_tasks_holdout accesses to be within the grace
+                 * period, avoiding the need for memory barriers for
+                 * ->rcu_tasks_holdout accesses.
+                 *
+                 * In addition, this synchronize_sched() waits for exiting
+                 * tasks to complete their final preempt_disable() region
+                 * of execution, cleaning up after the synchronize_srcu()
+                 * above.
+                 */
+                synchronize_sched();
+                /* Invoke the callbacks. */
+                while (list) {
+                        next = list->next;
+                        local_bh_disable();
+                        list->func(list);
+                        local_bh_enable();
+                        list = next;
+                        cond_resched();
+                }
+                schedule_timeout_uninterruptible(HZ/10);
+        }
+}
+/* Spawn rcu_tasks_kthread() at first call to call_rcu_tasks(). */
+static void rcu_spawn_tasks_kthread(void)
+{
+        static DEFINE_MUTEX(rcu_tasks_kthread_mutex);
+        static struct task_struct *rcu_tasks_kthread_ptr;
+        struct task_struct *t;
+        if (ACCESS_ONCE(rcu_tasks_kthread_ptr)) {
+                smp_mb(); /* Ensure caller sees full kthread. */
+                return;
+        }
+        mutex_lock(&rcu_tasks_kthread_mutex);
+        if (rcu_tasks_kthread_ptr) {
+                mutex_unlock(&rcu_tasks_kthread_mutex);
+                return;
+        }
+        t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread");
+        BUG_ON(IS_ERR(t));
+        smp_mb(); /* Ensure others see full kthread. */
+        ACCESS_ONCE(rcu_tasks_kthread_ptr) = t;
+        mutex_unlock(&rcu_tasks_kthread_mutex);
+}
+#endif /* #ifdef CONFIG_TASKS_RCU */
diff --git a/kernel/reboot.c b/kernel/reboot.c
index a3a9e240fcdb..5925f5ae8dff 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -104,6 +104,87 @@ int unregister_reboot_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_reboot_notifier);
+/*
+ *      Notifier list for kernel code which wants to be called
+ *      to restart the system.
+ */
+static ATOMIC_NOTIFIER_HEAD(restart_handler_list);
+/**
+ *      register_restart_handler - Register function to be called to reset
+ *                                 the system
+ *      @nb: Info about handler function to be called
+ *      @nb->priority:  Handler priority. Handlers should follow the
+ *                      following guidelines for setting priorities.
+ *                      0:      Restart handler of last resort,
+ *                              with limited restart capabilities
+ *                      128:    Default restart handler; use if no other
+ *                              restart handler is expected to be available,
+ *                              and/or if restart functionality is
+ *                              sufficient to restart the entire system
+ *                      255:    Highest priority restart handler, will
+ *                              preempt all other restart handlers
+ *
+ *      Registers a function with code to be called to restart the
+ *      system.
+ *
+ *      Registered functions will be called from machine_restart as last
+ *      step of the restart sequence (if the architecture specific
+ *      machine_restart function calls do_kernel_restart - see below
+ *      for details).
+ *      Registered functions are expected to restart the system immediately.
+ *      If more than one function is registered, the restart handler priority
+ *      selects which function will be called first.
+ *
+ *      Restart handlers are expected to be registered from non-architecture
+ *      code, typically from drivers. A typical use case would be a system
+ *      where restart functionality is provided through a watchdog. Multiple
+ *      restart handlers may exist; for example, one restart handler might
+ *      restart the entire system, while another only restarts the CPU.
+ *      In such cases, the restart handler which only restarts part of the
+ *      hardware is expected to register with low priority to ensure that
+ *      it only runs if no other means to restart the system is available.
+ *
+ *      Currently always returns zero, as atomic_notifier_chain_register()
+ *      always returns zero.
+ */
+int register_restart_handler(struct notifier_block *nb)
+{
+        return atomic_notifier_chain_register(&restart_handler_list, nb);
+}
+EXPORT_SYMBOL(register_restart_handler);
+/**
+ *      unregister_restart_handler - Unregister previously registered
+ *                                   restart handler
+ *      @nb: Hook to be unregistered
+ *
+ *      Unregisters a previously registered restart handler function.
+ *
+ *      Returns zero on success, or %-ENOENT on failure.
+ */
+int unregister_restart_handler(struct notifier_block *nb)
+{
+        return atomic_notifier_chain_unregister(&restart_handler_list, nb);
+}
+EXPORT_SYMBOL(unregister_restart_handler);
+/**
+ *      do_kernel_restart - Execute kernel restart handler call chain
+ *
+ *      Calls functions registered with register_restart_handler.
+ *
+ *      Expected to be called from machine_restart as last step of the restart
+ *      sequence.
+ *
+ *      Restarts the system immediately if a restart handler function has been
+ *      registered. Otherwise does nothing.
+ */
+void do_kernel_restart(char *cmd)
+{
+        atomic_notifier_call_chain(&restart_handler_list, reboot_mode, cmd);
+}
 void migrate_to_reboot_cpu(void)
 {
        /* The boot cpu is always logical cpu 0 */
diff --git a/kernel/resource.c b/kernel/resource.c
index 3c2237ac32db..0bcebffc4e77 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -59,10 +59,12 @@ static DEFINE_RWLOCK(resource_lock);
 static struct resource *bootmem_resource_free;
 static DEFINE_SPINLOCK(bootmem_resource_lock);
-static void *r_next(struct seq_file *m, void *v, loff_t *pos)
+static struct resource *next_resource(struct resource *p, bool sibling_only)
 {
-        struct resource *p = v;
+        /* Caller wants to traverse through siblings only */
-        (*pos)++;
+        if (sibling_only)
+                return p->sibling;
        if (p->child)
                return p->child;
        while (!p->sibling && p->parent)
@@ -70,6 +72,13 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos)
        return p->sibling;
 }
+static void *r_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        struct resource *p = v;
+        (*pos)++;
+        return (void *)next_resource(p, false);
+}
 #ifdef CONFIG_PROC_FS
 enum { MAX_IORES_LEVEL = 5 };
@@ -322,16 +331,19 @@ int release_resource(struct resource *old)
 EXPORT_SYMBOL(release_resource);
-#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
 /*
- * Finds the lowest memory reosurce exists within [res->start.res->end)
+ * Finds the lowest iomem reosurce exists with-in [res->start.res->end)
 * the caller must specify res->start, res->end, res->flags and "name".
 * If found, returns 0, res is overwritten, if not found, returns -1.
+ * This walks through whole tree and not just first level children
+ * until and unless first_level_children_only is true.
 */
-static int find_next_system_ram(struct resource *res, char *name)
+static int find_next_iomem_res(struct resource *res, char *name,
+                               bool first_level_children_only)
 {
        resource_size_t start, end;
        struct resource *p;
+        bool sibling_only = false;
        BUG_ON(!res);
@@ -339,9 +351,12 @@ static int find_next_system_ram(struct resource *res, char *name)
        end = res->end;
        BUG_ON(start >= end);
+        if (first_level_children_only)
+                sibling_only = true;
        read_lock(&resource_lock);
-        for (p = iomem_resource.child; p ; p = p->sibling) {
-                /* system ram is just marked as IORESOURCE_MEM */
+        for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) {
                if (p->flags != res->flags)
                        continue;
                if (name && strcmp(p->name, name))
@@ -353,6 +368,7 @@ static int find_next_system_ram(struct resource *res, char *name)
                if ((p->end >= start) && (p->start < end))
                        break;
        }
        read_unlock(&resource_lock);
        if (!p)
                return -1;
@@ -365,6 +381,70 @@ static int find_next_system_ram(struct resource *res, char *name)
 }
 /*
+ * Walks through iomem resources and calls func() with matching resource
+ * ranges. This walks through whole tree and not just first level children.
+ * All the memory ranges which overlap start,end and also match flags and
+ * name are valid candidates.
+ *
+ * @name: name of resource
+ * @flags: resource flags
+ * @start: start addr
+ * @end: end addr
+ */
+int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end,
+                void *arg, int (*func)(u64, u64, void *))
+{
+        struct resource res;
+        u64 orig_end;
+        int ret = -1;
+        res.start = start;
+        res.end = end;
+        res.flags = flags;
+        orig_end = res.end;
+        while ((res.start < res.end) &&
+                (!find_next_iomem_res(&res, name, false))) {
+                ret = (*func)(res.start, res.end, arg);
+                if (ret)
+                        break;
+                res.start = res.end + 1;
+                res.end = orig_end;
+        }
+        return ret;
+}
+/*
+ * This function calls callback against all memory range of "System RAM"
+ * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
+ * Now, this function is only for "System RAM". This function deals with
+ * full ranges and not pfn. If resources are not pfn aligned, dealing
+ * with pfn can truncate ranges.
+ */
+int walk_system_ram_res(u64 start, u64 end, void *arg,
+                                int (*func)(u64, u64, void *))
+{
+        struct resource res;
+        u64 orig_end;
+        int ret = -1;
+        res.start = start;
+        res.end = end;
+        res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+        orig_end = res.end;
+        while ((res.start < res.end) &&
+                (!find_next_iomem_res(&res, "System RAM", true))) {
+                ret = (*func)(res.start, res.end, arg);
+                if (ret)
+                        break;
+                res.start = res.end + 1;
+                res.end = orig_end;
+        }
+        return ret;
+}
+#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
+/*
 * This function calls callback against all memory range of "System RAM"
 * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
 * Now, this function is only for "System RAM".
@@ -382,7 +462,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
        res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
        orig_end = res.end;
        while ((res.start < res.end) &&
-                (find_next_system_ram(&res, "System RAM") >= 0)) {
+                (find_next_iomem_res(&res, "System RAM", true) >= 0)) {
                pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
                end_pfn = (res.end + 1) >> PAGE_SHIFT;
                if (end_pfn > pfn)
@@ -411,6 +491,42 @@ int __weak page_is_ram(unsigned long pfn)
 }
 EXPORT_SYMBOL_GPL(page_is_ram);
+/*
+ * Search for a resouce entry that fully contains the specified region.
+ * If found, return 1 if it is RAM, 0 if not.
+ * If not found, or region is not fully contained, return -1
+ *
+ * Used by the ioremap functions to ensure the user is not remapping RAM and is
+ * a vast speed up over walking through the resource table page by page.
+ */
+int region_is_ram(resource_size_t start, unsigned long size)
+{
+        struct resource *p;
+        resource_size_t end = start + size - 1;
+        int flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+        const char *name = "System RAM";
+        int ret = -1;
+        read_lock(&resource_lock);
+        for (p = iomem_resource.child; p ; p = p->sibling) {
+                if (end < p->start)
+                        continue;
+                if (p->start <= start && end <= p->end) {
+                        /* resource fully contains region */
+                        if ((p->flags != flags) || strcmp(p->name, name))
+                                ret = 0;
+                        else
+                                ret = 1;
+                        break;
+                }
+                if (p->end < start)
+                        break;  /* not found */
+        }
+        read_unlock(&resource_lock);
+        return ret;
+}
 void __weak arch_remove_reservations(struct resource *avail)
 {
 }
@@ -1165,6 +1281,76 @@ int release_mem_region_adjustable(struct resource *parent,
 /*
 * Managed region resource
 */
+static void devm_resource_release(struct device *dev, void *ptr)
+{
+        struct resource **r = ptr;
+        release_resource(*r);
+}
+/**
+ * devm_request_resource() - request and reserve an I/O or memory resource
+ * @dev: device for which to request the resource
+ * @root: root of the resource tree from which to request the resource
+ * @new: descriptor of the resource to request
+ *
+ * This is a device-managed version of request_resource(). There is usually
+ * no need to release resources requested by this function explicitly since
+ * that will be taken care of when the device is unbound from its driver.
+ * If for some reason the resource needs to be released explicitly, because
+ * of ordering issues for example, drivers must call devm_release_resource()
+ * rather than the regular release_resource().
+ *
+ * When a conflict is detected between any existing resources and the newly
+ * requested resource, an error message will be printed.
+ *
+ * Returns 0 on success or a negative error code on failure.
+ */
+int devm_request_resource(struct device *dev, struct resource *root,
+                          struct resource *new)
+{
+        struct resource *conflict, **ptr;
+        ptr = devres_alloc(devm_resource_release, sizeof(*ptr), GFP_KERNEL);
+        if (!ptr)
+                return -ENOMEM;
+        *ptr = new;
+        conflict = request_resource_conflict(root, new);
+        if (conflict) {
+                dev_err(dev, "resource collision: %pR conflicts with %s %pR\n",
+                        new, conflict->name, conflict);
+                devres_free(ptr);
+                return -EBUSY;
+        }
+        devres_add(dev, ptr);
+        return 0;
+}
+EXPORT_SYMBOL(devm_request_resource);
+static int devm_resource_match(struct device *dev, void *res, void *data)
+{
+        struct resource **ptr = res;
+        return *ptr == data;
+}
+/**
+ * devm_release_resource() - release a previously requested resource
+ * @dev: device for which to release the resource
+ * @new: descriptor of the resource to release
+ *
+ * Releases a resource previously requested using devm_request_resource().
+ */
+void devm_release_resource(struct device *dev, struct resource *new)
+{
+        WARN_ON(devres_release(dev, devm_resource_release, devm_resource_match,
+                               new));
+}
+EXPORT_SYMBOL(devm_release_resource);
 struct region_devres {
        struct resource *parent;
        resource_size_t start;
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index e73efba98301..8a2e230fb86a 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -148,11 +148,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
        if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
                goto out;
-        t = p;
+        for_each_thread(p, t)
-        do {
                sched_move_task(t);
-        } while_each_thread(p, t);
 out:
        unlock_task_sighand(p, &flags);
        autogroup_kref_put(prev);
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 3ef6451e972e..c27e4f8f4879 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -134,7 +134,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
 static inline struct sched_clock_data *this_scd(void)
 {
-        return &__get_cpu_var(sched_clock_data);
+        return this_cpu_ptr(&sched_clock_data);
 }
 static inline struct sched_clock_data *cpu_sdc(int cpu)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1211575a2208..240157c13ddc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -90,22 +90,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
-#ifdef smp_mb__before_atomic
-void __smp_mb__before_atomic(void)
-{
-        smp_mb__before_atomic();
-}
-EXPORT_SYMBOL(__smp_mb__before_atomic);
-#endif
-#ifdef smp_mb__after_atomic
-void __smp_mb__after_atomic(void)
-{
-        smp_mb__after_atomic();
-}
-EXPORT_SYMBOL(__smp_mb__after_atomic);
-#endif
 void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
 {
        unsigned long delta;
@@ -333,9 +317,12 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
        for (;;) {
                rq = task_rq(p);
                raw_spin_lock(&rq->lock);
-                if (likely(rq == task_rq(p)))
+                if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
                        return rq;
                raw_spin_unlock(&rq->lock);
+                while (unlikely(task_on_rq_migrating(p)))
+                        cpu_relax();
        }
 }
@@ -352,10 +339,13 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
                raw_spin_lock_irqsave(&p->pi_lock, *flags);
                rq = task_rq(p);
                raw_spin_lock(&rq->lock);
-                if (likely(rq == task_rq(p)))
+                if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
                        return rq;
                raw_spin_unlock(&rq->lock);
                raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+                while (unlikely(task_on_rq_migrating(p)))
+                        cpu_relax();
        }
 }
@@ -449,7 +439,15 @@ static void __hrtick_start(void *arg)
 void hrtick_start(struct rq *rq, u64 delay)
 {
        struct hrtimer *timer = &rq->hrtick_timer;
-        ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
+        ktime_t time;
+        s64 delta;
+        /*
+         * Don't schedule slices shorter than 10000ns, that just
+         * doesn't make sense and can cause timer DoS.
+         */
+        delta = max_t(s64, delay, 10000LL);
+        time = ktime_add_ns(timer->base->get_time(), delta);
        hrtimer_set_expires(timer, time);
@@ -1043,7 +1041,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
         * A queue event has occurred, and we're going to schedule.  In
         * this case, we can save a useless back to back clock update.
         */
-        if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
+        if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
                rq->skip_clock_update = 1;
 }
@@ -1088,7 +1086,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 static void __migrate_swap_task(struct task_struct *p, int cpu)
 {
-        if (p->on_rq) {
+        if (task_on_rq_queued(p)) {
                struct rq *src_rq, *dst_rq;
                src_rq = task_rq(p);
@@ -1214,7 +1212,7 @@ static int migration_cpu_stop(void *data);
 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 {
        unsigned long flags;
-        int running, on_rq;
+        int running, queued;
        unsigned long ncsw;
        struct rq *rq;
@@ -1252,7 +1250,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                rq = task_rq_lock(p, &flags);
                trace_sched_wait_task(p);
                running = task_running(rq, p);
-                on_rq = p->on_rq;
+                queued = task_on_rq_queued(p);
                ncsw = 0;
                if (!match_state || p->state == match_state)
                        ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
@@ -1284,7 +1282,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                 * running right now), it's preempted, and we should
                 * yield - it could be a while.
                 */
-                if (unlikely(on_rq)) {
+                if (unlikely(queued)) {
                        ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
                        set_current_state(TASK_UNINTERRUPTIBLE);
@@ -1478,7 +1476,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
 {
        activate_task(rq, p, en_flags);
-        p->on_rq = 1;
+        p->on_rq = TASK_ON_RQ_QUEUED;
        /* if a worker is waking up, notify workqueue */
        if (p->flags & PF_WQ_WORKER)
@@ -1537,7 +1535,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
        int ret = 0;
        rq = __task_rq_lock(p);
-        if (p->on_rq) {
+        if (task_on_rq_queued(p)) {
                /* check_preempt_curr() may use rq clock */
                update_rq_clock(rq);
                ttwu_do_wakeup(rq, p, wake_flags);
@@ -1620,6 +1618,25 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu)
        }
 }
+void wake_up_if_idle(int cpu)
+{
+        struct rq *rq = cpu_rq(cpu);
+        unsigned long flags;
+        if (!is_idle_task(rq->curr))
+                return;
+        if (set_nr_if_polling(rq->idle)) {
+                trace_sched_wake_idle_without_ipi(cpu);
+        } else {
+                raw_spin_lock_irqsave(&rq->lock, flags);
+                if (is_idle_task(rq->curr))
+                        smp_send_reschedule(cpu);
+                /* Else cpu is not in idle, do nothing here */
+                raw_spin_unlock_irqrestore(&rq->lock, flags);
+        }
+}
 bool cpus_share_cache(int this_cpu, int that_cpu)
 {
        return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
@@ -1742,7 +1759,7 @@ static void try_to_wake_up_local(struct task_struct *p)
        if (!(p->state & TASK_NORMAL))
                goto out;
-        if (!p->on_rq)
+        if (!task_on_rq_queued(p))
                ttwu_activate(rq, p, ENQUEUE_WAKEUP);
        ttwu_do_wakeup(rq, p, 0);
@@ -1776,6 +1793,20 @@ int wake_up_state(struct task_struct *p, unsigned int state)
 }
 /*
+ * This function clears the sched_dl_entity static params.
+ */
+void __dl_clear_params(struct task_struct *p)
+{
+        struct sched_dl_entity *dl_se = &p->dl;
+        dl_se->dl_runtime = 0;
+        dl_se->dl_deadline = 0;
+        dl_se->dl_period = 0;
+        dl_se->flags = 0;
+        dl_se->dl_bw = 0;
+}
+/*
 * Perform scheduler related setup for a newly forked process p.
 * p is forked by current.
 *
@@ -1799,10 +1830,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
        RB_CLEAR_NODE(&p->dl.rb_node);
        hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-        p->dl.dl_runtime = p->dl.runtime = 0;
+        __dl_clear_params(p);
-        p->dl.dl_deadline = p->dl.deadline = 0;
-        p->dl.dl_period = 0;
-        p->dl.flags = 0;
        INIT_LIST_HEAD(&p->rt.run_list);
@@ -1977,6 +2005,8 @@ unsigned long to_ratio(u64 period, u64 runtime)
 #ifdef CONFIG_SMP
 inline struct dl_bw *dl_bw_of(int i)
 {
+        rcu_lockdep_assert(rcu_read_lock_sched_held(),
+                           "sched RCU must be held");
        return &cpu_rq(i)->rd->dl_bw;
 }
@@ -1985,6 +2015,8 @@ static inline int dl_bw_cpus(int i)
        struct root_domain *rd = cpu_rq(i)->rd;
        int cpus = 0;
+        rcu_lockdep_assert(rcu_read_lock_sched_held(),
+                           "sched RCU must be held");
        for_each_cpu_and(i, rd->span, cpu_active_mask)
                cpus++;
@@ -2095,7 +2127,7 @@ void wake_up_new_task(struct task_struct *p)
        init_task_runnable_average(p);
        rq = __task_rq_lock(p);
        activate_task(rq, p, 0);
-        p->on_rq = 1;
+        p->on_rq = TASK_ON_RQ_QUEUED;
        trace_sched_wakeup_new(p, true);
        check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
@@ -2287,10 +2319,6 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
         */
        post_schedule(rq);
-#ifdef __ARCH_WANT_UNLOCKED_CTXSW
-        /* In this case, finish_task_switch does not reenable preemption */
-        preempt_enable();
-#endif
        if (current->set_child_tid)
                put_user(task_pid_vnr(current), current->set_child_tid);
 }
@@ -2333,9 +2361,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
         * of the scheduler it's an obvious special-case), so we
         * do an early lockdep release here:
         */
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
        spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
-#endif
        context_tracking_task_switch(prev, next);
        /* Here we just switch the register state and the stack. */
@@ -2366,6 +2392,18 @@ unsigned long nr_running(void)
        return sum;
 }
+/*
+ * Check if only the current task is running on the cpu.
+ */
+bool single_task_running(void)
+{
+        if (cpu_rq(smp_processor_id())->nr_running == 1)
+                return true;
+        else
+                return false;
+}
+EXPORT_SYMBOL(single_task_running);
 unsigned long long nr_context_switches(void)
 {
        int i;
@@ -2393,6 +2431,13 @@ unsigned long nr_iowait_cpu(int cpu)
        return atomic_read(&this->nr_iowait);
 }
+void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
+{
+        struct rq *this = this_rq();
+        *nr_waiters = atomic_read(&this->nr_iowait);
+        *load = this->cpu_load[0];
+}
 #ifdef CONFIG_SMP
 /*
@@ -2444,7 +2489,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
         * project cycles that may never be accounted to this
         * thread, breaking clock_gettime().
         */
-        if (task_current(rq, p) && p->on_rq) {
+        if (task_current(rq, p) && task_on_rq_queued(p)) {
                update_rq_clock(rq);
                ns = rq_clock_task(rq) - p->se.exec_start;
                if ((s64)ns < 0)
@@ -2490,7 +2535,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
         * If we see ->on_cpu without ->on_rq, the task is leaving, and has
         * been accounted, so we're correct here as well.
         */
-        if (!p->on_cpu || !p->on_rq)
+        if (!p->on_cpu || !task_on_rq_queued(p))
                return p->se.sum_exec_runtime;
 #endif
@@ -2653,6 +2698,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
 */
 static inline void schedule_debug(struct task_struct *prev)
 {
+#ifdef CONFIG_SCHED_STACK_END_CHECK
+        BUG_ON(unlikely(task_stack_end_corrupted(prev)));
+#endif
        /*
         * Test if we are atomic. Since do_exit() needs to call into
         * schedule() atomically, we ignore that path. Otherwise whine
@@ -2794,7 +2842,7 @@ need_resched:
                switch_count = &prev->nvcsw;
        }
-        if (prev->on_rq || rq->skip_clock_update < 0)
+        if (task_on_rq_queued(prev) || rq->skip_clock_update < 0)
                update_rq_clock(rq);
        next = pick_next_task(rq, prev);
@@ -2903,6 +2951,47 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
 }
 NOKPROBE_SYMBOL(preempt_schedule);
 EXPORT_SYMBOL(preempt_schedule);
+#ifdef CONFIG_CONTEXT_TRACKING
+/**
+ * preempt_schedule_context - preempt_schedule called by tracing
+ *
+ * The tracing infrastructure uses preempt_enable_notrace to prevent
+ * recursion and tracing preempt enabling caused by the tracing
+ * infrastructure itself. But as tracing can happen in areas coming
+ * from userspace or just about to enter userspace, a preempt enable
+ * can occur before user_exit() is called. This will cause the scheduler
+ * to be called when the system is still in usermode.
+ *
+ * To prevent this, the preempt_enable_notrace will use this function
+ * instead of preempt_schedule() to exit user context if needed before
+ * calling the scheduler.
+ */
+asmlinkage __visible void __sched notrace preempt_schedule_context(void)
+{
+        enum ctx_state prev_ctx;
+        if (likely(!preemptible()))
+                return;
+        do {
+                __preempt_count_add(PREEMPT_ACTIVE);
+                /*
+                 * Needs preempt disabled in case user_exit() is traced
+                 * and the tracer calls preempt_enable_notrace() causing
+                 * an infinite recursion.
+                 */
+                prev_ctx = exception_enter();
+                __schedule();
+                exception_exit(prev_ctx);
+                __preempt_count_sub(PREEMPT_ACTIVE);
+                barrier();
+        } while (need_resched());
+}
+EXPORT_SYMBOL_GPL(preempt_schedule_context);
+#endif /* CONFIG_CONTEXT_TRACKING */
 #endif /* CONFIG_PREEMPT */
 /*
@@ -2959,7 +3048,7 @@ EXPORT_SYMBOL(default_wake_function);
 */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-        int oldprio, on_rq, running, enqueue_flag = 0;
+        int oldprio, queued, running, enqueue_flag = 0;
        struct rq *rq;
        const struct sched_class *prev_class;
@@ -2988,12 +3077,12 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        trace_sched_pi_setprio(p, prio);
        oldprio = p->prio;
        prev_class = p->sched_class;
-        on_rq = p->on_rq;
+        queued = task_on_rq_queued(p);
        running = task_current(rq, p);
-        if (on_rq)
+        if (queued)
                dequeue_task(rq, p, 0);
        if (running)
-                p->sched_class->put_prev_task(rq, p);
+                put_prev_task(rq, p);
        /*
         * Boosting condition are:
@@ -3030,7 +3119,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        if (running)
                p->sched_class->set_curr_task(rq);
-        if (on_rq)
+        if (queued)
                enqueue_task(rq, p, enqueue_flag);
        check_class_changed(rq, p, prev_class, oldprio);
@@ -3041,7 +3130,7 @@ out_unlock:
 void set_user_nice(struct task_struct *p, long nice)
 {
-        int old_prio, delta, on_rq;
+        int old_prio, delta, queued;
        unsigned long flags;
        struct rq *rq;
@@ -3062,8 +3151,8 @@ void set_user_nice(struct task_struct *p, long nice)
                p->static_prio = NICE_TO_PRIO(nice);
                goto out_unlock;
        }
-        on_rq = p->on_rq;
+        queued = task_on_rq_queued(p);
-        if (on_rq)
+        if (queued)
                dequeue_task(rq, p, 0);
        p->static_prio = NICE_TO_PRIO(nice);
@@ -3072,7 +3161,7 @@ void set_user_nice(struct task_struct *p, long nice)
        p->prio = effective_prio(p);
        delta = p->prio - old_prio;
-        if (on_rq) {
+        if (queued) {
                enqueue_task(rq, p, 0);
                /*
                 * If the task increased its priority or is running and
@@ -3344,7 +3433,7 @@ static int __sched_setscheduler(struct task_struct *p,
 {
        int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
                      MAX_RT_PRIO - 1 - attr->sched_priority;
-        int retval, oldprio, oldpolicy = -1, on_rq, running;
+        int retval, oldprio, oldpolicy = -1, queued, running;
        int policy = attr->sched_policy;
        unsigned long flags;
        const struct sched_class *prev_class;
@@ -3541,19 +3630,19 @@ change:
                return 0;
        }
-        on_rq = p->on_rq;
+        queued = task_on_rq_queued(p);
        running = task_current(rq, p);
-        if (on_rq)
+        if (queued)
                dequeue_task(rq, p, 0);
        if (running)
-                p->sched_class->put_prev_task(rq, p);
+                put_prev_task(rq, p);
        prev_class = p->sched_class;
        __setscheduler(rq, p, attr);
        if (running)
                p->sched_class->set_curr_task(rq);
-        if (on_rq) {
+        if (queued) {
                /*
                 * We enqueue to tail when the priority of a task is
                 * increased (user space view).
@@ -3977,14 +4066,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
                rcu_read_lock();
                if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
                        rcu_read_unlock();
-                        goto out_unlock;
+                        goto out_free_new_mask;
                }
                rcu_read_unlock();
        }
        retval = security_task_setscheduler(p);
        if (retval)
-                goto out_unlock;
+                goto out_free_new_mask;
        cpuset_cpus_allowed(p, cpus_allowed);
@@ -3997,13 +4086,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
         * root_domain.
         */
 #ifdef CONFIG_SMP
-        if (task_has_dl_policy(p)) {
+        if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
-                const struct cpumask *span = task_rq(p)->rd->span;
+                rcu_read_lock();
+                if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
-                if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) {
                        retval = -EBUSY;
-                        goto out_unlock;
+                        rcu_read_unlock();
+                        goto out_free_new_mask;
                }
+                rcu_read_unlock();
        }
 #endif
 again:
@@ -4021,7 +4111,7 @@ again:
                        goto again;
                }
        }
-out_unlock:
+out_free_new_mask:
        free_cpumask_var(new_mask);
 out_free_cpus_allowed:
        free_cpumask_var(cpus_allowed);
@@ -4505,7 +4595,7 @@ void show_state_filter(unsigned long state_filter)
                "  task                        PC stack   pid father\n");
 #endif
        rcu_read_lock();
-        do_each_thread(g, p) {
+        for_each_process_thread(g, p) {
                /*
                 * reset the NMI-timeout, listing all files on a slow
                 * console might take a lot of time:
@@ -4513,7 +4603,7 @@ void show_state_filter(unsigned long state_filter)
                touch_nmi_watchdog();
                if (!state_filter || (p->state & state_filter))
                        sched_show_task(p);
-        } while_each_thread(g, p);
+        }
        touch_all_softlockup_watchdogs();
@@ -4568,7 +4658,7 @@ void init_idle(struct task_struct *idle, int cpu)
        rcu_read_unlock();
        rq->curr = rq->idle = idle;
-        idle->on_rq = 1;
+        idle->on_rq = TASK_ON_RQ_QUEUED;
 #if defined(CONFIG_SMP)
        idle->on_cpu = 1;
 #endif
@@ -4589,6 +4679,33 @@ void init_idle(struct task_struct *idle, int cpu)
 }
 #ifdef CONFIG_SMP
+/*
+ * move_queued_task - move a queued task to new rq.
+ *
+ * Returns (locked) new rq. Old rq's lock is released.
+ */
+static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
+{
+        struct rq *rq = task_rq(p);
+        lockdep_assert_held(&rq->lock);
+        dequeue_task(rq, p, 0);
+        p->on_rq = TASK_ON_RQ_MIGRATING;
+        set_task_cpu(p, new_cpu);
+        raw_spin_unlock(&rq->lock);
+        rq = cpu_rq(new_cpu);
+        raw_spin_lock(&rq->lock);
+        BUG_ON(task_cpu(p) != new_cpu);
+        p->on_rq = TASK_ON_RQ_QUEUED;
+        enqueue_task(rq, p, 0);
+        check_preempt_curr(rq, p, 0);
+        return rq;
+}
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
        if (p->sched_class && p->sched_class->set_cpus_allowed)
@@ -4645,14 +4762,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
                goto out;
        dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
-        if (p->on_rq) {
+        if (task_running(rq, p) || p->state == TASK_WAKING) {
                struct migration_arg arg = { p, dest_cpu };
                /* Need help from migration thread: drop lock and wait. */
                task_rq_unlock(rq, p, &flags);
                stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
                tlb_migrate_finish(p->mm);
                return 0;
-        }
+        } else if (task_on_rq_queued(p))
+                rq = move_queued_task(p, dest_cpu);
 out:
        task_rq_unlock(rq, p, &flags);
@@ -4673,20 +4791,20 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 */
 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
-        struct rq *rq_dest, *rq_src;
+        struct rq *rq;
        int ret = 0;
        if (unlikely(!cpu_active(dest_cpu)))
                return ret;
-        rq_src = cpu_rq(src_cpu);
+        rq = cpu_rq(src_cpu);
-        rq_dest = cpu_rq(dest_cpu);
        raw_spin_lock(&p->pi_lock);
-        double_rq_lock(rq_src, rq_dest);
+        raw_spin_lock(&rq->lock);
        /* Already moved. */
        if (task_cpu(p) != src_cpu)
                goto done;
        /* Affinity changed (again). */
        if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
                goto fail;
@@ -4695,16 +4813,12 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
         * If we're not on a rq, the next wake-up will ensure we're
         * placed properly.
         */
-        if (p->on_rq) {
+        if (task_on_rq_queued(p))
-                dequeue_task(rq_src, p, 0);
+                rq = move_queued_task(p, dest_cpu);
-                set_task_cpu(p, dest_cpu);
-                enqueue_task(rq_dest, p, 0);
-                check_preempt_curr(rq_dest, p, 0);
-        }
 done:
        ret = 1;
 fail:
-        double_rq_unlock(rq_src, rq_dest);
+        raw_spin_unlock(&rq->lock);
        raw_spin_unlock(&p->pi_lock);
        return ret;
 }
@@ -4736,22 +4850,22 @@ void sched_setnuma(struct task_struct *p, int nid)
 {
        struct rq *rq;
        unsigned long flags;
-        bool on_rq, running;
+        bool queued, running;
        rq = task_rq_lock(p, &flags);
-        on_rq = p->on_rq;
+        queued = task_on_rq_queued(p);
        running = task_current(rq, p);
-        if (on_rq)
+        if (queued)
                dequeue_task(rq, p, 0);
        if (running)
-                p->sched_class->put_prev_task(rq, p);
+                put_prev_task(rq, p);
        p->numa_preferred_nid = nid;
        if (running)
                p->sched_class->set_curr_task(rq);
-        if (on_rq)
+        if (queued)
                enqueue_task(rq, p, 0);
        task_rq_unlock(rq, p, &flags);
 }
@@ -4771,6 +4885,12 @@ static int migration_cpu_stop(void *data)
         * be on another cpu but it doesn't matter.
         */
        local_irq_disable();
+        /*
+         * We need to explicitly wake pending tasks before running
+         * __migrate_task() such that we will not miss enforcing cpus_allowed
+         * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
+         */
+        sched_ttwu_pending();
        __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
        local_irq_enable();
        return 0;
@@ -5181,6 +5301,7 @@ static int sched_cpu_inactive(struct notifier_block *nfb,
 {
        unsigned long flags;
        long cpu = (long)hcpu;
+        struct dl_bw *dl_b;
        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_DOWN_PREPARE:
@@ -5188,15 +5309,19 @@ static int sched_cpu_inactive(struct notifier_block *nfb,
                /* explicitly allow suspend */
                if (!(action & CPU_TASKS_FROZEN)) {
-                        struct dl_bw *dl_b = dl_bw_of(cpu);
                        bool overflow;
                        int cpus;
+                        rcu_read_lock_sched();
+                        dl_b = dl_bw_of(cpu);
                        raw_spin_lock_irqsave(&dl_b->lock, flags);
                        cpus = dl_bw_cpus(cpu);
                        overflow = __dl_overflow(dl_b, cpus, 0, 0);
                        raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+                        rcu_read_unlock_sched();
                        if (overflow)
                                return notifier_from_errno(-EBUSY);
                }
@@ -5739,7 +5864,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
        const struct cpumask *span = sched_domain_span(sd);
        struct cpumask *covered = sched_domains_tmpmask;
        struct sd_data *sdd = sd->private;
-        struct sched_domain *child;
+        struct sched_domain *sibling;
        int i;
        cpumask_clear(covered);
@@ -5750,10 +5875,10 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                if (cpumask_test_cpu(i, covered))
                        continue;
-                child = *per_cpu_ptr(sdd->sd, i);
+                sibling = *per_cpu_ptr(sdd->sd, i);
                /* See the comment near build_group_mask(). */
-                if (!cpumask_test_cpu(i, sched_domain_span(child)))
+                if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
                        continue;
                sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
@@ -5763,10 +5888,9 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                        goto fail;
                sg_span = sched_group_cpus(sg);
-                if (child->child) {
+                if (sibling->child)
-                        child = child->child;
+                        cpumask_copy(sg_span, sched_domain_span(sibling->child));
-                        cpumask_copy(sg_span, sched_domain_span(child));
+                else
-                } else
                        cpumask_set_cpu(i, sg_span);
                cpumask_or(covered, covered, sg_span);
@@ -7117,13 +7241,13 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
                .sched_policy = SCHED_NORMAL,
        };
        int old_prio = p->prio;
-        int on_rq;
+        int queued;
-        on_rq = p->on_rq;
+        queued = task_on_rq_queued(p);
-        if (on_rq)
+        if (queued)
                dequeue_task(rq, p, 0);
        __setscheduler(rq, p, &attr);
-        if (on_rq) {
+        if (queued) {
                enqueue_task(rq, p, 0);
                resched_curr(rq);
        }
@@ -7137,12 +7261,12 @@ void normalize_rt_tasks(void)
        unsigned long flags;
        struct rq *rq;
-        read_lock_irqsave(&tasklist_lock, flags);
+        read_lock(&tasklist_lock);
-        do_each_thread(g, p) {
+        for_each_process_thread(g, p) {
                /*
                 * Only normalize user tasks:
                 */
-                if (!p->mm)
+                if (p->flags & PF_KTHREAD)
                        continue;
                p->se.exec_start                = 0;
@@ -7157,21 +7281,16 @@ void normalize_rt_tasks(void)
                         * Renice negative nice level userspace
                         * tasks back to 0:
                         */
-                        if (task_nice(p) < 0 && p->mm)
+                        if (task_nice(p) < 0)
                                set_user_nice(p, 0);
                        continue;
                }
-                raw_spin_lock(&p->pi_lock);
+                rq = task_rq_lock(p, &flags);
-                rq = __task_rq_lock(p);
                normalize_task(rq, p);
+                task_rq_unlock(rq, p, &flags);
-                __task_rq_unlock(rq);
+        }
-                raw_spin_unlock(&p->pi_lock);
+        read_unlock(&tasklist_lock);
-        } while_each_thread(g, p);
-        read_unlock_irqrestore(&tasklist_lock, flags);
 }
 #endif /* CONFIG_MAGIC_SYSRQ */
@@ -7311,19 +7430,19 @@ void sched_offline_group(struct task_group *tg)
 void sched_move_task(struct task_struct *tsk)
 {
        struct task_group *tg;
-        int on_rq, running;
+        int queued, running;
        unsigned long flags;
        struct rq *rq;
        rq = task_rq_lock(tsk, &flags);
        running = task_current(rq, tsk);
-        on_rq = tsk->on_rq;
+        queued = task_on_rq_queued(tsk);
-        if (on_rq)
+        if (queued)
                dequeue_task(rq, tsk, 0);
        if (unlikely(running))
-                tsk->sched_class->put_prev_task(rq, tsk);
+                put_prev_task(rq, tsk);
        tg = container_of(task_css_check(tsk, cpu_cgrp_id,
                                lockdep_is_held(&tsk->sighand->siglock)),
@@ -7333,14 +7452,14 @@ void sched_move_task(struct task_struct *tsk)
 #ifdef CONFIG_FAIR_GROUP_SCHED
        if (tsk->sched_class->task_move_group)
-                tsk->sched_class->task_move_group(tsk, on_rq);
+                tsk->sched_class->task_move_group(tsk, queued);
        else
 #endif
                set_task_rq(tsk, task_cpu(tsk));
        if (unlikely(running))
                tsk->sched_class->set_curr_task(rq);
-        if (on_rq)
+        if (queued)
                enqueue_task(rq, tsk, 0);
        task_rq_unlock(rq, tsk, &flags);
@@ -7358,10 +7477,10 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
 {
        struct task_struct *g, *p;
-        do_each_thread(g, p) {
+        for_each_process_thread(g, p) {
-                if (rt_task(p) && task_rq(p)->rt.tg == tg)
+                if (rt_task(p) && task_group(p) == tg)
                        return 1;
-        } while_each_thread(g, p);
+        }
        return 0;
 }
@@ -7570,6 +7689,7 @@ static int sched_dl_global_constraints(void)
        u64 runtime = global_rt_runtime();
        u64 period = global_rt_period();
        u64 new_bw = to_ratio(period, runtime);
+        struct dl_bw *dl_b;
        int cpu, ret = 0;
        unsigned long flags;
@@ -7583,13 +7703,16 @@ static int sched_dl_global_constraints(void)
         * solutions is welcome!
         */
        for_each_possible_cpu(cpu) {
-                struct dl_bw *dl_b = dl_bw_of(cpu);
+                rcu_read_lock_sched();
+                dl_b = dl_bw_of(cpu);
                raw_spin_lock_irqsave(&dl_b->lock, flags);
                if (new_bw < dl_b->total_bw)
                        ret = -EBUSY;
                raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+                rcu_read_unlock_sched();
                if (ret)
                        break;
        }
@@ -7600,6 +7723,7 @@ static int sched_dl_global_constraints(void)
 static void sched_dl_do_global(void)
 {
        u64 new_bw = -1;
+        struct dl_bw *dl_b;
        int cpu;
        unsigned long flags;
@@ -7613,11 +7737,14 @@ static void sched_dl_do_global(void)
         * FIXME: As above...
         */
        for_each_possible_cpu(cpu) {
-                struct dl_bw *dl_b = dl_bw_of(cpu);
+                rcu_read_lock_sched();
+                dl_b = dl_bw_of(cpu);
                raw_spin_lock_irqsave(&dl_b->lock, flags);
                dl_b->bw = new_bw;
                raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+                rcu_read_unlock_sched();
        }
 }
@@ -7747,6 +7874,11 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
        sched_offline_group(tg);
 }
+static void cpu_cgroup_fork(struct task_struct *task)
+{
+        sched_move_task(task);
+}
 static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
                                 struct cgroup_taskset *tset)
 {
@@ -7998,7 +8130,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
                struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
                quota = normalize_cfs_quota(tg, d);
-                parent_quota = parent_b->hierarchal_quota;
+                parent_quota = parent_b->hierarchical_quota;
                /*
                 * ensure max(child_quota) <= parent_quota, inherit when no
@@ -8009,7 +8141,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
                else if (parent_quota != RUNTIME_INF && quota > parent_quota)
                        return -EINVAL;
        }
-        cfs_b->hierarchal_quota = quota;
+        cfs_b->hierarchical_quota = quota;
        return 0;
 }
@@ -8119,6 +8251,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
        .css_free       = cpu_cgroup_css_free,
        .css_online     = cpu_cgroup_css_online,
        .css_offline    = cpu_cgroup_css_offline,
+        .fork           = cpu_cgroup_fork,
        .can_attach     = cpu_cgroup_can_attach,
        .attach         = cpu_cgroup_attach,
        .exit           = cpu_cgroup_exit,
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index bd95963dae80..539ca3ce071b 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -107,9 +107,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
        int best_cpu = -1;
        const struct sched_dl_entity *dl_se = &p->dl;
-        if (later_mask && cpumask_and(later_mask, cp->free_cpus,
+        if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) {
-                        &p->cpus_allowed) && cpumask_and(later_mask,
-                        later_mask, cpu_active_mask)) {
                best_cpu = cpumask_any(later_mask);
                goto out;
        } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 72fdf06ef865..8394b1ee600c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -288,24 +288,29 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
        struct signal_struct *sig = tsk->signal;
        cputime_t utime, stime;
        struct task_struct *t;
+        unsigned int seq, nextseq;
-        times->utime = sig->utime;
+        unsigned long flags;
-        times->stime = sig->stime;
-        times->sum_exec_runtime = sig->sum_sched_runtime;
        rcu_read_lock();
-        /* make sure we can trust tsk->thread_group list */
+        /* Attempt a lockless read on the first round. */
-        if (!likely(pid_alive(tsk)))
+        nextseq = 0;
-                goto out;
-        t = tsk;
        do {
-                task_cputime(t, &utime, &stime);
+                seq = nextseq;
-                times->utime += utime;
+                flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
-                times->stime += stime;
+                times->utime = sig->utime;
-                times->sum_exec_runtime += task_sched_runtime(t);
+                times->stime = sig->stime;
-        } while_each_thread(tsk, t);
+                times->sum_exec_runtime = sig->sum_sched_runtime;
-out:
+                for_each_thread(tsk, t) {
+                        task_cputime(t, &utime, &stime);
+                        times->utime += utime;
+                        times->stime += stime;
+                        times->sum_exec_runtime += task_sched_runtime(t);
+                }
+                /* If lockless access failed, take the lock. */
+                nextseq = 1;
+        } while (need_seqretry(&sig->stats_lock, seq));
+        done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
        rcu_read_unlock();
 }
@@ -550,6 +555,23 @@ drop_precision:
 }
 /*
+ * Atomically advance counter to the new value. Interrupts, vcpu
+ * scheduling, and scaling inaccuracies can cause cputime_advance
+ * to be occasionally called with a new value smaller than counter.
+ * Let's enforce atomicity.
+ *
+ * Normally a caller will only go through this loop once, or not
+ * at all in case a previous caller updated counter the same jiffy.
+ */
+static void cputime_advance(cputime_t *counter, cputime_t new)
+{
+        cputime_t old;
+        while (new > (old = ACCESS_ONCE(*counter)))
+                cmpxchg_cputime(counter, old, new);
+}
+/*
 * Adjust tick based cputime random precision against scheduler
 * runtime accounting.
 */
@@ -594,13 +616,8 @@ static void cputime_adjust(struct task_cputime *curr,
                utime = rtime - stime;
        }
-        /*
+        cputime_advance(&prev->stime, stime);
-         * If the tick based count grows faster than the scheduler one,
+        cputime_advance(&prev->utime, utime);
-         * the result of the scaling may go backward.
-         * Let's enforce monotonicity.
-         */
-        prev->stime = max(prev->stime, stime);
-        prev->utime = max(prev->utime, utime);
 out:
        *ut = prev->utime;
@@ -617,9 +634,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
        cputime_adjust(&cputime, &p->prev_cputime, ut, st);
 }
-/*
- * Must be called with siglock held.
- */
 void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
        struct task_cputime cputime;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 255ce138b652..5285332392d5 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -518,21 +518,29 @@ again:
        }
        /*
-         * We need to take care of a possible races here. In fact, the
+         * We need to take care of several possible races here:
-         * task might have changed its scheduling policy to something
+         *
-         * different from SCHED_DEADLINE or changed its reservation
+         *   - the task might have changed its scheduling policy
-         * parameters (through sched_setattr()).
+         *     to something different than SCHED_DEADLINE
+         *   - the task might have changed its reservation parameters
+         *     (through sched_setattr())
+         *   - the task might have been boosted by someone else and
+         *     might be in the boosting/deboosting path
+         *
+         * In all this cases we bail out, as the task is already
+         * in the runqueue or is going to be enqueued back anyway.
         */
-        if (!dl_task(p) || dl_se->dl_new)
+        if (!dl_task(p) || dl_se->dl_new ||
+            dl_se->dl_boosted || !dl_se->dl_throttled)
                goto unlock;
        sched_clock_tick();
        update_rq_clock(rq);
        dl_se->dl_throttled = 0;
        dl_se->dl_yielded = 0;
-        if (p->on_rq) {
+        if (task_on_rq_queued(p)) {
                enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
-                if (task_has_dl_policy(rq->curr))
+                if (dl_task(rq->curr))
                        check_preempt_curr_dl(rq, p, 0);
                else
                        resched_curr(rq);
@@ -847,8 +855,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
         * smaller than our one... OTW we keep our runtime and
         * deadline.
         */
-        if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio))
+        if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) {
                pi_se = &pi_task->dl;
+        } else if (!dl_prio(p->normal_prio)) {
+                /*
+                 * Special case in which we have a !SCHED_DEADLINE task
+                 * that is going to be deboosted, but exceedes its
+                 * runtime while doing so. No point in replenishing
+                 * it, as it's going to return back to its original
+                 * scheduling class after this.
+                 */
+                BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
+                return;
+        }
        /*
         * If p is throttled, we do nothing. In fact, if it exhausted
@@ -997,10 +1016,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
 #ifdef CONFIG_SCHED_HRTICK
 static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
 {
-        s64 delta = p->dl.dl_runtime - p->dl.runtime;
+        hrtick_start(rq, p->dl.runtime);
-        if (delta > 10000)
-                hrtick_start(rq, p->dl.runtime);
 }
 #endif
@@ -1030,7 +1046,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
                 * means a stop task can slip in, in which case we need to
                 * re-start task selection.
                 */
-                if (rq->stop && rq->stop->on_rq)
+                if (rq->stop && task_on_rq_queued(rq->stop))
                        return RETRY_TASK;
        }
@@ -1124,10 +1140,8 @@ static void set_curr_task_dl(struct rq *rq)
 static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
 {
        if (!task_running(rq, p) &&
-            (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
+            cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
-            (p->nr_cpus_allowed > 1))
                return 1;
        return 0;
 }
@@ -1158,7 +1172,7 @@ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
 static int find_later_rq(struct task_struct *task)
 {
        struct sched_domain *sd;
-        struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl);
+        struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
        int this_cpu = smp_processor_id();
        int best_cpu, cpu = task_cpu(task);
@@ -1169,6 +1183,13 @@ static int find_later_rq(struct task_struct *task)
        if (task->nr_cpus_allowed == 1)
                return -1;
+        /*
+         * We have to consider system topology and task affinity
+         * first, then we can look for a suitable cpu.
+         */
+        cpumask_copy(later_mask, task_rq(task)->rd->span);
+        cpumask_and(later_mask, later_mask, cpu_active_mask);
+        cpumask_and(later_mask, later_mask, &task->cpus_allowed);
        best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
                        task, later_mask);
        if (best_cpu == -1)
@@ -1257,7 +1278,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
                        if (unlikely(task_rq(task) != rq ||
                                     !cpumask_test_cpu(later_rq->cpu,
                                                       &task->cpus_allowed) ||
-                                     task_running(rq, task) || !task->on_rq)) {
+                                     task_running(rq, task) ||
+                                     !task_on_rq_queued(task))) {
                                double_unlock_balance(rq, later_rq);
                                later_rq = NULL;
                                break;
@@ -1296,7 +1318,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
        BUG_ON(task_current(rq, p));
        BUG_ON(p->nr_cpus_allowed <= 1);
-        BUG_ON(!p->on_rq);
+        BUG_ON(!task_on_rq_queued(p));
        BUG_ON(!dl_task(p));
        return p;
@@ -1443,7 +1465,7 @@ static int pull_dl_task(struct rq *this_rq)
                     dl_time_before(p->dl.deadline,
                                    this_rq->dl.earliest_dl.curr))) {
                        WARN_ON(p == src_rq->curr);
-                        WARN_ON(!p->on_rq);
+                        WARN_ON(!task_on_rq_queued(p));
                        /*
                         * Then we pull iff p has actually an earlier
@@ -1569,6 +1591,8 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
        if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy))
                hrtimer_try_to_cancel(&p->dl.dl_timer);
+        __dl_clear_params(p);
 #ifdef CONFIG_SMP
        /*
         * Since this might be the only -deadline task on the rq,
@@ -1596,14 +1620,18 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
        if (unlikely(p->dl.dl_throttled))
                return;
-        if (p->on_rq && rq->curr != p) {
+        if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
                if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
                        /* Only reschedule if pushing failed */
                        check_resched = 0;
 #endif /* CONFIG_SMP */
-                if (check_resched && task_has_dl_policy(rq->curr))
+                if (check_resched) {
-                        check_preempt_curr_dl(rq, p, 0);
+                        if (dl_task(rq->curr))
+                                check_preempt_curr_dl(rq, p, 0);
+                        else
+                                resched_curr(rq);
+                }
        }
 }
@@ -1614,7 +1642,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
 static void prio_changed_dl(struct rq *rq, struct task_struct *p,
                            int oldprio)
 {
-        if (p->on_rq || rq->curr == p) {
+        if (task_on_rq_queued(p) || rq->curr == p) {
 #ifdef CONFIG_SMP
                /*
                 * This might be too much, but unfortunately
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 627b3c34b821..ce33780d8f20 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -150,7 +150,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
 {
        struct task_struct *g, *p;
-        unsigned long flags;
        SEQ_printf(m,
        "\nrunnable tasks:\n"
@@ -159,16 +158,14 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
        "------------------------------------------------------"
        "----------------------------------------------------\n");
-        read_lock_irqsave(&tasklist_lock, flags);
+        rcu_read_lock();
+        for_each_process_thread(g, p) {
-        do_each_thread(g, p) {
                if (task_cpu(p) != rq_cpu)
                        continue;
                print_task(m, rq, p);
-        } while_each_thread(g, p);
+        }
+        rcu_read_unlock();
-        read_unlock_irqrestore(&tasklist_lock, flags);
 }
 void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
@@ -333,9 +330,7 @@ do {									\
        print_cfs_stats(m, cpu);
        print_rt_stats(m, cpu);
-        rcu_read_lock();
        print_rq(m, rq, cpu);
-        rcu_read_unlock();
        spin_unlock_irqrestore(&sched_debug_lock, flags);
        SEQ_printf(m, "\n");
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bfa3c86d0d68..34baa60f8a7b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -23,6 +23,7 @@
 #include <linux/latencytop.h>
 #include <linux/sched.h>
 #include <linux/cpumask.h>
+#include <linux/cpuidle.h>
 #include <linux/slab.h>
 #include <linux/profile.h>
 #include <linux/interrupt.h>
@@ -665,6 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 #ifdef CONFIG_SMP
+static int select_idle_sibling(struct task_struct *p, int cpu);
 static unsigned long task_h_load(struct task_struct *p);
 static inline void __update_task_entity_contrib(struct sched_entity *se);
@@ -826,11 +828,12 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
 static unsigned int task_scan_min(struct task_struct *p)
 {
+        unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
        unsigned int scan, floor;
        unsigned int windows = 1;
-        if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
+        if (scan_size < MAX_SCAN_WINDOW)
-                windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
+                windows = MAX_SCAN_WINDOW / scan_size;
        floor = 1000 / windows;
        scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
@@ -1038,7 +1041,8 @@ struct numa_stats {
 */
 static void update_numa_stats(struct numa_stats *ns, int nid)
 {
-        int cpu, cpus = 0;
+        int smt, cpu, cpus = 0;
+        unsigned long capacity;
        memset(ns, 0, sizeof(*ns));
        for_each_cpu(cpu, cpumask_of_node(nid)) {
@@ -1062,8 +1066,12 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
        if (!cpus)
                return;
-        ns->task_capacity =
+        /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
-                DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
+        smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
+        capacity = cpus / smt; /* cores */
+        ns->task_capacity = min_t(unsigned, capacity,
+                DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
        ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
 }
@@ -1157,9 +1165,19 @@ static void task_numa_compare(struct task_numa_env *env,
        long moveimp = imp;
        rcu_read_lock();
-        cur = ACCESS_ONCE(dst_rq->curr);
-        if (cur->pid == 0) /* idle */
+        raw_spin_lock_irq(&dst_rq->lock);
+        cur = dst_rq->curr;
+        /*
+         * No need to move the exiting task, and this ensures that ->curr
+         * wasn't reaped and thus get_task_struct() in task_numa_assign()
+         * is safe under RCU read lock.
+         * Note that rcu_read_lock() itself can't protect from the final
+         * put_task_struct() after the last schedule().
+         */
+        if ((cur->flags & PF_EXITING) || is_idle_task(cur))
                cur = NULL;
+        raw_spin_unlock_irq(&dst_rq->lock);
        /*
         * "imp" is the fault differential for the source task between the
@@ -1206,7 +1224,7 @@ static void task_numa_compare(struct task_numa_env *env,
        if (!cur) {
                /* Is there capacity at our destination? */
-                if (env->src_stats.has_free_capacity &&
+                if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
                    !env->dst_stats.has_free_capacity)
                        goto unlock;
@@ -1252,6 +1270,13 @@ balance:
        if (load_too_imbalanced(src_load, dst_load, env))
                goto unlock;
+        /*
+         * One idle CPU per node is evaluated for a task numa move.
+         * Call select_idle_sibling to maybe find a better one.
+         */
+        if (!cur)
+                env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
 assign:
        task_numa_assign(env, cur, imp);
 unlock:
@@ -1506,7 +1531,7 @@ static void update_task_scan_period(struct task_struct *p,
                 * scanning faster if shared accesses dominate as it may
                 * simply bounce migrations uselessly
                 */
-                ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
+                ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
                diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
        }
@@ -1775,7 +1800,7 @@ void task_numa_free(struct task_struct *p)
                list_del(&p->numa_entry);
                grp->nr_tasks--;
                spin_unlock_irqrestore(&grp->lock, flags);
-                rcu_assign_pointer(p->numa_group, NULL);
+                RCU_INIT_POINTER(p->numa_group, NULL);
                put_numa_group(grp);
        }
@@ -1804,10 +1829,6 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
        if (!p->mm)
                return;
-        /* Do not worry about placement if exiting */
-        if (p->state == TASK_DEAD)
-                return;
        /* Allocate buffer to track faults on a per-node basis */
        if (unlikely(!p->numa_faults_memory)) {
                int size = sizeof(*p->numa_faults_memory) *
@@ -1946,7 +1967,7 @@ void task_numa_work(struct callback_head *work)
                vma = mm->mmap;
        }
        for (; vma; vma = vma->vm_next) {
-                if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
+                if (!vma_migratable(vma) || !vma_policy_mof(vma))
                        continue;
                /*
@@ -2211,8 +2232,8 @@ static __always_inline u64 decay_load(u64 val, u64 n)
        /*
         * As y^PERIOD = 1/2, we can combine
-         *    y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
+         *    y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
-         * With a look-up table which covers k^n (n<PERIOD)
+         * With a look-up table which covers y^n (n<PERIOD)
         *
         * To achieve constant time decay_load.
         */
@@ -2377,6 +2398,9 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
        tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
        tg_contrib -= cfs_rq->tg_load_contrib;
+        if (!tg_contrib)
+                return;
        if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
                atomic_long_add(tg_contrib, &tg->load_avg);
                cfs_rq->tg_load_contrib += tg_contrib;
@@ -3892,14 +3916,6 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
                                resched_curr(rq);
                        return;
                }
-                /*
-                 * Don't schedule slices shorter than 10000ns, that just
-                 * doesn't make sense. Rely on vruntime for fairness.
-                 */
-                if (rq->curr != p)
-                        delta = max_t(s64, 10000LL, delta);
                hrtick_start(rq, delta);
        }
 }
@@ -4087,7 +4103,7 @@ static unsigned long capacity_of(int cpu)
 static unsigned long cpu_avg_load_per_task(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-        unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
+        unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
        unsigned long load_avg = rq->cfs.runnable_load_avg;
        if (nr_running)
@@ -4276,8 +4292,8 @@ static int wake_wide(struct task_struct *p)
 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 {
        s64 this_load, load;
+        s64 this_eff_load, prev_eff_load;
        int idx, this_cpu, prev_cpu;
-        unsigned long tl_per_task;
        struct task_group *tg;
        unsigned long weight;
        int balanced;
@@ -4320,47 +4336,30 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
         * Otherwise check if either cpus are near enough in load to allow this
         * task to be woken on this_cpu.
         */
-        if (this_load > 0) {
+        this_eff_load = 100;
-                s64 this_eff_load, prev_eff_load;
+        this_eff_load *= capacity_of(prev_cpu);
+        prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+        prev_eff_load *= capacity_of(this_cpu);
-                this_eff_load = 100;
+        if (this_load > 0) {
-                this_eff_load *= capacity_of(prev_cpu);
                this_eff_load *= this_load +
                        effective_load(tg, this_cpu, weight, weight);
-                prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
-                prev_eff_load *= capacity_of(this_cpu);
                prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
+        }
-                balanced = this_eff_load <= prev_eff_load;
+        balanced = this_eff_load <= prev_eff_load;
-        } else
-                balanced = true;
-        /*
-         * If the currently running task will sleep within
-         * a reasonable amount of time then attract this newly
-         * woken task:
-         */
-        if (sync && balanced)
-                return 1;
        schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
-        tl_per_task = cpu_avg_load_per_task(this_cpu);
-        if (balanced ||
+        if (!balanced)
-            (this_load <= load &&
+                return 0;
-             this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
-                /*
-                 * This domain has SD_WAKE_AFFINE and
-                 * p is cache cold in this domain, and
-                 * there is no bad imbalance.
-                 */
-                schedstat_inc(sd, ttwu_move_affine);
-                schedstat_inc(p, se.statistics.nr_wakeups_affine);
-                return 1;
+        schedstat_inc(sd, ttwu_move_affine);
-        }
+        schedstat_inc(p, se.statistics.nr_wakeups_affine);
-        return 0;
+        return 1;
 }
 /*
@@ -4428,20 +4427,46 @@ static int
 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 {
        unsigned long load, min_load = ULONG_MAX;
-        int idlest = -1;
+        unsigned int min_exit_latency = UINT_MAX;
+        u64 latest_idle_timestamp = 0;
+        int least_loaded_cpu = this_cpu;
+        int shallowest_idle_cpu = -1;
        int i;
        /* Traverse only the allowed CPUs */
        for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
-                load = weighted_cpuload(i);
+                if (idle_cpu(i)) {
+                        struct rq *rq = cpu_rq(i);
-                if (load < min_load || (load == min_load && i == this_cpu)) {
+                        struct cpuidle_state *idle = idle_get_state(rq);
-                        min_load = load;
+                        if (idle && idle->exit_latency < min_exit_latency) {
-                        idlest = i;
+                                /*
+                                 * We give priority to a CPU whose idle state
+                                 * has the smallest exit latency irrespective
+                                 * of any idle timestamp.
+                                 */
+                                min_exit_latency = idle->exit_latency;
+                                latest_idle_timestamp = rq->idle_stamp;
+                                shallowest_idle_cpu = i;
+                        } else if ((!idle || idle->exit_latency == min_exit_latency) &&
+                                   rq->idle_stamp > latest_idle_timestamp) {
+                                /*
+                                 * If equal or no active idle state, then
+                                 * the most recently idled CPU might have
+                                 * a warmer cache.
+                                 */
+                                latest_idle_timestamp = rq->idle_stamp;
+                                shallowest_idle_cpu = i;
+                        }
+                } else {
+                        load = weighted_cpuload(i);
+                        if (load < min_load || (load == min_load && i == this_cpu)) {
+                                min_load = load;
+                                least_loaded_cpu = i;
+                        }
                }
        }
-        return idlest;
+        return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
 }
 /*
@@ -4513,11 +4538,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
        if (p->nr_cpus_allowed == 1)
                return prev_cpu;
-        if (sd_flag & SD_BALANCE_WAKE) {
+        if (sd_flag & SD_BALANCE_WAKE)
-                if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+                want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
-                        want_affine = 1;
-                new_cpu = prev_cpu;
-        }
        rcu_read_lock();
        for_each_domain(cpu, tmp) {
@@ -4704,7 +4726,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
                return;
        /*
-         * This is possible from callers such as move_task(), in which we
+         * This is possible from callers such as attach_tasks(), in which we
         * unconditionally check_prempt_curr() after an enqueue (which may have
         * lead to a throttle).  This both saves work and prevents false
         * next-buddy nomination below.
@@ -5112,27 +5134,18 @@ struct lb_env {
        unsigned int            loop_max;
        enum fbq_type           fbq_type;
+        struct list_head        tasks;
 };
 /*
- * move_task - move a task from one runqueue to another runqueue.
- * Both runqueues must be locked.
- */
-static void move_task(struct task_struct *p, struct lb_env *env)
-{
-        deactivate_task(env->src_rq, p, 0);
-        set_task_cpu(p, env->dst_cpu);
-        activate_task(env->dst_rq, p, 0);
-        check_preempt_curr(env->dst_rq, p, 0);
-}
-/*
 * Is this task likely cache-hot:
 */
 static int task_hot(struct task_struct *p, struct lb_env *env)
 {
        s64 delta;
+        lockdep_assert_held(&env->src_rq->lock);
        if (p->sched_class != &fair_sched_class)
                return 0;
@@ -5252,6 +5265,9 @@ static
 int can_migrate_task(struct task_struct *p, struct lb_env *env)
 {
        int tsk_cache_hot = 0;
+        lockdep_assert_held(&env->src_rq->lock);
        /*
         * We do not migrate tasks that are:
         * 1) throttled_lb_pair, or
@@ -5310,24 +5326,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
        if (!tsk_cache_hot)
                tsk_cache_hot = migrate_degrades_locality(p, env);
-        if (migrate_improves_locality(p, env)) {
+        if (migrate_improves_locality(p, env) || !tsk_cache_hot ||
-#ifdef CONFIG_SCHEDSTATS
+            env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
-                if (tsk_cache_hot) {
-                        schedstat_inc(env->sd, lb_hot_gained[env->idle]);
-                        schedstat_inc(p, se.statistics.nr_forced_migrations);
-                }
-#endif
-                return 1;
-        }
-        if (!tsk_cache_hot ||
-                env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
                if (tsk_cache_hot) {
                        schedstat_inc(env->sd, lb_hot_gained[env->idle]);
                        schedstat_inc(p, se.statistics.nr_forced_migrations);
                }
                return 1;
        }
@@ -5336,47 +5340,63 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 }
 /*
- * move_one_task tries to move exactly one task from busiest to this_rq, as
+ * detach_task() -- detach the task for the migration specified in env
+ */
+static void detach_task(struct task_struct *p, struct lb_env *env)
+{
+        lockdep_assert_held(&env->src_rq->lock);
+        deactivate_task(env->src_rq, p, 0);
+        p->on_rq = TASK_ON_RQ_MIGRATING;
+        set_task_cpu(p, env->dst_cpu);
+}
+/*
+ * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
 * part of active balancing operations within "domain".
- * Returns 1 if successful and 0 otherwise.
 *
- * Called with both runqueues locked.
+ * Returns a task if successful and NULL otherwise.
 */
-static int move_one_task(struct lb_env *env)
+static struct task_struct *detach_one_task(struct lb_env *env)
 {
        struct task_struct *p, *n;
+        lockdep_assert_held(&env->src_rq->lock);
        list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
                if (!can_migrate_task(p, env))
                        continue;
-                move_task(p, env);
+                detach_task(p, env);
                /*
-                 * Right now, this is only the second place move_task()
+                 * Right now, this is only the second place where
-                 * is called, so we can safely collect move_task()
+                 * lb_gained[env->idle] is updated (other is detach_tasks)
-                 * stats here rather than inside move_task().
+                 * so we can safely collect stats here rather than
+                 * inside detach_tasks().
                 */
                schedstat_inc(env->sd, lb_gained[env->idle]);
-                return 1;
+                return p;
        }
-        return 0;
+        return NULL;
 }
 static const unsigned int sched_nr_migrate_break = 32;
 /*
- * move_tasks tries to move up to imbalance weighted load from busiest to
+ * detach_tasks() -- tries to detach up to imbalance weighted load from
- * this_rq, as part of a balancing operation within domain "sd".
+ * busiest_rq, as part of a balancing operation within domain "sd".
- * Returns 1 if successful and 0 otherwise.
 *
- * Called with both runqueues locked.
+ * Returns number of detached tasks if successful and 0 otherwise.
 */
-static int move_tasks(struct lb_env *env)
+static int detach_tasks(struct lb_env *env)
 {
        struct list_head *tasks = &env->src_rq->cfs_tasks;
        struct task_struct *p;
        unsigned long load;
-        int pulled = 0;
+        int detached = 0;
+        lockdep_assert_held(&env->src_rq->lock);
        if (env->imbalance <= 0)
                return 0;
@@ -5407,14 +5427,16 @@ static int move_tasks(struct lb_env *env)
                if ((load / 2) > env->imbalance)
                        goto next;
-                move_task(p, env);
+                detach_task(p, env);
-                pulled++;
+                list_add(&p->se.group_node, &env->tasks);
+                detached++;
                env->imbalance -= load;
 #ifdef CONFIG_PREEMPT
                /*
                 * NEWIDLE balancing is a source of latency, so preemptible
-                 * kernels will stop after the first task is pulled to minimize
+                 * kernels will stop after the first task is detached to minimize
                 * the critical section.
                 */
                if (env->idle == CPU_NEWLY_IDLE)
@@ -5434,13 +5456,58 @@ next:
        }
        /*
-         * Right now, this is one of only two places move_task() is called,
+         * Right now, this is one of only two places we collect this stat
-         * so we can safely collect move_task() stats here rather than
+         * so we can safely collect detach_one_task() stats here rather
-         * inside move_task().
+         * than inside detach_one_task().
         */
-        schedstat_add(env->sd, lb_gained[env->idle], pulled);
+        schedstat_add(env->sd, lb_gained[env->idle], detached);
+        return detached;
+}
+/*
+ * attach_task() -- attach the task detached by detach_task() to its new rq.
+ */
+static void attach_task(struct rq *rq, struct task_struct *p)
+{
+        lockdep_assert_held(&rq->lock);
+        BUG_ON(task_rq(p) != rq);
+        p->on_rq = TASK_ON_RQ_QUEUED;
+        activate_task(rq, p, 0);
+        check_preempt_curr(rq, p, 0);
+}
+/*
+ * attach_one_task() -- attaches the task returned from detach_one_task() to
+ * its new rq.
+ */
+static void attach_one_task(struct rq *rq, struct task_struct *p)
+{
+        raw_spin_lock(&rq->lock);
+        attach_task(rq, p);
+        raw_spin_unlock(&rq->lock);
+}
+/*
+ * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
+ * new rq.
+ */
+static void attach_tasks(struct lb_env *env)
+{
+        struct list_head *tasks = &env->tasks;
+        struct task_struct *p;
+        raw_spin_lock(&env->dst_rq->lock);
+        while (!list_empty(tasks)) {
+                p = list_first_entry(tasks, struct task_struct, se.group_node);
+                list_del_init(&p->se.group_node);
+                attach_task(env->dst_rq, p);
+        }
-        return pulled;
+        raw_spin_unlock(&env->dst_rq->lock);
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -5559,6 +5626,13 @@ static unsigned long task_h_load(struct task_struct *p)
 #endif
 /********** Helpers for find_busiest_group ************************/
+enum group_type {
+        group_other = 0,
+        group_imbalanced,
+        group_overloaded,
+};
 /*
 * sg_lb_stats - stats of a sched_group required for load_balancing
 */
@@ -5572,7 +5646,7 @@ struct sg_lb_stats {
        unsigned int group_capacity_factor;
        unsigned int idle_cpus;
        unsigned int group_weight;
-        int group_imb; /* Is there an imbalance in the group ? */
+        enum group_type group_type;
        int group_has_free_capacity;
 #ifdef CONFIG_NUMA_BALANCING
        unsigned int nr_numa_running;
@@ -5610,6 +5684,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
                .total_capacity = 0UL,
                .busiest_stat = {
                        .avg_load = 0UL,
+                        .sum_nr_running = 0,
+                        .group_type = group_other,
                },
        };
 }
@@ -5652,19 +5728,17 @@ unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
        return default_scale_capacity(sd, cpu);
 }
-static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu)
+static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
 {
-        unsigned long weight = sd->span_weight;
+        if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
-        unsigned long smt_gain = sd->smt_gain;
+                return sd->smt_gain / sd->span_weight;
-        smt_gain /= weight;
-        return smt_gain;
+        return SCHED_CAPACITY_SCALE;
 }
-unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu)
+unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
 {
-        return default_scale_smt_capacity(sd, cpu);
+        return default_scale_cpu_capacity(sd, cpu);
 }
 static unsigned long scale_rt_capacity(int cpu)
@@ -5703,18 +5777,15 @@ static unsigned long scale_rt_capacity(int cpu)
 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 {
-        unsigned long weight = sd->span_weight;
        unsigned long capacity = SCHED_CAPACITY_SCALE;
        struct sched_group *sdg = sd->groups;
-        if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) {
+        if (sched_feat(ARCH_CAPACITY))
-                if (sched_feat(ARCH_CAPACITY))
+                capacity *= arch_scale_cpu_capacity(sd, cpu);
-                        capacity *= arch_scale_smt_capacity(sd, cpu);
+        else
-                else
+                capacity *= default_scale_cpu_capacity(sd, cpu);
-                        capacity *= default_scale_smt_capacity(sd, cpu);
-                capacity >>= SCHED_CAPACITY_SHIFT;
+        capacity >>= SCHED_CAPACITY_SHIFT;
-        }
        sdg->sgc->capacity_orig = capacity;
@@ -5891,6 +5962,18 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro
        return capacity_factor;
 }
+static enum group_type
+group_classify(struct sched_group *group, struct sg_lb_stats *sgs)
+{
+        if (sgs->sum_nr_running > sgs->group_capacity_factor)
+                return group_overloaded;
+        if (sg_imbalanced(group))
+                return group_imbalanced;
+        return group_other;
+}
 /**
 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
 * @env: The load balancing environment.
@@ -5920,7 +6003,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                        load = source_load(i, load_idx);
                sgs->group_load += load;
-                sgs->sum_nr_running += rq->nr_running;
+                sgs->sum_nr_running += rq->cfs.h_nr_running;
                if (rq->nr_running > 1)
                        *overload = true;
@@ -5942,9 +6025,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
        sgs->group_weight = group->group_weight;
-        sgs->group_imb = sg_imbalanced(group);
        sgs->group_capacity_factor = sg_capacity_factor(env, group);
+        sgs->group_type = group_classify(group, sgs);
        if (sgs->group_capacity_factor > sgs->sum_nr_running)
                sgs->group_has_free_capacity = 1;
@@ -5968,13 +6050,19 @@ static bool update_sd_pick_busiest(struct lb_env *env,
                                   struct sched_group *sg,
                                   struct sg_lb_stats *sgs)
 {
-        if (sgs->avg_load <= sds->busiest_stat.avg_load)
+        struct sg_lb_stats *busiest = &sds->busiest_stat;
-                return false;
-        if (sgs->sum_nr_running > sgs->group_capacity_factor)
+        if (sgs->group_type > busiest->group_type)
                return true;
-        if (sgs->group_imb)
+        if (sgs->group_type < busiest->group_type)
+                return false;
+        if (sgs->avg_load <= busiest->avg_load)
+                return false;
+        /* This is the busiest node in its class. */
+        if (!(env->sd->flags & SD_ASYM_PACKING))
                return true;
        /*
@@ -5982,8 +6070,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
         * numbered CPUs in the group, therefore mark all groups
         * higher than ourself as busy.
         */
-        if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
+        if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
-            env->dst_cpu < group_first_cpu(sg)) {
                if (!sds->busiest)
                        return true;
@@ -6228,7 +6315,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
        local = &sds->local_stat;
        busiest = &sds->busiest_stat;
-        if (busiest->group_imb) {
+        if (busiest->group_type == group_imbalanced) {
                /*
                 * In the group_imb case we cannot rely on group-wide averages
                 * to ensure cpu-load equilibrium, look at wider averages. XXX
@@ -6248,12 +6335,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
                return fix_small_imbalance(env, sds);
        }
-        if (!busiest->group_imb) {
+        /*
-                /*
+         * If there aren't any idle cpus, avoid creating some.
-                 * Don't want to pull so many tasks that a group would go idle.
+         */
-                 * Except of course for the group_imb case, since then we might
+        if (busiest->group_type == group_overloaded &&
-                 * have to drop below capacity to reach cpu-load equilibrium.
+            local->group_type   == group_overloaded) {
-                 */
                load_above_capacity =
                        (busiest->sum_nr_running - busiest->group_capacity_factor);
@@ -6337,7 +6423,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
         * work because they assume all things are equal, which typically
         * isn't true due to cpus_allowed constraints and the like.
         */
-        if (busiest->group_imb)
+        if (busiest->group_type == group_imbalanced)
                goto force_balance;
        /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
@@ -6346,7 +6432,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
                goto force_balance;
        /*
-         * If the local group is more busy than the selected busiest group
+         * If the local group is busier than the selected busiest group
         * don't try and pull any tasks.
         */
        if (local->avg_load >= busiest->avg_load)
@@ -6361,13 +6447,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
        if (env->idle == CPU_IDLE) {
                /*
-                 * This cpu is idle. If the busiest group load doesn't
+                 * This cpu is idle. If the busiest group is not overloaded
-                 * have more tasks than the number of available cpu's and
+                 * and there is no imbalance between this and busiest group
-                 * there is no imbalance between this and busiest group
+                 * wrt idle cpus, it is balanced. The imbalance becomes
-                 * wrt to idle cpu's, it is balanced.
+                 * significant if the diff is greater than 1 otherwise we
+                 * might end up to just move the imbalance on another group
                 */
-                if ((local->idle_cpus < busiest->idle_cpus) &&
+                if ((busiest->group_type != group_overloaded) &&
-                    busiest->sum_nr_running <= busiest->group_weight)
+                                (local->idle_cpus <= (busiest->idle_cpus + 1)))
                        goto out_balanced;
        } else {
                /*
@@ -6539,7 +6626,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        struct sched_group *group;
        struct rq *busiest;
        unsigned long flags;
-        struct cpumask *cpus = __get_cpu_var(load_balance_mask);
+        struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
        struct lb_env env = {
                .sd             = sd,
@@ -6550,6 +6637,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                .loop_break     = sched_nr_migrate_break,
                .cpus           = cpus,
                .fbq_type       = all,
+                .tasks          = LIST_HEAD_INIT(env.tasks),
        };
        /*
@@ -6599,23 +6687,30 @@ redo:
                env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
 more_balance:
-                local_irq_save(flags);
+                raw_spin_lock_irqsave(&busiest->lock, flags);
-                double_rq_lock(env.dst_rq, busiest);
                /*
                 * cur_ld_moved - load moved in current iteration
                 * ld_moved     - cumulative load moved across iterations
                 */
-                cur_ld_moved = move_tasks(&env);
+                cur_ld_moved = detach_tasks(&env);
-                ld_moved += cur_ld_moved;
-                double_rq_unlock(env.dst_rq, busiest);
-                local_irq_restore(flags);
                /*
-                 * some other cpu did the load balance for us.
+                 * We've detached some tasks from busiest_rq. Every
+                 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
+                 * unlock busiest->lock, and we are able to be sure
+                 * that nobody can manipulate the tasks in parallel.
+                 * See task_rq_lock() family for the details.
                 */
-                if (cur_ld_moved && env.dst_cpu != smp_processor_id())
-                        resched_cpu(env.dst_cpu);
+                raw_spin_unlock(&busiest->lock);
+                if (cur_ld_moved) {
+                        attach_tasks(&env);
+                        ld_moved += cur_ld_moved;
+                }
+                local_irq_restore(flags);
                if (env.flags & LBF_NEED_BREAK) {
                        env.flags &= ~LBF_NEED_BREAK;
@@ -6665,10 +6760,8 @@ more_balance:
                if (sd_parent) {
                        int *group_imbalance = &sd_parent->groups->sgc->imbalance;
-                        if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
+                        if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
                                *group_imbalance = 1;
-                        } else if (*group_imbalance)
-                                *group_imbalance = 0;
                }
                /* All tasks on this runqueue were pinned by CPU affinity */
@@ -6679,7 +6772,7 @@ more_balance:
                                env.loop_break = sched_nr_migrate_break;
                                goto redo;
                        }
-                        goto out_balanced;
+                        goto out_all_pinned;
                }
        }
@@ -6744,7 +6837,7 @@ more_balance:
                 * If we've begun active balancing, start to back off. This
                 * case may not be covered by the all_pinned logic if there
                 * is only 1 task on the busy runqueue (because we don't call
-                 * move_tasks).
+                 * detach_tasks).
                 */
                if (sd->balance_interval < sd->max_interval)
                        sd->balance_interval *= 2;
@@ -6753,6 +6846,23 @@ more_balance:
        goto out;
 out_balanced:
+        /*
+         * We reach balance although we may have faced some affinity
+         * constraints. Clear the imbalance flag if it was set.
+         */
+        if (sd_parent) {
+                int *group_imbalance = &sd_parent->groups->sgc->imbalance;
+                if (*group_imbalance)
+                        *group_imbalance = 0;
+        }
+out_all_pinned:
+        /*
+         * We reach balance because all tasks are pinned at this level so
+         * we can't migrate them. Let the imbalance flag set so parent level
+         * can try to migrate them.
+         */
        schedstat_inc(sd, lb_balanced[idle]);
        sd->nr_balance_failed = 0;
@@ -6914,6 +7024,7 @@ static int active_load_balance_cpu_stop(void *data)
        int target_cpu = busiest_rq->push_cpu;
        struct rq *target_rq = cpu_rq(target_cpu);
        struct sched_domain *sd;
+        struct task_struct *p = NULL;
        raw_spin_lock_irq(&busiest_rq->lock);
@@ -6933,9 +7044,6 @@ static int active_load_balance_cpu_stop(void *data)
         */
        BUG_ON(busiest_rq == target_rq);
-        /* move a task from busiest_rq to target_rq */
-        double_lock_balance(busiest_rq, target_rq);
        /* Search for an sd spanning us and the target CPU. */
        rcu_read_lock();
        for_each_domain(target_cpu, sd) {
@@ -6956,16 +7064,22 @@ static int active_load_balance_cpu_stop(void *data)
                schedstat_inc(sd, alb_count);
-                if (move_one_task(&env))
+                p = detach_one_task(&env);
+                if (p)
                        schedstat_inc(sd, alb_pushed);
                else
                        schedstat_inc(sd, alb_failed);
        }
        rcu_read_unlock();
-        double_unlock_balance(busiest_rq, target_rq);
 out_unlock:
        busiest_rq->active_balance = 0;
-        raw_spin_unlock_irq(&busiest_rq->lock);
+        raw_spin_unlock(&busiest_rq->lock);
+        if (p)
+                attach_one_task(target_rq, p);
+        local_irq_enable();
        return 0;
 }
@@ -7465,7 +7579,7 @@ static void task_fork_fair(struct task_struct *p)
 static void
 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
 {
-        if (!p->se.on_rq)
+        if (!task_on_rq_queued(p))
                return;
        /*
@@ -7490,11 +7604,11 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
         * switched back to the fair class the enqueue_entity(.flags=0) will
         * do the right thing.
         *
-         * If it's on_rq, then the dequeue_entity(.flags=0) will already
+         * If it's queued, then the dequeue_entity(.flags=0) will already
-         * have normalized the vruntime, if it's !on_rq, then only when
+         * have normalized the vruntime, if it's !queued, then only when
         * the task is sleeping will it still have non-normalized vruntime.
         */
-        if (!p->on_rq && p->state != TASK_RUNNING) {
+        if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) {
                /*
                 * Fix up our vruntime so that the current sleep doesn't
                 * cause 'unlimited' sleep bonus.
@@ -7521,15 +7635,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
 */
 static void switched_to_fair(struct rq *rq, struct task_struct *p)
 {
-        struct sched_entity *se = &p->se;
 #ifdef CONFIG_FAIR_GROUP_SCHED
+        struct sched_entity *se = &p->se;
        /*
         * Since the real-depth could have been changed (only FAIR
         * class maintain depth value), reset depth properly.
         */
        se->depth = se->parent ? se->parent->depth + 1 : 0;
 #endif
-        if (!se->on_rq)
+        if (!task_on_rq_queued(p))
                return;
        /*
@@ -7575,7 +7689,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void task_move_group_fair(struct task_struct *p, int on_rq)
+static void task_move_group_fair(struct task_struct *p, int queued)
 {
        struct sched_entity *se = &p->se;
        struct cfs_rq *cfs_rq;
@@ -7594,7 +7708,7 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
         * fair sleeper stuff for the first placement, but who cares.
         */
        /*
-         * When !on_rq, vruntime of the task has usually NOT been normalized.
+         * When !queued, vruntime of the task has usually NOT been normalized.
         * But there are some cases where it has already been normalized:
         *
         * - Moving a forked child which is waiting for being woken up by
@@ -7605,14 +7719,14 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
         * To prevent boost or penalty in the new cfs_rq caused by delta
         * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
         */
-        if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
+        if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
-                on_rq = 1;
+                queued = 1;
-        if (!on_rq)
+        if (!queued)
                se->vruntime -= cfs_rq_of(se)->min_vruntime;
        set_task_rq(p, task_cpu(p));
        se->depth = se->parent ? se->parent->depth + 1 : 0;
-        if (!on_rq) {
+        if (!queued) {
                cfs_rq = cfs_rq_of(se);
                se->vruntime += cfs_rq->min_vruntime;
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 11e7bc434f43..c47fce75e666 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -147,6 +147,9 @@ use_default:
            clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
                goto use_default;
+        /* Take note of the planned idle state. */
+        idle_set_state(this_rq(), &drv->states[next_state]);
        /*
         * Enter the idle state previously returned by the governor decision.
         * This function will block until an interrupt occurs and will take
@@ -154,6 +157,9 @@ use_default:
         */
        entered_state = cpuidle_enter(drv, dev, next_state);
+        /* The cpu is no longer idle or about to enter idle. */
+        idle_set_state(this_rq(), NULL);
        if (broadcast)
                clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c
index 16f5a30f9c88..8ecd552fe4f2 100644
--- a/kernel/sched/proc.c
+++ b/kernel/sched/proc.c
@@ -8,13 +8,6 @@
 #include "sched.h"
-unsigned long this_cpu_load(void)
-{
-        struct rq *this = this_rq();
-        return this->cpu_load[0];
-}
 /*
 * Global load-average calculations
 *
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 5f6edca4fafd..d024e6ce30ba 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1448,7 +1448,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
                 * means a dl or stop task can slip in, in which case we need
                 * to re-start task selection.
                 */
-                if (unlikely((rq->stop && rq->stop->on_rq) ||
+                if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
                             rq->dl.dl_nr_running))
                        return RETRY_TASK;
        }
@@ -1468,8 +1468,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
        p = _pick_next_task_rt(rq);
        /* The running task is never eligible for pushing */
-        if (p)
+        dequeue_pushable_task(rq, p);
-                dequeue_pushable_task(rq, p);
        set_post_schedule(rq);
@@ -1526,7 +1525,7 @@ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
 static int find_lowest_rq(struct task_struct *task)
 {
        struct sched_domain *sd;
-        struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
+        struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
        int this_cpu = smp_processor_id();
        int cpu      = task_cpu(task);
@@ -1624,7 +1623,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
                                     !cpumask_test_cpu(lowest_rq->cpu,
                                                       tsk_cpus_allowed(task)) ||
                                     task_running(rq, task) ||
-                                     !task->on_rq)) {
+                                     !task_on_rq_queued(task))) {
                                double_unlock_balance(rq, lowest_rq);
                                lowest_rq = NULL;
@@ -1658,7 +1657,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
        BUG_ON(task_current(rq, p));
        BUG_ON(p->nr_cpus_allowed <= 1);
-        BUG_ON(!p->on_rq);
+        BUG_ON(!task_on_rq_queued(p));
        BUG_ON(!rt_task(p));
        return p;
@@ -1809,7 +1808,7 @@ static int pull_rt_task(struct rq *this_rq)
                 */
                if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
                        WARN_ON(p == src_rq->curr);
-                        WARN_ON(!p->on_rq);
+                        WARN_ON(!task_on_rq_queued(p));
                        /*
                         * There's a chance that p is higher in priority
@@ -1870,7 +1869,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
        BUG_ON(!rt_task(p));
-        if (!p->on_rq)
+        if (!task_on_rq_queued(p))
                return;
        weight = cpumask_weight(new_mask);
@@ -1936,7 +1935,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
         * we may need to handle the pulling of RT tasks
         * now.
         */
-        if (!p->on_rq || rq->rt.rt_nr_running)
+        if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
                return;
        if (pull_rt_task(rq))
@@ -1970,7 +1969,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
         * If that current running task is also an RT task
         * then see if we can move to another run queue.
         */
-        if (p->on_rq && rq->curr != p) {
+        if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
                if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
                    /* Don't resched if we changed runqueues */
@@ -1989,7 +1988,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
 static void
 prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
 {
-        if (!p->on_rq)
+        if (!task_on_rq_queued(p))
                return;
        if (rq->curr == p) {
@@ -2073,7 +2072,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
        for_each_sched_rt_entity(rt_se) {
                if (rt_se->run_list.prev != rt_se->run_list.next) {
                        requeue_task_rt(rq, p, 0);
-                        set_tsk_need_resched(p);
+                        resched_curr(rq);
                        return;
                }
        }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 579712f4e9d5..24156c8434d1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -14,6 +14,11 @@
 #include "cpuacct.h"
 struct rq;
+struct cpuidle_state;
+/* task_struct::on_rq states: */
+#define TASK_ON_RQ_QUEUED       1
+#define TASK_ON_RQ_MIGRATING    2
 extern __read_mostly int scheduler_running;
@@ -126,6 +131,9 @@ struct rt_bandwidth {
        u64                     rt_runtime;
        struct hrtimer          rt_period_timer;
 };
+void __dl_clear_params(struct task_struct *p);
 /*
 * To keep the bandwidth of -deadline tasks and groups under control
 * we need some place where:
@@ -184,7 +192,7 @@ struct cfs_bandwidth {
        raw_spinlock_t lock;
        ktime_t period;
        u64 quota, runtime;
-        s64 hierarchal_quota;
+        s64 hierarchical_quota;
        u64 runtime_expires;
        int idle, timer_active;
@@ -636,6 +644,11 @@ struct rq {
 #ifdef CONFIG_SMP
        struct llist_head wake_list;
 #endif
+#ifdef CONFIG_CPU_IDLE
+        /* Must be inspected within a rcu lock section */
+        struct cpuidle_state *idle_state;
+#endif
 };
 static inline int cpu_of(struct rq *rq)
@@ -647,13 +660,13 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
-DECLARE_PER_CPU(struct rq, runqueues);
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 #define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
-#define this_rq()               (&__get_cpu_var(runqueues))
+#define this_rq()               this_cpu_ptr(&runqueues)
 #define task_rq(p)              cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
-#define raw_rq()                (&__raw_get_cpu_var(runqueues))
+#define raw_rq()                raw_cpu_ptr(&runqueues)
 static inline u64 rq_clock(struct rq *rq)
 {
@@ -942,6 +955,15 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
 #endif
 }
+static inline int task_on_rq_queued(struct task_struct *p)
+{
+        return p->on_rq == TASK_ON_RQ_QUEUED;
+}
+static inline int task_on_rq_migrating(struct task_struct *p)
+{
+        return p->on_rq == TASK_ON_RQ_MIGRATING;
+}
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)      do { } while (0)
@@ -953,7 +975,6 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
 # define finish_arch_post_lock_switch() do { } while (0)
 #endif
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
 #ifdef CONFIG_SMP
@@ -991,35 +1012,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
        raw_spin_unlock_irq(&rq->lock);
 }
-#else /* __ARCH_WANT_UNLOCKED_CTXSW */
-static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
-{
-#ifdef CONFIG_SMP
-        /*
-         * We can optimise this out completely for !SMP, because the
-         * SMP rebalancing from interrupt is the only thing that cares
-         * here.
-         */
-        next->on_cpu = 1;
-#endif
-        raw_spin_unlock(&rq->lock);
-}
-static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
-{
-#ifdef CONFIG_SMP
-        /*
-         * After ->on_cpu is cleared, the task can be moved to a different CPU.
-         * We must ensure this doesn't happen until the switch is completely
-         * finished.
-         */
-        smp_wmb();
-        prev->on_cpu = 0;
-#endif
-        local_irq_enable();
-}
-#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 /*
 * wake flags
 */
@@ -1180,6 +1172,30 @@ static inline void idle_exit_fair(struct rq *rq) { }
 #endif
+#ifdef CONFIG_CPU_IDLE
+static inline void idle_set_state(struct rq *rq,
+                                  struct cpuidle_state *idle_state)
+{
+        rq->idle_state = idle_state;
+}
+static inline struct cpuidle_state *idle_get_state(struct rq *rq)
+{
+        WARN_ON(!rcu_read_lock_held());
+        return rq->idle_state;
+}
+#else
+static inline void idle_set_state(struct rq *rq,
+                                  struct cpuidle_state *idle_state)
+{
+}
+static inline struct cpuidle_state *idle_get_state(struct rq *rq)
+{
+        return NULL;
+}
+#endif
 extern void sysrq_sched_debug_show(void);
 extern void sched_init_granularity(void);
 extern void update_max_interval(void);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index bfe0edadbfbb..67426e529f59 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -28,7 +28,7 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev)
 {
        struct task_struct *stop = rq->stop;
-        if (!stop || !stop->on_rq)
+        if (!stop || !task_on_rq_queued(stop))
                return NULL;
        put_prev_task(rq, prev);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 15cab1a4f84e..5a62915f47a8 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -343,6 +343,18 @@ int __sched out_of_line_wait_on_bit(void *word, int bit,
 }
 EXPORT_SYMBOL(out_of_line_wait_on_bit);
+int __sched out_of_line_wait_on_bit_timeout(
+        void *word, int bit, wait_bit_action_f *action,
+        unsigned mode, unsigned long timeout)
+{
+        wait_queue_head_t *wq = bit_waitqueue(word, bit);
+        DEFINE_WAIT_BIT(wait, word, bit);
+        wait.key.timeout = jiffies + timeout;
+        return __wait_on_bit(wq, &wait, action, mode);
+}
+EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
 int __sched
 __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
                        wait_bit_action_f *action, unsigned mode)
@@ -520,3 +532,27 @@ __sched int bit_wait_io(struct wait_bit_key *word)
        return 0;
 }
 EXPORT_SYMBOL(bit_wait_io);
+__sched int bit_wait_timeout(struct wait_bit_key *word)
+{
+        unsigned long now = ACCESS_ONCE(jiffies);
+        if (signal_pending_state(current->state, current))
+                return 1;
+        if (time_after_eq(now, word->timeout))
+                return -EAGAIN;
+        schedule_timeout(word->timeout - now);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(bit_wait_timeout);
+__sched int bit_wait_io_timeout(struct wait_bit_key *word)
+{
+        unsigned long now = ACCESS_ONCE(jiffies);
+        if (signal_pending_state(current->state, current))
+                return 1;
+        if (time_after_eq(now, word->timeout))
+                return -EAGAIN;
+        io_schedule_timeout(word->timeout - now);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 25b0043f4755..4ef9687ac115 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -21,10 +21,11 @@
 #include <linux/slab.h>
 #include <linux/syscalls.h>
-/* #define SECCOMP_DEBUG 1 */
+#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
+#include <asm/syscall.h>
+#endif
 #ifdef CONFIG_SECCOMP_FILTER
-#include <asm/syscall.h>
 #include <linux/filter.h>
 #include <linux/pid.h>
 #include <linux/ptrace.h>
@@ -172,10 +173,10 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
 *
 * Returns valid seccomp BPF response codes.
 */
-static u32 seccomp_run_filters(int syscall)
+static u32 seccomp_run_filters(struct seccomp_data *sd)
 {
        struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter);
-        struct seccomp_data sd;
+        struct seccomp_data sd_local;
        u32 ret = SECCOMP_RET_ALLOW;
        /* Ensure unexpected behavior doesn't result in failing open. */
@@ -185,14 +186,17 @@ static u32 seccomp_run_filters(int syscall)
        /* Make sure cross-thread synced filter points somewhere sane. */
        smp_read_barrier_depends();
-        populate_seccomp_data(&sd);
+        if (!sd) {
+                populate_seccomp_data(&sd_local);
+                sd = &sd_local;
+        }
        /*
         * All filters in the list are evaluated and the lowest BPF return
         * value always takes priority (ignoring the DATA).
         */
        for (; f; f = f->prev) {
-                u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)&sd);
+                u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd);
                if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
                        ret = cur_ret;
@@ -203,7 +207,7 @@ static u32 seccomp_run_filters(int syscall)
 static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
 {
-        BUG_ON(!spin_is_locked(&current->sighand->siglock));
+        assert_spin_locked(&current->sighand->siglock);
        if (current->seccomp.mode && current->seccomp.mode != seccomp_mode)
                return false;
@@ -214,7 +218,7 @@ static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
 static inline void seccomp_assign_mode(struct task_struct *task,
                                       unsigned long seccomp_mode)
 {
-        BUG_ON(!spin_is_locked(&task->sighand->siglock));
+        assert_spin_locked(&task->sighand->siglock);
        task->seccomp.mode = seccomp_mode;
        /*
@@ -253,7 +257,7 @@ static inline pid_t seccomp_can_sync_threads(void)
        struct task_struct *thread, *caller;
        BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
-        BUG_ON(!spin_is_locked(&current->sighand->siglock));
+        assert_spin_locked(&current->sighand->siglock);
        /* Validate all threads being eligible for synchronization. */
        caller = current;
@@ -294,7 +298,7 @@ static inline void seccomp_sync_threads(void)
        struct task_struct *thread, *caller;
        BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
-        BUG_ON(!spin_is_locked(&current->sighand->siglock));
+        assert_spin_locked(&current->sighand->siglock);
        /* Synchronize all threads. */
        caller = current;
@@ -395,16 +399,15 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
        if (!filter)
                goto free_prog;
-        filter->prog = kzalloc(bpf_prog_size(new_len),
+        filter->prog = bpf_prog_alloc(bpf_prog_size(new_len), __GFP_NOWARN);
-                               GFP_KERNEL|__GFP_NOWARN);
        if (!filter->prog)
                goto free_filter;
        ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len);
        if (ret)
                goto free_filter_prog;
-        kfree(fp);
+        kfree(fp);
        atomic_set(&filter->usage, 1);
        filter->prog->len = new_len;
@@ -413,7 +416,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
        return filter;
 free_filter_prog:
-        kfree(filter->prog);
+        __bpf_prog_free(filter->prog);
 free_filter:
        kfree(filter);
 free_prog:
@@ -464,7 +467,7 @@ static long seccomp_attach_filter(unsigned int flags,
        unsigned long total_insns;
        struct seccomp_filter *walker;
-        BUG_ON(!spin_is_locked(&current->sighand->siglock));
+        assert_spin_locked(&current->sighand->siglock);
        /* Validate resulting filter length. */
        total_insns = filter->prog->len;
@@ -564,11 +567,55 @@ static int mode1_syscalls_32[] = {
 };
 #endif
-int __secure_computing(int this_syscall)
+static void __secure_computing_strict(int this_syscall)
+{
+        int *syscall_whitelist = mode1_syscalls;
+#ifdef CONFIG_COMPAT
+        if (is_compat_task())
+                syscall_whitelist = mode1_syscalls_32;
+#endif
+        do {
+                if (*syscall_whitelist == this_syscall)
+                        return;
+        } while (*++syscall_whitelist);
+#ifdef SECCOMP_DEBUG
+        dump_stack();
+#endif
+        audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL);
+        do_exit(SIGKILL);
+}
+#ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER
+void secure_computing_strict(int this_syscall)
+{
+        int mode = current->seccomp.mode;
+        if (mode == 0)
+                return;
+        else if (mode == SECCOMP_MODE_STRICT)
+                __secure_computing_strict(this_syscall);
+        else
+                BUG();
+}
+#else
+int __secure_computing(void)
+{
+        u32 phase1_result = seccomp_phase1(NULL);
+        if (likely(phase1_result == SECCOMP_PHASE1_OK))
+                return 0;
+        else if (likely(phase1_result == SECCOMP_PHASE1_SKIP))
+                return -1;
+        else
+                return seccomp_phase2(phase1_result);
+}
+#ifdef CONFIG_SECCOMP_FILTER
+static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
 {
-        int exit_sig = 0;
+        u32 filter_ret, action;
-        int *syscall;
+        int data;
-        u32 ret;
        /*
         * Make sure that any changes to mode from another thread have
@@ -576,85 +623,127 @@ int __secure_computing(int this_syscall)
         */
        rmb();
-        switch (current->seccomp.mode) {
+        filter_ret = seccomp_run_filters(sd);
-        case SECCOMP_MODE_STRICT:
+        data = filter_ret & SECCOMP_RET_DATA;
-                syscall = mode1_syscalls;
+        action = filter_ret & SECCOMP_RET_ACTION;
-#ifdef CONFIG_COMPAT
-                if (is_compat_task())
+        switch (action) {
-                        syscall = mode1_syscalls_32;
+        case SECCOMP_RET_ERRNO:
+                /* Set the low-order 16-bits as a errno. */
+                syscall_set_return_value(current, task_pt_regs(current),
+                                         -data, 0);
+                goto skip;
+        case SECCOMP_RET_TRAP:
+                /* Show the handler the original registers. */
+                syscall_rollback(current, task_pt_regs(current));
+                /* Let the filter pass back 16 bits of data. */
+                seccomp_send_sigsys(this_syscall, data);
+                goto skip;
+        case SECCOMP_RET_TRACE:
+                return filter_ret;  /* Save the rest for phase 2. */
+        case SECCOMP_RET_ALLOW:
+                return SECCOMP_PHASE1_OK;
+        case SECCOMP_RET_KILL:
+        default:
+                audit_seccomp(this_syscall, SIGSYS, action);
+                do_exit(SIGSYS);
+        }
+        unreachable();
+skip:
+        audit_seccomp(this_syscall, 0, action);
+        return SECCOMP_PHASE1_SKIP;
+}
 #endif
-                do {
-                        if (*syscall == this_syscall)
+/**
-                                return 0;
+ * seccomp_phase1() - run fast path seccomp checks on the current syscall
-                } while (*++syscall);
+ * @arg sd: The seccomp_data or NULL
-                exit_sig = SIGKILL;
+ *
-                ret = SECCOMP_RET_KILL;
+ * This only reads pt_regs via the syscall_xyz helpers.  The only change
-                break;
+ * it will make to pt_regs is via syscall_set_return_value, and it will
+ * only do that if it returns SECCOMP_PHASE1_SKIP.
+ *
+ * If sd is provided, it will not read pt_regs at all.
+ *
+ * It may also call do_exit or force a signal; these actions must be
+ * safe.
+ *
+ * If it returns SECCOMP_PHASE1_OK, the syscall passes checks and should
+ * be processed normally.
+ *
+ * If it returns SECCOMP_PHASE1_SKIP, then the syscall should not be
+ * invoked.  In this case, seccomp_phase1 will have set the return value
+ * using syscall_set_return_value.
+ *
+ * If it returns anything else, then the return value should be passed
+ * to seccomp_phase2 from a context in which ptrace hooks are safe.
+ */
+u32 seccomp_phase1(struct seccomp_data *sd)
+{
+        int mode = current->seccomp.mode;
+        int this_syscall = sd ? sd->nr :
+                syscall_get_nr(current, task_pt_regs(current));
+        switch (mode) {
+        case SECCOMP_MODE_STRICT:
+                __secure_computing_strict(this_syscall);  /* may call do_exit */
+                return SECCOMP_PHASE1_OK;
 #ifdef CONFIG_SECCOMP_FILTER
-        case SECCOMP_MODE_FILTER: {
+        case SECCOMP_MODE_FILTER:
-                int data;
+                return __seccomp_phase1_filter(this_syscall, sd);
-                struct pt_regs *regs = task_pt_regs(current);
-                ret = seccomp_run_filters(this_syscall);
-                data = ret & SECCOMP_RET_DATA;
-                ret &= SECCOMP_RET_ACTION;
-                switch (ret) {
-                case SECCOMP_RET_ERRNO:
-                        /* Set the low-order 16-bits as a errno. */
-                        syscall_set_return_value(current, regs,
-                                                 -data, 0);
-                        goto skip;
-                case SECCOMP_RET_TRAP:
-                        /* Show the handler the original registers. */
-                        syscall_rollback(current, regs);
-                        /* Let the filter pass back 16 bits of data. */
-                        seccomp_send_sigsys(this_syscall, data);
-                        goto skip;
-                case SECCOMP_RET_TRACE:
-                        /* Skip these calls if there is no tracer. */
-                        if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
-                                syscall_set_return_value(current, regs,
-                                                         -ENOSYS, 0);
-                                goto skip;
-                        }
-                        /* Allow the BPF to provide the event message */
-                        ptrace_event(PTRACE_EVENT_SECCOMP, data);
-                        /*
-                         * The delivery of a fatal signal during event
-                         * notification may silently skip tracer notification.
-                         * Terminating the task now avoids executing a system
-                         * call that may not be intended.
-                         */
-                        if (fatal_signal_pending(current))
-                                break;
-                        if (syscall_get_nr(current, regs) < 0)
-                                goto skip;  /* Explicit request to skip. */
-                        return 0;
-                case SECCOMP_RET_ALLOW:
-                        return 0;
-                case SECCOMP_RET_KILL:
-                default:
-                        break;
-                }
-                exit_sig = SIGSYS;
-                break;
-        }
 #endif
        default:
                BUG();
        }
+}
-#ifdef SECCOMP_DEBUG
+/**
-        dump_stack();
+ * seccomp_phase2() - finish slow path seccomp work for the current syscall
-#endif
+ * @phase1_result: The return value from seccomp_phase1()
-        audit_seccomp(this_syscall, exit_sig, ret);
+ *
-        do_exit(exit_sig);
+ * This must be called from a context in which ptrace hooks can be used.
-#ifdef CONFIG_SECCOMP_FILTER
+ *
-skip:
+ * Returns 0 if the syscall should be processed or -1 to skip the syscall.
-        audit_seccomp(this_syscall, exit_sig, ret);
+ */
-#endif
+int seccomp_phase2(u32 phase1_result)
-        return -1;
+{
+        struct pt_regs *regs = task_pt_regs(current);
+        u32 action = phase1_result & SECCOMP_RET_ACTION;
+        int data = phase1_result & SECCOMP_RET_DATA;
+        BUG_ON(action != SECCOMP_RET_TRACE);
+        audit_seccomp(syscall_get_nr(current, regs), 0, action);
+        /* Skip these calls if there is no tracer. */
+        if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
+                syscall_set_return_value(current, regs,
+                                         -ENOSYS, 0);
+                return -1;
+        }
+        /* Allow the BPF to provide the event message */
+        ptrace_event(PTRACE_EVENT_SECCOMP, data);
+        /*
+         * The delivery of a fatal signal during event
+         * notification may silently skip tracer notification.
+         * Terminating the task now avoids executing a system
+         * call that may not be intended.
+         */
+        if (fatal_signal_pending(current))
+                do_exit(SIGSYS);
+        if (syscall_get_nr(current, regs) < 0)
+                return -1;  /* Explicit request to skip. */
+        return 0;
 }
+#endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */
 long prctl_get_seccomp(void)
 {
diff --git a/kernel/signal.c b/kernel/signal.c
index 40b76e351e64..8f0876f9f6dd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2170,8 +2170,7 @@ static int ptrace_signal(int signr, siginfo_t *info)
        return signr;
 }
-int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
+int get_signal(struct ksignal *ksig)
-                          struct pt_regs *regs, void *cookie)
 {
        struct sighand_struct *sighand = current->sighand;
        struct signal_struct *signal = current->signal;
@@ -2241,13 +2240,13 @@ relock:
                        goto relock;
                }
-                signr = dequeue_signal(current, &current->blocked, info);
+                signr = dequeue_signal(current, &current->blocked, &ksig->info);
                if (!signr)
                        break; /* will return 0 */
                if (unlikely(current->ptrace) && signr != SIGKILL) {
-                        signr = ptrace_signal(signr, info);
+                        signr = ptrace_signal(signr, &ksig->info);
                        if (!signr)
                                continue;
                }
@@ -2255,13 +2254,13 @@ relock:
                ka = &sighand->action[signr-1];
                /* Trace actually delivered signals. */
-                trace_signal_deliver(signr, info, ka);
+                trace_signal_deliver(signr, &ksig->info, ka);
                if (ka->sa.sa_handler == SIG_IGN) /* Do nothing.  */
                        continue;
                if (ka->sa.sa_handler != SIG_DFL) {
                        /* Run the handler.  */
-                        *return_ka = *ka;
+                        ksig->ka = *ka;
                        if (ka->sa.sa_flags & SA_ONESHOT)
                                ka->sa.sa_handler = SIG_DFL;
@@ -2311,7 +2310,7 @@ relock:
                                spin_lock_irq(&sighand->siglock);
                        }
-                        if (likely(do_signal_stop(info->si_signo))) {
+                        if (likely(do_signal_stop(ksig->info.si_signo))) {
                                /* It released the siglock.  */
                                goto relock;
                        }
@@ -2332,7 +2331,7 @@ relock:
                if (sig_kernel_coredump(signr)) {
                        if (print_fatal_signals)
-                                print_fatal_signal(info->si_signo);
+                                print_fatal_signal(ksig->info.si_signo);
                        proc_coredump_connector(current);
                        /*
                         * If it was able to dump core, this kills all
@@ -2342,34 +2341,32 @@ relock:
                         * first and our do_group_exit call below will use
                         * that value and ignore the one we pass it.
                         */
-                        do_coredump(info);
+                        do_coredump(&ksig->info);
                }
                /*
                 * Death signals, no core dump.
                 */
-                do_group_exit(info->si_signo);
+                do_group_exit(ksig->info.si_signo);
                /* NOTREACHED */
        }
        spin_unlock_irq(&sighand->siglock);
-        return signr;
+        ksig->sig = signr;
+        return ksig->sig > 0;
 }
 /**
 * signal_delivered - 
- * @sig:                number of signal being delivered
+ * @ksig:               kernel signal struct
- * @info:               siginfo_t of signal being delivered
- * @ka:                 sigaction setting that chose the handler
- * @regs:               user register state
 * @stepping:           nonzero if debugger single-step or block-step in use
 *
 * This function should be called when a signal has successfully been
- * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask
+ * delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask
 * is always blocked, and the signal itself is blocked unless %SA_NODEFER
- * is set in @ka->sa.sa_flags.  Tracing is notified.
+ * is set in @ksig->ka.sa.sa_flags.  Tracing is notified.
 */
-void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,
+static void signal_delivered(struct ksignal *ksig, int stepping)
-                        struct pt_regs *regs, int stepping)
 {
        sigset_t blocked;
@@ -2379,11 +2376,11 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,
           simply clear the restore sigmask flag.  */
        clear_restore_sigmask();
-        sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
+        sigorsets(&blocked, &current->blocked, &ksig->ka.sa.sa_mask);
-        if (!(ka->sa.sa_flags & SA_NODEFER))
+        if (!(ksig->ka.sa.sa_flags & SA_NODEFER))
-                sigaddset(&blocked, sig);
+                sigaddset(&blocked, ksig->sig);
        set_current_blocked(&blocked);
-        tracehook_signal_handler(sig, info, ka, regs, stepping);
+        tracehook_signal_handler(stepping);
 }
 void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
@@ -2391,8 +2388,7 @@ void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
        if (failed)
                force_sigsegv(ksig->sig, current);
        else
-                signal_delivered(ksig->sig, &ksig->info, &ksig->ka,
+                signal_delivered(ksig, stepping);
-                        signal_pt_regs(), stepping);
 }
 /*
diff --git a/kernel/smp.c b/kernel/smp.c
index 487653b5844f..f38a1e692259 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -13,6 +13,7 @@
 #include <linux/gfp.h>
 #include <linux/smp.h>
 #include <linux/cpu.h>
+#include <linux/sched.h>
 #include "smpboot.h"
@@ -164,7 +165,7 @@ static int generic_exec_single(int cpu, struct call_single_data *csd,
        if (!csd) {
                csd = &csd_stack;
                if (!wait)
-                        csd = &__get_cpu_var(csd_data);
+                        csd = this_cpu_ptr(&csd_data);
        }
        csd_lock(csd);
@@ -229,7 +230,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
        WARN_ON(!irqs_disabled());
-        head = &__get_cpu_var(call_single_queue);
+        head = this_cpu_ptr(&call_single_queue);
        entry = llist_del_all(head);
        entry = llist_reverse_order(entry);
@@ -419,7 +420,7 @@ void smp_call_function_many(const struct cpumask *mask,
                return;
        }
-        cfd = &__get_cpu_var(cfd_data);
+        cfd = this_cpu_ptr(&cfd_data);
        cpumask_and(cfd->cpumask, mask, cpu_online_mask);
        cpumask_clear_cpu(this_cpu, cfd->cpumask);
@@ -670,7 +671,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
                        if (cond_func(cpu, info)) {
                                ret = smp_call_function_single(cpu, func,
                                                                info, wait);
-                                WARN_ON_ONCE(!ret);
+                                WARN_ON_ONCE(ret);
                        }
                preempt_enable();
        }
@@ -699,3 +700,24 @@ void kick_all_cpus_sync(void)
        smp_call_function(do_nothing, NULL, 1);
 }
 EXPORT_SYMBOL_GPL(kick_all_cpus_sync);
+/**
+ * wake_up_all_idle_cpus - break all cpus out of idle
+ * wake_up_all_idle_cpus try to break all cpus which is in idle state even
+ * including idle polling cpus, for non-idle cpus, we will do nothing
+ * for them.
+ */
+void wake_up_all_idle_cpus(void)
+{
+        int cpu;
+        preempt_disable();
+        for_each_online_cpu(cpu) {
+                if (cpu == smp_processor_id())
+                        continue;
+                wake_up_if_idle(cpu);
+        }
+        preempt_enable();
+}
+EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 5918d227730f..0699add19164 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -278,7 +278,7 @@ restart:
                pending >>= softirq_bit;
        }
-        rcu_bh_qs(smp_processor_id());
+        rcu_bh_qs();
        local_irq_disable();
        pending = local_softirq_pending();
@@ -485,7 +485,7 @@ static void tasklet_action(struct softirq_action *a)
        local_irq_disable();
        list = __this_cpu_read(tasklet_vec.head);
        __this_cpu_write(tasklet_vec.head, NULL);
-        __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head);
+        __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
        local_irq_enable();
        while (list) {
@@ -521,7 +521,7 @@ static void tasklet_hi_action(struct softirq_action *a)
        local_irq_disable();
        list = __this_cpu_read(tasklet_hi_vec.head);
        __this_cpu_write(tasklet_hi_vec.head, NULL);
-        __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head);
+        __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
        local_irq_enable();
        while (list) {
diff --git a/kernel/sys.c b/kernel/sys.c
index ce8129192a26..1eaa2f0b0246 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -62,28 +62,28 @@
 #include <asm/unistd.h>
 #ifndef SET_UNALIGN_CTL
-# define SET_UNALIGN_CTL(a,b)   (-EINVAL)
+# define SET_UNALIGN_CTL(a, b)  (-EINVAL)
 #endif
 #ifndef GET_UNALIGN_CTL
-# define GET_UNALIGN_CTL(a,b)   (-EINVAL)
+# define GET_UNALIGN_CTL(a, b)  (-EINVAL)
 #endif
 #ifndef SET_FPEMU_CTL
-# define SET_FPEMU_CTL(a,b)     (-EINVAL)
+# define SET_FPEMU_CTL(a, b)    (-EINVAL)
 #endif
 #ifndef GET_FPEMU_CTL
-# define GET_FPEMU_CTL(a,b)     (-EINVAL)
+# define GET_FPEMU_CTL(a, b)    (-EINVAL)
 #endif
 #ifndef SET_FPEXC_CTL
-# define SET_FPEXC_CTL(a,b)     (-EINVAL)
+# define SET_FPEXC_CTL(a, b)    (-EINVAL)
 #endif
 #ifndef GET_FPEXC_CTL
-# define GET_FPEXC_CTL(a,b)     (-EINVAL)
+# define GET_FPEXC_CTL(a, b)    (-EINVAL)
 #endif
 #ifndef GET_ENDIAN
-# define GET_ENDIAN(a,b)        (-EINVAL)
+# define GET_ENDIAN(a, b)       (-EINVAL)
 #endif
 #ifndef SET_ENDIAN
-# define SET_ENDIAN(a,b)        (-EINVAL)
+# define SET_ENDIAN(a, b)       (-EINVAL)
 #endif
 #ifndef GET_TSC_CTL
 # define GET_TSC_CTL(a)         (-EINVAL)
@@ -182,39 +182,40 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
        rcu_read_lock();
        read_lock(&tasklist_lock);
        switch (which) {
-                case PRIO_PROCESS:
+        case PRIO_PROCESS:
-                        if (who)
+                if (who)
-                                p = find_task_by_vpid(who);
+                        p = find_task_by_vpid(who);
-                        else
+                else
-                                p = current;
+                        p = current;
-                        if (p)
+                if (p)
-                                error = set_one_prio(p, niceval, error);
+                        error = set_one_prio(p, niceval, error);
-                        break;
+                break;
-                case PRIO_PGRP:
+        case PRIO_PGRP:
-                        if (who)
+                if (who)
-                                pgrp = find_vpid(who);
+                        pgrp = find_vpid(who);
-                        else
+                else
-                                pgrp = task_pgrp(current);
+                        pgrp = task_pgrp(current);
-                        do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
+                do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
-                                error = set_one_prio(p, niceval, error);
+                        error = set_one_prio(p, niceval, error);
-                        } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
+                } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
-                        break;
+                break;
-                case PRIO_USER:
+        case PRIO_USER:
-                        uid = make_kuid(cred->user_ns, who);
+                uid = make_kuid(cred->user_ns, who);
-                        user = cred->user;
+                user = cred->user;
-                        if (!who)
+                if (!who)
-                                uid = cred->uid;
+                        uid = cred->uid;
-                        else if (!uid_eq(uid, cred->uid) &&
+                else if (!uid_eq(uid, cred->uid)) {
-                                 !(user = find_user(uid)))
+                        user = find_user(uid);
+                        if (!user)
                                goto out_unlock;        /* No processes for this user */
+                }
-                        do_each_thread(g, p) {
+                do_each_thread(g, p) {
-                                if (uid_eq(task_uid(p), uid))
+                        if (uid_eq(task_uid(p), uid))
-                                        error = set_one_prio(p, niceval, error);
+                                error = set_one_prio(p, niceval, error);
-                        } while_each_thread(g, p);
+                } while_each_thread(g, p);
-                        if (!uid_eq(uid, cred->uid))
+                if (!uid_eq(uid, cred->uid))
-                                free_uid(user);         /* For find_user() */
+                        free_uid(user);         /* For find_user() */
-                        break;
+                break;
        }
 out_unlock:
        read_unlock(&tasklist_lock);
@@ -244,47 +245,48 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
        rcu_read_lock();
        read_lock(&tasklist_lock);
        switch (which) {
-                case PRIO_PROCESS:
+        case PRIO_PROCESS:
-                        if (who)
+                if (who)
-                                p = find_task_by_vpid(who);
+                        p = find_task_by_vpid(who);
-                        else
+                else
-                                p = current;
+                        p = current;
-                        if (p) {
+                if (p) {
+                        niceval = nice_to_rlimit(task_nice(p));
+                        if (niceval > retval)
+                                retval = niceval;
+                }
+                break;
+        case PRIO_PGRP:
+                if (who)
+                        pgrp = find_vpid(who);
+                else
+                        pgrp = task_pgrp(current);
+                do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
+                        niceval = nice_to_rlimit(task_nice(p));
+                        if (niceval > retval)
+                                retval = niceval;
+                } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
+                break;
+        case PRIO_USER:
+                uid = make_kuid(cred->user_ns, who);
+                user = cred->user;
+                if (!who)
+                        uid = cred->uid;
+                else if (!uid_eq(uid, cred->uid)) {
+                        user = find_user(uid);
+                        if (!user)
+                                goto out_unlock;        /* No processes for this user */
+                }
+                do_each_thread(g, p) {
+                        if (uid_eq(task_uid(p), uid)) {
                                niceval = nice_to_rlimit(task_nice(p));
                                if (niceval > retval)
                                        retval = niceval;
                        }
-                        break;
+                } while_each_thread(g, p);
-                case PRIO_PGRP:
+                if (!uid_eq(uid, cred->uid))
-                        if (who)
+                        free_uid(user);         /* for find_user() */
-                                pgrp = find_vpid(who);
+                break;
-                        else
-                                pgrp = task_pgrp(current);
-                        do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
-                                niceval = nice_to_rlimit(task_nice(p));
-                                if (niceval > retval)
-                                        retval = niceval;
-                        } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
-                        break;
-                case PRIO_USER:
-                        uid = make_kuid(cred->user_ns, who);
-                        user = cred->user;
-                        if (!who)
-                                uid = cred->uid;
-                        else if (!uid_eq(uid, cred->uid) &&
-                                 !(user = find_user(uid)))
-                                goto out_unlock;        /* No processes for this user */
-                        do_each_thread(g, p) {
-                                if (uid_eq(task_uid(p), uid)) {
-                                        niceval = nice_to_rlimit(task_nice(p));
-                                        if (niceval > retval)
-                                                retval = niceval;
-                                }
-                        } while_each_thread(g, p);
-                        if (!uid_eq(uid, cred->uid))
-                                free_uid(user);         /* for find_user() */
-                        break;
        }
 out_unlock:
        read_unlock(&tasklist_lock);
@@ -306,7 +308,7 @@ out_unlock:
 *
 * The general idea is that a program which uses just setregid() will be
 * 100% compatible with BSD.  A program which uses just setgid() will be
- * 100% compatible with POSIX with saved IDs. 
+ * 100% compatible with POSIX with saved IDs.
 *
 * SMP: There are not races, the GIDs are checked only by filesystem
 *      operations (as far as semantic preservation is concerned).
@@ -364,7 +366,7 @@ error:
 }
 /*
- * setgid() is implemented like SysV w/ SAVED_IDS 
+ * setgid() is implemented like SysV w/ SAVED_IDS
 *
 * SMP: Same implicit races as above.
 */
@@ -442,7 +444,7 @@ static int set_user(struct cred *new)
 *
 * The general idea is that a program which uses just setreuid() will be
 * 100% compatible with BSD.  A program which uses just setuid() will be
- * 100% compatible with POSIX with saved IDs. 
+ * 100% compatible with POSIX with saved IDs.
 */
 SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
 {
@@ -503,17 +505,17 @@ error:
        abort_creds(new);
        return retval;
 }
-                
 /*
- * setuid() is implemented like SysV with SAVED_IDS 
+ * setuid() is implemented like SysV with SAVED_IDS
- * 
+ *
 * Note that SAVED_ID's is deficient in that a setuid root program
- * like sendmail, for example, cannot set its uid to be a normal 
+ * like sendmail, for example, cannot set its uid to be a normal
 * user and then switch back, because if you're root, setuid() sets
 * the saved uid too.  If you don't like this, blame the bright people
 * in the POSIX committee and/or USG.  Note that the BSD-style setreuid()
 * will allow a root program to temporarily drop privileges and be able to
- * regain them by swapping the real and effective uid.  
+ * regain them by swapping the real and effective uid.
 */
 SYSCALL_DEFINE1(setuid, uid_t, uid)
 {
@@ -637,10 +639,12 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t _
        euid = from_kuid_munged(cred->user_ns, cred->euid);
        suid = from_kuid_munged(cred->user_ns, cred->suid);
-        if (!(retval   = put_user(ruid, ruidp)) &&
+        retval = put_user(ruid, ruidp);
-            !(retval   = put_user(euid, euidp)))
+        if (!retval) {
-                retval = put_user(suid, suidp);
+                retval = put_user(euid, euidp);
+                if (!retval)
+                        return put_user(suid, suidp);
+        }
        return retval;
 }
@@ -709,9 +713,12 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t _
        egid = from_kgid_munged(cred->user_ns, cred->egid);
        sgid = from_kgid_munged(cred->user_ns, cred->sgid);
-        if (!(retval   = put_user(rgid, rgidp)) &&
+        retval = put_user(rgid, rgidp);
-            !(retval   = put_user(egid, egidp)))
+        if (!retval) {
-                retval = put_user(sgid, sgidp);
+                retval = put_user(egid, egidp);
+                if (!retval)
+                        retval = put_user(sgid, sgidp);
+        }
        return retval;
 }
@@ -862,11 +869,9 @@ void do_sys_times(struct tms *tms)
 {
        cputime_t tgutime, tgstime, cutime, cstime;
-        spin_lock_irq(&current->sighand->siglock);
        thread_group_cputime_adjusted(current, &tgutime, &tgstime);
        cutime = current->signal->cutime;
        cstime = current->signal->cstime;
-        spin_unlock_irq(&current->sighand->siglock);
        tms->tms_utime = cputime_to_clock_t(tgutime);
        tms->tms_stime = cputime_to_clock_t(tgstime);
        tms->tms_cutime = cputime_to_clock_t(cutime);
@@ -1284,7 +1289,6 @@ SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
 /*
 *      Back compatibility for getrlimit. Needed for some apps.
 */
- 
 SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
                struct rlimit __user *, rlim)
 {
@@ -1299,7 +1303,7 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
                x.rlim_cur = 0x7FFFFFFF;
        if (x.rlim_max > 0x7FFFFFFF)
                x.rlim_max = 0x7FFFFFFF;
-        return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0;
+        return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0;
 }
 #endif
@@ -1527,7 +1531,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
        cputime_t tgutime, tgstime, utime, stime;
        unsigned long maxrss = 0;
-        memset((char *) r, 0, sizeof *r);
+        memset((char *)r, 0, sizeof (*r));
        utime = stime = 0;
        if (who == RUSAGE_THREAD) {
@@ -1541,41 +1545,41 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
                return;
        switch (who) {
-                case RUSAGE_BOTH:
+        case RUSAGE_BOTH:
-                case RUSAGE_CHILDREN:
+        case RUSAGE_CHILDREN:
-                        utime = p->signal->cutime;
+                utime = p->signal->cutime;
-                        stime = p->signal->cstime;
+                stime = p->signal->cstime;
-                        r->ru_nvcsw = p->signal->cnvcsw;
+                r->ru_nvcsw = p->signal->cnvcsw;
-                        r->ru_nivcsw = p->signal->cnivcsw;
+                r->ru_nivcsw = p->signal->cnivcsw;
-                        r->ru_minflt = p->signal->cmin_flt;
+                r->ru_minflt = p->signal->cmin_flt;
-                        r->ru_majflt = p->signal->cmaj_flt;
+                r->ru_majflt = p->signal->cmaj_flt;
-                        r->ru_inblock = p->signal->cinblock;
+                r->ru_inblock = p->signal->cinblock;
-                        r->ru_oublock = p->signal->coublock;
+                r->ru_oublock = p->signal->coublock;
-                        maxrss = p->signal->cmaxrss;
+                maxrss = p->signal->cmaxrss;
-                        if (who == RUSAGE_CHILDREN)
+                if (who == RUSAGE_CHILDREN)
-                                break;
-                case RUSAGE_SELF:
-                        thread_group_cputime_adjusted(p, &tgutime, &tgstime);
-                        utime += tgutime;
-                        stime += tgstime;
-                        r->ru_nvcsw += p->signal->nvcsw;
-                        r->ru_nivcsw += p->signal->nivcsw;
-                        r->ru_minflt += p->signal->min_flt;
-                        r->ru_majflt += p->signal->maj_flt;
-                        r->ru_inblock += p->signal->inblock;
-                        r->ru_oublock += p->signal->oublock;
-                        if (maxrss < p->signal->maxrss)
-                                maxrss = p->signal->maxrss;
-                        t = p;
-                        do {
-                                accumulate_thread_rusage(t, r);
-                        } while_each_thread(p, t);
                        break;
-                default:
+        case RUSAGE_SELF:
-                        BUG();
+                thread_group_cputime_adjusted(p, &tgutime, &tgstime);
+                utime += tgutime;
+                stime += tgstime;
+                r->ru_nvcsw += p->signal->nvcsw;
+                r->ru_nivcsw += p->signal->nivcsw;
+                r->ru_minflt += p->signal->min_flt;
+                r->ru_majflt += p->signal->maj_flt;
+                r->ru_inblock += p->signal->inblock;
+                r->ru_oublock += p->signal->oublock;
+                if (maxrss < p->signal->maxrss)
+                        maxrss = p->signal->maxrss;
+                t = p;
+                do {
+                        accumulate_thread_rusage(t, r);
+                } while_each_thread(p, t);
+                break;
+        default:
+                BUG();
        }
        unlock_task_sighand(p, &flags);
@@ -1585,6 +1589,7 @@ out:
        if (who != RUSAGE_CHILDREN) {
                struct mm_struct *mm = get_task_mm(p);
                if (mm) {
                        setmax_mm_hiwater_rss(&maxrss, mm);
                        mmput(mm);
@@ -1596,6 +1601,7 @@ out:
 int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
 {
        struct rusage r;
        k_getrusage(p, who, &r);
        return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
 }
@@ -1628,12 +1634,14 @@ SYSCALL_DEFINE1(umask, int, mask)
        return mask;
 }
-static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
+static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
 {
        struct fd exe;
        struct inode *inode;
        int err;
+        VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
        exe = fdget(fd);
        if (!exe.file)
                return -EBADF;
@@ -1654,8 +1662,6 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
        if (err)
                goto exit;
-        down_write(&mm->mmap_sem);
        /*
         * Forbid mm->exe_file change if old file still mapped.
         */
@@ -1667,7 +1673,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
                        if (vma->vm_file &&
                            path_equal(&vma->vm_file->f_path,
                                       &mm->exe_file->f_path))
-                                goto exit_unlock;
+                                goto exit;
        }
        /*
@@ -1678,34 +1684,222 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
         */
        err = -EPERM;
        if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
-                goto exit_unlock;
+                goto exit;
        err = 0;
        set_mm_exe_file(mm, exe.file);  /* this grabs a reference to exe.file */
-exit_unlock:
-        up_write(&mm->mmap_sem);
 exit:
        fdput(exe);
        return err;
 }
+#ifdef CONFIG_CHECKPOINT_RESTORE
+/*
+ * WARNING: we don't require any capability here so be very careful
+ * in what is allowed for modification from userspace.
+ */
+static int validate_prctl_map(struct prctl_mm_map *prctl_map)
+{
+        unsigned long mmap_max_addr = TASK_SIZE;
+        struct mm_struct *mm = current->mm;
+        int error = -EINVAL, i;
+        static const unsigned char offsets[] = {
+                offsetof(struct prctl_mm_map, start_code),
+                offsetof(struct prctl_mm_map, end_code),
+                offsetof(struct prctl_mm_map, start_data),
+                offsetof(struct prctl_mm_map, end_data),
+                offsetof(struct prctl_mm_map, start_brk),
+                offsetof(struct prctl_mm_map, brk),
+                offsetof(struct prctl_mm_map, start_stack),
+                offsetof(struct prctl_mm_map, arg_start),
+                offsetof(struct prctl_mm_map, arg_end),
+                offsetof(struct prctl_mm_map, env_start),
+                offsetof(struct prctl_mm_map, env_end),
+        };
+        /*
+         * Make sure the members are not somewhere outside
+         * of allowed address space.
+         */
+        for (i = 0; i < ARRAY_SIZE(offsets); i++) {
+                u64 val = *(u64 *)((char *)prctl_map + offsets[i]);
+                if ((unsigned long)val >= mmap_max_addr ||
+                    (unsigned long)val < mmap_min_addr)
+                        goto out;
+        }
+        /*
+         * Make sure the pairs are ordered.
+         */
+#define __prctl_check_order(__m1, __op, __m2)                           \
+        ((unsigned long)prctl_map->__m1 __op                            \
+         (unsigned long)prctl_map->__m2) ? 0 : -EINVAL
+        error  = __prctl_check_order(start_code, <, end_code);
+        error |= __prctl_check_order(start_data, <, end_data);
+        error |= __prctl_check_order(start_brk, <=, brk);
+        error |= __prctl_check_order(arg_start, <=, arg_end);
+        error |= __prctl_check_order(env_start, <=, env_end);
+        if (error)
+                goto out;
+#undef __prctl_check_order
+        error = -EINVAL;
+        /*
+         * @brk should be after @end_data in traditional maps.
+         */
+        if (prctl_map->start_brk <= prctl_map->end_data ||
+            prctl_map->brk <= prctl_map->end_data)
+                goto out;
+        /*
+         * Neither we should allow to override limits if they set.
+         */
+        if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk,
+                              prctl_map->start_brk, prctl_map->end_data,
+                              prctl_map->start_data))
+                        goto out;
+        /*
+         * Someone is trying to cheat the auxv vector.
+         */
+        if (prctl_map->auxv_size) {
+                if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv))
+                        goto out;
+        }
+        /*
+         * Finally, make sure the caller has the rights to
+         * change /proc/pid/exe link: only local root should
+         * be allowed to.
+         */
+        if (prctl_map->exe_fd != (u32)-1) {
+                struct user_namespace *ns = current_user_ns();
+                const struct cred *cred = current_cred();
+                if (!uid_eq(cred->uid, make_kuid(ns, 0)) ||
+                    !gid_eq(cred->gid, make_kgid(ns, 0)))
+                        goto out;
+        }
+        error = 0;
+out:
+        return error;
+}
+static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size)
+{
+        struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, };
+        unsigned long user_auxv[AT_VECTOR_SIZE];
+        struct mm_struct *mm = current->mm;
+        int error;
+        BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
+        BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256);
+        if (opt == PR_SET_MM_MAP_SIZE)
+                return put_user((unsigned int)sizeof(prctl_map),
+                                (unsigned int __user *)addr);
+        if (data_size != sizeof(prctl_map))
+                return -EINVAL;
+        if (copy_from_user(&prctl_map, addr, sizeof(prctl_map)))
+                return -EFAULT;
+        error = validate_prctl_map(&prctl_map);
+        if (error)
+                return error;
+        if (prctl_map.auxv_size) {
+                memset(user_auxv, 0, sizeof(user_auxv));
+                if (copy_from_user(user_auxv,
+                                   (const void __user *)prctl_map.auxv,
+                                   prctl_map.auxv_size))
+                        return -EFAULT;
+                /* Last entry must be AT_NULL as specification requires */
+                user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL;
+                user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
+        }
+        down_write(&mm->mmap_sem);
+        if (prctl_map.exe_fd != (u32)-1)
+                error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd);
+        downgrade_write(&mm->mmap_sem);
+        if (error)
+                goto out;
+        /*
+         * We don't validate if these members are pointing to
+         * real present VMAs because application may have correspond
+         * VMAs already unmapped and kernel uses these members for statistics
+         * output in procfs mostly, except
+         *
+         *  - @start_brk/@brk which are used in do_brk but kernel lookups
+         *    for VMAs when updating these memvers so anything wrong written
+         *    here cause kernel to swear at userspace program but won't lead
+         *    to any problem in kernel itself
+         */
+        mm->start_code  = prctl_map.start_code;
+        mm->end_code    = prctl_map.end_code;
+        mm->start_data  = prctl_map.start_data;
+        mm->end_data    = prctl_map.end_data;
+        mm->start_brk   = prctl_map.start_brk;
+        mm->brk         = prctl_map.brk;
+        mm->start_stack = prctl_map.start_stack;
+        mm->arg_start   = prctl_map.arg_start;
+        mm->arg_end     = prctl_map.arg_end;
+        mm->env_start   = prctl_map.env_start;
+        mm->env_end     = prctl_map.env_end;
+        /*
+         * Note this update of @saved_auxv is lockless thus
+         * if someone reads this member in procfs while we're
+         * updating -- it may get partly updated results. It's
+         * known and acceptable trade off: we leave it as is to
+         * not introduce additional locks here making the kernel
+         * more complex.
+         */
+        if (prctl_map.auxv_size)
+                memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));
+        error = 0;
+out:
+        up_read(&mm->mmap_sem);
+        return error;
+}
+#endif /* CONFIG_CHECKPOINT_RESTORE */
 static int prctl_set_mm(int opt, unsigned long addr,
                        unsigned long arg4, unsigned long arg5)
 {
-        unsigned long rlim = rlimit(RLIMIT_DATA);
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        int error;
-        if (arg5 || (arg4 && opt != PR_SET_MM_AUXV))
+        if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV &&
+                              opt != PR_SET_MM_MAP &&
+                              opt != PR_SET_MM_MAP_SIZE)))
                return -EINVAL;
+#ifdef CONFIG_CHECKPOINT_RESTORE
+        if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE)
+                return prctl_set_mm_map(opt, (const void __user *)addr, arg4);
+#endif
        if (!capable(CAP_SYS_RESOURCE))
                return -EPERM;
-        if (opt == PR_SET_MM_EXE_FILE)
+        if (opt == PR_SET_MM_EXE_FILE) {
-                return prctl_set_mm_exe_file(mm, (unsigned int)addr);
+                down_write(&mm->mmap_sem);
+                error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr);
+                up_write(&mm->mmap_sem);
+                return error;
+        }
        if (addr >= TASK_SIZE || addr < mmap_min_addr)
                return -EINVAL;
@@ -1733,9 +1927,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
                if (addr <= mm->end_data)
                        goto out;
-                if (rlim < RLIM_INFINITY &&
+                if (check_data_rlimit(rlimit(RLIMIT_DATA), mm->brk, addr,
-                    (mm->brk - addr) +
+                                      mm->end_data, mm->start_data))
-                    (mm->end_data - mm->start_data) > rlim)
                        goto out;
                mm->start_brk = addr;
@@ -1745,9 +1938,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
                if (addr <= mm->end_data)
                        goto out;
-                if (rlim < RLIM_INFINITY &&
+                if (check_data_rlimit(rlimit(RLIMIT_DATA), addr, mm->start_brk,
-                    (addr - mm->start_brk) +
+                                      mm->end_data, mm->start_data))
-                    (mm->end_data - mm->start_data) > rlim)
                        goto out;
                mm->brk = addr;
@@ -2023,6 +2215,7 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
 {
        int err = 0;
        int cpu = raw_smp_processor_id();
        if (cpup)
                err |= put_user(cpu, cpup);
        if (nodep)
@@ -2135,7 +2328,7 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
        /* Check to see if any memory value is too large for 32-bit and scale
         *  down if needed
         */
-        if ((s.totalram >> 32) || (s.totalswap >> 32)) {
+        if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) {
                int bitcount = 0;
                while (s.mem_unit < PAGE_SIZE) {
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 2904a2105914..02aa4185b17e 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -25,6 +25,7 @@ cond_syscall(sys_swapon);
 cond_syscall(sys_swapoff);
 cond_syscall(sys_kexec_load);
 cond_syscall(compat_sys_kexec_load);
+cond_syscall(sys_kexec_file_load);
 cond_syscall(sys_init_module);
 cond_syscall(sys_finit_module);
 cond_syscall(sys_delete_module);
@@ -155,6 +156,9 @@ cond_syscall(sys_process_vm_writev);
 cond_syscall(compat_sys_process_vm_readv);
 cond_syscall(compat_sys_process_vm_writev);
 cond_syscall(sys_uselib);
+cond_syscall(sys_fadvise64);
+cond_syscall(sys_fadvise64_64);
+cond_syscall(sys_madvise);
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
@@ -197,6 +201,7 @@ cond_syscall(compat_sys_timerfd_settime);
 cond_syscall(compat_sys_timerfd_gettime);
 cond_syscall(sys_eventfd);
 cond_syscall(sys_eventfd2);
+cond_syscall(sys_memfd_create);
 /* performance counters: */
 cond_syscall(sys_perf_event_open);
@@ -216,3 +221,6 @@ cond_syscall(sys_kcmp);
 /* operate on Secure Computing state */
 cond_syscall(sys_seccomp);
+/* access BPF programs and maps */
+cond_syscall(sys_bpf);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 75b22e22a72c..15f2511a1b7c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -387,7 +387,8 @@ static struct ctl_table kern_table[] = {
                .data           = &sysctl_numa_balancing_scan_size,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &one,
        },
        {
                .procname       = "numa_balancing",
@@ -1055,15 +1056,6 @@ static struct ctl_table kern_table[] = {
                .child          = key_sysctls,
        },
 #endif
-#ifdef CONFIG_RCU_TORTURE_TEST
-        {
-                .procname       = "rcutorture_runnable",
-                .data           = &rcutorture_runnable,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
-        },
-#endif
 #ifdef CONFIG_PERF_EVENTS
        /*
         * User-space scripts rely on the existence of this file
@@ -1240,8 +1232,7 @@ static struct ctl_table vm_table[] = {
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
                .proc_handler   = hugetlb_sysctl_handler,
-                .extra1         = (void *)&hugetlb_zero,
+                .extra1         = &zero,
-                .extra2         = (void *)&hugetlb_infinity,
        },
 #ifdef CONFIG_NUMA
        {
@@ -1250,8 +1241,7 @@ static struct ctl_table vm_table[] = {
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
                .proc_handler   = &hugetlb_mempolicy_sysctl_handler,
-                .extra1         = (void *)&hugetlb_zero,
+                .extra1         = &zero,
-                .extra2         = (void *)&hugetlb_infinity,
        },
 #endif
         {
@@ -1274,8 +1264,7 @@ static struct ctl_table vm_table[] = {
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
                .proc_handler   = hugetlb_overcommit_handler,
-                .extra1         = (void *)&hugetlb_zero,
+                .extra1         = &zero,
-                .extra2         = (void *)&hugetlb_infinity,
        },
 #endif
        {
@@ -1463,13 +1452,6 @@ static struct ctl_table vm_table[] = {
                .extra2         = &one,
        },
 #endif
-        {
-                .procname       = "scan_unevictable_pages",
-                .data           = &scan_unevictable_pages,
-                .maxlen         = sizeof(scan_unevictable_pages),
-                .mode           = 0644,
-                .proc_handler   = scan_unevictable_handler,
-        },
 #ifdef CONFIG_MEMORY_FAILURE
        {
                .procname       = "memory_failure_early_kill",
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index e4ba9a5a5ccb..9a4f750a2963 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -390,7 +390,6 @@ static const struct bin_table bin_net_ipv4_table[] = {
        { CTL_INT,      NET_TCP_MTU_PROBING,                    "tcp_mtu_probing" },
        { CTL_INT,      NET_TCP_BASE_MSS,                       "tcp_base_mss" },
        { CTL_INT,      NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
-        { CTL_INT,      NET_TCP_DMA_COPYBREAK,                  "tcp_dma_copybreak" },
        { CTL_INT,      NET_TCP_SLOW_START_AFTER_IDLE,          "tcp_slow_start_after_idle" },
        { CTL_INT,      NET_CIPSOV4_CACHE_ENABLE,               "cipso_cache_enable" },
        { CTL_INT,      NET_CIPSOV4_CACHE_BUCKET_SIZE,          "cipso_cache_bucket_size" },
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 13d2f7cd65db..b312fcc73024 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -638,7 +638,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
                fill_tgid_exit(tsk);
        }
-        listeners = __this_cpu_ptr(&listener_array);
+        listeners = raw_cpu_ptr(&listener_array);
        if (list_empty(&listeners->list))
                return;
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 12d6ebbfdd83..0dbab6d1acb4 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -14,6 +14,8 @@
 * the GNU General Public License for more details.
 */
+#define pr_fmt(fmt) "Kprobe smoke test: " fmt
 #include <linux/kernel.h>
 #include <linux/kprobes.h>
 #include <linux/random.h>
@@ -41,8 +43,7 @@ static void kp_post_handler(struct kprobe *p, struct pt_regs *regs,
 {
        if (preh_val != (rand1 / div_factor)) {
                handler_errors++;
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("incorrect value in post_handler\n");
-                                "incorrect value in post_handler\n");
        }
        posth_val = preh_val + div_factor;
 }
@@ -59,8 +60,7 @@ static int test_kprobe(void)
        ret = register_kprobe(&kp);
        if (ret < 0) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("register_kprobe returned %d\n", ret);
-                                "register_kprobe returned %d\n", ret);
                return ret;
        }
@@ -68,14 +68,12 @@ static int test_kprobe(void)
        unregister_kprobe(&kp);
        if (preh_val == 0) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("kprobe pre_handler not called\n");
-                                "kprobe pre_handler not called\n");
                handler_errors++;
        }
        if (posth_val == 0) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("kprobe post_handler not called\n");
-                                "kprobe post_handler not called\n");
                handler_errors++;
        }
@@ -98,8 +96,7 @@ static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs,
 {
        if (preh_val != (rand1 / div_factor) + 1) {
                handler_errors++;
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("incorrect value in post_handler2\n");
-                                "incorrect value in post_handler2\n");
        }
        posth_val = preh_val + div_factor;
 }
@@ -120,8 +117,7 @@ static int test_kprobes(void)
        kp.flags = 0;
        ret = register_kprobes(kps, 2);
        if (ret < 0) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("register_kprobes returned %d\n", ret);
-                                "register_kprobes returned %d\n", ret);
                return ret;
        }
@@ -130,14 +126,12 @@ static int test_kprobes(void)
        ret = target(rand1);
        if (preh_val == 0) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("kprobe pre_handler not called\n");
-                                "kprobe pre_handler not called\n");
                handler_errors++;
        }
        if (posth_val == 0) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("kprobe post_handler not called\n");
-                                "kprobe post_handler not called\n");
                handler_errors++;
        }
@@ -146,14 +140,12 @@ static int test_kprobes(void)
        ret = target2(rand1);
        if (preh_val == 0) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("kprobe pre_handler2 not called\n");
-                                "kprobe pre_handler2 not called\n");
                handler_errors++;
        }
        if (posth_val == 0) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("kprobe post_handler2 not called\n");
-                                "kprobe post_handler2 not called\n");
                handler_errors++;
        }
@@ -166,8 +158,7 @@ static u32 j_kprobe_target(u32 value)
 {
        if (value != rand1) {
                handler_errors++;
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("incorrect value in jprobe handler\n");
-                                "incorrect value in jprobe handler\n");
        }
        jph_val = rand1;
@@ -186,16 +177,14 @@ static int test_jprobe(void)
        ret = register_jprobe(&jp);
        if (ret < 0) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("register_jprobe returned %d\n", ret);
-                                "register_jprobe returned %d\n", ret);
                return ret;
        }
        ret = target(rand1);
        unregister_jprobe(&jp);
        if (jph_val == 0) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("jprobe handler not called\n");
-                                "jprobe handler not called\n");
                handler_errors++;
        }
@@ -217,24 +206,21 @@ static int test_jprobes(void)
        jp.kp.flags = 0;
        ret = register_jprobes(jps, 2);
        if (ret < 0) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("register_jprobes returned %d\n", ret);
-                                "register_jprobes returned %d\n", ret);
                return ret;
        }
        jph_val = 0;
        ret = target(rand1);
        if (jph_val == 0) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("jprobe handler not called\n");
-                                "jprobe handler not called\n");
                handler_errors++;
        }
        jph_val = 0;
        ret = target2(rand1);
        if (jph_val == 0) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("jprobe handler2 not called\n");
-                                "jprobe handler2 not called\n");
                handler_errors++;
        }
        unregister_jprobes(jps, 2);
@@ -256,13 +242,11 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
        if (ret != (rand1 / div_factor)) {
                handler_errors++;
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("incorrect value in kretprobe handler\n");
-                                "incorrect value in kretprobe handler\n");
        }
        if (krph_val == 0) {
                handler_errors++;
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("call to kretprobe entry handler failed\n");
-                                "call to kretprobe entry handler failed\n");
        }
        krph_val = rand1;
@@ -281,16 +265,14 @@ static int test_kretprobe(void)
        ret = register_kretprobe(&rp);
        if (ret < 0) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("register_kretprobe returned %d\n", ret);
-                                "register_kretprobe returned %d\n", ret);
                return ret;
        }
        ret = target(rand1);
        unregister_kretprobe(&rp);
        if (krph_val != rand1) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("kretprobe handler not called\n");
-                                "kretprobe handler not called\n");
                handler_errors++;
        }
@@ -303,13 +285,11 @@ static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs)
        if (ret != (rand1 / div_factor) + 1) {
                handler_errors++;
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("incorrect value in kretprobe handler2\n");
-                                "incorrect value in kretprobe handler2\n");
        }
        if (krph_val == 0) {
                handler_errors++;
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("call to kretprobe entry handler failed\n");
-                                "call to kretprobe entry handler failed\n");
        }
        krph_val = rand1;
@@ -332,24 +312,21 @@ static int test_kretprobes(void)
        rp.kp.flags = 0;
        ret = register_kretprobes(rps, 2);
        if (ret < 0) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("register_kretprobe returned %d\n", ret);
-                                "register_kretprobe returned %d\n", ret);
                return ret;
        }
        krph_val = 0;
        ret = target(rand1);
        if (krph_val != rand1) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("kretprobe handler not called\n");
-                                "kretprobe handler not called\n");
                handler_errors++;
        }
        krph_val = 0;
        ret = target2(rand1);
        if (krph_val != rand1) {
-                printk(KERN_ERR "Kprobe smoke test failed: "
+                pr_err("kretprobe handler2 not called\n");
-                                "kretprobe handler2 not called\n");
                handler_errors++;
        }
        unregister_kretprobes(rps, 2);
@@ -368,7 +345,7 @@ int init_test_probes(void)
                rand1 = prandom_u32();
        } while (rand1 <= div_factor);
-        printk(KERN_INFO "Kprobe smoke test started\n");
+        pr_info("started\n");
        num_tests++;
        ret = test_kprobe();
        if (ret < 0)
@@ -402,13 +379,11 @@ int init_test_probes(void)
 #endif /* CONFIG_KRETPROBES */
        if (errors)
-                printk(KERN_ERR "BUG: Kprobe smoke test: %d out of "
+                pr_err("BUG: %d out of %d tests failed\n", errors, num_tests);
-                                "%d tests failed\n", errors, num_tests);
        else if (handler_errors)
-                printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) "
+                pr_err("BUG: %d error(s) running handlers\n", handler_errors);
-                                "running handlers\n", handler_errors);
        else
-                printk(KERN_INFO "Kprobe smoke test passed successfully\n");
+                pr_info("passed successfully\n");
        return 0;
 }
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 4aec4a457431..a7077d3ae52f 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -464,18 +464,26 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid)
 static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
                                                        ktime_t now)
 {
+        unsigned long flags;
        struct k_itimer *ptr = container_of(alarm, struct k_itimer,
                                                it.alarm.alarmtimer);
-        if (posix_timer_event(ptr, 0) != 0)
+        enum alarmtimer_restart result = ALARMTIMER_NORESTART;
-                ptr->it_overrun++;
+        spin_lock_irqsave(&ptr->it_lock, flags);
+        if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) {
+                if (posix_timer_event(ptr, 0) != 0)
+                        ptr->it_overrun++;
+        }
        /* Re-add periodic timers */
        if (ptr->it.alarm.interval.tv64) {
                ptr->it_overrun += alarm_forward(alarm, now,
                                                ptr->it.alarm.interval);
-                return ALARMTIMER_RESTART;
+                result = ALARMTIMER_RESTART;
        }
-        return ALARMTIMER_NORESTART;
+        spin_unlock_irqrestore(&ptr->it_lock, flags);
+        return result;
 }
 /**
@@ -541,18 +549,22 @@ static int alarm_timer_create(struct k_itimer *new_timer)
 * @new_timer: k_itimer pointer
 * @cur_setting: itimerspec data to fill
 *
- * Copies the itimerspec data out from the k_itimer
+ * Copies out the current itimerspec data
 */
 static void alarm_timer_get(struct k_itimer *timr,
                                struct itimerspec *cur_setting)
 {
-        memset(cur_setting, 0, sizeof(struct itimerspec));
+        ktime_t relative_expiry_time =
+                alarm_expires_remaining(&(timr->it.alarm.alarmtimer));
+        if (ktime_to_ns(relative_expiry_time) > 0) {
+                cur_setting->it_value = ktime_to_timespec(relative_expiry_time);
+        } else {
+                cur_setting->it_value.tv_sec = 0;
+                cur_setting->it_value.tv_nsec = 0;
+        }
-        cur_setting->it_interval =
+        cur_setting->it_interval = ktime_to_timespec(timr->it.alarm.interval);
-                        ktime_to_timespec(timr->it.alarm.interval);
-        cur_setting->it_value =
-                ktime_to_timespec(timr->it.alarm.alarmtimer.node.expires);
-        return;
 }
 /**
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 9c94c19f1305..55449909f114 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -72,7 +72,7 @@ static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt,
         * Also omit the add if it would overflow the u64 boundary.
         */
        if ((~0ULL - clc > rnd) &&
-            (!ismax || evt->mult <= (1U << evt->shift)))
+            (!ismax || evt->mult <= (1ULL << evt->shift)))
                clc += rnd;
        do_div(clc, evt->mult);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 1c2fe7de2842..37e50aadd471 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -558,7 +558,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
 static int hrtimer_reprogram(struct hrtimer *timer,
                             struct hrtimer_clock_base *base)
 {
-        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
        int res;
@@ -629,7 +629,7 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
 */
 static void retrigger_next_event(void *arg)
 {
-        struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
+        struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
        if (!hrtimer_hres_active())
                return;
@@ -903,7 +903,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
                 */
                debug_deactivate(timer);
                timer_stats_hrtimer_clear_start_info(timer);
-                reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
+                reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
                /*
                 * We must preserve the CALLBACK state flag here,
                 * otherwise we could move the timer base in
@@ -963,7 +963,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
                 * on dynticks target.
                 */
                wake_up_nohz_cpu(new_base->cpu_base->cpu);
-        } else if (new_base->cpu_base == &__get_cpu_var(hrtimer_bases) &&
+        } else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases) &&
                        hrtimer_reprogram(timer, new_base)) {
                /*
                 * Only allow reprogramming if the new base is on this CPU.
@@ -1103,7 +1103,7 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
 */
 ktime_t hrtimer_get_next_event(void)
 {
-        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        struct hrtimer_clock_base *base = cpu_base->clock_base;
        ktime_t delta, mindelta = { .tv64 = KTIME_MAX };
        unsigned long flags;
@@ -1144,7 +1144,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
        memset(timer, 0, sizeof(struct hrtimer));
-        cpu_base = &__raw_get_cpu_var(hrtimer_bases);
+        cpu_base = raw_cpu_ptr(&hrtimer_bases);
        if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
                clock_id = CLOCK_MONOTONIC;
@@ -1187,7 +1187,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
        struct hrtimer_cpu_base *cpu_base;
        int base = hrtimer_clockid_to_base(which_clock);
-        cpu_base = &__raw_get_cpu_var(hrtimer_bases);
+        cpu_base = raw_cpu_ptr(&hrtimer_bases);
        *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
        return 0;
@@ -1242,7 +1242,7 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
 */
 void hrtimer_interrupt(struct clock_event_device *dev)
 {
-        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        ktime_t expires_next, now, entry_time, delta;
        int i, retries = 0;
@@ -1376,7 +1376,7 @@ static void __hrtimer_peek_ahead_timers(void)
        if (!hrtimer_hres_active())
                return;
-        td = &__get_cpu_var(tick_cpu_device);
+        td = this_cpu_ptr(&tick_cpu_device);
        if (td && td->evtdev)
                hrtimer_interrupt(td->evtdev);
 }
@@ -1440,7 +1440,7 @@ void hrtimer_run_pending(void)
 void hrtimer_run_queues(void)
 {
        struct timerqueue_node *node;
-        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        struct hrtimer_clock_base *base;
        int index, gettime = 1;
@@ -1679,7 +1679,7 @@ static void migrate_hrtimers(int scpu)
        local_irq_disable();
        old_base = &per_cpu(hrtimer_bases, scpu);
-        new_base = &__get_cpu_var(hrtimer_bases);
+        new_base = this_cpu_ptr(&hrtimer_bases);
        /*
         * The caller is globally serialized and nobody else
         * takes two locks at once, deadlock is not possible.
@@ -1776,7 +1776,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
         */
        if (!expires) {
                schedule();
-                __set_current_state(TASK_RUNNING);
                return -EINTR;
        }
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 3b8946416a5f..492b986195d5 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -272,22 +272,8 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk,
                if (same_thread_group(tsk, current))
                        err = cpu_clock_sample(which_clock, tsk, &rtn);
        } else {
-                unsigned long flags;
-                struct sighand_struct *sighand;
-                /*
-                 * while_each_thread() is not yet entirely RCU safe,
-                 * keep locking the group while sampling process
-                 * clock for now.
-                 */
-                sighand = lock_task_sighand(tsk, &flags);
-                if (!sighand)
-                        return err;
                if (tsk == current || thread_group_leader(tsk))
                        err = cpu_clock_sample_group(which_clock, tsk, &rtn);
-                unlock_task_sighand(tsk, &flags);
        }
        if (!err)
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 42b463ad90f2..31ea01f42e1f 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -636,6 +636,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
                        goto out;
                }
        } else {
+                memset(&event.sigev_value, 0, sizeof(event.sigev_value));
                event.sigev_notify = SIGEV_SIGNAL;
                event.sigev_signo = SIGALRM;
                event.sigev_value.sival_int = new_timer->it_id;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 64c5990fd500..066f0ec05e48 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -554,7 +554,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
 void tick_check_oneshot_broadcast_this_cpu(void)
 {
        if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) {
-                struct tick_device *td = &__get_cpu_var(tick_cpu_device);
+                struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
                /*
                 * We might be in the middle of switching over from
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 0a0608edeb26..7efeedf53ebd 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -224,7 +224,7 @@ static void tick_setup_device(struct tick_device *td,
 void tick_install_replacement(struct clock_event_device *newdev)
 {
-        struct tick_device *td = &__get_cpu_var(tick_cpu_device);
+        struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
        int cpu = smp_processor_id();
        clockevents_exchange_device(td->evtdev, newdev);
@@ -374,14 +374,14 @@ void tick_shutdown(unsigned int *cpup)
 void tick_suspend(void)
 {
-        struct tick_device *td = &__get_cpu_var(tick_cpu_device);
+        struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
        clockevents_shutdown(td->evtdev);
 }
 void tick_resume(void)
 {
-        struct tick_device *td = &__get_cpu_var(tick_cpu_device);
+        struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
        int broadcast = tick_resume_broadcast();
        clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
@@ -400,4 +400,5 @@ void tick_resume(void)
 void __init tick_init(void)
 {
        tick_broadcast_init();
+        tick_nohz_init();
 }
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index c19c1d84b6f3..366aeb4f2c66 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -99,6 +99,13 @@ static inline int tick_broadcast_oneshot_active(void) { return 0; }
 static inline bool tick_broadcast_oneshot_available(void) { return false; }
 #endif /* !TICK_ONESHOT */
+/* NO_HZ_FULL internal */
+#ifdef CONFIG_NO_HZ_FULL
+extern void tick_nohz_init(void);
+# else
+static inline void tick_nohz_init(void) { }
+#endif
 /*
 * Broadcasting support
 */
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 824109060a33..7ce740e78e1b 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -59,7 +59,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
 */
 int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
 {
-        struct tick_device *td = &__get_cpu_var(tick_cpu_device);
+        struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
        struct clock_event_device *dev = td->evtdev;
        if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 99aa6ee3908f..7b5741fc4110 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -205,7 +205,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
 */
 void __tick_nohz_full_check(void)
 {
-        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
        if (tick_nohz_full_cpu(smp_processor_id())) {
                if (ts->tick_stopped && !is_idle_task(current)) {
@@ -225,6 +225,20 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
 };
 /*
+ * Kick this CPU if it's full dynticks in order to force it to
+ * re-evaluate its dependency on the tick and restart it if necessary.
+ * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(),
+ * is NMI safe.
+ */
+void tick_nohz_full_kick(void)
+{
+        if (!tick_nohz_full_cpu(smp_processor_id()))
+                return;
+        irq_work_queue(&__get_cpu_var(nohz_full_kick_work));
+}
+/*
 * Kick the CPU if it's full dynticks in order to force it to
 * re-evaluate its dependency on the tick and restart it if necessary.
 */
@@ -281,22 +295,12 @@ out:
 /* Parse the boot-time nohz CPU list from the kernel parameters. */
 static int __init tick_nohz_full_setup(char *str)
 {
-        int cpu;
        alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
-        alloc_bootmem_cpumask_var(&housekeeping_mask);
        if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
                pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
+                free_bootmem_cpumask_var(tick_nohz_full_mask);
                return 1;
        }
-        cpu = smp_processor_id();
-        if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
-                pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
-                cpumask_clear_cpu(cpu, tick_nohz_full_mask);
-        }
-        cpumask_andnot(housekeeping_mask,
-                       cpu_possible_mask, tick_nohz_full_mask);
        tick_nohz_full_running = true;
        return 1;
@@ -335,18 +339,11 @@ static int tick_nohz_init_all(void)
 #ifdef CONFIG_NO_HZ_FULL_ALL
        if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) {
-                pr_err("NO_HZ: Can't allocate full dynticks cpumask\n");
+                WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n");
-                return err;
-        }
-        if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
-                pr_err("NO_HZ: Can't allocate not-full dynticks cpumask\n");
                return err;
        }
        err = 0;
        cpumask_setall(tick_nohz_full_mask);
-        cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask);
-        cpumask_clear(housekeeping_mask);
-        cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
        tick_nohz_full_running = true;
 #endif
        return err;
@@ -361,6 +358,37 @@ void __init tick_nohz_init(void)
                        return;
        }
+        if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
+                WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n");
+                cpumask_clear(tick_nohz_full_mask);
+                tick_nohz_full_running = false;
+                return;
+        }
+        /*
+         * Full dynticks uses irq work to drive the tick rescheduling on safe
+         * locking contexts. But then we need irq work to raise its own
+         * interrupts to avoid circular dependency on the tick
+         */
+        if (!arch_irq_work_has_interrupt()) {
+                pr_warning("NO_HZ: Can't run full dynticks because arch doesn't "
+                           "support irq work self-IPIs\n");
+                cpumask_clear(tick_nohz_full_mask);
+                cpumask_copy(housekeeping_mask, cpu_possible_mask);
+                tick_nohz_full_running = false;
+                return;
+        }
+        cpu = smp_processor_id();
+        if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
+                pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
+                cpumask_clear_cpu(cpu, tick_nohz_full_mask);
+        }
+        cpumask_andnot(housekeeping_mask,
+                       cpu_possible_mask, tick_nohz_full_mask);
        for_each_cpu(cpu, tick_nohz_full_mask)
                context_tracking_cpu_set(cpu);
@@ -545,7 +573,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
        unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
        ktime_t last_update, expires, ret = { .tv64 = 0 };
        unsigned long rcu_delta_jiffies;
-        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+        struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
        u64 time_delta;
        time_delta = timekeeping_max_deferment();
@@ -558,7 +586,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
        } while (read_seqretry(&jiffies_lock, seq));
        if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
-            arch_needs_cpu(cpu) || irq_work_needs_cpu()) {
+            arch_needs_cpu() || irq_work_needs_cpu()) {
                next_jiffies = last_jiffies + 1;
                delta_jiffies = 1;
        } else {
@@ -813,7 +841,7 @@ void tick_nohz_idle_enter(void)
        local_irq_disable();
-        ts = &__get_cpu_var(tick_cpu_sched);
+        ts = this_cpu_ptr(&tick_cpu_sched);
        ts->inidle = 1;
        __tick_nohz_idle_enter(ts);
@@ -831,7 +859,7 @@ EXPORT_SYMBOL_GPL(tick_nohz_idle_enter);
 */
 void tick_nohz_irq_exit(void)
 {
-        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
        if (ts->inidle)
                __tick_nohz_idle_enter(ts);
@@ -846,7 +874,7 @@ void tick_nohz_irq_exit(void)
 */
 ktime_t tick_nohz_get_sleep_length(void)
 {
-        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
        return ts->sleep_length;
 }
@@ -924,7 +952,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
 */
 void tick_nohz_idle_exit(void)
 {
-        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
        ktime_t now;
        local_irq_disable();
@@ -959,7 +987,7 @@ static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
 */
 static void tick_nohz_handler(struct clock_event_device *dev)
 {
-        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
        struct pt_regs *regs = get_irq_regs();
        ktime_t now = ktime_get();
@@ -968,6 +996,10 @@ static void tick_nohz_handler(struct clock_event_device *dev)
        tick_sched_do_timer(now);
        tick_sched_handle(ts, regs);
+        /* No need to reprogram if we are running tickless  */
+        if (unlikely(ts->tick_stopped))
+                return;
        while (tick_nohz_reprogram(ts, now)) {
                now = ktime_get();
                tick_do_update_jiffies64(now);
@@ -979,7 +1011,7 @@ static void tick_nohz_handler(struct clock_event_device *dev)
 */
 static void tick_nohz_switch_to_nohz(void)
 {
-        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
        ktime_t next;
        if (!tick_nohz_enabled)
@@ -1041,7 +1073,7 @@ static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now)
 static inline void tick_nohz_irq_enter(void)
 {
-        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
        ktime_t now;
        if (!ts->idle_active && !ts->tick_stopped)
@@ -1095,6 +1127,10 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
        if (regs)
                tick_sched_handle(ts, regs);
+        /* No need to reprogram if we are in idle or full dynticks mode */
+        if (unlikely(ts->tick_stopped))
+                return HRTIMER_NORESTART;
        hrtimer_forward(timer, now, tick_period);
        return HRTIMER_RESTART;
@@ -1115,7 +1151,7 @@ early_param("skew_tick", skew_tick);
 */
 void tick_setup_sched_timer(void)
 {
-        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
        ktime_t now = ktime_get();
        /*
@@ -1184,7 +1220,7 @@ void tick_clock_notify(void)
 */
 void tick_oneshot_notify(void)
 {
-        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
        set_bit(0, &ts->check_clocks);
 }
@@ -1199,7 +1235,7 @@ void tick_oneshot_notify(void)
 */
 int tick_check_oneshot_change(int allow_nohz)
 {
-        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
        if (!test_and_clear_bit(0, &ts->check_clocks))
                return 0;
diff --git a/kernel/time/time.c b/kernel/time/time.c
index f0294ba14634..a9ae20fb0b11 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -559,17 +559,20 @@ EXPORT_SYMBOL(usecs_to_jiffies);
 * that a remainder subtract here would not do the right thing as the
 * resolution values don't fall on second boundries.  I.e. the line:
 * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
+ * Note that due to the small error in the multiplier here, this
+ * rounding is incorrect for sufficiently large values of tv_nsec, but
+ * well formed timespecs should have tv_nsec < NSEC_PER_SEC, so we're
+ * OK.
 *
 * Rather, we just shift the bits off the right.
 *
 * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
 * value to a scaled second value.
 */
-unsigned long
+static unsigned long
-timespec_to_jiffies(const struct timespec *value)
+__timespec_to_jiffies(unsigned long sec, long nsec)
 {
-        unsigned long sec = value->tv_sec;
+        nsec = nsec + TICK_NSEC - 1;
-        long nsec = value->tv_nsec + TICK_NSEC - 1;
        if (sec >= MAX_SEC_IN_JIFFIES){
                sec = MAX_SEC_IN_JIFFIES;
@@ -580,6 +583,13 @@ timespec_to_jiffies(const struct timespec *value)
                 (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
 }
+unsigned long
+timespec_to_jiffies(const struct timespec *value)
+{
+        return __timespec_to_jiffies(value->tv_sec, value->tv_nsec);
+}
 EXPORT_SYMBOL(timespec_to_jiffies);
 void
@@ -596,31 +606,27 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
 }
 EXPORT_SYMBOL(jiffies_to_timespec);
-/* Same for "timeval"
+/*
- *
+ * We could use a similar algorithm to timespec_to_jiffies (with a
- * Well, almost.  The problem here is that the real system resolution is
+ * different multiplier for usec instead of nsec). But this has a
- * in nanoseconds and the value being converted is in micro seconds.
+ * problem with rounding: we can't exactly add TICK_NSEC - 1 to the
- * Also for some machines (those that use HZ = 1024, in-particular),
+ * usec value, since it's not necessarily integral.
- * there is a LARGE error in the tick size in microseconds.
+ *
+ * We could instead round in the intermediate scaled representation
- * The solution we use is to do the rounding AFTER we convert the
+ * (i.e. in units of 1/2^(large scale) jiffies) but that's also
- * microsecond part.  Thus the USEC_ROUND, the bits to be shifted off.
+ * perilous: the scaling introduces a small positive error, which
- * Instruction wise, this should cost only an additional add with carry
+ * combined with a division-rounding-upward (i.e. adding 2^(scale) - 1
- * instruction above the way it was done above.
+ * units to the intermediate before shifting) leads to accidental
+ * overflow and overestimates.
+ *
+ * At the cost of one additional multiplication by a constant, just
+ * use the timespec implementation.
 */
 unsigned long
 timeval_to_jiffies(const struct timeval *value)
 {
-        unsigned long sec = value->tv_sec;
+        return __timespec_to_jiffies(value->tv_sec,
-        long usec = value->tv_usec;
+                                     value->tv_usec * NSEC_PER_USEC);
-        if (sec >= MAX_SEC_IN_JIFFIES){
-                sec = MAX_SEC_IN_JIFFIES;
-                usec = 0;
-        }
-        return (((u64)sec * SEC_CONVERSION) +
-                (((u64)usec * USEC_CONVERSION + USEC_ROUND) >>
-                 (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
 }
 EXPORT_SYMBOL(timeval_to_jiffies);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f36b02838a47..ec1791fae965 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -338,10 +338,11 @@ EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
 static inline void update_vsyscall(struct timekeeper *tk)
 {
-        struct timespec xt;
+        struct timespec xt, wm;
        xt = timespec64_to_timespec(tk_xtime(tk));
-        update_vsyscall_old(&xt, &tk->wall_to_monotonic, tk->tkr.clock, tk->tkr.mult,
+        wm = timespec64_to_timespec(tk->wall_to_monotonic);
+        update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult,
                            tk->tkr.cycle_last);
 }
@@ -441,11 +442,12 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
                tk->ntp_error = 0;
                ntp_clear();
        }
-        update_vsyscall(tk);
-        update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
        tk_update_ktime_data(tk);
+        update_vsyscall(tk);
+        update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
        if (action & TK_MIRROR)
                memcpy(&shadow_timekeeper, &tk_core.timekeeper,
                       sizeof(tk_core.timekeeper));
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index aca5dfe2fa3d..3260ffdb368f 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -655,7 +655,7 @@ static inline void debug_assert_init(struct timer_list *timer)
 static void do_init_timer(struct timer_list *timer, unsigned int flags,
                          const char *name, struct lock_class_key *key)
 {
-        struct tvec_base *base = __raw_get_cpu_var(tvec_bases);
+        struct tvec_base *base = raw_cpu_read(tvec_bases);
        timer->entry.next = NULL;
        timer->base = (void *)((unsigned long)base | flags);
@@ -1385,7 +1385,7 @@ void update_process_times(int user_tick)
        rcu_check_callbacks(cpu, user_tick);
 #ifdef CONFIG_IRQ_WORK
        if (in_irq())
-                irq_work_run();
+                irq_work_tick();
 #endif
        scheduler_tick();
        run_posix_cpu_timers(p);
diff --git a/kernel/torture.c b/kernel/torture.c
index d600af21f022..dd70993c266c 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -211,18 +211,16 @@ EXPORT_SYMBOL_GPL(torture_onoff_cleanup);
 /*
 * Print online/offline testing statistics.
 */
-char *torture_onoff_stats(char *page)
+void torture_onoff_stats(void)
 {
 #ifdef CONFIG_HOTPLUG_CPU
-        page += sprintf(page,
+        pr_cont("onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
-                       "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
+                n_online_successes, n_online_attempts,
-                       n_online_successes, n_online_attempts,
+                n_offline_successes, n_offline_attempts,
-                       n_offline_successes, n_offline_attempts,
+                min_online, max_online,
-                       min_online, max_online,
+                min_offline, max_offline,
-                       min_offline, max_offline,
+                sum_online, sum_offline, HZ);
-                       sum_online, sum_offline, HZ);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
-        return page;
 }
 EXPORT_SYMBOL_GPL(torture_onoff_stats);
@@ -635,8 +633,13 @@ EXPORT_SYMBOL_GPL(torture_init_end);
 *
 * This must be called before the caller starts shutting down its own
 * kthreads.
+ *
+ * Both torture_cleanup_begin() and torture_cleanup_end() must be paired,
+ * in order to correctly perform the cleanup. They are separated because
+ * threads can still need to reference the torture_type type, thus nullify
+ * only after completing all other relevant calls.
 */
-bool torture_cleanup(void)
+bool torture_cleanup_begin(void)
 {
        mutex_lock(&fullstop_mutex);
        if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
@@ -651,12 +654,17 @@ bool torture_cleanup(void)
        torture_shuffle_cleanup();
        torture_stutter_cleanup();
        torture_onoff_cleanup();
+        return false;
+}
+EXPORT_SYMBOL_GPL(torture_cleanup_begin);
+void torture_cleanup_end(void)
+{
        mutex_lock(&fullstop_mutex);
        torture_type = NULL;
        mutex_unlock(&fullstop_mutex);
-        return false;
 }
-EXPORT_SYMBOL_GPL(torture_cleanup);
+EXPORT_SYMBOL_GPL(torture_cleanup_end);
 /*
 * Is it time for the current torture test to stop?
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1654b12c891a..31c90fec4158 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -65,15 +65,21 @@
 #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL)
 #ifdef CONFIG_DYNAMIC_FTRACE
-#define INIT_REGEX_LOCK(opsname)        \
+#define INIT_OPS_HASH(opsname)  \
-        .regex_lock     = __MUTEX_INITIALIZER(opsname.regex_lock),
+        .func_hash              = &opsname.local_hash,                  \
+        .local_hash.regex_lock  = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
+#define ASSIGN_OPS_HASH(opsname, val) \
+        .func_hash              = val, \
+        .local_hash.regex_lock  = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
 #else
-#define INIT_REGEX_LOCK(opsname)
+#define INIT_OPS_HASH(opsname)
+#define ASSIGN_OPS_HASH(opsname, val)
 #endif
 static struct ftrace_ops ftrace_list_end __read_mostly = {
        .func           = ftrace_stub,
        .flags          = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
+        INIT_OPS_HASH(ftrace_list_end)
 };
 /* ftrace_enabled is a method to turn ftrace on or off */
@@ -107,6 +113,9 @@ ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
 static struct ftrace_ops global_ops;
 static struct ftrace_ops control_ops;
+static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
+                                   struct ftrace_ops *op, struct pt_regs *regs);
 #if ARCH_SUPPORTS_FTRACE_OPS
 static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
                                 struct ftrace_ops *op, struct pt_regs *regs);
@@ -140,7 +149,8 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops)
 {
 #ifdef CONFIG_DYNAMIC_FTRACE
        if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) {
-                mutex_init(&ops->regex_lock);
+                mutex_init(&ops->local_hash.regex_lock);
+                ops->func_hash = &ops->local_hash;
                ops->flags |= FTRACE_OPS_FL_INITIALIZED;
        }
 #endif
@@ -244,18 +254,24 @@ static void update_ftrace_function(void)
        ftrace_func_t func;
        /*
+         * Prepare the ftrace_ops that the arch callback will use.
+         * If there's only one ftrace_ops registered, the ftrace_ops_list
+         * will point to the ops we want.
+         */
+        set_function_trace_op = ftrace_ops_list;
+        /* If there's no ftrace_ops registered, just call the stub function */
+        if (ftrace_ops_list == &ftrace_list_end) {
+                func = ftrace_stub;
+        /*
         * If we are at the end of the list and this ops is
         * recursion safe and not dynamic and the arch supports passing ops,
         * then have the mcount trampoline call the function directly.
         */
-        if (ftrace_ops_list == &ftrace_list_end ||
+        } else if (ftrace_ops_list->next == &ftrace_list_end) {
-            (ftrace_ops_list->next == &ftrace_list_end &&
+                func = ftrace_ops_get_func(ftrace_ops_list);
-             !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) &&
-             (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) &&
-             !FTRACE_FORCE_LIST_FUNC)) {
-                /* Set the ftrace_ops that the arch callback uses */
-                set_function_trace_op = ftrace_ops_list;
-                func = ftrace_ops_list->func;
        } else {
                /* Just use the default ftrace_ops */
                set_function_trace_op = &ftrace_list_end;
@@ -899,7 +915,7 @@ static void unregister_ftrace_profiler(void)
 static struct ftrace_ops ftrace_profile_ops __read_mostly = {
        .func           = function_profile_call,
        .flags          = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
-        INIT_REGEX_LOCK(ftrace_profile_ops)
+        INIT_OPS_HASH(ftrace_profile_ops)
 };
 static int register_ftrace_profiler(void)
@@ -1041,6 +1057,12 @@ static struct pid * const ftrace_swapper_pid = &init_struct_pid;
 static struct ftrace_ops *removed_ops;
+/*
+ * Set when doing a global update, like enabling all recs or disabling them.
+ * It is not set when just updating a single ftrace_ops.
+ */
+static bool update_all_ops;
 #ifndef CONFIG_FTRACE_MCOUNT_RECORD
 # error Dynamic ftrace depends on MCOUNT_RECORD
 #endif
@@ -1081,11 +1103,12 @@ static const struct ftrace_hash empty_hash = {
 #define EMPTY_HASH      ((struct ftrace_hash *)&empty_hash)
 static struct ftrace_ops global_ops = {
-        .func                   = ftrace_stub,
+        .func                           = ftrace_stub,
-        .notrace_hash           = EMPTY_HASH,
+        .local_hash.notrace_hash        = EMPTY_HASH,
-        .filter_hash            = EMPTY_HASH,
+        .local_hash.filter_hash         = EMPTY_HASH,
-        .flags                  = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
+        INIT_OPS_HASH(global_ops)
-        INIT_REGEX_LOCK(global_ops)
+        .flags                          = FTRACE_OPS_FL_RECURSION_SAFE |
+                                          FTRACE_OPS_FL_INITIALIZED,
 };
 struct ftrace_page {
@@ -1226,8 +1249,8 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
 void ftrace_free_filter(struct ftrace_ops *ops)
 {
        ftrace_ops_init(ops);
-        free_ftrace_hash(ops->filter_hash);
+        free_ftrace_hash(ops->func_hash->filter_hash);
-        free_ftrace_hash(ops->notrace_hash);
+        free_ftrace_hash(ops->func_hash->notrace_hash);
 }
 static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
@@ -1288,9 +1311,9 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
 }
 static void
-ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash);
+ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash);
 static void
-ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash);
+ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash);
 static int
 ftrace_hash_move(struct ftrace_ops *ops, int enable,
@@ -1299,7 +1322,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
        struct ftrace_func_entry *entry;
        struct hlist_node *tn;
        struct hlist_head *hhd;
-        struct ftrace_hash *old_hash;
        struct ftrace_hash *new_hash;
        int size = src->count;
        int bits = 0;
@@ -1342,17 +1364,30 @@ update:
         * Remove the current set, update the hash and add
         * them back.
         */
-        ftrace_hash_rec_disable(ops, enable);
+        ftrace_hash_rec_disable_modify(ops, enable);
-        old_hash = *dst;
        rcu_assign_pointer(*dst, new_hash);
-        free_ftrace_hash_rcu(old_hash);
-        ftrace_hash_rec_enable(ops, enable);
+        ftrace_hash_rec_enable_modify(ops, enable);
        return 0;
 }
+static bool hash_contains_ip(unsigned long ip,
+                             struct ftrace_ops_hash *hash)
+{
+        /*
+         * The function record is a match if it exists in the filter
+         * hash and not in the notrace hash. Note, an emty hash is
+         * considered a match for the filter hash, but an empty
+         * notrace hash is considered not in the notrace hash.
+         */
+        return (ftrace_hash_empty(hash->filter_hash) ||
+                ftrace_lookup_ip(hash->filter_hash, ip)) &&
+                (ftrace_hash_empty(hash->notrace_hash) ||
+                 !ftrace_lookup_ip(hash->notrace_hash, ip));
+}
 /*
 * Test the hashes for this ops to see if we want to call
 * the ops->func or not.
@@ -1368,8 +1403,7 @@ update:
 static int
 ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
 {
-        struct ftrace_hash *filter_hash;
+        struct ftrace_ops_hash hash;
-        struct ftrace_hash *notrace_hash;
        int ret;
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
@@ -1382,13 +1416,10 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
                return 0;
 #endif
-        filter_hash = rcu_dereference_raw_notrace(ops->filter_hash);
+        hash.filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash);
-        notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash);
+        hash.notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash);
-        if ((ftrace_hash_empty(filter_hash) ||
+        if (hash_contains_ip(ip, &hash))
-             ftrace_lookup_ip(filter_hash, ip)) &&
-            (ftrace_hash_empty(notrace_hash) ||
-             !ftrace_lookup_ip(notrace_hash, ip)))
                ret = 1;
        else
                ret = 0;
@@ -1500,33 +1531,6 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec)
        return  keep_regs;
 }
-static void ftrace_remove_tramp(struct ftrace_ops *ops,
-                                struct dyn_ftrace *rec)
-{
-        struct ftrace_func_entry *entry;
-        entry = ftrace_lookup_ip(ops->tramp_hash, rec->ip);
-        if (!entry)
-                return;
-        /*
-         * The tramp_hash entry will be removed at time
-         * of update.
-         */
-        ops->nr_trampolines--;
-        rec->flags &= ~FTRACE_FL_TRAMP;
-}
-static void ftrace_clear_tramps(struct dyn_ftrace *rec)
-{
-        struct ftrace_ops *op;
-        do_for_each_ftrace_op(op, ftrace_ops_list) {
-                if (op->nr_trampolines)
-                        ftrace_remove_tramp(op, rec);
-        } while_for_each_ftrace_op(op);
-}
 static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
                                     int filter_hash,
                                     bool inc)
@@ -1554,14 +1558,14 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
         *   gets inversed.
         */
        if (filter_hash) {
-                hash = ops->filter_hash;
+                hash = ops->func_hash->filter_hash;
-                other_hash = ops->notrace_hash;
+                other_hash = ops->func_hash->notrace_hash;
                if (ftrace_hash_empty(hash))
                        all = 1;
        } else {
                inc = !inc;
-                hash = ops->notrace_hash;
+                hash = ops->func_hash->notrace_hash;
-                other_hash = ops->filter_hash;
+                other_hash = ops->func_hash->filter_hash;
                /*
                 * If the notrace hash has no items,
                 * then there's nothing to do.
@@ -1615,22 +1619,17 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
                         * function, and the ops has a trampoline registered
                         * for it, then we can call it directly.
                         */
-                        if (ftrace_rec_count(rec) == 1 && ops->trampoline) {
+                        if (ftrace_rec_count(rec) == 1 && ops->trampoline)
                                rec->flags |= FTRACE_FL_TRAMP;
-                                ops->nr_trampolines++;
+                        else
-                        } else {
                                /*
                                 * If we are adding another function callback
                                 * to this function, and the previous had a
-                                 * trampoline used, then we need to go back to
+                                 * custom trampoline in use, then we need to go
-                                 * the default trampoline.
+                                 * back to the default trampoline.
                                 */
                                rec->flags &= ~FTRACE_FL_TRAMP;
-                                /* remove trampolines from any ops for this rec */
-                                ftrace_clear_tramps(rec);
-                        }
                        /*
                         * If any ops wants regs saved for this function
                         * then all ops will get saved regs.
@@ -1642,9 +1641,6 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
                                return;
                        rec->flags--;
-                        if (ops->trampoline && !ftrace_rec_count(rec))
-                                ftrace_remove_tramp(ops, rec);
                        /*
                         * If the rec had REGS enabled and the ops that is
                         * being removed had REGS set, then see if there is
@@ -1659,6 +1655,17 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
                        }
                        /*
+                         * If the rec had TRAMP enabled, then it needs to
+                         * be cleared. As TRAMP can only be enabled iff
+                         * there is only a single ops attached to it.
+                         * In otherwords, always disable it on decrementing.
+                         * In the future, we may set it if rec count is
+                         * decremented to one, and the ops that is left
+                         * has a trampoline.
+                         */
+                        rec->flags &= ~FTRACE_FL_TRAMP;
+                        /*
                         * flags will be cleared in ftrace_check_record()
                         * if rec count is zero.
                         */
@@ -1682,6 +1689,41 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
        __ftrace_hash_rec_update(ops, filter_hash, 1);
 }
+static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops,
+                                          int filter_hash, int inc)
+{
+        struct ftrace_ops *op;
+        __ftrace_hash_rec_update(ops, filter_hash, inc);
+        if (ops->func_hash != &global_ops.local_hash)
+                return;
+        /*
+         * If the ops shares the global_ops hash, then we need to update
+         * all ops that are enabled and use this hash.
+         */
+        do_for_each_ftrace_op(op, ftrace_ops_list) {
+                /* Already done */
+                if (op == ops)
+                        continue;
+                if (op->func_hash == &global_ops.local_hash)
+                        __ftrace_hash_rec_update(op, filter_hash, inc);
+        } while_for_each_ftrace_op(op);
+}
+static void ftrace_hash_rec_disable_modify(struct ftrace_ops *ops,
+                                           int filter_hash)
+{
+        ftrace_hash_rec_update_modify(ops, filter_hash, 0);
+}
+static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops,
+                                          int filter_hash)
+{
+        ftrace_hash_rec_update_modify(ops, filter_hash, 1);
+}
 static void print_ip_ins(const char *fmt, unsigned char *p)
 {
        int i;
@@ -1842,21 +1884,86 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)
 }
 static struct ftrace_ops *
+ftrace_find_tramp_ops_any(struct dyn_ftrace *rec)
+{
+        struct ftrace_ops *op;
+        unsigned long ip = rec->ip;
+        do_for_each_ftrace_op(op, ftrace_ops_list) {
+                if (!op->trampoline)
+                        continue;
+                if (hash_contains_ip(ip, op->func_hash))
+                        return op;
+        } while_for_each_ftrace_op(op);
+        return NULL;
+}
+static struct ftrace_ops *
 ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec)
 {
        struct ftrace_ops *op;
+        unsigned long ip = rec->ip;
-        /* Removed ops need to be tested first */
+        /*
-        if (removed_ops && removed_ops->tramp_hash) {
+         * Need to check removed ops first.
-                if (ftrace_lookup_ip(removed_ops->tramp_hash, rec->ip))
+         * If they are being removed, and this rec has a tramp,
+         * and this rec is in the ops list, then it would be the
+         * one with the tramp.
+         */
+        if (removed_ops) {
+                if (hash_contains_ip(ip, &removed_ops->old_hash))
                        return removed_ops;
        }
+        /*
+         * Need to find the current trampoline for a rec.
+         * Now, a trampoline is only attached to a rec if there
+         * was a single 'ops' attached to it. But this can be called
+         * when we are adding another op to the rec or removing the
+         * current one. Thus, if the op is being added, we can
+         * ignore it because it hasn't attached itself to the rec
+         * yet.
+         *
+         * If an ops is being modified (hooking to different functions)
+         * then we don't care about the new functions that are being
+         * added, just the old ones (that are probably being removed).
+         *
+         * If we are adding an ops to a function that already is using
+         * a trampoline, it needs to be removed (trampolines are only
+         * for single ops connected), then an ops that is not being
+         * modified also needs to be checked.
+         */
        do_for_each_ftrace_op(op, ftrace_ops_list) {
-                if (!op->tramp_hash)
+                if (!op->trampoline)
                        continue;
-                if (ftrace_lookup_ip(op->tramp_hash, rec->ip))
+                /*
+                 * If the ops is being added, it hasn't gotten to
+                 * the point to be removed from this tree yet.
+                 */
+                if (op->flags & FTRACE_OPS_FL_ADDING)
+                        continue;
+                /*
+                 * If the ops is being modified and is in the old
+                 * hash, then it is probably being removed from this
+                 * function.
+                 */
+                if ((op->flags & FTRACE_OPS_FL_MODIFYING) &&
+                    hash_contains_ip(ip, &op->old_hash))
+                        return op;
+                /*
+                 * If the ops is not being added or modified, and it's
+                 * in its normal filter hash, then this must be the one
+                 * we want!
+                 */
+                if (!(op->flags & FTRACE_OPS_FL_MODIFYING) &&
+                    hash_contains_ip(ip, op->func_hash))
                        return op;
        } while_for_each_ftrace_op(op);
@@ -1868,10 +1975,11 @@ static struct ftrace_ops *
 ftrace_find_tramp_ops_new(struct dyn_ftrace *rec)
 {
        struct ftrace_ops *op;
+        unsigned long ip = rec->ip;
        do_for_each_ftrace_op(op, ftrace_ops_list) {
                /* pass rec in as regs to have non-NULL val */
-                if (ftrace_ops_test(op, rec->ip, rec))
+                if (hash_contains_ip(ip, op->func_hash))
                        return op;
        } while_for_each_ftrace_op(op);
@@ -1896,8 +2004,8 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
        if (rec->flags & FTRACE_FL_TRAMP) {
                ops = ftrace_find_tramp_ops_new(rec);
                if (FTRACE_WARN_ON(!ops || !ops->trampoline)) {
-                        pr_warning("Bad trampoline accounting at: %p (%pS)\n",
+                        pr_warn("Bad trampoline accounting at: %p (%pS) (%lx)\n",
-                                    (void *)rec->ip, (void *)rec->ip);
+                                (void *)rec->ip, (void *)rec->ip, rec->flags);
                        /* Ftrace is shutting down, return anything */
                        return (unsigned long)FTRACE_ADDR;
                }
@@ -1964,7 +2072,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
                return ftrace_make_call(rec, ftrace_addr);
        case FTRACE_UPDATE_MAKE_NOP:
-                return ftrace_make_nop(NULL, rec, ftrace_addr);
+                return ftrace_make_nop(NULL, rec, ftrace_old_addr);
        case FTRACE_UPDATE_MODIFY_CALL:
                return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
@@ -2178,89 +2286,6 @@ void __weak arch_ftrace_update_code(int command)
        ftrace_run_stop_machine(command);
 }
-static int ftrace_save_ops_tramp_hash(struct ftrace_ops *ops)
-{
-        struct ftrace_page *pg;
-        struct dyn_ftrace *rec;
-        int size, bits;
-        int ret;
-        size = ops->nr_trampolines;
-        bits = 0;
-        /*
-         * Make the hash size about 1/2 the # found
-         */
-        for (size /= 2; size; size >>= 1)
-                bits++;
-        ops->tramp_hash = alloc_ftrace_hash(bits);
-        /*
-         * TODO: a failed allocation is going to screw up
-         * the accounting of what needs to be modified
-         * and not. For now, we kill ftrace if we fail
-         * to allocate here. But there are ways around this,
-         * but that will take a little more work.
-         */
-        if (!ops->tramp_hash)
-                return -ENOMEM;
-        do_for_each_ftrace_rec(pg, rec) {
-                if (ftrace_rec_count(rec) == 1 &&
-                    ftrace_ops_test(ops, rec->ip, rec)) {
-                        /*
-                         * If another ops adds to a rec, the rec will
-                         * lose its trampoline and never get it back
-                         * until all ops are off of it.
-                         */
-                        if (!(rec->flags & FTRACE_FL_TRAMP))
-                                continue;
-                        /* This record had better have a trampoline */
-                        if (FTRACE_WARN_ON(!(rec->flags & FTRACE_FL_TRAMP_EN)))
-                                return -1;
-                        ret = add_hash_entry(ops->tramp_hash, rec->ip);
-                        if (ret < 0)
-                                return ret;
-                }
-        } while_for_each_ftrace_rec();
-        /* The number of recs in the hash must match nr_trampolines */
-        FTRACE_WARN_ON(ops->tramp_hash->count != ops->nr_trampolines);
-        return 0;
-}
-static int ftrace_save_tramp_hashes(void)
-{
-        struct ftrace_ops *op;
-        int ret;
-        /*
-         * Now that any trampoline is being used, we need to save the
-         * hashes for the ops that have them. This allows the mapping
-         * back from the record to the ops that has the trampoline to
-         * know what code is being replaced. Modifying code must always
-         * verify what it is changing.
-         */
-        do_for_each_ftrace_op(op, ftrace_ops_list) {
-                /* The tramp_hash is recreated each time. */
-                free_ftrace_hash(op->tramp_hash);
-                op->tramp_hash = NULL;
-                if (op->nr_trampolines) {
-                        ret = ftrace_save_ops_tramp_hash(op);
-                        if (ret)
-                                return ret;
-                }
-        } while_for_each_ftrace_op(op);
-        return 0;
-}
 static void ftrace_run_update_code(int command)
 {
        int ret;
@@ -2280,9 +2305,16 @@ static void ftrace_run_update_code(int command)
        ret = ftrace_arch_code_modify_post_process();
        FTRACE_WARN_ON(ret);
+}
-        ret = ftrace_save_tramp_hashes();
+static void ftrace_run_modify_code(struct ftrace_ops *ops, int command,
-        FTRACE_WARN_ON(ret);
+                                   struct ftrace_hash *old_hash)
+{
+        ops->flags |= FTRACE_OPS_FL_MODIFYING;
+        ops->old_hash.filter_hash = old_hash;
+        ftrace_run_update_code(command);
+        ops->old_hash.filter_hash = NULL;
+        ops->flags &= ~FTRACE_OPS_FL_MODIFYING;
 }
 static ftrace_func_t saved_ftrace_func;
@@ -2306,6 +2338,13 @@ static void ftrace_startup_enable(int command)
        ftrace_run_update_code(command);
 }
+static void ftrace_startup_all(int command)
+{
+        update_all_ops = true;
+        ftrace_startup_enable(command);
+        update_all_ops = false;
+}
 static int ftrace_startup(struct ftrace_ops *ops, int command)
 {
        int ret;
@@ -2320,12 +2359,22 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
        ftrace_start_up++;
        command |= FTRACE_UPDATE_CALLS;
-        ops->flags |= FTRACE_OPS_FL_ENABLED;
+        /*
+         * Note that ftrace probes uses this to start up
+         * and modify functions it will probe. But we still
+         * set the ADDING flag for modification, as probes
+         * do not have trampolines. If they add them in the
+         * future, then the probes will need to distinguish
+         * between adding and updating probes.
+         */
+        ops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_ADDING;
        ftrace_hash_rec_enable(ops, 1);
        ftrace_startup_enable(command);
+        ops->flags &= ~FTRACE_OPS_FL_ADDING;
        return 0;
 }
@@ -2375,11 +2424,35 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
         * If the ops uses a trampoline, then it needs to be
         * tested first on update.
         */
+        ops->flags |= FTRACE_OPS_FL_REMOVING;
        removed_ops = ops;
+        /* The trampoline logic checks the old hashes */
+        ops->old_hash.filter_hash = ops->func_hash->filter_hash;
+        ops->old_hash.notrace_hash = ops->func_hash->notrace_hash;
        ftrace_run_update_code(command);
+        /*
+         * If there's no more ops registered with ftrace, run a
+         * sanity check to make sure all rec flags are cleared.
+         */
+        if (ftrace_ops_list == &ftrace_list_end) {
+                struct ftrace_page *pg;
+                struct dyn_ftrace *rec;
+                do_for_each_ftrace_rec(pg, rec) {
+                        if (FTRACE_WARN_ON_ONCE(rec->flags))
+                                pr_warn("  %pS flags:%lx\n",
+                                        (void *)rec->ip, rec->flags);
+                } while_for_each_ftrace_rec();
+        }
+        ops->old_hash.filter_hash = NULL;
+        ops->old_hash.notrace_hash = NULL;
        removed_ops = NULL;
+        ops->flags &= ~FTRACE_OPS_FL_REMOVING;
        /*
         * Dynamic ops may be freed, we must make sure that all
@@ -2436,8 +2509,8 @@ static inline int ops_traces_mod(struct ftrace_ops *ops)
         * Filter_hash being empty will default to trace module.
         * But notrace hash requires a test of individual module functions.
         */
-        return ftrace_hash_empty(ops->filter_hash) &&
+        return ftrace_hash_empty(ops->func_hash->filter_hash) &&
-                ftrace_hash_empty(ops->notrace_hash);
+                ftrace_hash_empty(ops->func_hash->notrace_hash);
 }
 /*
@@ -2459,12 +2532,12 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
                return 0;
        /* The function must be in the filter */
-        if (!ftrace_hash_empty(ops->filter_hash) &&
+        if (!ftrace_hash_empty(ops->func_hash->filter_hash) &&
-            !ftrace_lookup_ip(ops->filter_hash, rec->ip))
+            !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))
                return 0;
        /* If in notrace hash, we ignore it too */
-        if (ftrace_lookup_ip(ops->notrace_hash, rec->ip))
+        if (ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip))
                return 0;
        return 1;
@@ -2785,10 +2858,10 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
        } else {
                rec = &iter->pg->records[iter->idx++];
                if (((iter->flags & FTRACE_ITER_FILTER) &&
-                     !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) ||
+                     !(ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))) ||
                    ((iter->flags & FTRACE_ITER_NOTRACE) &&
-                     !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) ||
+                     !ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) ||
                    ((iter->flags & FTRACE_ITER_ENABLED) &&
                     !(rec->flags & FTRACE_FL_ENABLED))) {
@@ -2837,9 +2910,9 @@ static void *t_start(struct seq_file *m, loff_t *pos)
         * functions are enabled.
         */
        if ((iter->flags & FTRACE_ITER_FILTER &&
-             ftrace_hash_empty(ops->filter_hash)) ||
+             ftrace_hash_empty(ops->func_hash->filter_hash)) ||
            (iter->flags & FTRACE_ITER_NOTRACE &&
-             ftrace_hash_empty(ops->notrace_hash))) {
+             ftrace_hash_empty(ops->func_hash->notrace_hash))) {
                if (*pos > 0)
                        return t_hash_start(m, pos);
                iter->flags |= FTRACE_ITER_PRINTALL;
@@ -2904,8 +2977,8 @@ static int t_show(struct seq_file *m, void *v)
                if (rec->flags & FTRACE_FL_TRAMP_EN) {
                        struct ftrace_ops *ops;
-                        ops = ftrace_find_tramp_ops_curr(rec);
+                        ops = ftrace_find_tramp_ops_any(rec);
-                        if (ops && ops->trampoline)
+                        if (ops)
                                seq_printf(m, "\ttramp: %pS",
                                           (void *)ops->trampoline);
                        else
@@ -3001,12 +3074,12 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
        iter->ops = ops;
        iter->flags = flag;
-        mutex_lock(&ops->regex_lock);
+        mutex_lock(&ops->func_hash->regex_lock);
        if (flag & FTRACE_ITER_NOTRACE)
-                hash = ops->notrace_hash;
+                hash = ops->func_hash->notrace_hash;
        else
-                hash = ops->filter_hash;
+                hash = ops->func_hash->filter_hash;
        if (file->f_mode & FMODE_WRITE) {
                const int size_bits = FTRACE_HASH_DEFAULT_BITS;
@@ -3041,7 +3114,7 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
                file->private_data = iter;
 out_unlock:
-        mutex_unlock(&ops->regex_lock);
+        mutex_unlock(&ops->func_hash->regex_lock);
        return ret;
 }
@@ -3279,12 +3352,12 @@ static struct ftrace_ops trace_probe_ops __read_mostly =
 {
        .func           = function_trace_probe_call,
        .flags          = FTRACE_OPS_FL_INITIALIZED,
-        INIT_REGEX_LOCK(trace_probe_ops)
+        INIT_OPS_HASH(trace_probe_ops)
 };
 static int ftrace_probe_registered;
-static void __enable_ftrace_function_probe(void)
+static void __enable_ftrace_function_probe(struct ftrace_hash *old_hash)
 {
        int ret;
        int i;
@@ -3292,7 +3365,8 @@ static void __enable_ftrace_function_probe(void)
        if (ftrace_probe_registered) {
                /* still need to update the function call sites */
                if (ftrace_enabled)
-                        ftrace_run_update_code(FTRACE_UPDATE_CALLS);
+                        ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS,
+                                               old_hash);
                return;
        }
@@ -3342,7 +3416,8 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
                              void *data)
 {
        struct ftrace_func_probe *entry;
-        struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash;
+        struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
+        struct ftrace_hash *old_hash = *orig_hash;
        struct ftrace_hash *hash;
        struct ftrace_page *pg;
        struct dyn_ftrace *rec;
@@ -3359,9 +3434,9 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
        if (WARN_ON(not))
                return -EINVAL;
-        mutex_lock(&trace_probe_ops.regex_lock);
+        mutex_lock(&trace_probe_ops.func_hash->regex_lock);
-        hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
+        hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash);
        if (!hash) {
                count = -ENOMEM;
                goto out;
@@ -3420,15 +3495,18 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
        } while_for_each_ftrace_rec();
        ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
-        if (ret < 0)
-                count = ret;
-        __enable_ftrace_function_probe();
+        __enable_ftrace_function_probe(old_hash);
+        if (!ret)
+                free_ftrace_hash_rcu(old_hash);
+        else
+                count = ret;
 out_unlock:
        mutex_unlock(&ftrace_lock);
 out:
-        mutex_unlock(&trace_probe_ops.regex_lock);
+        mutex_unlock(&trace_probe_ops.func_hash->regex_lock);
        free_ftrace_hash(hash);
        return count;
@@ -3446,7 +3524,8 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
        struct ftrace_func_entry *rec_entry;
        struct ftrace_func_probe *entry;
        struct ftrace_func_probe *p;
-        struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash;
+        struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
+        struct ftrace_hash *old_hash = *orig_hash;
        struct list_head free_list;
        struct ftrace_hash *hash;
        struct hlist_node *tmp;
@@ -3454,6 +3533,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
        int type = MATCH_FULL;
        int i, len = 0;
        char *search;
+        int ret;
        if (glob && (strcmp(glob, "*") == 0 || !strlen(glob)))
                glob = NULL;
@@ -3468,7 +3548,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
                        return;
        }
-        mutex_lock(&trace_probe_ops.regex_lock);
+        mutex_lock(&trace_probe_ops.func_hash->regex_lock);
        hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
        if (!hash)
@@ -3512,8 +3592,11 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
         * Remove after the disable is called. Otherwise, if the last
         * probe is removed, a null hash means *all enabled*.
         */
-        ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
+        ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
        synchronize_sched();
+        if (!ret)
+                free_ftrace_hash_rcu(old_hash);
        list_for_each_entry_safe(entry, p, &free_list, free_list) {
                list_del(&entry->free_list);
                ftrace_free_entry(entry);
@@ -3521,7 +3604,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
        mutex_unlock(&ftrace_lock);
                
 out_unlock:
-        mutex_unlock(&trace_probe_ops.regex_lock);
+        mutex_unlock(&trace_probe_ops.func_hash->regex_lock);
        free_ftrace_hash(hash);
 }
@@ -3700,10 +3783,11 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
        return add_hash_entry(hash, ip);
 }
-static void ftrace_ops_update_code(struct ftrace_ops *ops)
+static void ftrace_ops_update_code(struct ftrace_ops *ops,
+                                   struct ftrace_hash *old_hash)
 {
        if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled)
-                ftrace_run_update_code(FTRACE_UPDATE_CALLS);
+                ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash);
 }
 static int
@@ -3711,18 +3795,19 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
                unsigned long ip, int remove, int reset, int enable)
 {
        struct ftrace_hash **orig_hash;
+        struct ftrace_hash *old_hash;
        struct ftrace_hash *hash;
        int ret;
        if (unlikely(ftrace_disabled))
                return -ENODEV;
-        mutex_lock(&ops->regex_lock);
+        mutex_lock(&ops->func_hash->regex_lock);
        if (enable)
-                orig_hash = &ops->filter_hash;
+                orig_hash = &ops->func_hash->filter_hash;
        else
-                orig_hash = &ops->notrace_hash;
+                orig_hash = &ops->func_hash->notrace_hash;
        if (reset)
                hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
@@ -3745,14 +3830,16 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
        }
        mutex_lock(&ftrace_lock);
+        old_hash = *orig_hash;
        ret = ftrace_hash_move(ops, enable, orig_hash, hash);
-        if (!ret)
+        if (!ret) {
-                ftrace_ops_update_code(ops);
+                ftrace_ops_update_code(ops, old_hash);
+                free_ftrace_hash_rcu(old_hash);
+        }
        mutex_unlock(&ftrace_lock);
 out_regex_unlock:
-        mutex_unlock(&ops->regex_lock);
+        mutex_unlock(&ops->func_hash->regex_lock);
        free_ftrace_hash(hash);
        return ret;
@@ -3957,6 +4044,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
        struct seq_file *m = (struct seq_file *)file->private_data;
        struct ftrace_iterator *iter;
        struct ftrace_hash **orig_hash;
+        struct ftrace_hash *old_hash;
        struct trace_parser *parser;
        int filter_hash;
        int ret;
@@ -3975,26 +4063,28 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
        trace_parser_put(parser);
-        mutex_lock(&iter->ops->regex_lock);
+        mutex_lock(&iter->ops->func_hash->regex_lock);
        if (file->f_mode & FMODE_WRITE) {
                filter_hash = !!(iter->flags & FTRACE_ITER_FILTER);
                if (filter_hash)
-                        orig_hash = &iter->ops->filter_hash;
+                        orig_hash = &iter->ops->func_hash->filter_hash;
                else
-                        orig_hash = &iter->ops->notrace_hash;
+                        orig_hash = &iter->ops->func_hash->notrace_hash;
                mutex_lock(&ftrace_lock);
+                old_hash = *orig_hash;
                ret = ftrace_hash_move(iter->ops, filter_hash,
                                       orig_hash, iter->hash);
-                if (!ret)
+                if (!ret) {
-                        ftrace_ops_update_code(iter->ops);
+                        ftrace_ops_update_code(iter->ops, old_hash);
+                        free_ftrace_hash_rcu(old_hash);
+                }
                mutex_unlock(&ftrace_lock);
        }
-        mutex_unlock(&iter->ops->regex_lock);
+        mutex_unlock(&iter->ops->func_hash->regex_lock);
        free_ftrace_hash(iter->hash);
        kfree(iter);
@@ -4611,7 +4701,6 @@ void __init ftrace_init(void)
 static struct ftrace_ops global_ops = {
        .func                   = ftrace_stub,
        .flags                  = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
-        INIT_REGEX_LOCK(global_ops)
 };
 static int __init ftrace_nodyn_init(void)
@@ -4623,6 +4712,7 @@ core_initcall(ftrace_nodyn_init);
 static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
 static inline void ftrace_startup_enable(int command) { }
+static inline void ftrace_startup_all(int command) { }
 /* Keep as macros so we do not need to define the commands */
 # define ftrace_startup(ops, command)                                   \
        ({                                                              \
@@ -4713,7 +4803,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
 static struct ftrace_ops control_ops = {
        .func   = ftrace_ops_control_func,
        .flags  = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
-        INIT_REGEX_LOCK(control_ops)
+        INIT_OPS_HASH(control_ops)
 };
 static inline void
@@ -4772,6 +4862,56 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
 }
 #endif
+/*
+ * If there's only one function registered but it does not support
+ * recursion, this function will be called by the mcount trampoline.
+ * This function will handle recursion protection.
+ */
+static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
+                                   struct ftrace_ops *op, struct pt_regs *regs)
+{
+        int bit;
+        bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
+        if (bit < 0)
+                return;
+        op->func(ip, parent_ip, op, regs);
+        trace_clear_recursion(bit);
+}
+/**
+ * ftrace_ops_get_func - get the function a trampoline should call
+ * @ops: the ops to get the function for
+ *
+ * Normally the mcount trampoline will call the ops->func, but there
+ * are times that it should not. For example, if the ops does not
+ * have its own recursion protection, then it should call the
+ * ftrace_ops_recurs_func() instead.
+ *
+ * Returns the function that the trampoline should call for @ops.
+ */
+ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
+{
+        /*
+         * If this is a dynamic ops or we force list func,
+         * then it needs to call the list anyway.
+         */
+        if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC)
+                return ftrace_ops_list_func;
+        /*
+         * If the func handles its own recursion, call it directly.
+         * Otherwise call the recursion protected function that
+         * will call the ftrace ops function.
+         */
+        if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE))
+                return ftrace_ops_recurs_func;
+        return ops->func;
+}
 static void clear_ftrace_swapper(void)
 {
        struct task_struct *p;
@@ -4872,7 +5012,8 @@ static int ftrace_pid_add(int p)
        set_ftrace_pid_task(pid);
        ftrace_update_pid_func();
-        ftrace_startup_enable(0);
+        ftrace_startup_all(0);
        mutex_unlock(&ftrace_lock);
        return 0;
@@ -4901,7 +5042,7 @@ static void ftrace_pid_reset(void)
        }
        ftrace_update_pid_func();
-        ftrace_startup_enable(0);
+        ftrace_startup_all(0);
        mutex_unlock(&ftrace_lock);
 }
@@ -5145,6 +5286,17 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static struct ftrace_ops graph_ops = {
+        .func                   = ftrace_stub,
+        .flags                  = FTRACE_OPS_FL_RECURSION_SAFE |
+                                   FTRACE_OPS_FL_INITIALIZED |
+                                   FTRACE_OPS_FL_STUB,
+#ifdef FTRACE_GRAPH_TRAMP_ADDR
+        .trampoline             = FTRACE_GRAPH_TRAMP_ADDR,
+#endif
+        ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash)
+};
 static int ftrace_graph_active;
 int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
@@ -5307,12 +5459,28 @@ static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)
 */
 static void update_function_graph_func(void)
 {
-        if (ftrace_ops_list == &ftrace_list_end ||
+        struct ftrace_ops *op;
-            (ftrace_ops_list == &global_ops &&
+        bool do_test = false;
-             global_ops.next == &ftrace_list_end))
-                ftrace_graph_entry = __ftrace_graph_entry;
+        /*
-        else
+         * The graph and global ops share the same set of functions
+         * to test. If any other ops is on the list, then
+         * the graph tracing needs to test if its the function
+         * it should call.
+         */
+        do_for_each_ftrace_op(op, ftrace_ops_list) {
+                if (op != &global_ops && op != &graph_ops &&
+                    op != &ftrace_list_end) {
+                        do_test = true;
+                        /* in double loop, break out with goto */
+                        goto out;
+                }
+        } while_for_each_ftrace_op(op);
+ out:
+        if (do_test)
                ftrace_graph_entry = ftrace_graph_entry_test;
+        else
+                ftrace_graph_entry = __ftrace_graph_entry;
 }
 static struct notifier_block ftrace_suspend_notifier = {
@@ -5353,16 +5521,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
        ftrace_graph_entry = ftrace_graph_entry_test;
        update_function_graph_func();
-        /* Function graph doesn't use the .func field of global_ops */
+        ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET);
-        global_ops.flags |= FTRACE_OPS_FL_STUB;
-#ifdef CONFIG_DYNAMIC_FTRACE
-        /* Optimize function graph calling (if implemented by arch) */
-        if (FTRACE_GRAPH_TRAMP_ADDR != 0)
-                global_ops.trampoline = FTRACE_GRAPH_TRAMP_ADDR;
-#endif
-        ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
 out:
        mutex_unlock(&ftrace_lock);
@@ -5380,12 +5539,7 @@ void unregister_ftrace_graph(void)
        ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
        ftrace_graph_entry = ftrace_graph_entry_stub;
        __ftrace_graph_entry = ftrace_graph_entry_stub;
-        ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
+        ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET);
-        global_ops.flags &= ~FTRACE_OPS_FL_STUB;
-#ifdef CONFIG_DYNAMIC_FTRACE
-        if (FTRACE_GRAPH_TRAMP_ADDR != 0)
-                global_ops.trampoline = 0;
-#endif
        unregister_pm_notifier(&ftrace_suspend_notifier);
        unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 925f629658d6..a56e07c8d15b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -538,16 +538,18 @@ static void rb_wake_up_waiters(struct irq_work *work)
 * ring_buffer_wait - wait for input to the ring buffer
 * @buffer: buffer to wait on
 * @cpu: the cpu buffer to wait on
+ * @full: wait until a full page is available, if @cpu != RING_BUFFER_ALL_CPUS
 *
 * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
 * as data is added to any of the @buffer's cpu buffers. Otherwise
 * it will wait for data to be added to a specific cpu buffer.
 */
-int ring_buffer_wait(struct ring_buffer *buffer, int cpu)
+int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
 {
-        struct ring_buffer_per_cpu *cpu_buffer;
+        struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer);
        DEFINE_WAIT(wait);
        struct rb_irq_work *work;
+        int ret = 0;
        /*
         * Depending on what the caller is waiting for, either any
@@ -564,36 +566,61 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu)
        }
-        prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
+        while (true) {
+                prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
-        /*
+                /*
-         * The events can happen in critical sections where
+                 * The events can happen in critical sections where
-         * checking a work queue can cause deadlocks.
+                 * checking a work queue can cause deadlocks.
-         * After adding a task to the queue, this flag is set
+                 * After adding a task to the queue, this flag is set
-         * only to notify events to try to wake up the queue
+                 * only to notify events to try to wake up the queue
-         * using irq_work.
+                 * using irq_work.
-         *
+                 *
-         * We don't clear it even if the buffer is no longer
+                 * We don't clear it even if the buffer is no longer
-         * empty. The flag only causes the next event to run
+                 * empty. The flag only causes the next event to run
-         * irq_work to do the work queue wake up. The worse
+                 * irq_work to do the work queue wake up. The worse
-         * that can happen if we race with !trace_empty() is that
+                 * that can happen if we race with !trace_empty() is that
-         * an event will cause an irq_work to try to wake up
+                 * an event will cause an irq_work to try to wake up
-         * an empty queue.
+                 * an empty queue.
-         *
+                 *
-         * There's no reason to protect this flag either, as
+                 * There's no reason to protect this flag either, as
-         * the work queue and irq_work logic will do the necessary
+                 * the work queue and irq_work logic will do the necessary
-         * synchronization for the wake ups. The only thing
+                 * synchronization for the wake ups. The only thing
-         * that is necessary is that the wake up happens after
+                 * that is necessary is that the wake up happens after
-         * a task has been queued. It's OK for spurious wake ups.
+                 * a task has been queued. It's OK for spurious wake ups.
-         */
+                 */
-        work->waiters_pending = true;
+                work->waiters_pending = true;
+                if (signal_pending(current)) {
+                        ret = -EINTR;
+                        break;
+                }
+                if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer))
+                        break;
+                if (cpu != RING_BUFFER_ALL_CPUS &&
+                    !ring_buffer_empty_cpu(buffer, cpu)) {
+                        unsigned long flags;
+                        bool pagebusy;
+                        if (!full)
+                                break;
+                        raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+                        pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
+                        raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+                        if (!pagebusy)
+                                break;
+                }
-        if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) ||
-            (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu)))
                schedule();
+        }
        finish_wait(&work->waiters, &wait);
-        return 0;
+        return ret;
 }
 /**
@@ -626,8 +653,22 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
                work = &cpu_buffer->irq_work;
        }
-        work->waiters_pending = true;
        poll_wait(filp, &work->waiters, poll_table);
+        work->waiters_pending = true;
+        /*
+         * There's a tight race between setting the waiters_pending and
+         * checking if the ring buffer is empty.  Once the waiters_pending bit
+         * is set, the next event will wake the task up, but we can get stuck
+         * if there's only a single event in.
+         *
+         * FIXME: Ideally, we need a memory barrier on the writer side as well,
+         * but adding a memory barrier to all events will cause too much of a
+         * performance hit in the fast path.  We only need a memory barrier when
+         * the buffer goes from empty to having content.  But as this race is
+         * extremely small, and it's not a problem if another event comes in, we
+         * will fix it later.
+         */
+        smp_mb();
        if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
            (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
@@ -1968,7 +2009,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
 /**
 * rb_update_event - update event type and data
- * @event: the even to update
+ * @event: the event to update
 * @type: the type of event
 * @length: the size of the event field in the ring buffer
 *
@@ -3341,21 +3382,16 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
        struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
        /* Iterator usage is expected to have record disabled */
-        if (list_empty(&cpu_buffer->reader_page->list)) {
+        iter->head_page = cpu_buffer->reader_page;
-                iter->head_page = rb_set_head_page(cpu_buffer);
+        iter->head = cpu_buffer->reader_page->read;
-                if (unlikely(!iter->head_page))
-                        return;
+        iter->cache_reader_page = iter->head_page;
-                iter->head = iter->head_page->read;
+        iter->cache_read = cpu_buffer->read;
-        } else {
-                iter->head_page = cpu_buffer->reader_page;
-                iter->head = cpu_buffer->reader_page->read;
-        }
        if (iter->head)
                iter->read_stamp = cpu_buffer->read_stamp;
        else
                iter->read_stamp = iter->head_page->page->time_stamp;
-        iter->cache_reader_page = cpu_buffer->reader_page;
-        iter->cache_read = cpu_buffer->read;
 }
 /**
@@ -3748,12 +3784,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
                return NULL;
        /*
-         * We repeat when a time extend is encountered.
+         * We repeat when a time extend is encountered or we hit
-         * Since the time extend is always attached to a data event,
+         * the end of the page. Since the time extend is always attached
-         * we should never loop more than once.
+         * to a data event, we should never loop more than three times.
-         * (We never hit the following condition more than twice).
+         * Once for going to next page, once on time extend, and
+         * finally once to get the event.
+         * (We never hit the following condition more than thrice).
         */
-        if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
+        if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3))
                return NULL;
        if (rb_per_cpu_empty(cpu_buffer))
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 0434ff1b808e..3f9e328c30b5 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -205,7 +205,6 @@ static void ring_buffer_consumer(void)
                        break;
                schedule();
-                __set_current_state(TASK_RUNNING);
        }
        reader_finish = 0;
        complete(&read_done);
@@ -379,7 +378,6 @@ static int ring_buffer_consumer_thread(void *arg)
                        break;
                schedule();
-                __set_current_state(TASK_RUNNING);
        }
        __set_current_state(TASK_RUNNING);
@@ -407,7 +405,6 @@ static int ring_buffer_producer_thread(void *arg)
                trace_printk("Sleeping for 10 secs\n");
                set_current_state(TASK_INTERRUPTIBLE);
                schedule_timeout(HZ * SLEEP_TIME);
-                __set_current_state(TASK_RUNNING);
        }
        if (kill_test)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f3ef80c8914c..0fa2d2070bd4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1076,13 +1076,14 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 }
 #endif /* CONFIG_TRACER_MAX_TRACE */
-static int wait_on_pipe(struct trace_iterator *iter)
+static int wait_on_pipe(struct trace_iterator *iter, bool full)
 {
        /* Iterators are static, they should be filled or empty */
        if (trace_buffer_iter(iter, iter->cpu_file))
                return 0;
-        return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file);
+        return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file,
+                                full);
 }
 #ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -4434,15 +4435,12 @@ static int tracing_wait_pipe(struct file *filp)
                mutex_unlock(&iter->mutex);
-                ret = wait_on_pipe(iter);
+                ret = wait_on_pipe(iter, false);
                mutex_lock(&iter->mutex);
                if (ret)
                        return ret;
-                if (signal_pending(current))
-                        return -EINTR;
        }
        return 1;
@@ -5372,16 +5370,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
                                goto out_unlock;
                        }
                        mutex_unlock(&trace_types_lock);
-                        ret = wait_on_pipe(iter);
+                        ret = wait_on_pipe(iter, false);
                        mutex_lock(&trace_types_lock);
                        if (ret) {
                                size = ret;
                                goto out_unlock;
                        }
-                        if (signal_pending(current)) {
-                                size = -EINTR;
-                                goto out_unlock;
-                        }
                        goto again;
                }
                size = 0;
@@ -5500,7 +5494,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
        };
        struct buffer_ref *ref;
        int entries, size, i;
-        ssize_t ret;
+        ssize_t ret = 0;
        mutex_lock(&trace_types_lock);
@@ -5538,13 +5532,16 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                int r;
                ref = kzalloc(sizeof(*ref), GFP_KERNEL);
-                if (!ref)
+                if (!ref) {
+                        ret = -ENOMEM;
                        break;
+                }
                ref->ref = 1;
                ref->buffer = iter->trace_buffer->buffer;
                ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
                if (!ref->page) {
+                        ret = -ENOMEM;
                        kfree(ref);
                        break;
                }
@@ -5582,19 +5579,19 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
        /* did we read anything? */
        if (!spd.nr_pages) {
+                if (ret)
+                        goto out;
                if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) {
                        ret = -EAGAIN;
                        goto out;
                }
                mutex_unlock(&trace_types_lock);
-                ret = wait_on_pipe(iter);
+                ret = wait_on_pipe(iter, true);
                mutex_lock(&trace_types_lock);
                if (ret)
                        goto out;
-                if (signal_pending(current)) {
-                        ret = -EINTR;
-                        goto out;
-                }
                goto again;
        }
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index ef06ce7e9cf8..0cc51edde3a8 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2513,8 +2513,11 @@ static __init int event_test_thread(void *unused)
        kfree(test_malloc);
        set_current_state(TASK_INTERRUPTIBLE);
-        while (!kthread_should_stop())
+        while (!kthread_should_stop()) {
                schedule();
+                set_current_state(TASK_INTERRUPTIBLE);
+        }
+        __set_current_state(TASK_RUNNING);
        return 0;
 }
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 5ef60499dc8e..b0f86ea77881 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -382,6 +382,8 @@ static int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
        /* check the trace buffer */
        ret = trace_test_buffer(&tr->trace_buffer, &count);
+        ftrace_enabled = 1;
        tracing_start();
        /* we should only have one item */
@@ -679,6 +681,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
        /* check the trace buffer */
        ret = trace_test_buffer(&tr->trace_buffer, &count);
+        ftrace_enabled = 1;
        trace->reset(tr);
        tracing_start();
@@ -1025,6 +1029,12 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
 #endif
 #ifdef CONFIG_SCHED_TRACER
+struct wakeup_test_data {
+        struct completion       is_ready;
+        int                     go;
+};
 static int trace_wakeup_test_thread(void *data)
 {
        /* Make this a -deadline thread */
@@ -1034,51 +1044,56 @@ static int trace_wakeup_test_thread(void *data)
                .sched_deadline = 10000000ULL,
                .sched_period = 10000000ULL
        };
-        struct completion *x = data;
+        struct wakeup_test_data *x = data;
        sched_setattr(current, &attr);
        /* Make it know we have a new prio */
-        complete(x);
+        complete(&x->is_ready);
        /* now go to sleep and let the test wake us up */
        set_current_state(TASK_INTERRUPTIBLE);
-        schedule();
+        while (!x->go) {
+                schedule();
+                set_current_state(TASK_INTERRUPTIBLE);
+        }
-        complete(x);
+        complete(&x->is_ready);
+        set_current_state(TASK_INTERRUPTIBLE);
        /* we are awake, now wait to disappear */
        while (!kthread_should_stop()) {
-                /*
+                schedule();
-                 * This will likely be the system top priority
+                set_current_state(TASK_INTERRUPTIBLE);
-                 * task, do short sleeps to let others run.
-                 */
-                msleep(100);
        }
+        __set_current_state(TASK_RUNNING);
        return 0;
 }
 int
 trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
 {
        unsigned long save_max = tr->max_latency;
        struct task_struct *p;
-        struct completion is_ready;
+        struct wakeup_test_data data;
        unsigned long count;
        int ret;
-        init_completion(&is_ready);
+        memset(&data, 0, sizeof(data));
+        init_completion(&data.is_ready);
        /* create a -deadline thread */
-        p = kthread_run(trace_wakeup_test_thread, &is_ready, "ftrace-test");
+        p = kthread_run(trace_wakeup_test_thread, &data, "ftrace-test");
        if (IS_ERR(p)) {
                printk(KERN_CONT "Failed to create ftrace wakeup test thread ");
                return -1;
        }
        /* make sure the thread is running at -deadline policy */
-        wait_for_completion(&is_ready);
+        wait_for_completion(&data.is_ready);
        /* start the tracing */
        ret = tracer_init(trace, tr);
@@ -1099,18 +1114,20 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
                msleep(100);
        }
-        init_completion(&is_ready);
+        init_completion(&data.is_ready);
+        data.go = 1;
+        /* memory barrier is in the wake_up_process() */
        wake_up_process(p);
        /* Wait for the task to wake up */
-        wait_for_completion(&is_ready);
+        wait_for_completion(&data.is_ready);
        /* stop the tracing. */
        tracing_stop();
        /* check both trace buffers */
        ret = trace_test_buffer(&tr->trace_buffer, NULL);
-        printk("ret = %d\n", ret);
        if (!ret)
                ret = trace_test_buffer(&tr->max_buffer, &count);
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 8a4e5cb66a4c..16eddb308c33 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -13,7 +13,6 @@
 #include <linux/sysctl.h>
 #include <linux/init.h>
 #include <linux/fs.h>
-#include <linux/magic.h>
 #include <asm/setup.h>
@@ -171,8 +170,7 @@ check_stack(unsigned long ip, unsigned long *stack)
                        i++;
        }
-        if ((current != &init_task &&
+        if (task_stack_end_corrupted(current)) {
-                *(end_of_stack(current)) != STACK_END_MAGIC)) {
                print_max_stack();
                BUG();
        }
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 759d5e004517..29228c4d5696 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -313,7 +313,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
        int size;
        syscall_nr = trace_get_syscall_nr(current, regs);
-        if (syscall_nr < 0)
+        if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
                return;
        /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */
@@ -360,7 +360,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
        int syscall_nr;
        syscall_nr = trace_get_syscall_nr(current, regs);
-        if (syscall_nr < 0)
+        if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
                return;
        /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */
@@ -425,7 +425,7 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file,
                return;
        mutex_lock(&syscall_trace_lock);
        tr->sys_refcount_enter--;
-        rcu_assign_pointer(tr->enter_syscall_files[num], NULL);
+        RCU_INIT_POINTER(tr->enter_syscall_files[num], NULL);
        if (!tr->sys_refcount_enter)
                unregister_trace_sys_enter(ftrace_syscall_enter, tr);
        mutex_unlock(&syscall_trace_lock);
@@ -463,7 +463,7 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file,
                return;
        mutex_lock(&syscall_trace_lock);
        tr->sys_refcount_exit--;
-        rcu_assign_pointer(tr->exit_syscall_files[num], NULL);
+        RCU_INIT_POINTER(tr->exit_syscall_files[num], NULL);
        if (!tr->sys_refcount_exit)
                unregister_trace_sys_exit(ftrace_syscall_exit, tr);
        mutex_unlock(&syscall_trace_lock);
@@ -567,7 +567,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
        int size;
        syscall_nr = trace_get_syscall_nr(current, regs);
-        if (syscall_nr < 0)
+        if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
                return;
        if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
                return;
@@ -641,7 +641,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
        int size;
        syscall_nr = trace_get_syscall_nr(current, regs);
-        if (syscall_nr < 0)
+        if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
                return;
        if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
                return;
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 394f70b17162..9586b670a5b2 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -14,7 +14,7 @@ static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
 void user_return_notifier_register(struct user_return_notifier *urn)
 {
        set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
-        hlist_add_head(&urn->link, &__get_cpu_var(return_notifier_list));
+        hlist_add_head(&urn->link, this_cpu_ptr(&return_notifier_list));
 }
 EXPORT_SYMBOL_GPL(user_return_notifier_register);
@@ -25,7 +25,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register);
 void user_return_notifier_unregister(struct user_return_notifier *urn)
 {
        hlist_del(&urn->link);
-        if (hlist_empty(&__get_cpu_var(return_notifier_list)))
+        if (hlist_empty(this_cpu_ptr(&return_notifier_list)))
                clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
 }
 EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index fcc02560fd6b..aa312b0dc3ec 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -526,21 +526,21 @@ static void m_stop(struct seq_file *seq, void *v)
        return;
 }
-struct seq_operations proc_uid_seq_operations = {
+const struct seq_operations proc_uid_seq_operations = {
        .start = uid_m_start,
        .stop = m_stop,
        .next = m_next,
        .show = uid_m_show,
 };
-struct seq_operations proc_gid_seq_operations = {
+const struct seq_operations proc_gid_seq_operations = {
        .start = gid_m_start,
        .stop = m_stop,
        .next = m_next,
        .show = gid_m_show,
 };
-struct seq_operations proc_projid_seq_operations = {
+const struct seq_operations proc_projid_seq_operations = {
        .start = projid_m_start,
        .stop = m_stop,
        .next = m_next,
diff --git a/kernel/utsname.c b/kernel/utsname.c
index fd393124e507..883aaaa7de8a 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -93,13 +93,13 @@ static void *utsns_get(struct task_struct *task)
        struct uts_namespace *ns = NULL;
        struct nsproxy *nsproxy;
-        rcu_read_lock();
+        task_lock(task);
-        nsproxy = task_nsproxy(task);
+        nsproxy = task->nsproxy;
        if (nsproxy) {
                ns = nsproxy->uts_ns;
                get_uts_ns(ns);
        }
-        rcu_read_unlock();
+        task_unlock(task);
        return ns;
 }
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index c3319bd1b040..70bf11815f84 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -15,11 +15,6 @@
 #include <linux/cpu.h>
 #include <linux/nmi.h>
 #include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/lockdep.h>
-#include <linux/notifier.h>
 #include <linux/module.h>
 #include <linux/sysctl.h>
 #include <linux/smpboot.h>
@@ -47,6 +42,7 @@ static DEFINE_PER_CPU(bool, softlockup_touch_sync);
 static DEFINE_PER_CPU(bool, soft_watchdog_warn);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
 static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
+static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 static DEFINE_PER_CPU(bool, hard_watchdog_warn);
 static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
@@ -63,6 +59,25 @@ static unsigned long soft_lockup_nmi_warn;
 static int hardlockup_panic =
                        CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
+static bool hardlockup_detector_enabled = true;
+/*
+ * We may not want to enable hard lockup detection by default in all cases,
+ * for example when running the kernel as a guest on a hypervisor. In these
+ * cases this function can be called to disable hard lockup detection. This
+ * function should only be executed once by the boot processor before the
+ * kernel command line parameters are parsed, because otherwise it is not
+ * possible to override this in hardlockup_panic_setup().
+ */
+void watchdog_enable_hardlockup_detector(bool val)
+{
+        hardlockup_detector_enabled = val;
+}
+bool watchdog_hardlockup_detector_is_enabled(void)
+{
+        return hardlockup_detector_enabled;
+}
 static int __init hardlockup_panic_setup(char *str)
 {
        if (!strncmp(str, "panic", 5))
@@ -71,6 +86,14 @@ static int __init hardlockup_panic_setup(char *str)
                hardlockup_panic = 0;
        else if (!strncmp(str, "0", 1))
                watchdog_user_enabled = 0;
+        else if (!strncmp(str, "1", 1) || !strncmp(str, "2", 1)) {
+                /*
+                 * Setting 'nmi_watchdog=1' or 'nmi_watchdog=2' (legacy option)
+                 * has the same effect.
+                 */
+                watchdog_user_enabled = 1;
+                watchdog_enable_hardlockup_detector(true);
+        }
        return 1;
 }
 __setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -185,7 +208,7 @@ void touch_nmi_watchdog(void)
         * case we shouldn't have to worry about the watchdog
         * going off.
         */
-        __raw_get_cpu_var(watchdog_nmi_touch) = true;
+        raw_cpu_write(watchdog_nmi_touch, true);
        touch_softlockup_watchdog();
 }
 EXPORT_SYMBOL(touch_nmi_watchdog);
@@ -194,8 +217,8 @@ EXPORT_SYMBOL(touch_nmi_watchdog);
 void touch_softlockup_watchdog_sync(void)
 {
-        __raw_get_cpu_var(softlockup_touch_sync) = true;
+        __this_cpu_write(softlockup_touch_sync, true);
-        __raw_get_cpu_var(watchdog_touch_ts) = 0;
+        __this_cpu_write(watchdog_touch_ts, 0);
 }
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
@@ -260,9 +283,11 @@ static void watchdog_overflow_callback(struct perf_event *event,
                        return;
                if (hardlockup_panic)
-                        panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+                        panic("Watchdog detected hard LOCKUP on cpu %d",
+                              this_cpu);
                else
-                        WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+                        WARN(1, "Watchdog detected hard LOCKUP on cpu %d",
+                             this_cpu);
                __this_cpu_write(hard_watchdog_warn, true);
                return;
@@ -331,8 +356,22 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
                        return HRTIMER_RESTART;
                /* only warn once */
-                if (__this_cpu_read(soft_watchdog_warn) == true)
+                if (__this_cpu_read(soft_watchdog_warn) == true) {
+                        /*
+                         * When multiple processes are causing softlockups the
+                         * softlockup detector only warns on the first one
+                         * because the code relies on a full quiet cycle to
+                         * re-arm.  The second process prevents the quiet cycle
+                         * and never gets reported.  Use task pointers to detect
+                         * this.
+                         */
+                        if (__this_cpu_read(softlockup_task_ptr_saved) !=
+                            current) {
+                                __this_cpu_write(soft_watchdog_warn, false);
+                                __touch_watchdog();
+                        }
                        return HRTIMER_RESTART;
+                }
                if (softlockup_all_cpu_backtrace) {
                        /* Prevent multiple soft-lockup reports if one cpu is already
@@ -345,9 +384,10 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
                        }
                }
-                printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
+                pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
                        smp_processor_id(), duration,
                        current->comm, task_pid_nr(current));
+                __this_cpu_write(softlockup_task_ptr_saved, current);
                print_modules();
                print_irqtrace_events(current);
                if (regs)
@@ -366,6 +406,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
                        smp_mb__after_atomic();
                }
+                add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
                if (softlockup_panic)
                        panic("softlockup: hung tasks");
                __this_cpu_write(soft_watchdog_warn, true);
@@ -384,7 +425,7 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio)
 static void watchdog_enable(unsigned int cpu)
 {
-        struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
+        struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
        /* kick off the timer for the hardlockup detector */
        hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -404,7 +445,7 @@ static void watchdog_enable(unsigned int cpu)
 static void watchdog_disable(unsigned int cpu)
 {
-        struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
+        struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
        watchdog_set_prio(SCHED_NORMAL, 0);
        hrtimer_cancel(hrtimer);
@@ -451,6 +492,15 @@ static int watchdog_nmi_enable(unsigned int cpu)
        struct perf_event_attr *wd_attr;
        struct perf_event *event = per_cpu(watchdog_ev, cpu);
+        /*
+         * Some kernels need to default hard lockup detection to
+         * 'disabled', for example a guest on a hypervisor.
+         */
+        if (!watchdog_hardlockup_detector_is_enabled()) {
+                event = ERR_PTR(-ENOENT);
+                goto handle_err;
+        }
        /* is it already setup and enabled? */
        if (event && event->state > PERF_EVENT_STATE_OFF)
                goto out;
@@ -465,6 +515,7 @@ static int watchdog_nmi_enable(unsigned int cpu)
        /* Try to register using hardware perf events */
        event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
+handle_err:
        /* save cpu0 error for future comparision */
        if (cpu == 0 && IS_ERR(event))
                cpu0_err = PTR_ERR(event);
@@ -484,7 +535,7 @@ static int watchdog_nmi_enable(unsigned int cpu)
        if (PTR_ERR(event) == -EOPNOTSUPP)
                pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
        else if (PTR_ERR(event) == -ENOENT)
-                pr_warning("disabled (cpu%i): hardware events not enabled\n",
+                pr_warn("disabled (cpu%i): hardware events not enabled\n",
                         cpu);
        else
                pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
@@ -511,7 +562,10 @@ static void watchdog_nmi_disable(unsigned int cpu)
                /* should be in cleanup, but blocks oprofile */
                perf_event_release_kernel(event);
        }
-        return;
+        if (cpu == 0) {
+                /* watchdog_nmi_enable() expects this to be zero initially. */
+                cpu0_err = 0;
+        }
 }
 #else
 static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
@@ -531,7 +585,7 @@ static struct smp_hotplug_thread watchdog_threads = {
 static void restart_watchdog_hrtimer(void *info)
 {
-        struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
+        struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
        int ret;
        /*
@@ -607,11 +661,13 @@ int proc_dowatchdog(struct ctl_table *table, int write,
                    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        int err, old_thresh, old_enabled;
+        bool old_hardlockup;
        static DEFINE_MUTEX(watchdog_proc_mutex);
        mutex_lock(&watchdog_proc_mutex);
        old_thresh = ACCESS_ONCE(watchdog_thresh);
        old_enabled = ACCESS_ONCE(watchdog_user_enabled);
+        old_hardlockup = watchdog_hardlockup_detector_is_enabled();
        err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (err || !write)
@@ -623,15 +679,22 @@ int proc_dowatchdog(struct ctl_table *table, int write,
         * disabled. The 'watchdog_running' variable check in
         * watchdog_*_all_cpus() function takes care of this.
         */
-        if (watchdog_user_enabled && watchdog_thresh)
+        if (watchdog_user_enabled && watchdog_thresh) {
+                /*
+                 * Prevent a change in watchdog_thresh accidentally overriding
+                 * the enablement of the hardlockup detector.
+                 */
+                if (watchdog_user_enabled != old_enabled)
+                        watchdog_enable_hardlockup_detector(true);
                err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh);
-        else
+        } else
                watchdog_disable_all_cpus();
        /* Restore old values on failure */
        if (err) {
                watchdog_thresh = old_thresh;
                watchdog_user_enabled = old_enabled;
+                watchdog_enable_hardlockup_detector(old_hardlockup);
        }
 out:
        mutex_unlock(&watchdog_proc_mutex);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5dbe22aa3efd..09b685daee3d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2043,9 +2043,10 @@ __acquires(&pool->lock)
         * kernels, where a requeueing work item waiting for something to
         * happen could deadlock with stop_machine as such work item could
         * indefinitely requeue itself while all other CPUs are trapped in
-         * stop_machine.
+         * stop_machine. At the same time, report a quiescent RCU state so
+         * the same condition doesn't freeze RCU.
         */
-        cond_resched();
+        cond_resched_rcu_qs();
        spin_lock_irq(&pool->lock);
author	Jiri Kosina <jkosina@suse.cz>	2014-11-20 08:42:02 -0500
committer	Jiri Kosina <jkosina@suse.cz>	2014-11-20 08:42:02 -0500
commit	a02001086bbfb4da35d1228bebc2f1b442db455f (patch)
tree	62ab47936cef06fd08657ca5b6cd1df98c19be57 /kernel
parent	eff264efeeb0898408e8c9df72d8a32621035bed (diff)
parent	fc14f9c1272f62c3e8d01300f52467c0d9af50f9 (diff)