104 files changed, 11722 insertions, 4271 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index cb41b9547c9f..c0cc67ad764c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -5,12 +5,12 @@
 obj-y     = fork.o exec_domain.o panic.o printk.o \
            cpu.o exit.o itimer.o time.o softirq.o resource.o \
            sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
-            signal.o sys.o kmod.o workqueue.o pid.o \
+            signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
            rcupdate.o extable.o params.o posix-timers.o \
            kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
            notifier.o ksysfs.o cred.o \
-            async.o range.o groups.o
+            async.o range.o groups.o lglock.o
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
@@ -25,6 +25,9 @@ endif
 obj-y += sched/
 obj-y += power/
+ifeq ($(CONFIG_CHECKPOINT_RESTORE),y)
+obj-$(CONFIG_X86) += kcmp.o
+endif
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
@@ -43,6 +46,7 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += smp.o
+obj-$(CONFIG_SMP) += smpboot.o
 ifneq ($(CONFIG_SMP),y)
 obj-y += up.o
 endif
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index af1de0f34eae..4b96415527b8 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -67,6 +67,7 @@
 #include <linux/syscalls.h>
 #include <linux/capability.h>
 #include <linux/fs_struct.h>
+#include <linux/compat.h>
 #include "audit.h"
@@ -2710,13 +2711,16 @@ void audit_core_dumps(long signr)
        audit_log_end(ab);
 }
-void __audit_seccomp(unsigned long syscall)
+void __audit_seccomp(unsigned long syscall, long signr, int code)
 {
        struct audit_buffer *ab;
        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
-        audit_log_abend(ab, "seccomp", SIGKILL);
+        audit_log_abend(ab, "seccomp", signr);
        audit_log_format(ab, " syscall=%ld", syscall);
+        audit_log_format(ab, " compat=%d", is_compat_task());
+        audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current));
+        audit_log_format(ab, " code=0x%x", code);
        audit_log_end(ab);
 }
diff --git a/kernel/capability.c b/kernel/capability.c
index 3f1adb6c6470..493d97259484 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -419,3 +419,24 @@ bool nsown_capable(int cap)
 {
        return ns_capable(current_user_ns(), cap);
 }
+/**
+ * inode_capable - Check superior capability over inode
+ * @inode: The inode in question
+ * @cap: The capability in question
+ *
+ * Return true if the current task has the given superior capability
+ * targeted at it's own user namespace and that the given inode is owned
+ * by the current user namespace or a child namespace.
+ *
+ * Currently we check to see if an inode is owned by the current
+ * user namespace by seeing if the inode's owner maps into the
+ * current user namespace.
+ *
+ */
+bool inode_capable(const struct inode *inode, int cap)
+{
+        struct user_namespace *ns = current_user_ns();
+        return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid);
+}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ed64ccac67c9..b303dfc7dce0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -60,9 +60,13 @@
 #include <linux/eventfd.h>
 #include <linux/poll.h>
 #include <linux/flex_array.h> /* used in cgroup_attach_proc */
+#include <linux/kthread.h>
 #include <linux/atomic.h>
+/* css deactivation bias, makes css->refcnt negative to deny new trygets */
+#define CSS_DEACT_BIAS          INT_MIN
 /*
 * cgroup_mutex is the master lock.  Any modification to cgroup or its
 * hierarchy must be performed while holding it.
@@ -127,6 +131,9 @@ struct cgroupfs_root {
        /* A list running through the active hierarchies */
        struct list_head root_list;
+        /* All cgroups on this root, cgroup_mutex protected */
+        struct list_head allcg_list;
        /* Hierarchy-specific flags */
        unsigned long flags;
@@ -145,6 +152,15 @@ struct cgroupfs_root {
 static struct cgroupfs_root rootnode;
 /*
+ * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
+ */
+struct cfent {
+        struct list_head                node;
+        struct dentry                   *dentry;
+        struct cftype                   *type;
+};
+/*
 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
 * cgroup_subsys->use_id != 0.
 */
@@ -239,6 +255,19 @@ int cgroup_lock_is_held(void)
 EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
+static int css_unbias_refcnt(int refcnt)
+{
+        return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
+}
+/* the current nr of refs, always >= 0 whether @css is deactivated or not */
+static int css_refcnt(struct cgroup_subsys_state *css)
+{
+        int v = atomic_read(&css->refcnt);
+        return css_unbias_refcnt(v);
+}
 /* convenient tests for these bits */
 inline int cgroup_is_removed(const struct cgroup *cgrp)
 {
@@ -279,6 +308,21 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling)
 #define for_each_active_root(_root) \
 list_for_each_entry(_root, &roots, root_list)
+static inline struct cgroup *__d_cgrp(struct dentry *dentry)
+{
+        return dentry->d_fsdata;
+}
+static inline struct cfent *__d_cfe(struct dentry *dentry)
+{
+        return dentry->d_fsdata;
+}
+static inline struct cftype *__d_cft(struct dentry *dentry)
+{
+        return __d_cfe(dentry)->type;
+}
 /* the list of cgroups eligible for automatic release. Protected by
 * release_list_lock */
 static LIST_HEAD(release_list);
@@ -816,12 +860,17 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
        struct cgroup_subsys *ss;
        int ret = 0;
-        for_each_subsys(cgrp->root, ss)
+        for_each_subsys(cgrp->root, ss) {
-                if (ss->pre_destroy) {
+                if (!ss->pre_destroy)
-                        ret = ss->pre_destroy(cgrp);
+                        continue;
-                        if (ret)
-                                break;
+                ret = ss->pre_destroy(cgrp);
+                if (ret) {
+                        /* ->pre_destroy() failure is being deprecated */
+                        WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
+                        break;
                }
+        }
        return ret;
 }
@@ -864,6 +913,14 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
                BUG_ON(!list_empty(&cgrp->pidlists));
                kfree_rcu(cgrp, rcu_head);
+        } else {
+                struct cfent *cfe = __d_cfe(dentry);
+                struct cgroup *cgrp = dentry->d_parent->d_fsdata;
+                WARN_ONCE(!list_empty(&cfe->node) &&
+                          cgrp != &cgrp->root->top_cgroup,
+                          "cfe still linked for %s\n", cfe->type->name);
+                kfree(cfe);
        }
        iput(inode);
 }
@@ -882,34 +939,36 @@ static void remove_dir(struct dentry *d)
        dput(parent);
 }
-static void cgroup_clear_directory(struct dentry *dentry)
+static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
-{
+{
-        struct list_head *node;
+        struct cfent *cfe;
-        BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+        lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
-        spin_lock(&dentry->d_lock);
+        lockdep_assert_held(&cgroup_mutex);
-        node = dentry->d_subdirs.next;
-        while (node != &dentry->d_subdirs) {
+        list_for_each_entry(cfe, &cgrp->files, node) {
-                struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
+                struct dentry *d = cfe->dentry;
-                spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
+                if (cft && cfe->type != cft)
-                list_del_init(node);
+                        continue;
-                if (d->d_inode) {
-                        /* This should never be called on a cgroup
+                dget(d);
-                         * directory with child cgroups */
+                d_delete(d);
-                        BUG_ON(d->d_inode->i_mode & S_IFDIR);
+                simple_unlink(d->d_inode, d);
-                        dget_dlock(d);
+                list_del_init(&cfe->node);
-                        spin_unlock(&d->d_lock);
+                dput(d);
-                        spin_unlock(&dentry->d_lock);
-                        d_delete(d);
+                return 0;
-                        simple_unlink(dentry->d_inode, d);
-                        dput(d);
-                        spin_lock(&dentry->d_lock);
-                } else
-                        spin_unlock(&d->d_lock);
-                node = dentry->d_subdirs.next;
        }
-        spin_unlock(&dentry->d_lock);
+        return -ENOENT;
+}
+static void cgroup_clear_directory(struct dentry *dir)
+{
+        struct cgroup *cgrp = __d_cgrp(dir);
+        while (!list_empty(&cgrp->files))
+                cgroup_rm_file(cgrp, NULL);
 }
 /*
@@ -1294,6 +1353,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
        if (ret)
                goto out_unlock;
+        /* See feature-removal-schedule.txt */
+        if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent)
+                pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
+                           task_tgid_nr(current), current->comm);
        /* Don't allow flags or name to change at remount */
        if (opts.flags != root->flags ||
            (opts.name && strcmp(opts.name, root->name))) {
@@ -1308,7 +1372,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
                goto out_unlock;
        }
-        /* (re)populate subsystem files */
+        /* clear out any existing files and repopulate subsystem files */
+        cgroup_clear_directory(cgrp->dentry);
        cgroup_populate_dir(cgrp);
        if (opts.release_agent)
@@ -1333,6 +1398,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 {
        INIT_LIST_HEAD(&cgrp->sibling);
        INIT_LIST_HEAD(&cgrp->children);
+        INIT_LIST_HEAD(&cgrp->files);
        INIT_LIST_HEAD(&cgrp->css_sets);
        INIT_LIST_HEAD(&cgrp->release_list);
        INIT_LIST_HEAD(&cgrp->pidlists);
@@ -1344,11 +1410,14 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 static void init_cgroup_root(struct cgroupfs_root *root)
 {
        struct cgroup *cgrp = &root->top_cgroup;
        INIT_LIST_HEAD(&root->subsys_list);
        INIT_LIST_HEAD(&root->root_list);
+        INIT_LIST_HEAD(&root->allcg_list);
        root->number_of_cgroups = 1;
        cgrp->root = root;
        cgrp->top_cgroup = cgrp;
+        list_add_tail(&cgrp->allcg_node, &root->allcg_list);
        init_cgroup_housekeeping(cgrp);
 }
@@ -1692,16 +1761,6 @@ static struct file_system_type cgroup_fs_type = {
 static struct kobject *cgroup_kobj;
-static inline struct cgroup *__d_cgrp(struct dentry *dentry)
-{
-        return dentry->d_fsdata;
-}
-static inline struct cftype *__d_cft(struct dentry *dentry)
-{
-        return dentry->d_fsdata;
-}
 /**
 * cgroup_path - generate the path of a cgroup
 * @cgrp: the cgroup in question
@@ -2160,9 +2219,9 @@ retry_find_task:
                 * only need to check permissions on one of them.
                 */
                tcred = __task_cred(tsk);
-                if (cred->euid &&
+                if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
-                    cred->euid != tcred->uid &&
+                    !uid_eq(cred->euid, tcred->uid) &&
-                    cred->euid != tcred->suid) {
+                    !uid_eq(cred->euid, tcred->suid)) {
                        rcu_read_unlock();
                        ret = -EACCES;
                        goto out_unlock_cgroup;
@@ -2172,6 +2231,18 @@ retry_find_task:
        if (threadgroup)
                tsk = tsk->group_leader;
+        /*
+         * Workqueue threads may acquire PF_THREAD_BOUND and become
+         * trapped in a cpuset, or RT worker may be born in a cgroup
+         * with no rt_runtime allocated.  Just say no.
+         */
+        if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) {
+                ret = -EINVAL;
+                rcu_read_unlock();
+                goto out_unlock_cgroup;
+        }
        get_task_struct(tsk);
        rcu_read_unlock();
@@ -2603,50 +2674,191 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
        return mode;
 }
-int cgroup_add_file(struct cgroup *cgrp,
+static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
-                       struct cgroup_subsys *subsys,
+                           const struct cftype *cft)
-                       const struct cftype *cft)
 {
        struct dentry *dir = cgrp->dentry;
+        struct cgroup *parent = __d_cgrp(dir);
        struct dentry *dentry;
+        struct cfent *cfe;
        int error;
        umode_t mode;
        char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
+        /* does @cft->flags tell us to skip creation on @cgrp? */
+        if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
+                return 0;
+        if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
+                return 0;
        if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
                strcpy(name, subsys->name);
                strcat(name, ".");
        }
        strcat(name, cft->name);
        BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
+        cfe = kzalloc(sizeof(*cfe), GFP_KERNEL);
+        if (!cfe)
+                return -ENOMEM;
        dentry = lookup_one_len(name, dir, strlen(name));
-        if (!IS_ERR(dentry)) {
+        if (IS_ERR(dentry)) {
-                mode = cgroup_file_mode(cft);
-                error = cgroup_create_file(dentry, mode | S_IFREG,
-                                                cgrp->root->sb);
-                if (!error)
-                        dentry->d_fsdata = (void *)cft;
-                dput(dentry);
-        } else
                error = PTR_ERR(dentry);
+                goto out;
+        }
+        mode = cgroup_file_mode(cft);
+        error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
+        if (!error) {
+                cfe->type = (void *)cft;
+                cfe->dentry = dentry;
+                dentry->d_fsdata = cfe;
+                list_add_tail(&cfe->node, &parent->files);
+                cfe = NULL;
+        }
+        dput(dentry);
+out:
+        kfree(cfe);
        return error;
 }
-EXPORT_SYMBOL_GPL(cgroup_add_file);
-int cgroup_add_files(struct cgroup *cgrp,
+static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
-                        struct cgroup_subsys *subsys,
+                              const struct cftype cfts[], bool is_add)
-                        const struct cftype cft[],
-                        int count)
 {
-        int i, err;
+        const struct cftype *cft;
-        for (i = 0; i < count; i++) {
+        int err, ret = 0;
-                err = cgroup_add_file(cgrp, subsys, &cft[i]);
-                if (err)
+        for (cft = cfts; cft->name[0] != '\0'; cft++) {
-                        return err;
+                if (is_add)
+                        err = cgroup_add_file(cgrp, subsys, cft);
+                else
+                        err = cgroup_rm_file(cgrp, cft);
+                if (err) {
+                        pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
+                                   is_add ? "add" : "remove", cft->name, err);
+                        ret = err;
+                }
+        }
+        return ret;
+}
+static DEFINE_MUTEX(cgroup_cft_mutex);
+static void cgroup_cfts_prepare(void)
+        __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex)
+{
+        /*
+         * Thanks to the entanglement with vfs inode locking, we can't walk
+         * the existing cgroups under cgroup_mutex and create files.
+         * Instead, we increment reference on all cgroups and build list of
+         * them using @cgrp->cft_q_node.  Grab cgroup_cft_mutex to ensure
+         * exclusive access to the field.
+         */
+        mutex_lock(&cgroup_cft_mutex);
+        mutex_lock(&cgroup_mutex);
+}
+static void cgroup_cfts_commit(struct cgroup_subsys *ss,
+                               const struct cftype *cfts, bool is_add)
+        __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex)
+{
+        LIST_HEAD(pending);
+        struct cgroup *cgrp, *n;
+        /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
+        if (cfts && ss->root != &rootnode) {
+                list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) {
+                        dget(cgrp->dentry);
+                        list_add_tail(&cgrp->cft_q_node, &pending);
+                }
        }
+        mutex_unlock(&cgroup_mutex);
+        /*
+         * All new cgroups will see @cfts update on @ss->cftsets.  Add/rm
+         * files for all cgroups which were created before.
+         */
+        list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) {
+                struct inode *inode = cgrp->dentry->d_inode;
+                mutex_lock(&inode->i_mutex);
+                mutex_lock(&cgroup_mutex);
+                if (!cgroup_is_removed(cgrp))
+                        cgroup_addrm_files(cgrp, ss, cfts, is_add);
+                mutex_unlock(&cgroup_mutex);
+                mutex_unlock(&inode->i_mutex);
+                list_del_init(&cgrp->cft_q_node);
+                dput(cgrp->dentry);
+        }
+        mutex_unlock(&cgroup_cft_mutex);
+}
+/**
+ * cgroup_add_cftypes - add an array of cftypes to a subsystem
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Register @cfts to @ss.  Files described by @cfts are created for all
+ * existing cgroups to which @ss is attached and all future cgroups will
+ * have them too.  This function can be called anytime whether @ss is
+ * attached or not.
+ *
+ * Returns 0 on successful registration, -errno on failure.  Note that this
+ * function currently returns 0 as long as @cfts registration is successful
+ * even if some file creation attempts on existing cgroups fail.
+ */
+int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
+{
+        struct cftype_set *set;
+        set = kzalloc(sizeof(*set), GFP_KERNEL);
+        if (!set)
+                return -ENOMEM;
+        cgroup_cfts_prepare();
+        set->cfts = cfts;
+        list_add_tail(&set->node, &ss->cftsets);
+        cgroup_cfts_commit(ss, cfts, true);
        return 0;
 }
-EXPORT_SYMBOL_GPL(cgroup_add_files);
+EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
+/**
+ * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Unregister @cfts from @ss.  Files described by @cfts are removed from
+ * all existing cgroups to which @ss is attached and all future cgroups
+ * won't have them either.  This function can be called anytime whether @ss
+ * is attached or not.
+ *
+ * Returns 0 on successful unregistration, -ENOENT if @cfts is not
+ * registered with @ss.
+ */
+int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
+{
+        struct cftype_set *set;
+        cgroup_cfts_prepare();
+        list_for_each_entry(set, &ss->cftsets, node) {
+                if (set->cfts == cfts) {
+                        list_del_init(&set->node);
+                        cgroup_cfts_commit(ss, cfts, false);
+                        return 0;
+                }
+        }
+        cgroup_cfts_commit(ss, NULL, false);
+        return -ENOENT;
+}
 /**
 * cgroup_task_count - count the number of tasks in a cgroup.
@@ -3625,13 +3837,14 @@ static struct cftype files[] = {
                .read_u64 = cgroup_clone_children_read,
                .write_u64 = cgroup_clone_children_write,
        },
-};
+        {
+                .name = "release_agent",
-static struct cftype cft_release_agent = {
+                .flags = CFTYPE_ONLY_ON_ROOT,
-        .name = "release_agent",
+                .read_seq_string = cgroup_release_agent_show,
-        .read_seq_string = cgroup_release_agent_show,
+                .write_string = cgroup_release_agent_write,
-        .write_string = cgroup_release_agent_write,
+                .max_write_len = PATH_MAX,
-        .max_write_len = PATH_MAX,
+        },
+        { }     /* terminate */
 };
 static int cgroup_populate_dir(struct cgroup *cgrp)
@@ -3639,22 +3852,18 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
        int err;
        struct cgroup_subsys *ss;
-        /* First clear out any existing files */
+        err = cgroup_addrm_files(cgrp, NULL, files, true);
-        cgroup_clear_directory(cgrp->dentry);
-        err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
        if (err < 0)
                return err;
-        if (cgrp == cgrp->top_cgroup) {
+        /* process cftsets of each subsystem */
-                if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
-                        return err;
-        }
        for_each_subsys(cgrp->root, ss) {
-                if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
+                struct cftype_set *set;
-                        return err;
+                list_for_each_entry(set, &ss->cftsets, node)
+                        cgroup_addrm_files(cgrp, ss, set->cfts, true);
        }
        /* This cgroup is ready now */
        for_each_subsys(cgrp->root, ss) {
                struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
@@ -3670,6 +3879,18 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
        return 0;
 }
+static void css_dput_fn(struct work_struct *work)
+{
+        struct cgroup_subsys_state *css =
+                container_of(work, struct cgroup_subsys_state, dput_work);
+        struct dentry *dentry = css->cgroup->dentry;
+        struct super_block *sb = dentry->d_sb;
+        atomic_inc(&sb->s_active);
+        dput(dentry);
+        deactivate_super(sb);
+}
 static void init_cgroup_css(struct cgroup_subsys_state *css,
                               struct cgroup_subsys *ss,
                               struct cgroup *cgrp)
@@ -3682,6 +3903,16 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
                set_bit(CSS_ROOT, &css->flags);
        BUG_ON(cgrp->subsys[ss->subsys_id]);
        cgrp->subsys[ss->subsys_id] = css;
+        /*
+         * If !clear_css_refs, css holds an extra ref to @cgrp->dentry
+         * which is put on the last css_put().  dput() requires process
+         * context, which css_put() may be called without.  @css->dput_work
+         * will be used to invoke dput() asynchronously from css_put().
+         */
+        INIT_WORK(&css->dput_work, css_dput_fn);
+        if (ss->__DEPRECATED_clear_css_refs)
+                set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
 }
 static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
@@ -3784,9 +4015,16 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        if (err < 0)
                goto err_remove;
+        /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
+        for_each_subsys(root, ss)
+                if (!ss->__DEPRECATED_clear_css_refs)
+                        dget(dentry);
        /* The cgroup directory was pre-locked for us */
        BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
+        list_add_tail(&cgrp->allcg_node, &root->allcg_list);
        err = cgroup_populate_dir(cgrp);
        /* If err < 0, we have a half-filled directory - oh well ;) */
@@ -3826,18 +4064,19 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        return cgroup_create(c_parent, dentry, mode | S_IFDIR);
 }
+/*
+ * Check the reference count on each subsystem. Since we already
+ * established that there are no tasks in the cgroup, if the css refcount
+ * is also 1, then there should be no outstanding references, so the
+ * subsystem is safe to destroy. We scan across all subsystems rather than
+ * using the per-hierarchy linked list of mounted subsystems since we can
+ * be called via check_for_release() with no synchronization other than
+ * RCU, and the subsystem linked list isn't RCU-safe.
+ */
 static int cgroup_has_css_refs(struct cgroup *cgrp)
 {
-        /* Check the reference count on each subsystem. Since we
-         * already established that there are no tasks in the
-         * cgroup, if the css refcount is also 1, then there should
-         * be no outstanding references, so the subsystem is safe to
-         * destroy. We scan across all subsystems rather than using
-         * the per-hierarchy linked list of mounted subsystems since
-         * we can be called via check_for_release() with no
-         * synchronization other than RCU, and the subsystem linked
-         * list isn't RCU-safe */
        int i;
        /*
         * We won't need to lock the subsys array, because the subsystems
         * we're concerned about aren't going anywhere since our cgroup root
@@ -3846,17 +4085,21 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
                struct cgroup_subsys_state *css;
                /* Skip subsystems not present or not in this hierarchy */
                if (ss == NULL || ss->root != cgrp->root)
                        continue;
                css = cgrp->subsys[ss->subsys_id];
-                /* When called from check_for_release() it's possible
+                /*
+                 * When called from check_for_release() it's possible
                 * that by this point the cgroup has been removed
                 * and the css deleted. But a false-positive doesn't
                 * matter, since it can only happen if the cgroup
                 * has been deleted and hence no longer needs the
-                 * release agent to be called anyway. */
+                 * release agent to be called anyway.
-                if (css && (atomic_read(&css->refcnt) > 1))
+                 */
+                if (css && css_refcnt(css) > 1)
                        return 1;
        }
        return 0;
@@ -3866,51 +4109,63 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
 * Atomically mark all (or else none) of the cgroup's CSS objects as
 * CSS_REMOVED. Return true on success, or false if the cgroup has
 * busy subsystems. Call with cgroup_mutex held
+ *
+ * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
+ * not, cgroup removal behaves differently.
+ *
+ * If clear is set, css refcnt for the subsystem should be zero before
+ * cgroup removal can be committed.  This is implemented by
+ * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
+ * called multiple times until all css refcnts reach zero and is allowed to
+ * veto removal on any invocation.  This behavior is deprecated and will be
+ * removed as soon as the existing user (memcg) is updated.
+ *
+ * If clear is not set, each css holds an extra reference to the cgroup's
+ * dentry and cgroup removal proceeds regardless of css refs.
+ * ->pre_destroy() will be called at least once and is not allowed to fail.
+ * On the last put of each css, whenever that may be, the extra dentry ref
+ * is put so that dentry destruction happens only after all css's are
+ * released.
 */
 static int cgroup_clear_css_refs(struct cgroup *cgrp)
 {
        struct cgroup_subsys *ss;
        unsigned long flags;
        bool failed = false;
        local_irq_save(flags);
+        /*
+         * Block new css_tryget() by deactivating refcnt.  If all refcnts
+         * for subsystems w/ clear_css_refs set were 1 at the moment of
+         * deactivation, we succeeded.
+         */
        for_each_subsys(cgrp->root, ss) {
                struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
-                int refcnt;
-                while (1) {
+                WARN_ON(atomic_read(&css->refcnt) < 0);
-                        /* We can only remove a CSS with a refcnt==1 */
+                atomic_add(CSS_DEACT_BIAS, &css->refcnt);
-                        refcnt = atomic_read(&css->refcnt);
-                        if (refcnt > 1) {
+                if (ss->__DEPRECATED_clear_css_refs)
-                                failed = true;
+                        failed |= css_refcnt(css) != 1;
-                                goto done;
-                        }
-                        BUG_ON(!refcnt);
-                        /*
-                         * Drop the refcnt to 0 while we check other
-                         * subsystems. This will cause any racing
-                         * css_tryget() to spin until we set the
-                         * CSS_REMOVED bits or abort
-                         */
-                        if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
-                                break;
-                        cpu_relax();
-                }
        }
- done:
+        /*
+         * If succeeded, set REMOVED and put all the base refs; otherwise,
+         * restore refcnts to positive values.  Either way, all in-progress
+         * css_tryget() will be released.
+         */
        for_each_subsys(cgrp->root, ss) {
                struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
-                if (failed) {
-                        /*
+                if (!failed) {
-                         * Restore old refcnt if we previously managed
-                         * to clear it from 1 to 0
-                         */
-                        if (!atomic_read(&css->refcnt))
-                                atomic_set(&css->refcnt, 1);
-                } else {
-                        /* Commit the fact that the CSS is removed */
                        set_bit(CSS_REMOVED, &css->flags);
+                        css_put(css);
+                } else {
+                        atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
                }
        }
        local_irq_restore(flags);
        return !failed;
 }
@@ -3995,6 +4250,8 @@ again:
        list_del_init(&cgrp->sibling);
        cgroup_unlock_hierarchy(cgrp->root);
+        list_del_init(&cgrp->allcg_node);
        d = dget(cgrp->dentry);
        cgroup_d_remove_dir(d);
@@ -4021,12 +4278,29 @@ again:
        return 0;
 }
+static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
+{
+        INIT_LIST_HEAD(&ss->cftsets);
+        /*
+         * base_cftset is embedded in subsys itself, no need to worry about
+         * deregistration.
+         */
+        if (ss->base_cftypes) {
+                ss->base_cftset.cfts = ss->base_cftypes;
+                list_add_tail(&ss->base_cftset.node, &ss->cftsets);
+        }
+}
 static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 {
        struct cgroup_subsys_state *css;
        printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
+        /* init base cftset */
+        cgroup_init_cftsets(ss);
        /* Create the top cgroup state for this subsystem */
        list_add(&ss->sibling, &rootnode.subsys_list);
        ss->root = &rootnode;
@@ -4096,6 +4370,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
                return 0;
        }
+        /* init base cftset */
+        cgroup_init_cftsets(ss);
        /*
         * need to register a subsys id before anything else - for example,
         * init_cgroup_css needs it.
@@ -4685,21 +4962,43 @@ static void check_for_release(struct cgroup *cgrp)
 }
 /* Caller must verify that the css is not for root cgroup */
-void __css_put(struct cgroup_subsys_state *css, int count)
+bool __css_tryget(struct cgroup_subsys_state *css)
+{
+        do {
+                int v = css_refcnt(css);
+                if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v)
+                        return true;
+                cpu_relax();
+        } while (!test_bit(CSS_REMOVED, &css->flags));
+        return false;
+}
+EXPORT_SYMBOL_GPL(__css_tryget);
+/* Caller must verify that the css is not for root cgroup */
+void __css_put(struct cgroup_subsys_state *css)
 {
        struct cgroup *cgrp = css->cgroup;
-        int val;
+        int v;
        rcu_read_lock();
-        val = atomic_sub_return(count, &css->refcnt);
+        v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
-        if (val == 1) {
+        switch (v) {
+        case 1:
                if (notify_on_release(cgrp)) {
                        set_bit(CGRP_RELEASABLE, &cgrp->flags);
                        check_for_release(cgrp);
                }
                cgroup_wakeup_rmdir_waiter(cgrp);
+                break;
+        case 0:
+                if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags))
+                        schedule_work(&css->dput_work);
+                break;
        }
        rcu_read_unlock();
-        WARN_ON_ONCE(val < 1);
 }
 EXPORT_SYMBOL_GPL(__css_put);
@@ -4818,7 +5117,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
         * on this or this is under rcu_read_lock(). Once css->id is allocated,
         * it's unchanged until freed.
         */
-        cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
+        cssid = rcu_dereference_check(css->id, css_refcnt(css));
        if (cssid)
                return cssid->id;
@@ -4830,7 +5129,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
 {
        struct css_id *cssid;
-        cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
+        cssid = rcu_dereference_check(css->id, css_refcnt(css));
        if (cssid)
                return cssid->depth;
@@ -4844,7 +5143,7 @@ EXPORT_SYMBOL_GPL(css_depth);
 * @root: the css supporsed to be an ancestor of the child.
 *
 * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
- * this function reads css->id, this use rcu_dereference() and rcu_read_lock().
+ * this function reads css->id, the caller must hold rcu_read_lock().
 * But, considering usual usage, the csses should be valid objects after test.
 * Assuming that the caller will do some action to the child if this returns
 * returns true, the caller must take "child";s reference count.
@@ -4856,18 +5155,18 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,
 {
        struct css_id *child_id;
        struct css_id *root_id;
-        bool ret = true;
-        rcu_read_lock();
        child_id  = rcu_dereference(child->id);
+        if (!child_id)
+                return false;
        root_id = rcu_dereference(root->id);
-        if (!child_id
+        if (!root_id)
-            || !root_id
+                return false;
-            || (child_id->depth < root_id->depth)
+        if (child_id->depth < root_id->depth)
-            || (child_id->stack[root_id->depth] != root_id->id))
+                return false;
-                ret = false;
+        if (child_id->stack[root_id->depth] != root_id->id)
-        rcu_read_unlock();
+                return false;
-        return ret;
+        return true;
 }
 void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
@@ -5211,19 +5510,15 @@ static struct cftype debug_files[] =  {
                .name = "releasable",
                .read_u64 = releasable_read,
        },
-};
-static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
+        { }     /* terminate */
-{
+};
-        return cgroup_add_files(cont, ss, debug_files,
-                                ARRAY_SIZE(debug_files));
-}
 struct cgroup_subsys debug_subsys = {
        .name = "debug",
        .create = debug_create,
        .destroy = debug_destroy,
-        .populate = debug_populate,
        .subsys_id = debug_subsys_id,
+        .base_cftypes = debug_files,
 };
 #endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index f86e93920b62..3649fc6b3eaa 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -358,24 +358,19 @@ static int freezer_write(struct cgroup *cgroup,
 static struct cftype files[] = {
        {
                .name = "state",
+                .flags = CFTYPE_NOT_ON_ROOT,
                .read_seq_string = freezer_read,
                .write_string = freezer_write,
        },
+        { }     /* terminate */
 };
-static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
-{
-        if (!cgroup->parent)
-                return 0;
-        return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
-}
 struct cgroup_subsys freezer_subsys = {
        .name           = "freezer",
        .create         = freezer_create,
        .destroy        = freezer_destroy,
-        .populate       = freezer_populate,
        .subsys_id      = freezer_subsys_id,
        .can_attach     = freezer_can_attach,
        .fork           = freezer_fork,
+        .base_cftypes   = files,
 };
diff --git a/kernel/compat.c b/kernel/compat.c
index 74ff8498809a..c28a306ae05c 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -372,25 +372,54 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
 #ifdef __ARCH_WANT_SYS_SIGPROCMASK
-asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
+/*
-                compat_old_sigset_t __user *oset)
+ * sys_sigprocmask SIG_SETMASK sets the first (compat) word of the
+ * blocked set of signals to the supplied signal set
+ */
+static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set)
 {
-        old_sigset_t s;
+        memcpy(blocked->sig, &set, sizeof(set));
-        long ret;
+}
-        mm_segment_t old_fs;
-        if (set && get_user(s, set))
+asmlinkage long compat_sys_sigprocmask(int how,
-                return -EFAULT;
+                                       compat_old_sigset_t __user *nset,
-        old_fs = get_fs();
+                                       compat_old_sigset_t __user *oset)
-        set_fs(KERNEL_DS);
+{
-        ret = sys_sigprocmask(how,
+        old_sigset_t old_set, new_set;
-                              set ? (old_sigset_t __user *) &s : NULL,
+        sigset_t new_blocked;
-                              oset ? (old_sigset_t __user *) &s : NULL);
-        set_fs(old_fs);
+        old_set = current->blocked.sig[0];
-        if (ret == 0)
-                if (oset)
+        if (nset) {
-                        ret = put_user(s, oset);
+                if (get_user(new_set, nset))
-        return ret;
+                        return -EFAULT;
+                new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
+                new_blocked = current->blocked;
+                switch (how) {
+                case SIG_BLOCK:
+                        sigaddsetmask(&new_blocked, new_set);
+                        break;
+                case SIG_UNBLOCK:
+                        sigdelsetmask(&new_blocked, new_set);
+                        break;
+                case SIG_SETMASK:
+                        compat_sig_setmask(&new_blocked, new_set);
+                        break;
+                default:
+                        return -EINVAL;
+                }
+                set_current_blocked(&new_blocked);
+        }
+        if (oset) {
+                if (put_user(old_set, oset))
+                        return -EFAULT;
+        }
+        return 0;
 }
 #endif
@@ -1044,15 +1073,7 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
        if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
                return -EFAULT;
        sigset_from_compat(&newset, &newset32);
-        sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
+        return sigsuspend(&newset);
-        current->saved_sigmask = current->blocked;
-        set_current_blocked(&newset);
-        current->state = TASK_INTERRUPTIBLE;
-        schedule();
-        set_restore_sigmask();
-        return -ERESTARTNOHAND;
 }
 #endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2060c6e57027..a4eb5227a19e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -10,13 +10,18 @@
 #include <linux/sched.h>
 #include <linux/unistd.h>
 #include <linux/cpu.h>
+#include <linux/oom.h>
+#include <linux/rcupdate.h>
 #include <linux/export.h>
+#include <linux/bug.h>
 #include <linux/kthread.h>
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
 #include <linux/gfp.h>
 #include <linux/suspend.h>
+#include "smpboot.h"
 #ifdef CONFIG_SMP
 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
 static DEFINE_MUTEX(cpu_add_remove_lock);
@@ -171,6 +176,47 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_cpu_notifier);
+/**
+ * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
+ * @cpu: a CPU id
+ *
+ * This function walks all processes, finds a valid mm struct for each one and
+ * then clears a corresponding bit in mm's cpumask.  While this all sounds
+ * trivial, there are various non-obvious corner cases, which this function
+ * tries to solve in a safe manner.
+ *
+ * Also note that the function uses a somewhat relaxed locking scheme, so it may
+ * be called only for an already offlined CPU.
+ */
+void clear_tasks_mm_cpumask(int cpu)
+{
+        struct task_struct *p;
+        /*
+         * This function is called after the cpu is taken down and marked
+         * offline, so its not like new tasks will ever get this cpu set in
+         * their mm mask. -- Peter Zijlstra
+         * Thus, we may use rcu_read_lock() here, instead of grabbing
+         * full-fledged tasklist_lock.
+         */
+        WARN_ON(cpu_online(cpu));
+        rcu_read_lock();
+        for_each_process(p) {
+                struct task_struct *t;
+                /*
+                 * Main thread might exit, but other threads may still have
+                 * a valid mm. Find one.
+                 */
+                t = find_lock_task_mm(p);
+                if (!t)
+                        continue;
+                cpumask_clear_cpu(cpu, mm_cpumask(t->mm));
+                task_unlock(t);
+        }
+        rcu_read_unlock();
+}
 static inline void check_for_tasks(int cpu)
 {
        struct task_struct *p;
@@ -295,11 +341,19 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
        int ret, nr_calls = 0;
        void *hcpu = (void *)(long)cpu;
        unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
+        struct task_struct *idle;
        if (cpu_online(cpu) || !cpu_present(cpu))
                return -EINVAL;
        cpu_hotplug_begin();
+        idle = idle_thread_get(cpu);
+        if (IS_ERR(idle)) {
+                ret = PTR_ERR(idle);
+                goto out;
+        }
        ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
        if (ret) {
                nr_calls--;
@@ -309,7 +363,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
        }
        /* Arch-specific enabling code. */
-        ret = __cpu_up(cpu);
+        ret = __cpu_up(cpu, idle);
        if (ret != 0)
                goto out_notify;
        BUG_ON(!cpu_online(cpu));
@@ -320,6 +374,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
 out_notify:
        if (ret != 0)
                __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
+out:
        cpu_hotplug_done();
        return ret;
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 249152e15308..9656a3c36503 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -81,7 +81,7 @@ int cpu_pm_unregister_notifier(struct notifier_block *nb)
 EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
 /**
- * cpm_pm_enter - CPU low power entry notifier
+ * cpu_pm_enter - CPU low power entry notifier
 *
 * Notifies listeners that a single CPU is entering a low power state that may
 * cause some blocks in the same power domain as the cpu to reset.
@@ -89,7 +89,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
 * Must be called on the affected CPU with interrupts disabled.  Platform is
 * responsible for ensuring that cpu_pm_enter is not called twice on the same
 * CPU before cpu_pm_exit is called. Notified drivers can include VFP
- * co-processor, interrupt controller and it's PM extensions, local CPU
+ * co-processor, interrupt controller and its PM extensions, local CPU
 * timers context save/restore which shouldn't be interrupted. Hence it
 * must be called with interrupts disabled.
 *
@@ -115,13 +115,13 @@ int cpu_pm_enter(void)
 EXPORT_SYMBOL_GPL(cpu_pm_enter);
 /**
- * cpm_pm_exit - CPU low power exit notifier
+ * cpu_pm_exit - CPU low power exit notifier
 *
 * Notifies listeners that a single CPU is exiting a low power state that may
 * have caused some blocks in the same power domain as the cpu to reset.
 *
 * Notified drivers can include VFP co-processor, interrupt controller
- * and it's PM extensions, local CPU timers context save/restore which
+ * and its PM extensions, local CPU timers context save/restore which
 * shouldn't be interrupted. Hence it must be called with interrupts disabled.
 *
 * Return conditions are same as __raw_notifier_call_chain.
@@ -139,7 +139,7 @@ int cpu_pm_exit(void)
 EXPORT_SYMBOL_GPL(cpu_pm_exit);
 /**
- * cpm_cluster_pm_enter - CPU cluster low power entry notifier
+ * cpu_cluster_pm_enter - CPU cluster low power entry notifier
 *
 * Notifies listeners that all cpus in a power domain are entering a low power
 * state that may cause some blocks in the same power domain to reset.
@@ -147,7 +147,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit);
 * Must be called after cpu_pm_enter has been called on all cpus in the power
 * domain, and before cpu_pm_exit has been called on any cpu in the power
 * domain. Notified drivers can include VFP co-processor, interrupt controller
- * and it's PM extensions, local CPU timers context save/restore which
+ * and its PM extensions, local CPU timers context save/restore which
 * shouldn't be interrupted. Hence it must be called with interrupts disabled.
 *
 * Must be called with interrupts disabled.
@@ -174,7 +174,7 @@ int cpu_cluster_pm_enter(void)
 EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
 /**
- * cpm_cluster_pm_exit - CPU cluster low power exit notifier
+ * cpu_cluster_pm_exit - CPU cluster low power exit notifier
 *
 * Notifies listeners that all cpus in a power domain are exiting form a
 * low power state that may have caused some blocks in the same power domain
@@ -183,7 +183,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
 * Must be called after cpu_pm_exit has been called on all cpus in the power
 * domain, and before cpu_pm_exit has been called on any cpu in the power
 * domain. Notified drivers can include VFP co-processor, interrupt controller
- * and it's PM extensions, local CPU timers context save/restore which
+ * and its PM extensions, local CPU timers context save/restore which
 * shouldn't be interrupted. Hence it must be called with interrupts disabled.
 *
 * Return conditions are same as __raw_notifier_call_chain.
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 14f7070b4ba2..8c8bd652dd12 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1765,28 +1765,17 @@ static struct cftype files[] = {
                .write_u64 = cpuset_write_u64,
                .private = FILE_SPREAD_SLAB,
        },
-};
-static struct cftype cft_memory_pressure_enabled = {
-        .name = "memory_pressure_enabled",
-        .read_u64 = cpuset_read_u64,
-        .write_u64 = cpuset_write_u64,
-        .private = FILE_MEMORY_PRESSURE_ENABLED,
-};
-static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
+        {
-{
+                .name = "memory_pressure_enabled",
-        int err;
+                .flags = CFTYPE_ONLY_ON_ROOT,
+                .read_u64 = cpuset_read_u64,
+                .write_u64 = cpuset_write_u64,
+                .private = FILE_MEMORY_PRESSURE_ENABLED,
+        },
-        err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
+        { }     /* terminate */
-        if (err)
+};
-                return err;
-        /* memory_pressure_enabled is in root cpuset only */
-        if (!cont->parent)
-                err = cgroup_add_file(cont, ss,
-                                      &cft_memory_pressure_enabled);
-        return err;
-}
 /*
 * post_clone() is called during cgroup_create() when the
@@ -1887,9 +1876,9 @@ struct cgroup_subsys cpuset_subsys = {
        .destroy = cpuset_destroy,
        .can_attach = cpuset_can_attach,
        .attach = cpuset_attach,
-        .populate = cpuset_populate,
        .post_clone = cpuset_post_clone,
        .subsys_id = cpuset_subsys_id,
+        .base_cftypes = files,
        .early_init = 1,
 };
diff --git a/kernel/cred.c b/kernel/cred.c
index e70683d9ec32..de728ac50d82 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -49,6 +49,14 @@ struct cred init_cred = {
        .subscribers            = ATOMIC_INIT(2),
        .magic                  = CRED_MAGIC,
 #endif
+        .uid                    = GLOBAL_ROOT_UID,
+        .gid                    = GLOBAL_ROOT_GID,
+        .suid                   = GLOBAL_ROOT_UID,
+        .sgid                   = GLOBAL_ROOT_GID,
+        .euid                   = GLOBAL_ROOT_UID,
+        .egid                   = GLOBAL_ROOT_GID,
+        .fsuid                  = GLOBAL_ROOT_UID,
+        .fsgid                  = GLOBAL_ROOT_GID,
        .securebits             = SECUREBITS_DEFAULT,
        .cap_inheritable        = CAP_EMPTY_SET,
        .cap_permitted          = CAP_FULL_SET,
@@ -148,6 +156,7 @@ static void put_cred_rcu(struct rcu_head *rcu)
        if (cred->group_info)
                put_group_info(cred->group_info);
        free_uid(cred->user);
+        put_user_ns(cred->user_ns);
        kmem_cache_free(cred_jar, cred);
 }
@@ -198,13 +207,6 @@ void exit_creds(struct task_struct *tsk)
        validate_creds(cred);
        alter_cred_subscribers(cred, -1);
        put_cred(cred);
-        cred = (struct cred *) tsk->replacement_session_keyring;
-        if (cred) {
-                tsk->replacement_session_keyring = NULL;
-                validate_creds(cred);
-                put_cred(cred);
-        }
 }
 /**
@@ -303,6 +305,7 @@ struct cred *prepare_creds(void)
        set_cred_subscribers(new, 0);
        get_group_info(new->group_info);
        get_uid(new->user);
+        get_user_ns(new->user_ns);
 #ifdef CONFIG_KEYS
        key_get(new->thread_keyring);
@@ -386,8 +389,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
        struct cred *new;
        int ret;
-        p->replacement_session_keyring = NULL;
        if (
 #ifdef CONFIG_KEYS
                !p->cred->thread_keyring &&
@@ -414,11 +415,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
                        goto error_put;
        }
-        /* cache user_ns in cred.  Doesn't need a refcount because it will
-         * stay pinned by cred->user
-         */
-        new->user_ns = new->user->user_ns;
 #ifdef CONFIG_KEYS
        /* new threads get their own thread keyrings if their parent already
         * had one */
@@ -493,10 +489,10 @@ int commit_creds(struct cred *new)
        get_cred(new); /* we will require a ref for the subj creds too */
        /* dumpability changes */
-        if (old->euid != new->euid ||
+        if (!uid_eq(old->euid, new->euid) ||
-            old->egid != new->egid ||
+            !gid_eq(old->egid, new->egid) ||
-            old->fsuid != new->fsuid ||
+            !uid_eq(old->fsuid, new->fsuid) ||
-            old->fsgid != new->fsgid ||
+            !gid_eq(old->fsgid, new->fsgid) ||
            !cap_issubset(new->cap_permitted, old->cap_permitted)) {
                if (task->mm)
                        set_dumpable(task->mm, suid_dumpable);
@@ -505,9 +501,9 @@ int commit_creds(struct cred *new)
        }
        /* alter the thread keyring */
-        if (new->fsuid != old->fsuid)
+        if (!uid_eq(new->fsuid, old->fsuid))
                key_fsuid_changed(task);
-        if (new->fsgid != old->fsgid)
+        if (!gid_eq(new->fsgid, old->fsgid))
                key_fsgid_changed(task);
        /* do it
@@ -524,16 +520,16 @@ int commit_creds(struct cred *new)
        alter_cred_subscribers(old, -2);
        /* send notifications */
-        if (new->uid   != old->uid  ||
+        if (!uid_eq(new->uid,   old->uid)  ||
-            new->euid  != old->euid ||
+            !uid_eq(new->euid,  old->euid) ||
-            new->suid  != old->suid ||
+            !uid_eq(new->suid,  old->suid) ||
-            new->fsuid != old->fsuid)
+            !uid_eq(new->fsuid, old->fsuid))
                proc_id_connector(task, PROC_EVENT_UID);
-        if (new->gid   != old->gid  ||
+        if (!gid_eq(new->gid,   old->gid)  ||
-            new->egid  != old->egid ||
+            !gid_eq(new->egid,  old->egid) ||
-            new->sgid  != old->sgid ||
+            !gid_eq(new->sgid,  old->sgid) ||
-            new->fsgid != old->fsgid)
+            !gid_eq(new->fsgid, old->fsgid))
                proc_id_connector(task, PROC_EVENT_GID);
        /* release the old obj and subj refs both */
@@ -678,6 +674,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
        atomic_set(&new->usage, 1);
        set_cred_subscribers(new, 0);
        get_uid(new->user);
+        get_user_ns(new->user_ns);
        get_group_info(new->group_info);
 #ifdef CONFIG_KEYS
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 67b847dfa2bb..1f91413edb87 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -14,6 +14,7 @@
 #include <linux/ctype.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
+#include <linux/kmsg_dump.h>
 #include <linux/reboot.h>
 #include <linux/sched.h>
 #include <linux/sysrq.h>
@@ -2040,8 +2041,15 @@ static int kdb_env(int argc, const char **argv)
 */
 static int kdb_dmesg(int argc, const char **argv)
 {
-        char *syslog_data[4], *start, *end, c = '\0', *p;
+        int diag;
-        int diag, logging, logsize, lines = 0, adjust = 0, n;
+        int logging;
+        int lines = 0;
+        int adjust = 0;
+        int n = 0;
+        int skip = 0;
+        struct kmsg_dumper dumper = { .active = 1 };
+        size_t len;
+        char buf[201];
        if (argc > 2)
                return KDB_ARGCOUNT;
@@ -2064,22 +2072,10 @@ static int kdb_dmesg(int argc, const char **argv)
                kdb_set(2, setargs);
        }
-        /* syslog_data[0,1] physical start, end+1.  syslog_data[2,3]
+        kmsg_dump_rewind_nolock(&dumper);
-         * logical start, end+1. */
+        while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL))
-        kdb_syslog_data(syslog_data);
+                n++;
-        if (syslog_data[2] == syslog_data[3])
-                return 0;
-        logsize = syslog_data[1] - syslog_data[0];
-        start = syslog_data[2];
-        end = syslog_data[3];
-#define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0])
-        for (n = 0, p = start; p < end; ++p) {
-                c = *KDB_WRAP(p);
-                if (c == '\n')
-                        ++n;
-        }
-        if (c != '\n')
-                ++n;
        if (lines < 0) {
                if (adjust >= n)
                        kdb_printf("buffer only contains %d lines, nothing "
@@ -2087,21 +2083,11 @@ static int kdb_dmesg(int argc, const char **argv)
                else if (adjust - lines >= n)
                        kdb_printf("buffer only contains %d lines, last %d "
                                   "lines printed\n", n, n - adjust);
-                if (adjust) {
+                skip = adjust;
-                        for (; start < end && adjust; ++start) {
+                lines = abs(lines);
-                                if (*KDB_WRAP(start) == '\n')
-                                        --adjust;
-                        }
-                        if (start < end)
-                                ++start;
-                }
-                for (p = start; p < end && lines; ++p) {
-                        if (*KDB_WRAP(p) == '\n')
-                                ++lines;
-                }
-                end = p;
        } else if (lines > 0) {
-                int skip = n - (adjust + lines);
+                skip = n - lines - adjust;
+                lines = abs(lines);
                if (adjust >= n) {
                        kdb_printf("buffer only contains %d lines, "
                                   "nothing printed\n", n);
@@ -2112,35 +2098,24 @@ static int kdb_dmesg(int argc, const char **argv)
                        kdb_printf("buffer only contains %d lines, first "
                                   "%d lines printed\n", n, lines);
                }
-                for (; start < end && skip; ++start) {
+        } else {
-                        if (*KDB_WRAP(start) == '\n')
+                lines = n;
-                                --skip;
-                }
-                for (p = start; p < end && lines; ++p) {
-                        if (*KDB_WRAP(p) == '\n')
-                                --lines;
-                }
-                end = p;
        }
-        /* Do a line at a time (max 200 chars) to reduce protocol overhead */
-        c = '\n';
+        if (skip >= n || skip < 0)
-        while (start != end) {
+                return 0;
-                char buf[201];
-                p = buf;
+        kmsg_dump_rewind_nolock(&dumper);
-                if (KDB_FLAG(CMD_INTERRUPT))
+        while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) {
-                        return 0;
+                if (skip) {
-                while (start < end && (c = *KDB_WRAP(start)) &&
+                        skip--;
-                       (p - buf) < sizeof(buf)-1) {
+                        continue;
-                        ++start;
-                        *p++ = c;
-                        if (c == '\n')
-                                break;
                }
-                *p = '\0';
+                if (!lines--)
-                kdb_printf("%s", buf);
+                        break;
+                kdb_printf("%.*s\n", (int)len - 1, buf);
        }
-        if (c != '\n')
-                kdb_printf("\n");
        return 0;
 }
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 47c4e56e513b..392ec6a25844 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -205,7 +205,6 @@ extern char kdb_grep_string[];
 extern int kdb_grep_leading;
 extern int kdb_grep_trailing;
 extern char *kdb_cmds[];
-extern void kdb_syslog_data(char *syslog_data[]);
 extern unsigned long kdb_task_state_string(const char *);
 extern char kdb_task_state_char (const struct task_struct *);
 extern unsigned long kdb_task_state(const struct task_struct *p,
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 22d901f9caf4..103f5d147b2f 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -3,4 +3,7 @@ CFLAGS_REMOVE_core.o = -pg
 endif
 obj-y := core.o ring_buffer.o callchain.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
+obj-$(CONFIG_UPROBES) += uprobes.o
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fd126f82b57c..d7d71d6ec972 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -253,9 +253,9 @@ perf_cgroup_match(struct perf_event *event)
        return !event->cgrp || event->cgrp == cpuctx->cgrp;
 }
-static inline void perf_get_cgroup(struct perf_event *event)
+static inline bool perf_tryget_cgroup(struct perf_event *event)
 {
-        css_get(&event->cgrp->css);
+        return css_tryget(&event->cgrp->css);
 }
 static inline void perf_put_cgroup(struct perf_event *event)
@@ -484,7 +484,11 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
        event->cgrp = cgrp;
        /* must be done before we fput() the file */
-        perf_get_cgroup(event);
+        if (!perf_tryget_cgroup(event)) {
+                event->cgrp = NULL;
+                ret = -ENOENT;
+                goto out;
+        }
        /*
         * all events in a group must monitor
@@ -3181,7 +3185,6 @@ static void perf_event_for_each(struct perf_event *event,
        event = event->group_leader;
        perf_event_for_each_child(event, func);
-        func(event);
        list_for_each_entry(sibling, &event->sibling_list, group_entry)
                perf_event_for_each_child(sibling, func);
        mutex_unlock(&ctx->mutex);
@@ -4957,7 +4960,7 @@ void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
        if (rctx < 0)
                return;
-        perf_sample_data_init(&data, addr);
+        perf_sample_data_init(&data, addr, 0);
        do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
@@ -5215,7 +5218,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
                .data = record,
        };
-        perf_sample_data_init(&data, addr);
+        perf_sample_data_init(&data, addr, 0);
        data.raw = &raw;
        hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
@@ -5318,7 +5321,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
        struct perf_sample_data sample;
        struct pt_regs *regs = data;
-        perf_sample_data_init(&sample, bp->attr.bp_addr);
+        perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
        if (!bp->hw.state && !perf_exclude_event(bp, regs))
                perf_swevent_event(bp, 1, &sample, regs);
@@ -5344,13 +5347,12 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
        event->pmu->read(event);
-        perf_sample_data_init(&data, 0);
+        perf_sample_data_init(&data, 0, event->hw.last_period);
-        data.period = event->hw.last_period;
        regs = get_irq_regs();
        if (regs && !perf_exclude_event(event, regs)) {
                if (!(event->attr.exclude_idle && is_idle_task(current)))
-                        if (perf_event_overflow(event, &data, regs))
+                        if (__perf_event_overflow(event, 1, &data, regs))
                                ret = HRTIMER_NORESTART;
        }
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
new file mode 100644
index 000000000000..985be4d80fe8
--- /dev/null
+++ b/kernel/events/uprobes.c
@@ -0,0 +1,1667 @@
+/*
+ * User-space Probes (UProbes)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2008-2012
+ * Authors:
+ *      Srikar Dronamraju
+ *      Jim Keniston
+ * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ */
+#include <linux/kernel.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>      /* read_mapping_page */
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/rmap.h>         /* anon_vma_prepare */
+#include <linux/mmu_notifier.h> /* set_pte_at_notify */
+#include <linux/swap.h>         /* try_to_free_swap */
+#include <linux/ptrace.h>       /* user_enable_single_step */
+#include <linux/kdebug.h>       /* notifier mechanism */
+#include <linux/uprobes.h>
+#define UINSNS_PER_PAGE                 (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
+#define MAX_UPROBE_XOL_SLOTS            UINSNS_PER_PAGE
+static struct srcu_struct uprobes_srcu;
+static struct rb_root uprobes_tree = RB_ROOT;
+static DEFINE_SPINLOCK(uprobes_treelock);       /* serialize rbtree access */
+#define UPROBES_HASH_SZ 13
+/* serialize (un)register */
+static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
+#define uprobes_hash(v)         (&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
+/* serialize uprobe->pending_list */
+static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
+#define uprobes_mmap_hash(v)    (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
+/*
+ * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe
+ * events active at this time.  Probably a fine grained per inode count is
+ * better?
+ */
+static atomic_t uprobe_events = ATOMIC_INIT(0);
+/*
+ * Maintain a temporary per vma info that can be used to search if a vma
+ * has already been handled. This structure is introduced since extending
+ * vm_area_struct wasnt recommended.
+ */
+struct vma_info {
+        struct list_head        probe_list;
+        struct mm_struct        *mm;
+        loff_t                  vaddr;
+};
+struct uprobe {
+        struct rb_node          rb_node;        /* node in the rb tree */
+        atomic_t                ref;
+        struct rw_semaphore     consumer_rwsem;
+        struct list_head        pending_list;
+        struct uprobe_consumer  *consumers;
+        struct inode            *inode;         /* Also hold a ref to inode */
+        loff_t                  offset;
+        int                     flags;
+        struct arch_uprobe      arch;
+};
+/*
+ * valid_vma: Verify if the specified vma is an executable vma
+ * Relax restrictions while unregistering: vm_flags might have
+ * changed after breakpoint was inserted.
+ *      - is_register: indicates if we are in register context.
+ *      - Return 1 if the specified virtual address is in an
+ *        executable vma.
+ */
+static bool valid_vma(struct vm_area_struct *vma, bool is_register)
+{
+        if (!vma->vm_file)
+                return false;
+        if (!is_register)
+                return true;
+        if ((vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) == (VM_READ|VM_EXEC))
+                return true;
+        return false;
+}
+static loff_t vma_address(struct vm_area_struct *vma, loff_t offset)
+{
+        loff_t vaddr;
+        vaddr = vma->vm_start + offset;
+        vaddr -= vma->vm_pgoff << PAGE_SHIFT;
+        return vaddr;
+}
+/**
+ * __replace_page - replace page in vma by new page.
+ * based on replace_page in mm/ksm.c
+ *
+ * @vma:      vma that holds the pte pointing to page
+ * @page:     the cowed page we are replacing by kpage
+ * @kpage:    the modified page we replace page by
+ *
+ * Returns 0 on success, -EFAULT on failure.
+ */
+static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *ptep;
+        spinlock_t *ptl;
+        unsigned long addr;
+        int err = -EFAULT;
+        addr = page_address_in_vma(page, vma);
+        if (addr == -EFAULT)
+                goto out;
+        pgd = pgd_offset(mm, addr);
+        if (!pgd_present(*pgd))
+                goto out;
+        pud = pud_offset(pgd, addr);
+        if (!pud_present(*pud))
+                goto out;
+        pmd = pmd_offset(pud, addr);
+        if (!pmd_present(*pmd))
+                goto out;
+        ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
+        if (!ptep)
+                goto out;
+        get_page(kpage);
+        page_add_new_anon_rmap(kpage, vma, addr);
+        if (!PageAnon(page)) {
+                dec_mm_counter(mm, MM_FILEPAGES);
+                inc_mm_counter(mm, MM_ANONPAGES);
+        }
+        flush_cache_page(vma, addr, pte_pfn(*ptep));
+        ptep_clear_flush(vma, addr, ptep);
+        set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
+        page_remove_rmap(page);
+        if (!page_mapped(page))
+                try_to_free_swap(page);
+        put_page(page);
+        pte_unmap_unlock(ptep, ptl);
+        err = 0;
+out:
+        return err;
+}
+/**
+ * is_swbp_insn - check if instruction is breakpoint instruction.
+ * @insn: instruction to be checked.
+ * Default implementation of is_swbp_insn
+ * Returns true if @insn is a breakpoint instruction.
+ */
+bool __weak is_swbp_insn(uprobe_opcode_t *insn)
+{
+        return *insn == UPROBE_SWBP_INSN;
+}
+/*
+ * NOTE:
+ * Expect the breakpoint instruction to be the smallest size instruction for
+ * the architecture. If an arch has variable length instruction and the
+ * breakpoint instruction is not of the smallest length instruction
+ * supported by that architecture then we need to modify read_opcode /
+ * write_opcode accordingly. This would never be a problem for archs that
+ * have fixed length instructions.
+ */
+/*
+ * write_opcode - write the opcode at a given virtual address.
+ * @auprobe: arch breakpointing information.
+ * @mm: the probed process address space.
+ * @vaddr: the virtual address to store the opcode.
+ * @opcode: opcode to be written at @vaddr.
+ *
+ * Called with mm->mmap_sem held (for read and with a reference to
+ * mm).
+ *
+ * For mm @mm, write the opcode at @vaddr.
+ * Return 0 (success) or a negative errno.
+ */
+static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
+                        unsigned long vaddr, uprobe_opcode_t opcode)
+{
+        struct page *old_page, *new_page;
+        struct address_space *mapping;
+        void *vaddr_old, *vaddr_new;
+        struct vm_area_struct *vma;
+        struct uprobe *uprobe;
+        loff_t addr;
+        int ret;
+        /* Read the page with vaddr into memory */
+        ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma);
+        if (ret <= 0)
+                return ret;
+        ret = -EINVAL;
+        /*
+         * We are interested in text pages only. Our pages of interest
+         * should be mapped for read and execute only. We desist from
+         * adding probes in write mapped pages since the breakpoints
+         * might end up in the file copy.
+         */
+        if (!valid_vma(vma, is_swbp_insn(&opcode)))
+                goto put_out;
+        uprobe = container_of(auprobe, struct uprobe, arch);
+        mapping = uprobe->inode->i_mapping;
+        if (mapping != vma->vm_file->f_mapping)
+                goto put_out;
+        addr = vma_address(vma, uprobe->offset);
+        if (vaddr != (unsigned long)addr)
+                goto put_out;
+        ret = -ENOMEM;
+        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
+        if (!new_page)
+                goto put_out;
+        __SetPageUptodate(new_page);
+        /*
+         * lock page will serialize against do_wp_page()'s
+         * PageAnon() handling
+         */
+        lock_page(old_page);
+        /* copy the page now that we've got it stable */
+        vaddr_old = kmap_atomic(old_page);
+        vaddr_new = kmap_atomic(new_page);
+        memcpy(vaddr_new, vaddr_old, PAGE_SIZE);
+        /* poke the new insn in, ASSUMES we don't cross page boundary */
+        vaddr &= ~PAGE_MASK;
+        BUG_ON(vaddr + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
+        memcpy(vaddr_new + vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+        kunmap_atomic(vaddr_new);
+        kunmap_atomic(vaddr_old);
+        ret = anon_vma_prepare(vma);
+        if (ret)
+                goto unlock_out;
+        lock_page(new_page);
+        ret = __replace_page(vma, old_page, new_page);
+        unlock_page(new_page);
+unlock_out:
+        unlock_page(old_page);
+        page_cache_release(new_page);
+put_out:
+        put_page(old_page);
+        return ret;
+}
+/**
+ * read_opcode - read the opcode at a given virtual address.
+ * @mm: the probed process address space.
+ * @vaddr: the virtual address to read the opcode.
+ * @opcode: location to store the read opcode.
+ *
+ * Called with mm->mmap_sem held (for read and with a reference to
+ * mm.
+ *
+ * For mm @mm, read the opcode at @vaddr and store it in @opcode.
+ * Return 0 (success) or a negative errno.
+ */
+static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t *opcode)
+{
+        struct page *page;
+        void *vaddr_new;
+        int ret;
+        ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &page, NULL);
+        if (ret <= 0)
+                return ret;
+        lock_page(page);
+        vaddr_new = kmap_atomic(page);
+        vaddr &= ~PAGE_MASK;
+        memcpy(opcode, vaddr_new + vaddr, UPROBE_SWBP_INSN_SIZE);
+        kunmap_atomic(vaddr_new);
+        unlock_page(page);
+        put_page(page);
+        return 0;
+}
+static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
+{
+        uprobe_opcode_t opcode;
+        int result;
+        result = read_opcode(mm, vaddr, &opcode);
+        if (result)
+                return result;
+        if (is_swbp_insn(&opcode))
+                return 1;
+        return 0;
+}
+/**
+ * set_swbp - store breakpoint at a given address.
+ * @auprobe: arch specific probepoint information.
+ * @mm: the probed process address space.
+ * @vaddr: the virtual address to insert the opcode.
+ *
+ * For mm @mm, store the breakpoint instruction at @vaddr.
+ * Return 0 (success) or a negative errno.
+ */
+int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
+{
+        int result;
+        result = is_swbp_at_addr(mm, vaddr);
+        if (result == 1)
+                return -EEXIST;
+        if (result)
+                return result;
+        return write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN);
+}
+/**
+ * set_orig_insn - Restore the original instruction.
+ * @mm: the probed process address space.
+ * @auprobe: arch specific probepoint information.
+ * @vaddr: the virtual address to insert the opcode.
+ * @verify: if true, verify existance of breakpoint instruction.
+ *
+ * For mm @mm, restore the original opcode (opcode) at @vaddr.
+ * Return 0 (success) or a negative errno.
+ */
+int __weak
+set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, bool verify)
+{
+        if (verify) {
+                int result;
+                result = is_swbp_at_addr(mm, vaddr);
+                if (!result)
+                        return -EINVAL;
+                if (result != 1)
+                        return result;
+        }
+        return write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
+}
+static int match_uprobe(struct uprobe *l, struct uprobe *r)
+{
+        if (l->inode < r->inode)
+                return -1;
+        if (l->inode > r->inode)
+                return 1;
+        if (l->offset < r->offset)
+                return -1;
+        if (l->offset > r->offset)
+                return 1;
+        return 0;
+}
+static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
+{
+        struct uprobe u = { .inode = inode, .offset = offset };
+        struct rb_node *n = uprobes_tree.rb_node;
+        struct uprobe *uprobe;
+        int match;
+        while (n) {
+                uprobe = rb_entry(n, struct uprobe, rb_node);
+                match = match_uprobe(&u, uprobe);
+                if (!match) {
+                        atomic_inc(&uprobe->ref);
+                        return uprobe;
+                }
+                if (match < 0)
+                        n = n->rb_left;
+                else
+                        n = n->rb_right;
+        }
+        return NULL;
+}
+/*
+ * Find a uprobe corresponding to a given inode:offset
+ * Acquires uprobes_treelock
+ */
+static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
+{
+        struct uprobe *uprobe;
+        unsigned long flags;
+        spin_lock_irqsave(&uprobes_treelock, flags);
+        uprobe = __find_uprobe(inode, offset);
+        spin_unlock_irqrestore(&uprobes_treelock, flags);
+        return uprobe;
+}
+static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
+{
+        struct rb_node **p = &uprobes_tree.rb_node;
+        struct rb_node *parent = NULL;
+        struct uprobe *u;
+        int match;
+        while (*p) {
+                parent = *p;
+                u = rb_entry(parent, struct uprobe, rb_node);
+                match = match_uprobe(uprobe, u);
+                if (!match) {
+                        atomic_inc(&u->ref);
+                        return u;
+                }
+                if (match < 0)
+                        p = &parent->rb_left;
+                else
+                        p = &parent->rb_right;
+        }
+        u = NULL;
+        rb_link_node(&uprobe->rb_node, parent, p);
+        rb_insert_color(&uprobe->rb_node, &uprobes_tree);
+        /* get access + creation ref */
+        atomic_set(&uprobe->ref, 2);
+        return u;
+}
+/*
+ * Acquire uprobes_treelock.
+ * Matching uprobe already exists in rbtree;
+ *      increment (access refcount) and return the matching uprobe.
+ *
+ * No matching uprobe; insert the uprobe in rb_tree;
+ *      get a double refcount (access + creation) and return NULL.
+ */
+static struct uprobe *insert_uprobe(struct uprobe *uprobe)
+{
+        unsigned long flags;
+        struct uprobe *u;
+        spin_lock_irqsave(&uprobes_treelock, flags);
+        u = __insert_uprobe(uprobe);
+        spin_unlock_irqrestore(&uprobes_treelock, flags);
+        /* For now assume that the instruction need not be single-stepped */
+        uprobe->flags |= UPROBE_SKIP_SSTEP;
+        return u;
+}
+static void put_uprobe(struct uprobe *uprobe)
+{
+        if (atomic_dec_and_test(&uprobe->ref))
+                kfree(uprobe);
+}
+static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
+{
+        struct uprobe *uprobe, *cur_uprobe;
+        uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
+        if (!uprobe)
+                return NULL;
+        uprobe->inode = igrab(inode);
+        uprobe->offset = offset;
+        init_rwsem(&uprobe->consumer_rwsem);
+        INIT_LIST_HEAD(&uprobe->pending_list);
+        /* add to uprobes_tree, sorted on inode:offset */
+        cur_uprobe = insert_uprobe(uprobe);
+        /* a uprobe exists for this inode:offset combination */
+        if (cur_uprobe) {
+                kfree(uprobe);
+                uprobe = cur_uprobe;
+                iput(inode);
+        } else {
+                atomic_inc(&uprobe_events);
+        }
+        return uprobe;
+}
+static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
+{
+        struct uprobe_consumer *uc;
+        if (!(uprobe->flags & UPROBE_RUN_HANDLER))
+                return;
+        down_read(&uprobe->consumer_rwsem);
+        for (uc = uprobe->consumers; uc; uc = uc->next) {
+                if (!uc->filter || uc->filter(uc, current))
+                        uc->handler(uc, regs);
+        }
+        up_read(&uprobe->consumer_rwsem);
+}
+/* Returns the previous consumer */
+static struct uprobe_consumer *
+consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
+{
+        down_write(&uprobe->consumer_rwsem);
+        uc->next = uprobe->consumers;
+        uprobe->consumers = uc;
+        up_write(&uprobe->consumer_rwsem);
+        return uc->next;
+}
+/*
+ * For uprobe @uprobe, delete the consumer @uc.
+ * Return true if the @uc is deleted successfully
+ * or return false.
+ */
+static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
+{
+        struct uprobe_consumer **con;
+        bool ret = false;
+        down_write(&uprobe->consumer_rwsem);
+        for (con = &uprobe->consumers; *con; con = &(*con)->next) {
+                if (*con == uc) {
+                        *con = uc->next;
+                        ret = true;
+                        break;
+                }
+        }
+        up_write(&uprobe->consumer_rwsem);
+        return ret;
+}
+static int
+__copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *insn,
+                        unsigned long nbytes, unsigned long offset)
+{
+        struct file *filp = vma->vm_file;
+        struct page *page;
+        void *vaddr;
+        unsigned long off1;
+        unsigned long idx;
+        if (!filp)
+                return -EINVAL;
+        idx = (unsigned long)(offset >> PAGE_CACHE_SHIFT);
+        off1 = offset &= ~PAGE_MASK;
+        /*
+         * Ensure that the page that has the original instruction is
+         * populated and in page-cache.
+         */
+        page = read_mapping_page(mapping, idx, filp);
+        if (IS_ERR(page))
+                return PTR_ERR(page);
+        vaddr = kmap_atomic(page);
+        memcpy(insn, vaddr + off1, nbytes);
+        kunmap_atomic(vaddr);
+        page_cache_release(page);
+        return 0;
+}
+static int
+copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr)
+{
+        struct address_space *mapping;
+        unsigned long nbytes;
+        int bytes;
+        addr &= ~PAGE_MASK;
+        nbytes = PAGE_SIZE - addr;
+        mapping = uprobe->inode->i_mapping;
+        /* Instruction at end of binary; copy only available bytes */
+        if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size)
+                bytes = uprobe->inode->i_size - uprobe->offset;
+        else
+                bytes = MAX_UINSN_BYTES;
+        /* Instruction at the page-boundary; copy bytes in second page */
+        if (nbytes < bytes) {
+                if (__copy_insn(mapping, vma, uprobe->arch.insn + nbytes,
+                                bytes - nbytes, uprobe->offset + nbytes))
+                        return -ENOMEM;
+                bytes = nbytes;
+        }
+        return __copy_insn(mapping, vma, uprobe->arch.insn, bytes, uprobe->offset);
+}
+/*
+ * How mm->uprobes_state.count gets updated
+ * uprobe_mmap() increments the count if
+ *      - it successfully adds a breakpoint.
+ *      - it cannot add a breakpoint, but sees that there is a underlying
+ *        breakpoint (via a is_swbp_at_addr()).
+ *
+ * uprobe_munmap() decrements the count if
+ *      - it sees a underlying breakpoint, (via is_swbp_at_addr)
+ *        (Subsequent uprobe_unregister wouldnt find the breakpoint
+ *        unless a uprobe_mmap kicks in, since the old vma would be
+ *        dropped just after uprobe_munmap.)
+ *
+ * uprobe_register increments the count if:
+ *      - it successfully adds a breakpoint.
+ *
+ * uprobe_unregister decrements the count if:
+ *      - it sees a underlying breakpoint and removes successfully.
+ *        (via is_swbp_at_addr)
+ *        (Subsequent uprobe_munmap wouldnt find the breakpoint
+ *        since there is no underlying breakpoint after the
+ *        breakpoint removal.)
+ */
+static int
+install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
+                        struct vm_area_struct *vma, loff_t vaddr)
+{
+        unsigned long addr;
+        int ret;
+        /*
+         * If probe is being deleted, unregister thread could be done with
+         * the vma-rmap-walk through. Adding a probe now can be fatal since
+         * nobody will be able to cleanup. Also we could be from fork or
+         * mremap path, where the probe might have already been inserted.
+         * Hence behave as if probe already existed.
+         */
+        if (!uprobe->consumers)
+                return -EEXIST;
+        addr = (unsigned long)vaddr;
+        if (!(uprobe->flags & UPROBE_COPY_INSN)) {
+                ret = copy_insn(uprobe, vma, addr);
+                if (ret)
+                        return ret;
+                if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
+                        return -EEXIST;
+                ret = arch_uprobe_analyze_insn(&uprobe->arch, mm);
+                if (ret)
+                        return ret;
+                uprobe->flags |= UPROBE_COPY_INSN;
+        }
+        /*
+         * Ideally, should be updating the probe count after the breakpoint
+         * has been successfully inserted. However a thread could hit the
+         * breakpoint we just inserted even before the probe count is
+         * incremented. If this is the first breakpoint placed, breakpoint
+         * notifier might ignore uprobes and pass the trap to the thread.
+         * Hence increment before and decrement on failure.
+         */
+        atomic_inc(&mm->uprobes_state.count);
+        ret = set_swbp(&uprobe->arch, mm, addr);
+        if (ret)
+                atomic_dec(&mm->uprobes_state.count);
+        return ret;
+}
+static void
+remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr)
+{
+        if (!set_orig_insn(&uprobe->arch, mm, (unsigned long)vaddr, true))
+                atomic_dec(&mm->uprobes_state.count);
+}
+/*
+ * There could be threads that have hit the breakpoint and are entering the
+ * notifier code and trying to acquire the uprobes_treelock. The thread
+ * calling delete_uprobe() that is removing the uprobe from the rb_tree can
+ * race with these threads and might acquire the uprobes_treelock compared
+ * to some of the breakpoint hit threads. In such a case, the breakpoint
+ * hit threads will not find the uprobe. The current unregistering thread
+ * waits till all other threads have hit a breakpoint, to acquire the
+ * uprobes_treelock before the uprobe is removed from the rbtree.
+ */
+static void delete_uprobe(struct uprobe *uprobe)
+{
+        unsigned long flags;
+        synchronize_srcu(&uprobes_srcu);
+        spin_lock_irqsave(&uprobes_treelock, flags);
+        rb_erase(&uprobe->rb_node, &uprobes_tree);
+        spin_unlock_irqrestore(&uprobes_treelock, flags);
+        iput(uprobe->inode);
+        put_uprobe(uprobe);
+        atomic_dec(&uprobe_events);
+}
+static struct vma_info *
+__find_next_vma_info(struct address_space *mapping, struct list_head *head,
+                        struct vma_info *vi, loff_t offset, bool is_register)
+{
+        struct prio_tree_iter iter;
+        struct vm_area_struct *vma;
+        struct vma_info *tmpvi;
+        unsigned long pgoff;
+        int existing_vma;
+        loff_t vaddr;
+        pgoff = offset >> PAGE_SHIFT;
+        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+                if (!valid_vma(vma, is_register))
+                        continue;
+                existing_vma = 0;
+                vaddr = vma_address(vma, offset);
+                list_for_each_entry(tmpvi, head, probe_list) {
+                        if (tmpvi->mm == vma->vm_mm && tmpvi->vaddr == vaddr) {
+                                existing_vma = 1;
+                                break;
+                        }
+                }
+                /*
+                 * Another vma needs a probe to be installed. However skip
+                 * installing the probe if the vma is about to be unlinked.
+                 */
+                if (!existing_vma && atomic_inc_not_zero(&vma->vm_mm->mm_users)) {
+                        vi->mm = vma->vm_mm;
+                        vi->vaddr = vaddr;
+                        list_add(&vi->probe_list, head);
+                        return vi;
+                }
+        }
+        return NULL;
+}
+/*
+ * Iterate in the rmap prio tree  and find a vma where a probe has not
+ * yet been inserted.
+ */
+static struct vma_info *
+find_next_vma_info(struct address_space *mapping, struct list_head *head,
+                loff_t offset, bool is_register)
+{
+        struct vma_info *vi, *retvi;
+        vi = kzalloc(sizeof(struct vma_info), GFP_KERNEL);
+        if (!vi)
+                return ERR_PTR(-ENOMEM);
+        mutex_lock(&mapping->i_mmap_mutex);
+        retvi = __find_next_vma_info(mapping, head, vi, offset, is_register);
+        mutex_unlock(&mapping->i_mmap_mutex);
+        if (!retvi)
+                kfree(vi);
+        return retvi;
+}
+static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
+{
+        struct list_head try_list;
+        struct vm_area_struct *vma;
+        struct address_space *mapping;
+        struct vma_info *vi, *tmpvi;
+        struct mm_struct *mm;
+        loff_t vaddr;
+        int ret;
+        mapping = uprobe->inode->i_mapping;
+        INIT_LIST_HEAD(&try_list);
+        ret = 0;
+        for (;;) {
+                vi = find_next_vma_info(mapping, &try_list, uprobe->offset, is_register);
+                if (!vi)
+                        break;
+                if (IS_ERR(vi)) {
+                        ret = PTR_ERR(vi);
+                        break;
+                }
+                mm = vi->mm;
+                down_read(&mm->mmap_sem);
+                vma = find_vma(mm, (unsigned long)vi->vaddr);
+                if (!vma || !valid_vma(vma, is_register)) {
+                        list_del(&vi->probe_list);
+                        kfree(vi);
+                        up_read(&mm->mmap_sem);
+                        mmput(mm);
+                        continue;
+                }
+                vaddr = vma_address(vma, uprobe->offset);
+                if (vma->vm_file->f_mapping->host != uprobe->inode ||
+                                                vaddr != vi->vaddr) {
+                        list_del(&vi->probe_list);
+                        kfree(vi);
+                        up_read(&mm->mmap_sem);
+                        mmput(mm);
+                        continue;
+                }
+                if (is_register)
+                        ret = install_breakpoint(uprobe, mm, vma, vi->vaddr);
+                else
+                        remove_breakpoint(uprobe, mm, vi->vaddr);
+                up_read(&mm->mmap_sem);
+                mmput(mm);
+                if (is_register) {
+                        if (ret && ret == -EEXIST)
+                                ret = 0;
+                        if (ret)
+                                break;
+                }
+        }
+        list_for_each_entry_safe(vi, tmpvi, &try_list, probe_list) {
+                list_del(&vi->probe_list);
+                kfree(vi);
+        }
+        return ret;
+}
+static int __uprobe_register(struct uprobe *uprobe)
+{
+        return register_for_each_vma(uprobe, true);
+}
+static void __uprobe_unregister(struct uprobe *uprobe)
+{
+        if (!register_for_each_vma(uprobe, false))
+                delete_uprobe(uprobe);
+        /* TODO : cant unregister? schedule a worker thread */
+}
+/*
+ * uprobe_register - register a probe
+ * @inode: the file in which the probe has to be placed.
+ * @offset: offset from the start of the file.
+ * @uc: information on howto handle the probe..
+ *
+ * Apart from the access refcount, uprobe_register() takes a creation
+ * refcount (thro alloc_uprobe) if and only if this @uprobe is getting
+ * inserted into the rbtree (i.e first consumer for a @inode:@offset
+ * tuple).  Creation refcount stops uprobe_unregister from freeing the
+ * @uprobe even before the register operation is complete. Creation
+ * refcount is released when the last @uc for the @uprobe
+ * unregisters.
+ *
+ * Return errno if it cannot successully install probes
+ * else return 0 (success)
+ */
+int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
+{
+        struct uprobe *uprobe;
+        int ret;
+        if (!inode || !uc || uc->next)
+                return -EINVAL;
+        if (offset > i_size_read(inode))
+                return -EINVAL;
+        ret = 0;
+        mutex_lock(uprobes_hash(inode));
+        uprobe = alloc_uprobe(inode, offset);
+        if (uprobe && !consumer_add(uprobe, uc)) {
+                ret = __uprobe_register(uprobe);
+                if (ret) {
+                        uprobe->consumers = NULL;
+                        __uprobe_unregister(uprobe);
+                } else {
+                        uprobe->flags |= UPROBE_RUN_HANDLER;
+                }
+        }
+        mutex_unlock(uprobes_hash(inode));
+        put_uprobe(uprobe);
+        return ret;
+}
+/*
+ * uprobe_unregister - unregister a already registered probe.
+ * @inode: the file in which the probe has to be removed.
+ * @offset: offset from the start of the file.
+ * @uc: identify which probe if multiple probes are colocated.
+ */
+void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
+{
+        struct uprobe *uprobe;
+        if (!inode || !uc)
+                return;
+        uprobe = find_uprobe(inode, offset);
+        if (!uprobe)
+                return;
+        mutex_lock(uprobes_hash(inode));
+        if (consumer_del(uprobe, uc)) {
+                if (!uprobe->consumers) {
+                        __uprobe_unregister(uprobe);
+                        uprobe->flags &= ~UPROBE_RUN_HANDLER;
+                }
+        }
+        mutex_unlock(uprobes_hash(inode));
+        if (uprobe)
+                put_uprobe(uprobe);
+}
+/*
+ * Of all the nodes that correspond to the given inode, return the node
+ * with the least offset.
+ */
+static struct rb_node *find_least_offset_node(struct inode *inode)
+{
+        struct uprobe u = { .inode = inode, .offset = 0};
+        struct rb_node *n = uprobes_tree.rb_node;
+        struct rb_node *close_node = NULL;
+        struct uprobe *uprobe;
+        int match;
+        while (n) {
+                uprobe = rb_entry(n, struct uprobe, rb_node);
+                match = match_uprobe(&u, uprobe);
+                if (uprobe->inode == inode)
+                        close_node = n;
+                if (!match)
+                        return close_node;
+                if (match < 0)
+                        n = n->rb_left;
+                else
+                        n = n->rb_right;
+        }
+        return close_node;
+}
+/*
+ * For a given inode, build a list of probes that need to be inserted.
+ */
+static void build_probe_list(struct inode *inode, struct list_head *head)
+{
+        struct uprobe *uprobe;
+        unsigned long flags;
+        struct rb_node *n;
+        spin_lock_irqsave(&uprobes_treelock, flags);
+        n = find_least_offset_node(inode);
+        for (; n; n = rb_next(n)) {
+                uprobe = rb_entry(n, struct uprobe, rb_node);
+                if (uprobe->inode != inode)
+                        break;
+                list_add(&uprobe->pending_list, head);
+                atomic_inc(&uprobe->ref);
+        }
+        spin_unlock_irqrestore(&uprobes_treelock, flags);
+}
+/*
+ * Called from mmap_region.
+ * called with mm->mmap_sem acquired.
+ *
+ * Return -ve no if we fail to insert probes and we cannot
+ * bail-out.
+ * Return 0 otherwise. i.e:
+ *
+ *      - successful insertion of probes
+ *      - (or) no possible probes to be inserted.
+ *      - (or) insertion of probes failed but we can bail-out.
+ */
+int uprobe_mmap(struct vm_area_struct *vma)
+{
+        struct list_head tmp_list;
+        struct uprobe *uprobe, *u;
+        struct inode *inode;
+        int ret, count;
+        if (!atomic_read(&uprobe_events) || !valid_vma(vma, true))
+                return 0;
+        inode = vma->vm_file->f_mapping->host;
+        if (!inode)
+                return 0;
+        INIT_LIST_HEAD(&tmp_list);
+        mutex_lock(uprobes_mmap_hash(inode));
+        build_probe_list(inode, &tmp_list);
+        ret = 0;
+        count = 0;
+        list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
+                loff_t vaddr;
+                list_del(&uprobe->pending_list);
+                if (!ret) {
+                        vaddr = vma_address(vma, uprobe->offset);
+                        if (vaddr < vma->vm_start || vaddr >= vma->vm_end) {
+                                put_uprobe(uprobe);
+                                continue;
+                        }
+                        ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
+                        /* Ignore double add: */
+                        if (ret == -EEXIST) {
+                                ret = 0;
+                                if (!is_swbp_at_addr(vma->vm_mm, vaddr))
+                                        continue;
+                                /*
+                                 * Unable to insert a breakpoint, but
+                                 * breakpoint lies underneath. Increment the
+                                 * probe count.
+                                 */
+                                atomic_inc(&vma->vm_mm->uprobes_state.count);
+                        }
+                        if (!ret)
+                                count++;
+                }
+                put_uprobe(uprobe);
+        }
+        mutex_unlock(uprobes_mmap_hash(inode));
+        if (ret)
+                atomic_sub(count, &vma->vm_mm->uprobes_state.count);
+        return ret;
+}
+/*
+ * Called in context of a munmap of a vma.
+ */
+void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
+{
+        struct list_head tmp_list;
+        struct uprobe *uprobe, *u;
+        struct inode *inode;
+        if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
+                return;
+        if (!atomic_read(&vma->vm_mm->uprobes_state.count))
+                return;
+        inode = vma->vm_file->f_mapping->host;
+        if (!inode)
+                return;
+        INIT_LIST_HEAD(&tmp_list);
+        mutex_lock(uprobes_mmap_hash(inode));
+        build_probe_list(inode, &tmp_list);
+        list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
+                loff_t vaddr;
+                list_del(&uprobe->pending_list);
+                vaddr = vma_address(vma, uprobe->offset);
+                if (vaddr >= start && vaddr < end) {
+                        /*
+                         * An unregister could have removed the probe before
+                         * unmap. So check before we decrement the count.
+                         */
+                        if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1)
+                                atomic_dec(&vma->vm_mm->uprobes_state.count);
+                }
+                put_uprobe(uprobe);
+        }
+        mutex_unlock(uprobes_mmap_hash(inode));
+}
+/* Slot allocation for XOL */
+static int xol_add_vma(struct xol_area *area)
+{
+        struct mm_struct *mm;
+        int ret;
+        area->page = alloc_page(GFP_HIGHUSER);
+        if (!area->page)
+                return -ENOMEM;
+        ret = -EALREADY;
+        mm = current->mm;
+        down_write(&mm->mmap_sem);
+        if (mm->uprobes_state.xol_area)
+                goto fail;
+        ret = -ENOMEM;
+        /* Try to map as high as possible, this is only a hint. */
+        area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
+        if (area->vaddr & ~PAGE_MASK) {
+                ret = area->vaddr;
+                goto fail;
+        }
+        ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
+                                VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page);
+        if (ret)
+                goto fail;
+        smp_wmb();      /* pairs with get_xol_area() */
+        mm->uprobes_state.xol_area = area;
+        ret = 0;
+fail:
+        up_write(&mm->mmap_sem);
+        if (ret)
+                __free_page(area->page);
+        return ret;
+}
+static struct xol_area *get_xol_area(struct mm_struct *mm)
+{
+        struct xol_area *area;
+        area = mm->uprobes_state.xol_area;
+        smp_read_barrier_depends();     /* pairs with wmb in xol_add_vma() */
+        return area;
+}
+/*
+ * xol_alloc_area - Allocate process's xol_area.
+ * This area will be used for storing instructions for execution out of
+ * line.
+ *
+ * Returns the allocated area or NULL.
+ */
+static struct xol_area *xol_alloc_area(void)
+{
+        struct xol_area *area;
+        area = kzalloc(sizeof(*area), GFP_KERNEL);
+        if (unlikely(!area))
+                return NULL;
+        area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL);
+        if (!area->bitmap)
+                goto fail;
+        init_waitqueue_head(&area->wq);
+        if (!xol_add_vma(area))
+                return area;
+fail:
+        kfree(area->bitmap);
+        kfree(area);
+        return get_xol_area(current->mm);
+}
+/*
+ * uprobe_clear_state - Free the area allocated for slots.
+ */
+void uprobe_clear_state(struct mm_struct *mm)
+{
+        struct xol_area *area = mm->uprobes_state.xol_area;
+        if (!area)
+                return;
+        put_page(area->page);
+        kfree(area->bitmap);
+        kfree(area);
+}
+/*
+ * uprobe_reset_state - Free the area allocated for slots.
+ */
+void uprobe_reset_state(struct mm_struct *mm)
+{
+        mm->uprobes_state.xol_area = NULL;
+        atomic_set(&mm->uprobes_state.count, 0);
+}
+/*
+ *  - search for a free slot.
+ */
+static unsigned long xol_take_insn_slot(struct xol_area *area)
+{
+        unsigned long slot_addr;
+        int slot_nr;
+        do {
+                slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
+                if (slot_nr < UINSNS_PER_PAGE) {
+                        if (!test_and_set_bit(slot_nr, area->bitmap))
+                                break;
+                        slot_nr = UINSNS_PER_PAGE;
+                        continue;
+                }
+                wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE));
+        } while (slot_nr >= UINSNS_PER_PAGE);
+        slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES);
+        atomic_inc(&area->slot_count);
+        return slot_addr;
+}
+/*
+ * xol_get_insn_slot - If was not allocated a slot, then
+ * allocate a slot.
+ * Returns the allocated slot address or 0.
+ */
+static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr)
+{
+        struct xol_area *area;
+        unsigned long offset;
+        void *vaddr;
+        area = get_xol_area(current->mm);
+        if (!area) {
+                area = xol_alloc_area();
+                if (!area)
+                        return 0;
+        }
+        current->utask->xol_vaddr = xol_take_insn_slot(area);
+        /*
+         * Initialize the slot if xol_vaddr points to valid
+         * instruction slot.
+         */
+        if (unlikely(!current->utask->xol_vaddr))
+                return 0;
+        current->utask->vaddr = slot_addr;
+        offset = current->utask->xol_vaddr & ~PAGE_MASK;
+        vaddr = kmap_atomic(area->page);
+        memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
+        kunmap_atomic(vaddr);
+        return current->utask->xol_vaddr;
+}
+/*
+ * xol_free_insn_slot - If slot was earlier allocated by
+ * @xol_get_insn_slot(), make the slot available for
+ * subsequent requests.
+ */
+static void xol_free_insn_slot(struct task_struct *tsk)
+{
+        struct xol_area *area;
+        unsigned long vma_end;
+        unsigned long slot_addr;
+        if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask)
+                return;
+        slot_addr = tsk->utask->xol_vaddr;
+        if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr)))
+                return;
+        area = tsk->mm->uprobes_state.xol_area;
+        vma_end = area->vaddr + PAGE_SIZE;
+        if (area->vaddr <= slot_addr && slot_addr < vma_end) {
+                unsigned long offset;
+                int slot_nr;
+                offset = slot_addr - area->vaddr;
+                slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
+                if (slot_nr >= UINSNS_PER_PAGE)
+                        return;
+                clear_bit(slot_nr, area->bitmap);
+                atomic_dec(&area->slot_count);
+                if (waitqueue_active(&area->wq))
+                        wake_up(&area->wq);
+                tsk->utask->xol_vaddr = 0;
+        }
+}
+/**
+ * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
+ * @regs: Reflects the saved state of the task after it has hit a breakpoint
+ * instruction.
+ * Return the address of the breakpoint instruction.
+ */
+unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
+{
+        return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
+}
+/*
+ * Called with no locks held.
+ * Called in context of a exiting or a exec-ing thread.
+ */
+void uprobe_free_utask(struct task_struct *t)
+{
+        struct uprobe_task *utask = t->utask;
+        if (t->uprobe_srcu_id != -1)
+                srcu_read_unlock_raw(&uprobes_srcu, t->uprobe_srcu_id);
+        if (!utask)
+                return;
+        if (utask->active_uprobe)
+                put_uprobe(utask->active_uprobe);
+        xol_free_insn_slot(t);
+        kfree(utask);
+        t->utask = NULL;
+}
+/*
+ * Called in context of a new clone/fork from copy_process.
+ */
+void uprobe_copy_process(struct task_struct *t)
+{
+        t->utask = NULL;
+        t->uprobe_srcu_id = -1;
+}
+/*
+ * Allocate a uprobe_task object for the task.
+ * Called when the thread hits a breakpoint for the first time.
+ *
+ * Returns:
+ * - pointer to new uprobe_task on success
+ * - NULL otherwise
+ */
+static struct uprobe_task *add_utask(void)
+{
+        struct uprobe_task *utask;
+        utask = kzalloc(sizeof *utask, GFP_KERNEL);
+        if (unlikely(!utask))
+                return NULL;
+        utask->active_uprobe = NULL;
+        current->utask = utask;
+        return utask;
+}
+/* Prepare to single-step probed instruction out of line. */
+static int
+pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr)
+{
+        if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs))
+                return 0;
+        return -EFAULT;
+}
+/*
+ * If we are singlestepping, then ensure this thread is not connected to
+ * non-fatal signals until completion of singlestep.  When xol insn itself
+ * triggers the signal,  restart the original insn even if the task is
+ * already SIGKILL'ed (since coredump should report the correct ip).  This
+ * is even more important if the task has a handler for SIGSEGV/etc, The
+ * _same_ instruction should be repeated again after return from the signal
+ * handler, and SSTEP can never finish in this case.
+ */
+bool uprobe_deny_signal(void)
+{
+        struct task_struct *t = current;
+        struct uprobe_task *utask = t->utask;
+        if (likely(!utask || !utask->active_uprobe))
+                return false;
+        WARN_ON_ONCE(utask->state != UTASK_SSTEP);
+        if (signal_pending(t)) {
+                spin_lock_irq(&t->sighand->siglock);
+                clear_tsk_thread_flag(t, TIF_SIGPENDING);
+                spin_unlock_irq(&t->sighand->siglock);
+                if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
+                        utask->state = UTASK_SSTEP_TRAPPED;
+                        set_tsk_thread_flag(t, TIF_UPROBE);
+                        set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+                }
+        }
+        return true;
+}
+/*
+ * Avoid singlestepping the original instruction if the original instruction
+ * is a NOP or can be emulated.
+ */
+static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
+{
+        if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
+                return true;
+        uprobe->flags &= ~UPROBE_SKIP_SSTEP;
+        return false;
+}
+/*
+ * Run handler and ask thread to singlestep.
+ * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
+ */
+static void handle_swbp(struct pt_regs *regs)
+{
+        struct vm_area_struct *vma;
+        struct uprobe_task *utask;
+        struct uprobe *uprobe;
+        struct mm_struct *mm;
+        unsigned long bp_vaddr;
+        uprobe = NULL;
+        bp_vaddr = uprobe_get_swbp_addr(regs);
+        mm = current->mm;
+        down_read(&mm->mmap_sem);
+        vma = find_vma(mm, bp_vaddr);
+        if (vma && vma->vm_start <= bp_vaddr && valid_vma(vma, false)) {
+                struct inode *inode;
+                loff_t offset;
+                inode = vma->vm_file->f_mapping->host;
+                offset = bp_vaddr - vma->vm_start;
+                offset += (vma->vm_pgoff << PAGE_SHIFT);
+                uprobe = find_uprobe(inode, offset);
+        }
+        srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id);
+        current->uprobe_srcu_id = -1;
+        up_read(&mm->mmap_sem);
+        if (!uprobe) {
+                /* No matching uprobe; signal SIGTRAP. */
+                send_sig(SIGTRAP, current, 0);
+                return;
+        }
+        utask = current->utask;
+        if (!utask) {
+                utask = add_utask();
+                /* Cannot allocate; re-execute the instruction. */
+                if (!utask)
+                        goto cleanup_ret;
+        }
+        utask->active_uprobe = uprobe;
+        handler_chain(uprobe, regs);
+        if (uprobe->flags & UPROBE_SKIP_SSTEP && can_skip_sstep(uprobe, regs))
+                goto cleanup_ret;
+        utask->state = UTASK_SSTEP;
+        if (!pre_ssout(uprobe, regs, bp_vaddr)) {
+                user_enable_single_step(current);
+                return;
+        }
+cleanup_ret:
+        if (utask) {
+                utask->active_uprobe = NULL;
+                utask->state = UTASK_RUNNING;
+        }
+        if (uprobe) {
+                if (!(uprobe->flags & UPROBE_SKIP_SSTEP))
+                        /*
+                         * cannot singlestep; cannot skip instruction;
+                         * re-execute the instruction.
+                         */
+                        instruction_pointer_set(regs, bp_vaddr);
+                put_uprobe(uprobe);
+        }
+}
+/*
+ * Perform required fix-ups and disable singlestep.
+ * Allow pending signals to take effect.
+ */
+static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
+{
+        struct uprobe *uprobe;
+        uprobe = utask->active_uprobe;
+        if (utask->state == UTASK_SSTEP_ACK)
+                arch_uprobe_post_xol(&uprobe->arch, regs);
+        else if (utask->state == UTASK_SSTEP_TRAPPED)
+                arch_uprobe_abort_xol(&uprobe->arch, regs);
+        else
+                WARN_ON_ONCE(1);
+        put_uprobe(uprobe);
+        utask->active_uprobe = NULL;
+        utask->state = UTASK_RUNNING;
+        user_disable_single_step(current);
+        xol_free_insn_slot(current);
+        spin_lock_irq(&current->sighand->siglock);
+        recalc_sigpending(); /* see uprobe_deny_signal() */
+        spin_unlock_irq(&current->sighand->siglock);
+}
+/*
+ * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag.  (and on
+ * subsequent probe hits on the thread sets the state to UTASK_BP_HIT) and
+ * allows the thread to return from interrupt.
+ *
+ * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag and
+ * also sets the state to UTASK_SSTEP_ACK and allows the thread to return from
+ * interrupt.
+ *
+ * While returning to userspace, thread notices the TIF_UPROBE flag and calls
+ * uprobe_notify_resume().
+ */
+void uprobe_notify_resume(struct pt_regs *regs)
+{
+        struct uprobe_task *utask;
+        utask = current->utask;
+        if (!utask || utask->state == UTASK_BP_HIT)
+                handle_swbp(regs);
+        else
+                handle_singlestep(utask, regs);
+}
+/*
+ * uprobe_pre_sstep_notifier gets called from interrupt context as part of
+ * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit.
+ */
+int uprobe_pre_sstep_notifier(struct pt_regs *regs)
+{
+        struct uprobe_task *utask;
+        if (!current->mm || !atomic_read(&current->mm->uprobes_state.count))
+                /* task is currently not uprobed */
+                return 0;
+        utask = current->utask;
+        if (utask)
+                utask->state = UTASK_BP_HIT;
+        set_thread_flag(TIF_UPROBE);
+        current->uprobe_srcu_id = srcu_read_lock_raw(&uprobes_srcu);
+        return 1;
+}
+/*
+ * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier
+ * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep.
+ */
+int uprobe_post_sstep_notifier(struct pt_regs *regs)
+{
+        struct uprobe_task *utask = current->utask;
+        if (!current->mm || !utask || !utask->active_uprobe)
+                /* task is currently not uprobed */
+                return 0;
+        utask->state = UTASK_SSTEP_ACK;
+        set_thread_flag(TIF_UPROBE);
+        return 1;
+}
+static struct notifier_block uprobe_exception_nb = {
+        .notifier_call          = arch_uprobe_exception_notify,
+        .priority               = INT_MAX-1,    /* notified after kprobes, kgdb */
+};
+static int __init init_uprobes(void)
+{
+        int i;
+        for (i = 0; i < UPROBES_HASH_SZ; i++) {
+                mutex_init(&uprobes_mutex[i]);
+                mutex_init(&uprobes_mmap_mutex[i]);
+        }
+        init_srcu_struct(&uprobes_srcu);
+        return register_die_notifier(&uprobe_exception_nb);
+}
+module_init(init_uprobes);
+static void __exit exit_uprobes(void)
+{
+}
+module_exit(exit_uprobes);
diff --git a/kernel/exit.c b/kernel/exit.c
index d8bd3b425fa7..2f59cc334516 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -72,6 +72,18 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
                list_del_rcu(&p->tasks);
                list_del_init(&p->sibling);
                __this_cpu_dec(process_counts);
+                /*
+                 * If we are the last child process in a pid namespace to be
+                 * reaped, notify the reaper sleeping zap_pid_ns_processes().
+                 */
+                if (IS_ENABLED(CONFIG_PID_NS)) {
+                        struct task_struct *parent = p->real_parent;
+                        if ((task_active_pid_ns(parent)->child_reaper == parent) &&
+                            list_empty(&parent->children) &&
+                            (parent->flags & PF_EXITING))
+                                wake_up_process(parent);
+                }
        }
        list_del_rcu(&p->thread_group);
 }
@@ -643,6 +655,7 @@ static void exit_mm(struct task_struct * tsk)
        mm_release(tsk, mm);
        if (!mm)
                return;
+        sync_mm_rss(mm);
        /*
         * Serialize with any possible pending coredump.
         * We must hold mmap_sem around checking core_state
@@ -719,12 +732,6 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
                zap_pid_ns_processes(pid_ns);
                write_lock_irq(&tasklist_lock);
-                /*
-                 * We can not clear ->child_reaper or leave it alone.
-                 * There may by stealth EXIT_DEAD tasks on ->children,
-                 * forget_original_parent() must move them somewhere.
-                 */
-                pid_ns->child_reaper = init_pid_ns.child_reaper;
        } else if (father->signal->has_child_subreaper) {
                struct task_struct *reaper;
@@ -884,9 +891,9 @@ static void check_stack_usage(void)
        spin_lock(&low_water_lock);
        if (free < lowest_to_date) {
-                printk(KERN_WARNING "%s used greatest stack depth: %lu bytes "
+                printk(KERN_WARNING "%s (%d) used greatest stack depth: "
-                                "left\n",
+                                "%lu bytes left\n",
-                                current->comm, free);
+                                current->comm, task_pid_nr(current), free);
                lowest_to_date = free;
        }
        spin_unlock(&low_water_lock);
@@ -946,12 +953,13 @@ void do_exit(long code)
        exit_signals(tsk);  /* sets PF_EXITING */
        /*
         * tsk->flags are checked in the futex code to protect against
-         * an exiting task cleaning up the robust pi futexes.
+         * an exiting task cleaning up the robust pi futexes, and in
+         * task_work_add() to avoid the race with exit_task_work().
         */
        smp_mb();
        raw_spin_unlock_wait(&tsk->pi_lock);
-        exit_irq_thread();
+        exit_task_work(tsk);
        if (unlikely(in_atomic()))
                printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
@@ -1214,7 +1222,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
        unsigned long state;
        int retval, status, traced;
        pid_t pid = task_pid_vnr(p);
-        uid_t uid = __task_cred(p)->uid;
+        uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
        struct siginfo __user *infop;
        if (!likely(wo->wo_flags & WEXITED))
@@ -1427,7 +1435,7 @@ static int wait_task_stopped(struct wait_opts *wo,
        if (!unlikely(wo->wo_flags & WNOWAIT))
                *p_code = 0;
-        uid = task_uid(p);
+        uid = from_kuid_munged(current_user_ns(), task_uid(p));
 unlock_sig:
        spin_unlock_irq(&p->sighand->siglock);
        if (!exit_code)
@@ -1500,7 +1508,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
        }
        if (!unlikely(wo->wo_flags & WNOWAIT))
                p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
-        uid = task_uid(p);
+        uid = from_kuid_munged(current_user_ns(), task_uid(p));
        spin_unlock_irq(&p->sighand->siglock);
        pid = task_pid_vnr(p);
diff --git a/kernel/extable.c b/kernel/extable.c
index 5339705b8241..fe35a634bf76 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -35,10 +35,16 @@ DEFINE_MUTEX(text_mutex);
 extern struct exception_table_entry __start___ex_table[];
 extern struct exception_table_entry __stop___ex_table[];
+/* Cleared by build time tools if the table is already sorted. */
+u32 __initdata main_extable_sort_needed = 1;
 /* Sort the kernel's built-in exception table */
 void __init sort_main_extable(void)
 {
-        sort_extable(__start___ex_table, __stop___ex_table);
+        if (main_extable_sort_needed)
+                sort_extable(__start___ex_table, __stop___ex_table);
+        else
+                pr_notice("__ex_table already sorted, skipping sort\n");
 }
 /* Given an address, look for it in the exception tables. */
diff --git a/kernel/fork.c b/kernel/fork.c
index b9372a0bff18..f00e319d8376 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -34,6 +34,7 @@
 #include <linux/cgroup.h>
 #include <linux/security.h>
 #include <linux/hugetlb.h>
+#include <linux/seccomp.h>
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
@@ -47,6 +48,7 @@
 #include <linux/audit.h>
 #include <linux/memcontrol.h>
 #include <linux/ftrace.h>
+#include <linux/proc_fs.h>
 #include <linux/profile.h>
 #include <linux/rmap.h>
 #include <linux/ksm.h>
@@ -67,6 +69,7 @@
 #include <linux/oom.h>
 #include <linux/khugepaged.h>
 #include <linux/signalfd.h>
+#include <linux/uprobes.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -111,32 +114,67 @@ int nr_processes(void)
        return total;
 }
-#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
+#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
-# define alloc_task_struct_node(node)           \
-                kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node)
-# define free_task_struct(tsk)                  \
-                kmem_cache_free(task_struct_cachep, (tsk))
 static struct kmem_cache *task_struct_cachep;
+static inline struct task_struct *alloc_task_struct_node(int node)
+{
+        return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
+}
+void __weak arch_release_task_struct(struct task_struct *tsk) { }
+static inline void free_task_struct(struct task_struct *tsk)
+{
+        arch_release_task_struct(tsk);
+        kmem_cache_free(task_struct_cachep, tsk);
+}
 #endif
-#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
+#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
+void __weak arch_release_thread_info(struct thread_info *ti) { }
+/*
+ * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
+ * kmemcache based allocator.
+ */
+# if THREAD_SIZE >= PAGE_SIZE
 static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
                                                  int node)
 {
-#ifdef CONFIG_DEBUG_STACK_USAGE
+        struct page *page = alloc_pages_node(node, THREADINFO_GFP,
-        gfp_t mask = GFP_KERNEL | __GFP_ZERO;
+                                             THREAD_SIZE_ORDER);
-#else
-        gfp_t mask = GFP_KERNEL;
-#endif
-        struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER);
        return page ? page_address(page) : NULL;
 }
 static inline void free_thread_info(struct thread_info *ti)
 {
+        arch_release_thread_info(ti);
        free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
 }
+# else
+static struct kmem_cache *thread_info_cache;
+static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+                                                  int node)
+{
+        return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
+}
+static void free_thread_info(struct thread_info *ti)
+{
+        arch_release_thread_info(ti);
+        kmem_cache_free(thread_info_cache, ti);
+}
+void thread_info_cache_init(void)
+{
+        thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
+                                              THREAD_SIZE, 0, NULL);
+        BUG_ON(thread_info_cache == NULL);
+}
+# endif
 #endif
 /* SLAB cache for signal_struct structures (tsk->signal) */
@@ -170,6 +208,7 @@ void free_task(struct task_struct *tsk)
        free_thread_info(tsk->stack);
        rt_mutex_debug_task_free(tsk);
        ftrace_graph_exit_task(tsk);
+        put_seccomp_filter(tsk);
        free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -203,17 +242,11 @@ void __put_task_struct(struct task_struct *tsk)
 }
 EXPORT_SYMBOL_GPL(__put_task_struct);
-/*
+void __init __weak arch_task_cache_init(void) { }
- * macro override instead of weak attribute alias, to workaround
- * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions.
- */
-#ifndef arch_task_cache_init
-#define arch_task_cache_init()
-#endif
 void __init fork_init(unsigned long mempages)
 {
-#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
+#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
 #define ARCH_MIN_TASKALIGN      L1_CACHE_BYTES
 #endif
@@ -260,8 +293,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        int node = tsk_fork_get_node(orig);
        int err;
-        prepare_to_copy(orig);
        tsk = alloc_task_struct_node(node);
        if (!tsk)
                return NULL;
@@ -273,12 +304,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        }
        err = arch_dup_task_struct(tsk, orig);
-        if (err)
-                goto out;
+        /*
+         * We defer looking at err, because we will need this setup
+         * for the clean up path to work correctly.
+         */
        tsk->stack = ti;
        setup_thread_stack(tsk, orig);
+        if (err)
+                goto out;
        clear_user_return_notifier(tsk);
        clear_tsk_need_resched(tsk);
        stackend = end_of_stack(tsk);
@@ -355,7 +391,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                }
                charge = 0;
                if (mpnt->vm_flags & VM_ACCOUNT) {
-                        unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
+                        unsigned long len;
+                        len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
                        if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
                                goto fail_nomem;
                        charge = len;
@@ -421,6 +458,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                if (retval)
                        goto out;
+                if (file && uprobe_mmap(tmp))
+                        goto out;
        }
        /* a new mm has just been created */
        arch_dup_mmap(oldmm, mm);
@@ -569,6 +609,7 @@ void mmput(struct mm_struct *mm)
        might_sleep();
        if (atomic_dec_and_test(&mm->mm_users)) {
+                uprobe_clear_state(mm);
                exit_aio(mm);
                ksm_exit(mm);
                khugepaged_exit(mm); /* must run before exit_mmap */
@@ -579,7 +620,6 @@ void mmput(struct mm_struct *mm)
                        list_del(&mm->mmlist);
                        spin_unlock(&mmlist_lock);
                }
-                put_swap_token(mm);
                if (mm->binfmt)
                        module_put(mm->binfmt->module);
                mmdrop(mm);
@@ -747,12 +787,11 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
                exit_pi_state_list(tsk);
 #endif
+        uprobe_free_utask(tsk);
        /* Get rid of any cached register state */
        deactivate_mm(tsk, mm);
-        if (tsk->vfork_done)
-                complete_vfork_done(tsk);
        /*
         * If we're exiting normally, clear a user-space tid field if
         * requested.  We leave this alone when dying by signal, to leave
@@ -773,6 +812,13 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
                }
                tsk->clear_child_tid = NULL;
        }
+        /*
+         * All done, finally we can wake up parent and return this mm to him.
+         * Also kthread_stop() uses this completion for synchronization.
+         */
+        if (tsk->vfork_done)
+                complete_vfork_done(tsk);
 }
 /*
@@ -794,13 +840,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
        memcpy(mm, oldmm, sizeof(*mm));
        mm_init_cpumask(mm);
-        /* Initializing for Swap token stuff */
-        mm->token_priority = 0;
-        mm->last_interval = 0;
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        mm->pmd_huge_pte = NULL;
 #endif
+        uprobe_reset_state(mm);
        if (!mm_init(mm, tsk))
                goto fail_nomem;
@@ -875,10 +918,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
                goto fail_nomem;
 good_mm:
-        /* Initializing for Swap token stuff */
-        mm->token_priority = 0;
-        mm->last_interval = 0;
        tsk->mm = mm;
        tsk->active_mm = mm;
        return 0;
@@ -946,9 +985,8 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
         * Share io context with parent, if CLONE_IO is set
         */
        if (clone_flags & CLONE_IO) {
-                tsk->io_context = ioc_task_link(ioc);
+                ioc_task_link(ioc);
-                if (unlikely(!tsk->io_context))
+                tsk->io_context = ioc;
-                        return -ENOMEM;
        } else if (ioprio_valid(ioc->ioprio)) {
                new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
                if (unlikely(!new_ioc))
@@ -1162,6 +1200,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                goto fork_out;
        ftrace_graph_init_task(p);
+        get_seccomp_filter(p);
        rt_mutex_init_task(p);
@@ -1342,6 +1381,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        INIT_LIST_HEAD(&p->pi_state_list);
        p->pi_state_cache = NULL;
 #endif
+        uprobe_copy_process(p);
        /*
         * sigaltstack should be cleared when sharing the same VM
         */
@@ -1380,6 +1420,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         */
        p->group_leader = p;
        INIT_LIST_HEAD(&p->thread_group);
+        INIT_HLIST_HEAD(&p->task_works);
        /* Now that the task is set up, run cgroup callbacks if
         * necessary. We need to run them before the task is visible
@@ -1464,6 +1505,8 @@ bad_fork_cleanup_io:
        if (p->io_context)
                exit_io_context(p);
 bad_fork_cleanup_namespaces:
+        if (unlikely(clone_flags & CLONE_NEWPID))
+                pid_ns_release_proc(p->nsproxy->pid_ns);
        exit_task_namespaces(p);
 bad_fork_cleanup_mm:
        if (p->mm)
diff --git a/kernel/groups.c b/kernel/groups.c
index 99b53d1eb7ea..6b2588dd04ff 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -31,7 +31,7 @@ struct group_info *groups_alloc(int gidsetsize)
                group_info->blocks[0] = group_info->small_block;
        else {
                for (i = 0; i < nblocks; i++) {
-                        gid_t *b;
+                        kgid_t *b;
                        b = (void *)__get_free_page(GFP_USER);
                        if (!b)
                                goto out_undo_partial_alloc;
@@ -66,18 +66,15 @@ EXPORT_SYMBOL(groups_free);
 static int groups_to_user(gid_t __user *grouplist,
                          const struct group_info *group_info)
 {
+        struct user_namespace *user_ns = current_user_ns();
        int i;
        unsigned int count = group_info->ngroups;
-        for (i = 0; i < group_info->nblocks; i++) {
+        for (i = 0; i < count; i++) {
-                unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
+                gid_t gid;
-                unsigned int len = cp_count * sizeof(*grouplist);
+                gid = from_kgid_munged(user_ns, GROUP_AT(group_info, i));
+                if (put_user(gid, grouplist+i))
-                if (copy_to_user(grouplist, group_info->blocks[i], len))
                        return -EFAULT;
-                grouplist += NGROUPS_PER_BLOCK;
-                count -= cp_count;
        }
        return 0;
 }
@@ -86,18 +83,21 @@ static int groups_to_user(gid_t __user *grouplist,
 static int groups_from_user(struct group_info *group_info,
    gid_t __user *grouplist)
 {
+        struct user_namespace *user_ns = current_user_ns();
        int i;
        unsigned int count = group_info->ngroups;
-        for (i = 0; i < group_info->nblocks; i++) {
+        for (i = 0; i < count; i++) {
-                unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
+                gid_t gid;
-                unsigned int len = cp_count * sizeof(*grouplist);
+                kgid_t kgid;
+                if (get_user(gid, grouplist+i))
-                if (copy_from_user(group_info->blocks[i], grouplist, len))
                        return -EFAULT;
-                grouplist += NGROUPS_PER_BLOCK;
+                kgid = make_kgid(user_ns, gid);
-                count -= cp_count;
+                if (!gid_valid(kgid))
+                        return -EINVAL;
+                GROUP_AT(group_info, i) = kgid;
        }
        return 0;
 }
@@ -117,9 +117,9 @@ static void groups_sort(struct group_info *group_info)
                for (base = 0; base < max; base++) {
                        int left = base;
                        int right = left + stride;
-                        gid_t tmp = GROUP_AT(group_info, right);
+                        kgid_t tmp = GROUP_AT(group_info, right);
-                        while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
+                        while (left >= 0 && gid_gt(GROUP_AT(group_info, left), tmp)) {
                                GROUP_AT(group_info, right) =
                                    GROUP_AT(group_info, left);
                                right = left;
@@ -132,7 +132,7 @@ static void groups_sort(struct group_info *group_info)
 }
 /* a simple bsearch */
-int groups_search(const struct group_info *group_info, gid_t grp)
+int groups_search(const struct group_info *group_info, kgid_t grp)
 {
        unsigned int left, right;
@@ -143,9 +143,9 @@ int groups_search(const struct group_info *group_info, gid_t grp)
        right = group_info->ngroups;
        while (left < right) {
                unsigned int mid = (left+right)/2;
-                if (grp > GROUP_AT(group_info, mid))
+                if (gid_gt(grp, GROUP_AT(group_info, mid)))
                        left = mid + 1;
-                else if (grp < GROUP_AT(group_info, mid))
+                else if (gid_lt(grp, GROUP_AT(group_info, mid)))
                        right = mid;
                else
                        return 1;
@@ -256,24 +256,24 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
 /*
 * Check whether we're fsgid/egid or in the supplemental group..
 */
-int in_group_p(gid_t grp)
+int in_group_p(kgid_t grp)
 {
        const struct cred *cred = current_cred();
        int retval = 1;
-        if (grp != cred->fsgid)
+        if (!gid_eq(grp, cred->fsgid))
                retval = groups_search(cred->group_info, grp);
        return retval;
 }
 EXPORT_SYMBOL(in_group_p);
-int in_egroup_p(gid_t grp)
+int in_egroup_p(kgid_t grp)
 {
        const struct cred *cred = current_cred();
        int retval = 1;
-        if (grp != cred->egid)
+        if (!gid_eq(grp, cred->egid))
                retval = groups_search(cred->group_info, grp);
        return retval;
 }
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ae34bf51682b..6db7a5ed52b5 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -657,6 +657,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
        return 0;
 }
+static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
+{
+        ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
+        ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
+        return ktime_get_update_offsets(offs_real, offs_boot);
+}
 /*
 * Retrigger next event is called after clock was set
 *
@@ -665,22 +673,12 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 static void retrigger_next_event(void *arg)
 {
        struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
-        struct timespec realtime_offset, xtim, wtm, sleep;
        if (!hrtimer_hres_active())
                return;
-        /* Optimized out for !HIGH_RES */
-        get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep);
-        set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
-        /* Adjust CLOCK_REALTIME offset */
        raw_spin_lock(&base->lock);
-        base->clock_base[HRTIMER_BASE_REALTIME].offset =
+        hrtimer_update_base(base);
-                timespec_to_ktime(realtime_offset);
-        base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
-                timespec_to_ktime(sleep);
        hrtimer_force_reprogram(base, 0);
        raw_spin_unlock(&base->lock);
 }
@@ -710,13 +708,25 @@ static int hrtimer_switch_to_hres(void)
                base->clock_base[i].resolution = KTIME_HIGH_RES;
        tick_setup_sched_timer();
        /* "Retrigger" the interrupt to get things going */
        retrigger_next_event(NULL);
        local_irq_restore(flags);
        return 1;
 }
+/*
+ * Called from timekeeping code to reprogramm the hrtimer interrupt
+ * device. If called from the timer interrupt context we defer it to
+ * softirq context.
+ */
+void clock_was_set_delayed(void)
+{
+        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+        cpu_base->clock_was_set = 1;
+        __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+}
 #else
 static inline int hrtimer_hres_active(void) { return 0; }
@@ -1250,11 +1260,10 @@ void hrtimer_interrupt(struct clock_event_device *dev)
        cpu_base->nr_events++;
        dev->next_event.tv64 = KTIME_MAX;
-        entry_time = now = ktime_get();
+        raw_spin_lock(&cpu_base->lock);
+        entry_time = now = hrtimer_update_base(cpu_base);
 retry:
        expires_next.tv64 = KTIME_MAX;
-        raw_spin_lock(&cpu_base->lock);
        /*
         * We set expires_next to KTIME_MAX here with cpu_base->lock
         * held to prevent that a timer is enqueued in our queue via
@@ -1330,8 +1339,12 @@ retry:
         * We need to prevent that we loop forever in the hrtimer
         * interrupt routine. We give it 3 attempts to avoid
         * overreacting on some spurious event.
+         *
+         * Acquire base lock for updating the offsets and retrieving
+         * the current time.
         */
-        now = ktime_get();
+        raw_spin_lock(&cpu_base->lock);
+        now = hrtimer_update_base(cpu_base);
        cpu_base->nr_retries++;
        if (++retries < 3)
                goto retry;
@@ -1343,6 +1356,7 @@ retry:
         */
        cpu_base->nr_hangs++;
        cpu_base->hang_detected = 1;
+        raw_spin_unlock(&cpu_base->lock);
        delta = ktime_sub(now, entry_time);
        if (delta.tv64 > cpu_base->max_hang_time.tv64)
                cpu_base->max_hang_time = delta;
@@ -1395,6 +1409,13 @@ void hrtimer_peek_ahead_timers(void)
 static void run_hrtimer_softirq(struct softirq_action *h)
 {
+        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+        if (cpu_base->clock_was_set) {
+                cpu_base->clock_was_set = 0;
+                clock_was_set();
+        }
        hrtimer_peek_ahead_timers();
 }
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index c21449f85a2a..6df614912b9d 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -108,8 +108,10 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
        touch_nmi_watchdog();
-        if (sysctl_hung_task_panic)
+        if (sysctl_hung_task_panic) {
+                trigger_all_cpu_backtrace();
                panic("hung_task: blocked tasks");
+        }
 }
 /*
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6080f6bc8c33..eebd6d5cfb44 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -275,8 +275,10 @@ void handle_nested_irq(unsigned int irq)
        kstat_incr_irqs_this_cpu(irq, desc);
        action = desc->action;
-        if (unlikely(!action || irqd_irq_disabled(&desc->irq_data)))
+        if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) {
+                desc->istate |= IRQS_PENDING;
                goto out_unlock;
+        }
        irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
        raw_spin_unlock_irq(&desc->lock);
@@ -324,8 +326,10 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
        desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
        kstat_incr_irqs_this_cpu(irq, desc);
-        if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
+        if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+                desc->istate |= IRQS_PENDING;
                goto out_unlock;
+        }
        handle_irq_event(desc);
@@ -379,8 +383,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
         * If its disabled or no action available
         * keep it masked and get out of here
         */
-        if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
+        if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+                desc->istate |= IRQS_PENDING;
                goto out_unlock;
+        }
        handle_irq_event(desc);
@@ -518,6 +524,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 out_unlock:
        raw_spin_unlock(&desc->lock);
 }
+EXPORT_SYMBOL(handle_edge_irq);
 #ifdef CONFIG_IRQ_EDGE_EOI_HANDLER
 /**
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 8e5c56b3b7d9..001fa5bab490 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -101,6 +101,9 @@ extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
 extern void irq_set_thread_affinity(struct irq_desc *desc);
+extern int irq_do_set_affinity(struct irq_data *data,
+                               const struct cpumask *dest, bool force);
 /* Inline functions for support of irq chips on slow busses */
 static inline void chip_bus_lock(struct irq_desc *desc)
 {
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index d86e254b95eb..192a302d6cfd 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -112,6 +112,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
 {
        return radix_tree_lookup(&irq_desc_tree, irq);
 }
+EXPORT_SYMBOL(irq_to_desc);
 static void delete_irq_desc(unsigned int irq)
 {
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 0e0ba5f840b2..41c1564103f1 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1,3 +1,5 @@
+#define pr_fmt(fmt)  "irq: " fmt
 #include <linux/debugfs.h>
 #include <linux/hardirq.h>
 #include <linux/interrupt.h>
@@ -56,14 +58,73 @@ static struct irq_domain *irq_domain_alloc(struct device_node *of_node,
        return domain;
 }
+static void irq_domain_free(struct irq_domain *domain)
+{
+        of_node_put(domain->of_node);
+        kfree(domain);
+}
 static void irq_domain_add(struct irq_domain *domain)
 {
        mutex_lock(&irq_domain_mutex);
        list_add(&domain->link, &irq_domain_list);
        mutex_unlock(&irq_domain_mutex);
-        pr_debug("irq: Allocated domain of type %d @0x%p\n",
+        pr_debug("Allocated domain of type %d @0x%p\n",
+                 domain->revmap_type, domain);
+}
+/**
+ * irq_domain_remove() - Remove an irq domain.
+ * @domain: domain to remove
+ *
+ * This routine is used to remove an irq domain. The caller must ensure
+ * that all mappings within the domain have been disposed of prior to
+ * use, depending on the revmap type.
+ */
+void irq_domain_remove(struct irq_domain *domain)
+{
+        mutex_lock(&irq_domain_mutex);
+        switch (domain->revmap_type) {
+        case IRQ_DOMAIN_MAP_LEGACY:
+                /*
+                 * Legacy domains don't manage their own irq_desc
+                 * allocations, we expect the caller to handle irq_desc
+                 * freeing on their own.
+                 */
+                break;
+        case IRQ_DOMAIN_MAP_TREE:
+                /*
+                 * radix_tree_delete() takes care of destroying the root
+                 * node when all entries are removed. Shout if there are
+                 * any mappings left.
+                 */
+                WARN_ON(domain->revmap_data.tree.height);
+                break;
+        case IRQ_DOMAIN_MAP_LINEAR:
+                kfree(domain->revmap_data.linear.revmap);
+                domain->revmap_data.linear.size = 0;
+                break;
+        case IRQ_DOMAIN_MAP_NOMAP:
+                break;
+        }
+        list_del(&domain->link);
+        /*
+         * If the going away domain is the default one, reset it.
+         */
+        if (unlikely(irq_default_domain == domain))
+                irq_set_default_host(NULL);
+        mutex_unlock(&irq_domain_mutex);
+        pr_debug("Removed domain of type %d @0x%p\n",
                 domain->revmap_type, domain);
+        irq_domain_free(domain);
 }
+EXPORT_SYMBOL_GPL(irq_domain_remove);
 static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
                                             irq_hw_number_t hwirq)
@@ -117,8 +178,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
                if (WARN_ON(!irq_data || irq_data->domain)) {
                        mutex_unlock(&irq_domain_mutex);
-                        of_node_put(domain->of_node);
+                        irq_domain_free(domain);
-                        kfree(domain);
                        return NULL;
                }
        }
@@ -152,10 +212,12 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
        irq_domain_add(domain);
        return domain;
 }
+EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
 /**
 * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain.
 * @of_node: pointer to interrupt controller's device tree node.
+ * @size: Number of interrupts in the domain.
 * @ops: map/unmap domain callbacks
 * @host_data: Controller private data pointer
 */
@@ -181,6 +243,7 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node,
        irq_domain_add(domain);
        return domain;
 }
+EXPORT_SYMBOL_GPL(irq_domain_add_linear);
 struct irq_domain *irq_domain_add_nomap(struct device_node *of_node,
                                         unsigned int max_irq,
@@ -195,6 +258,7 @@ struct irq_domain *irq_domain_add_nomap(struct device_node *of_node,
        }
        return domain;
 }
+EXPORT_SYMBOL_GPL(irq_domain_add_nomap);
 /**
 * irq_domain_add_tree()
@@ -216,6 +280,7 @@ struct irq_domain *irq_domain_add_tree(struct device_node *of_node,
        }
        return domain;
 }
+EXPORT_SYMBOL_GPL(irq_domain_add_tree);
 /**
 * irq_find_host() - Locates a domain for a given device node
@@ -259,10 +324,11 @@ EXPORT_SYMBOL_GPL(irq_find_host);
 */
 void irq_set_default_host(struct irq_domain *domain)
 {
-        pr_debug("irq: Default domain set to @0x%p\n", domain);
+        pr_debug("Default domain set to @0x%p\n", domain);
        irq_default_domain = domain;
 }
+EXPORT_SYMBOL_GPL(irq_set_default_host);
 static int irq_setup_virq(struct irq_domain *domain, unsigned int virq,
                            irq_hw_number_t hwirq)
@@ -272,7 +338,7 @@ static int irq_setup_virq(struct irq_domain *domain, unsigned int virq,
        irq_data->hwirq = hwirq;
        irq_data->domain = domain;
        if (domain->ops->map(domain, virq, hwirq)) {
-                pr_debug("irq: -> mapping failed, freeing\n");
+                pr_debug("irq-%i==>hwirq-0x%lx mapping failed\n", virq, hwirq);
                irq_data->domain = NULL;
                irq_data->hwirq = 0;
                return -1;
@@ -303,7 +369,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
        virq = irq_alloc_desc_from(1, 0);
        if (!virq) {
-                pr_debug("irq: create_direct virq allocation failed\n");
+                pr_debug("create_direct virq allocation failed\n");
                return 0;
        }
        if (virq >= domain->revmap_data.nomap.max_irq) {
@@ -312,7 +378,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
                irq_free_desc(virq);
                return 0;
        }
-        pr_debug("irq: create_direct obtained virq %d\n", virq);
+        pr_debug("create_direct obtained virq %d\n", virq);
        if (irq_setup_virq(domain, virq, virq)) {
                irq_free_desc(virq);
@@ -321,6 +387,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
        return virq;
 }
+EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
 /**
 * irq_create_mapping() - Map a hardware interrupt into linux irq space
@@ -338,23 +405,23 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
        unsigned int hint;
        int virq;
-        pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
+        pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
        /* Look for default domain if nececssary */
        if (domain == NULL)
                domain = irq_default_domain;
        if (domain == NULL) {
-                printk(KERN_WARNING "irq_create_mapping called for"
+                pr_warning("irq_create_mapping called for"
-                       " NULL domain, hwirq=%lx\n", hwirq);
+                           " NULL domain, hwirq=%lx\n", hwirq);
                WARN_ON(1);
                return 0;
        }
-        pr_debug("irq: -> using domain @%p\n", domain);
+        pr_debug("-> using domain @%p\n", domain);
        /* Check if mapping already exists */
        virq = irq_find_mapping(domain, hwirq);
        if (virq) {
-                pr_debug("irq: -> existing mapping on virq %d\n", virq);
+                pr_debug("-> existing mapping on virq %d\n", virq);
                return virq;
        }
@@ -370,7 +437,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
        if (virq <= 0)
                virq = irq_alloc_desc_from(1, 0);
        if (virq <= 0) {
-                pr_debug("irq: -> virq allocation failed\n");
+                pr_debug("-> virq allocation failed\n");
                return 0;
        }
@@ -380,7 +447,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
                return 0;
        }
-        pr_debug("irq: irq %lu on domain %s mapped to virtual irq %u\n",
+        pr_debug("irq %lu on domain %s mapped to virtual irq %u\n",
                hwirq, domain->of_node ? domain->of_node->full_name : "null", virq);
        return virq;
@@ -409,8 +476,8 @@ unsigned int irq_create_of_mapping(struct device_node *controller,
                if (intsize > 0)
                        return intspec[0];
 #endif
-                printk(KERN_WARNING "irq: no irq domain found for %s !\n",
+                pr_warning("no irq domain found for %s !\n",
-                       controller->full_name);
+                           controller->full_name);
                return 0;
        }
@@ -560,6 +627,7 @@ unsigned int irq_radix_revmap_lookup(struct irq_domain *domain,
         */
        return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq);
 }
+EXPORT_SYMBOL_GPL(irq_radix_revmap_lookup);
 /**
 * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping.
@@ -584,6 +652,7 @@ void irq_radix_revmap_insert(struct irq_domain *domain, unsigned int virq,
                mutex_unlock(&revmap_trees_mutex);
        }
 }
+EXPORT_SYMBOL_GPL(irq_radix_revmap_insert);
 /**
 * irq_linear_revmap() - Find a linux irq from a hw irq number.
@@ -617,6 +686,7 @@ unsigned int irq_linear_revmap(struct irq_domain *domain,
        return revmap[hwirq];
 }
+EXPORT_SYMBOL_GPL(irq_linear_revmap);
 #ifdef CONFIG_IRQ_DOMAIN_DEBUG
 static int virq_debug_show(struct seq_file *m, void *private)
@@ -691,8 +761,8 @@ static int __init irq_debugfs_init(void)
 __initcall(irq_debugfs_init);
 #endif /* CONFIG_IRQ_DOMAIN_DEBUG */
-int irq_domain_simple_map(struct irq_domain *d, unsigned int irq,
+static int irq_domain_simple_map(struct irq_domain *d, unsigned int irq,
-                          irq_hw_number_t hwirq)
+                                 irq_hw_number_t hwirq)
 {
        return 0;
 }
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 89a3ea82569b..8c548232ba39 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -7,6 +7,8 @@
 * This file contains driver APIs to the irq subsystem.
 */
+#define pr_fmt(fmt) "genirq: " fmt
 #include <linux/irq.h>
 #include <linux/kthread.h>
 #include <linux/module.h>
@@ -14,6 +16,7 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/task_work.h>
 #include "internals.h"
@@ -139,6 +142,25 @@ static inline void
 irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
 #endif
+int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
+                        bool force)
+{
+        struct irq_desc *desc = irq_data_to_desc(data);
+        struct irq_chip *chip = irq_data_get_irq_chip(data);
+        int ret;
+        ret = chip->irq_set_affinity(data, mask, false);
+        switch (ret) {
+        case IRQ_SET_MASK_OK:
+                cpumask_copy(data->affinity, mask);
+        case IRQ_SET_MASK_OK_NOCOPY:
+                irq_set_thread_affinity(desc);
+                ret = 0;
+        }
+        return ret;
+}
 int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
 {
        struct irq_chip *chip = irq_data_get_irq_chip(data);
@@ -149,14 +171,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
                return -EINVAL;
        if (irq_can_move_pcntxt(data)) {
-                ret = chip->irq_set_affinity(data, mask, false);
+                ret = irq_do_set_affinity(data, mask, false);
-                switch (ret) {
-                case IRQ_SET_MASK_OK:
-                        cpumask_copy(data->affinity, mask);
-                case IRQ_SET_MASK_OK_NOCOPY:
-                        irq_set_thread_affinity(desc);
-                        ret = 0;
-                }
        } else {
                irqd_set_move_pending(data);
                irq_copy_pending(desc, mask);
@@ -280,9 +295,8 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
 static int
 setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
 {
-        struct irq_chip *chip = irq_desc_get_chip(desc);
        struct cpumask *set = irq_default_affinity;
-        int ret, node = desc->irq_data.node;
+        int node = desc->irq_data.node;
        /* Excludes PER_CPU and NO_BALANCE interrupts */
        if (!irq_can_set_affinity(irq))
@@ -308,13 +322,7 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
                if (cpumask_intersects(mask, nodemask))
                        cpumask_and(mask, mask, nodemask);
        }
-        ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
+        irq_do_set_affinity(&desc->irq_data, mask, false);
-        switch (ret) {
-        case IRQ_SET_MASK_OK:
-                cpumask_copy(desc->irq_data.affinity, mask);
-        case IRQ_SET_MASK_OK_NOCOPY:
-                irq_set_thread_affinity(desc);
-        }
        return 0;
 }
 #else
@@ -566,7 +574,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
                 * flow-types?
                 */
                pr_debug("No set_type function for IRQ %d (%s)\n", irq,
-                                chip ? (chip->name ? : "unknown") : "unknown");
+                         chip ? (chip->name ? : "unknown") : "unknown");
                return 0;
        }
@@ -600,7 +608,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
                ret = 0;
                break;
        default:
-                pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
+                pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n",
                       flags, irq, chip->irq_set_type);
        }
        if (unmask)
@@ -773,11 +781,39 @@ static void wake_threads_waitq(struct irq_desc *desc)
                wake_up(&desc->wait_for_threads);
 }
+static void irq_thread_dtor(struct task_work *unused)
+{
+        struct task_struct *tsk = current;
+        struct irq_desc *desc;
+        struct irqaction *action;
+        if (WARN_ON_ONCE(!(current->flags & PF_EXITING)))
+                return;
+        action = kthread_data(tsk);
+        pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
+               tsk->comm ? tsk->comm : "", tsk->pid, action->irq);
+        desc = irq_to_desc(action->irq);
+        /*
+         * If IRQTF_RUNTHREAD is set, we need to decrement
+         * desc->threads_active and wake possible waiters.
+         */
+        if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+                wake_threads_waitq(desc);
+        /* Prevent a stale desc->threads_oneshot */
+        irq_finalize_oneshot(desc, action);
+}
 /*
 * Interrupt handler thread
 */
 static int irq_thread(void *data)
 {
+        struct task_work on_exit_work;
        static const struct sched_param param = {
                .sched_priority = MAX_USER_RT_PRIO/2,
        };
@@ -793,7 +829,9 @@ static int irq_thread(void *data)
                handler_fn = irq_thread_fn;
        sched_setscheduler(current, SCHED_FIFO, &param);
-        current->irq_thread = 1;
+        init_task_work(&on_exit_work, irq_thread_dtor, NULL);
+        task_work_add(current, &on_exit_work, false);
        while (!irq_wait_for_interrupt(action)) {
                irqreturn_t action_ret;
@@ -815,45 +853,11 @@ static int irq_thread(void *data)
         * cannot touch the oneshot mask at this point anymore as
         * __setup_irq() might have given out currents thread_mask
         * again.
-         *
-         * Clear irq_thread. Otherwise exit_irq_thread() would make
-         * fuzz about an active irq thread going into nirvana.
         */
-        current->irq_thread = 0;
+        task_work_cancel(current, irq_thread_dtor);
        return 0;
 }
-/*
- * Called from do_exit()
- */
-void exit_irq_thread(void)
-{
-        struct task_struct *tsk = current;
-        struct irq_desc *desc;
-        struct irqaction *action;
-        if (!tsk->irq_thread)
-                return;
-        action = kthread_data(tsk);
-        printk(KERN_ERR
-               "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
-               tsk->comm ? tsk->comm : "", tsk->pid, action->irq);
-        desc = irq_to_desc(action->irq);
-        /*
-         * If IRQTF_RUNTHREAD is set, we need to decrement
-         * desc->threads_active and wake possible waiters.
-         */
-        if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags))
-                wake_threads_waitq(desc);
-        /* Prevent a stale desc->threads_oneshot */
-        irq_finalize_oneshot(desc, action);
-}
 static void irq_setup_forced_threading(struct irqaction *new)
 {
        if (!force_irqthreads)
@@ -878,7 +882,6 @@ static int
 __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 {
        struct irqaction *old, **old_ptr;
-        const char *old_name = NULL;
        unsigned long flags, thread_mask = 0;
        int ret, nested, shared = 0;
        cpumask_var_t mask;
@@ -972,10 +975,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                 */
                if (!((old->flags & new->flags) & IRQF_SHARED) ||
                    ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
-                    ((old->flags ^ new->flags) & IRQF_ONESHOT)) {
+                    ((old->flags ^ new->flags) & IRQF_ONESHOT))
-                        old_name = old->name;
                        goto mismatch;
-                }
                /* All handlers must agree on per-cpuness */
                if ((old->flags & IRQF_PERCPU) !=
@@ -1031,6 +1032,27 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                 * all existing action->thread_mask bits.
                 */
                new->thread_mask = 1 << ffz(thread_mask);
+        } else if (new->handler == irq_default_primary_handler) {
+                /*
+                 * The interrupt was requested with handler = NULL, so
+                 * we use the default primary handler for it. But it
+                 * does not have the oneshot flag set. In combination
+                 * with level interrupts this is deadly, because the
+                 * default primary handler just wakes the thread, then
+                 * the irq lines is reenabled, but the device still
+                 * has the level irq asserted. Rinse and repeat....
+                 *
+                 * While this works for edge type interrupts, we play
+                 * it safe and reject unconditionally because we can't
+                 * say for sure which type this interrupt really
+                 * has. The type flags are unreliable as the
+                 * underlying chip implementation can override them.
+                 */
+                pr_err("Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n",
+                       irq);
+                ret = -EINVAL;
+                goto out_mask;
        }
        if (!shared) {
@@ -1078,7 +1100,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                if (nmsk != omsk)
                        /* hope the handler works with current  trigger mode */
-                        pr_warning("IRQ %d uses trigger mode %u; requested %u\n",
+                        pr_warning("irq %d uses trigger mode %u; requested %u\n",
                                   irq, nmsk, omsk);
        }
@@ -1115,14 +1137,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
        return 0;
 mismatch:
-#ifdef CONFIG_DEBUG_SHIRQ
        if (!(new->flags & IRQF_PROBE_SHARED)) {
-                printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq);
+                pr_err("Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n",
-                if (old_name)
+                       irq, new->flags, new->name, old->flags, old->name);
-                        printk(KERN_ERR "current handler: %s\n", old_name);
+#ifdef CONFIG_DEBUG_SHIRQ
                dump_stack();
-        }
 #endif
+        }
        ret = -EBUSY;
 out_mask:
@@ -1204,12 +1225,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
        /* Found it - now remove it from the list of entries: */
        *action_ptr = action->next;
-        /* Currently used only by UML, might disappear one day: */
-#ifdef CONFIG_IRQ_RELEASE_METHOD
-        if (desc->irq_data.chip->release)
-                desc->irq_data.chip->release(irq, dev_id);
-#endif
        /* If this was the last handler, shut down the IRQ line: */
        if (!desc->action)
                irq_shutdown(desc);
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index c3c89751b327..ca3f4aaff707 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -42,17 +42,8 @@ void irq_move_masked_irq(struct irq_data *idata)
         * For correct operation this depends on the caller
         * masking the irqs.
         */
-        if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
+        if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids)
-                   < nr_cpu_ids)) {
+                irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false);
-                int ret = chip->irq_set_affinity(&desc->irq_data,
-                                                 desc->pending_mask, false);
-                switch (ret) {
-                case IRQ_SET_MASK_OK:
-                        cpumask_copy(desc->irq_data.affinity, desc->pending_mask);
-                case IRQ_SET_MASK_OK_NOCOPY:
-                        irq_set_thread_affinity(desc);
-                }
-        }
        cpumask_clear(desc->pending_mask);
 }
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 15e53b1766a6..cb228bf21760 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -103,8 +103,13 @@ int check_wakeup_irqs(void)
        int irq;
        for_each_irq_desc(irq, desc) {
+                /*
+                 * Only interrupts which are marked as wakeup source
+                 * and have not been disabled before the suspend check
+                 * can abort suspend.
+                 */
                if (irqd_is_wakeup_set(&desc->irq_data)) {
-                        if (desc->istate & IRQS_PENDING)
+                        if (desc->depth == 1 && desc->istate & IRQS_PENDING)
                                return -EBUSY;
                        continue;
                }
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 14dd5761e8c9..6454db7b6a4d 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -58,10 +58,13 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
        /*
         * We do not resend level type interrupts. Level type
         * interrupts are resent by hardware when they are still
-         * active.
+         * active. Clear the pending bit so suspend/resume does not
+         * get confused.
         */
-        if (irq_settings_is_level(desc))
+        if (irq_settings_is_level(desc)) {
+                desc->istate &= ~IRQS_PENDING;
                return;
+        }
        if (desc->istate & IRQS_REPLAY)
                return;
        if (desc->istate & IRQS_PENDING) {
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 079f1d39a8b8..2169feeba529 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -343,7 +343,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
 /* Look up a kernel symbol and return it in a text buffer. */
 static int __sprint_symbol(char *buffer, unsigned long address,
-                           int symbol_offset)
+                           int symbol_offset, int add_offset)
 {
        char *modname;
        const char *name;
@@ -358,13 +358,13 @@ static int __sprint_symbol(char *buffer, unsigned long address,
        if (name != buffer)
                strcpy(buffer, name);
        len = strlen(buffer);
-        buffer += len;
        offset -= symbol_offset;
+        if (add_offset)
+                len += sprintf(buffer + len, "+%#lx/%#lx", offset, size);
        if (modname)
-                len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname);
+                len += sprintf(buffer + len, " [%s]", modname);
-        else
-                len += sprintf(buffer, "+%#lx/%#lx", offset, size);
        return len;
 }
@@ -382,12 +382,28 @@ static int __sprint_symbol(char *buffer, unsigned long address,
 */
 int sprint_symbol(char *buffer, unsigned long address)
 {
-        return __sprint_symbol(buffer, address, 0);
+        return __sprint_symbol(buffer, address, 0, 1);
 }
 EXPORT_SYMBOL_GPL(sprint_symbol);
 /**
+ * sprint_symbol_no_offset - Look up a kernel symbol and return it in a text buffer
+ * @buffer: buffer to be stored
+ * @address: address to lookup
+ *
+ * This function looks up a kernel symbol with @address and stores its name
+ * and module name to @buffer if possible. If no symbol was found, just saves
+ * its @address as is.
+ *
+ * This function returns the number of bytes stored in @buffer.
+ */
+int sprint_symbol_no_offset(char *buffer, unsigned long address)
+{
+        return __sprint_symbol(buffer, address, 0, 0);
+}
+EXPORT_SYMBOL_GPL(sprint_symbol_no_offset);
+/**
 * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer
 * @buffer: buffer to be stored
 * @address: address to lookup
@@ -403,7 +419,7 @@ EXPORT_SYMBOL_GPL(sprint_symbol);
 */
 int sprint_backtrace(char *buffer, unsigned long address)
 {
-        return __sprint_symbol(buffer, address, -1);
+        return __sprint_symbol(buffer, address, -1, 1);
 }
 /* Look up a kernel symbol and print it to the kernel messages. */
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
new file mode 100644
index 000000000000..30b7b225306c
--- /dev/null
+++ b/kernel/kcmp.c
@@ -0,0 +1,196 @@
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
+#include <linux/fdtable.h>
+#include <linux/string.h>
+#include <linux/random.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/cache.h>
+#include <linux/bug.h>
+#include <linux/err.h>
+#include <linux/kcmp.h>
+#include <asm/unistd.h>
+/*
+ * We don't expose the real in-memory order of objects for security reasons.
+ * But still the comparison results should be suitable for sorting. So we
+ * obfuscate kernel pointers values and compare the production instead.
+ *
+ * The obfuscation is done in two steps. First we xor the kernel pointer with
+ * a random value, which puts pointer into a new position in a reordered space.
+ * Secondly we multiply the xor production with a large odd random number to
+ * permute its bits even more (the odd multiplier guarantees that the product
+ * is unique ever after the high bits are truncated, since any odd number is
+ * relative prime to 2^n).
+ *
+ * Note also that the obfuscation itself is invisible to userspace and if needed
+ * it can be changed to an alternate scheme.
+ */
+static unsigned long cookies[KCMP_TYPES][2] __read_mostly;
+static long kptr_obfuscate(long v, int type)
+{
+        return (v ^ cookies[type][0]) * cookies[type][1];
+}
+/*
+ * 0 - equal, i.e. v1 = v2
+ * 1 - less than, i.e. v1 < v2
+ * 2 - greater than, i.e. v1 > v2
+ * 3 - not equal but ordering unavailable (reserved for future)
+ */
+static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type)
+{
+        long ret;
+        ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type);
+        return (ret < 0) | ((ret > 0) << 1);
+}
+/* The caller must have pinned the task */
+static struct file *
+get_file_raw_ptr(struct task_struct *task, unsigned int idx)
+{
+        struct file *file = NULL;
+        task_lock(task);
+        rcu_read_lock();
+        if (task->files)
+                file = fcheck_files(task->files, idx);
+        rcu_read_unlock();
+        task_unlock(task);
+        return file;
+}
+static void kcmp_unlock(struct mutex *m1, struct mutex *m2)
+{
+        if (likely(m2 != m1))
+                mutex_unlock(m2);
+        mutex_unlock(m1);
+}
+static int kcmp_lock(struct mutex *m1, struct mutex *m2)
+{
+        int err;
+        if (m2 > m1)
+                swap(m1, m2);
+        err = mutex_lock_killable(m1);
+        if (!err && likely(m1 != m2)) {
+                err = mutex_lock_killable_nested(m2, SINGLE_DEPTH_NESTING);
+                if (err)
+                        mutex_unlock(m1);
+        }
+        return err;
+}
+SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
+                unsigned long, idx1, unsigned long, idx2)
+{
+        struct task_struct *task1, *task2;
+        int ret;
+        rcu_read_lock();
+        /*
+         * Tasks are looked up in caller's PID namespace only.
+         */
+        task1 = find_task_by_vpid(pid1);
+        task2 = find_task_by_vpid(pid2);
+        if (!task1 || !task2)
+                goto err_no_task;
+        get_task_struct(task1);
+        get_task_struct(task2);
+        rcu_read_unlock();
+        /*
+         * One should have enough rights to inspect task details.
+         */
+        ret = kcmp_lock(&task1->signal->cred_guard_mutex,
+                        &task2->signal->cred_guard_mutex);
+        if (ret)
+                goto err;
+        if (!ptrace_may_access(task1, PTRACE_MODE_READ) ||
+            !ptrace_may_access(task2, PTRACE_MODE_READ)) {
+                ret = -EPERM;
+                goto err_unlock;
+        }
+        switch (type) {
+        case KCMP_FILE: {
+                struct file *filp1, *filp2;
+                filp1 = get_file_raw_ptr(task1, idx1);
+                filp2 = get_file_raw_ptr(task2, idx2);
+                if (filp1 && filp2)
+                        ret = kcmp_ptr(filp1, filp2, KCMP_FILE);
+                else
+                        ret = -EBADF;
+                break;
+        }
+        case KCMP_VM:
+                ret = kcmp_ptr(task1->mm, task2->mm, KCMP_VM);
+                break;
+        case KCMP_FILES:
+                ret = kcmp_ptr(task1->files, task2->files, KCMP_FILES);
+                break;
+        case KCMP_FS:
+                ret = kcmp_ptr(task1->fs, task2->fs, KCMP_FS);
+                break;
+        case KCMP_SIGHAND:
+                ret = kcmp_ptr(task1->sighand, task2->sighand, KCMP_SIGHAND);
+                break;
+        case KCMP_IO:
+                ret = kcmp_ptr(task1->io_context, task2->io_context, KCMP_IO);
+                break;
+        case KCMP_SYSVSEM:
+#ifdef CONFIG_SYSVIPC
+                ret = kcmp_ptr(task1->sysvsem.undo_list,
+                               task2->sysvsem.undo_list,
+                               KCMP_SYSVSEM);
+#else
+                ret = -EOPNOTSUPP;
+#endif
+                break;
+        default:
+                ret = -EINVAL;
+                break;
+        }
+err_unlock:
+        kcmp_unlock(&task1->signal->cred_guard_mutex,
+                    &task2->signal->cred_guard_mutex);
+err:
+        put_task_struct(task1);
+        put_task_struct(task2);
+        return ret;
+err_no_task:
+        rcu_read_unlock();
+        return -ESRCH;
+}
+static __init int kcmp_cookies_init(void)
+{
+        int i;
+        get_random_bytes(cookies, sizeof(cookies));
+        for (i = 0; i < KCMP_TYPES; i++)
+                cookies[i][1] |= (~(~0UL >>  1) | 1);
+        return 0;
+}
+arch_initcall(kcmp_cookies_init);
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index c744b88c44e2..59dcf5b81d24 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -402,6 +402,7 @@ unsigned int __kfifo_max_r(unsigned int len, size_t recsize)
                return max;
        return len;
 }
+EXPORT_SYMBOL(__kfifo_max_r);
 #define __KFIFO_PEEK(data, out, mask) \
        ((data)[(out) & (mask)])
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 05698a7415fe..ff2c7cb86d77 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -221,13 +221,12 @@ fail:
        return 0;
 }
-void call_usermodehelper_freeinfo(struct subprocess_info *info)
+static void call_usermodehelper_freeinfo(struct subprocess_info *info)
 {
        if (info->cleanup)
                (*info->cleanup)(info);
        kfree(info);
 }
-EXPORT_SYMBOL(call_usermodehelper_freeinfo);
 static void umh_complete(struct subprocess_info *sub_info)
 {
@@ -410,7 +409,7 @@ EXPORT_SYMBOL_GPL(usermodehelper_read_unlock);
 /**
 * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled.
- * depth: New value to assign to usermodehelper_disabled.
+ * @depth: New value to assign to usermodehelper_disabled.
 *
 * Change the value of usermodehelper_disabled (under umhelper_sem locked for
 * writing) and wakeup tasks waiting for it to change.
@@ -479,6 +478,7 @@ static void helper_unlock(void)
 * structure.  This should be passed to call_usermodehelper_exec to
 * exec the process and free the structure.
 */
+static
 struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
                                                  char **envp, gfp_t gfp_mask)
 {
@@ -494,7 +494,6 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
  out:
        return sub_info;
 }
-EXPORT_SYMBOL(call_usermodehelper_setup);
 /**
 * call_usermodehelper_setfns - set a cleanup/init function
@@ -512,6 +511,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
 * Function must be runnable in either a process context or the
 * context in which call_usermodehelper_exec is called.
 */
+static
 void call_usermodehelper_setfns(struct subprocess_info *info,
                    int (*init)(struct subprocess_info *info, struct cred *new),
                    void (*cleanup)(struct subprocess_info *info),
@@ -521,7 +521,6 @@ void call_usermodehelper_setfns(struct subprocess_info *info,
        info->init = init;
        info->data = data;
 }
-EXPORT_SYMBOL(call_usermodehelper_setfns);
 /**
 * call_usermodehelper_exec - start a usermode application
@@ -535,6 +534,7 @@ EXPORT_SYMBOL(call_usermodehelper_setfns);
 * asynchronously if wait is not set, and runs as a child of keventd.
 * (ie. it runs with full root capabilities).
 */
+static
 int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
 {
        DECLARE_COMPLETION_ONSTACK(done);
@@ -576,7 +576,25 @@ unlock:
        helper_unlock();
        return retval;
 }
-EXPORT_SYMBOL(call_usermodehelper_exec);
+int call_usermodehelper_fns(
+        char *path, char **argv, char **envp, int wait,
+        int (*init)(struct subprocess_info *info, struct cred *new),
+        void (*cleanup)(struct subprocess_info *), void *data)
+{
+        struct subprocess_info *info;
+        gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
+        info = call_usermodehelper_setup(path, argv, envp, gfp_mask);
+        if (info == NULL)
+                return -ENOMEM;
+        call_usermodehelper_setfns(info, init, cleanup, data);
+        return call_usermodehelper_exec(info, wait);
+}
+EXPORT_SYMBOL(call_usermodehelper_fns);
 static int proc_cap_handler(struct ctl_table *table, int write,
                         void __user *buffer, size_t *lenp, loff_t *ppos)
diff --git a/kernel/lglock.c b/kernel/lglock.c
new file mode 100644
index 000000000000..6535a667a5a7
--- /dev/null
+++ b/kernel/lglock.c
@@ -0,0 +1,89 @@
+/* See include/linux/lglock.h for description */
+#include <linux/module.h>
+#include <linux/lglock.h>
+#include <linux/cpu.h>
+#include <linux/string.h>
+/*
+ * Note there is no uninit, so lglocks cannot be defined in
+ * modules (but it's fine to use them from there)
+ * Could be added though, just undo lg_lock_init
+ */
+void lg_lock_init(struct lglock *lg, char *name)
+{
+        LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
+}
+EXPORT_SYMBOL(lg_lock_init);
+void lg_local_lock(struct lglock *lg)
+{
+        arch_spinlock_t *lock;
+        preempt_disable();
+        rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
+        lock = this_cpu_ptr(lg->lock);
+        arch_spin_lock(lock);
+}
+EXPORT_SYMBOL(lg_local_lock);
+void lg_local_unlock(struct lglock *lg)
+{
+        arch_spinlock_t *lock;
+        rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+        lock = this_cpu_ptr(lg->lock);
+        arch_spin_unlock(lock);
+        preempt_enable();
+}
+EXPORT_SYMBOL(lg_local_unlock);
+void lg_local_lock_cpu(struct lglock *lg, int cpu)
+{
+        arch_spinlock_t *lock;
+        preempt_disable();
+        rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
+        lock = per_cpu_ptr(lg->lock, cpu);
+        arch_spin_lock(lock);
+}
+EXPORT_SYMBOL(lg_local_lock_cpu);
+void lg_local_unlock_cpu(struct lglock *lg, int cpu)
+{
+        arch_spinlock_t *lock;
+        rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+        lock = per_cpu_ptr(lg->lock, cpu);
+        arch_spin_unlock(lock);
+        preempt_enable();
+}
+EXPORT_SYMBOL(lg_local_unlock_cpu);
+void lg_global_lock(struct lglock *lg)
+{
+        int i;
+        preempt_disable();
+        rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_);
+        for_each_possible_cpu(i) {
+                arch_spinlock_t *lock;
+                lock = per_cpu_ptr(lg->lock, i);
+                arch_spin_lock(lock);
+        }
+}
+EXPORT_SYMBOL(lg_global_lock);
+void lg_global_unlock(struct lglock *lg)
+{
+        int i;
+        rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+        for_each_possible_cpu(i) {
+                arch_spinlock_t *lock;
+                lock = per_cpu_ptr(lg->lock, i);
+                arch_spin_unlock(lock);
+        }
+        preempt_enable();
+}
+EXPORT_SYMBOL(lg_global_unlock);
diff --git a/kernel/module.c b/kernel/module.c
index 78ac6ec1e425..4edbd9c11aca 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2429,7 +2429,8 @@ static int copy_and_check(struct load_info *info,
                goto free_hdr;
        }
-        if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) {
+        if (hdr->e_shoff >= len ||
+            hdr->e_shnum * sizeof(Elf_Shdr) > len - hdr->e_shoff) {
                err = -ENOEXEC;
                goto free_hdr;
        }
@@ -2953,7 +2954,7 @@ static struct module *load_module(void __user *umod,
        /* Module is ready to execute: parsing args may do that. */
        err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
-                         -32768, 32767, NULL);
+                         -32768, 32767, &ddebug_dyndbg_module_param_cb);
        if (err < 0)
                goto unlink;
diff --git a/kernel/panic.c b/kernel/panic.c
index 8ed89a175d79..d2a5f4ecc6dd 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -27,7 +27,7 @@
 #define PANIC_TIMER_STEP 100
 #define PANIC_BLINK_SPD 18
-int panic_on_oops;
+int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;
 static unsigned long tainted_mask;
 static int pause_on_oops;
 static int pause_on_oops_flag;
@@ -108,8 +108,6 @@ void panic(const char *fmt, ...)
         */
        crash_kexec(NULL);
-        kmsg_dump(KMSG_DUMP_PANIC);
        /*
         * Note smp_send_stop is the usual smp shutdown function, which
         * unfortunately means it may not be hardened to work in a panic
@@ -117,6 +115,8 @@ void panic(const char *fmt, ...)
         */
        smp_send_stop();
+        kmsg_dump(KMSG_DUMP_PANIC);
        atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
        bust_spinlocks(0);
diff --git a/kernel/params.c b/kernel/params.c
index f37d82631347..ed35345be536 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -85,11 +85,13 @@ bool parameq(const char *a, const char *b)
 static int parse_one(char *param,
                     char *val,
+                     const char *doing,
                     const struct kernel_param *params,
                     unsigned num_params,
                     s16 min_level,
                     s16 max_level,
-                     int (*handle_unknown)(char *param, char *val))
+                     int (*handle_unknown)(char *param, char *val,
+                                     const char *doing))
 {
        unsigned int i;
        int err;
@@ -104,8 +106,8 @@ static int parse_one(char *param,
                        if (!val && params[i].ops->set != param_set_bool
                            && params[i].ops->set != param_set_bint)
                                return -EINVAL;
-                        pr_debug("They are equal!  Calling %p\n",
+                        pr_debug("handling %s with %p\n", param,
-                               params[i].ops->set);
+                                params[i].ops->set);
                        mutex_lock(&param_lock);
                        err = params[i].ops->set(val, &params[i]);
                        mutex_unlock(&param_lock);
@@ -114,11 +116,11 @@ static int parse_one(char *param,
        }
        if (handle_unknown) {
-                pr_debug("Unknown argument: calling %p\n", handle_unknown);
+                pr_debug("doing %s: %s='%s'\n", doing, param, val);
-                return handle_unknown(param, val);
+                return handle_unknown(param, val, doing);
        }
-        pr_debug("Unknown argument `%s'\n", param);
+        pr_debug("Unknown argument '%s'\n", param);
        return -ENOENT;
 }
@@ -175,49 +177,47 @@ static char *next_arg(char *args, char **param, char **val)
 }
 /* Args looks like "foo=bar,bar2 baz=fuz wiz". */
-int parse_args(const char *name,
+int parse_args(const char *doing,
               char *args,
               const struct kernel_param *params,
               unsigned num,
               s16 min_level,
               s16 max_level,
-               int (*unknown)(char *param, char *val))
+               int (*unknown)(char *param, char *val, const char *doing))
 {
        char *param, *val;
-        pr_debug("Parsing ARGS: %s\n", args);
        /* Chew leading spaces */
        args = skip_spaces(args);
+        if (*args)
+                pr_debug("doing %s, parsing ARGS: '%s'\n", doing, args);
        while (*args) {
                int ret;
                int irq_was_disabled;
                args = next_arg(args, &param, &val);
                irq_was_disabled = irqs_disabled();
-                ret = parse_one(param, val, params, num,
+                ret = parse_one(param, val, doing, params, num,
                                min_level, max_level, unknown);
-                if (irq_was_disabled && !irqs_disabled()) {
+                if (irq_was_disabled && !irqs_disabled())
-                        printk(KERN_WARNING "parse_args(): option '%s' enabled "
+                        pr_warn("%s: option '%s' enabled irq's!\n",
-                                        "irq's!\n", param);
+                                doing, param);
-                }
                switch (ret) {
                case -ENOENT:
-                        printk(KERN_ERR "%s: Unknown parameter `%s'\n",
+                        pr_err("%s: Unknown parameter `%s'\n", doing, param);
-                               name, param);
                        return ret;
                case -ENOSPC:
-                        printk(KERN_ERR
+                        pr_err("%s: `%s' too large for parameter `%s'\n",
-                               "%s: `%s' too large for parameter `%s'\n",
+                               doing, val ?: "", param);
-                               name, val ?: "", param);
                        return ret;
                case 0:
                        break;
                default:
-                        printk(KERN_ERR
+                        pr_err("%s: `%s' invalid for parameter `%s'\n",
-                               "%s: `%s' invalid for parameter `%s'\n",
+                               doing, val ?: "", param);
-                               name, val ?: "", param);
                        return ret;
                }
        }
@@ -263,8 +263,7 @@ STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul);
 int param_set_charp(const char *val, const struct kernel_param *kp)
 {
        if (strlen(val) > 1024) {
-                printk(KERN_ERR "%s: string parameter too long\n",
+                pr_err("%s: string parameter too long\n", kp->name);
-                       kp->name);
                return -ENOSPC;
        }
@@ -400,8 +399,7 @@ static int param_array(const char *name,
                int len;
                if (*num == max) {
-                        printk(KERN_ERR "%s: can only take %i arguments\n",
+                        pr_err("%s: can only take %i arguments\n", name, max);
-                               name, max);
                        return -EINVAL;
                }
                len = strcspn(val, ",");
@@ -420,8 +418,7 @@ static int param_array(const char *name,
        } while (save == ',');
        if (*num < min) {
-                printk(KERN_ERR "%s: needs at least %i arguments\n",
+                pr_err("%s: needs at least %i arguments\n", name, min);
-                       name, min);
                return -EINVAL;
        }
        return 0;
@@ -480,7 +477,7 @@ int param_set_copystring(const char *val, const struct kernel_param *kp)
        const struct kparam_string *kps = kp->str;
        if (strlen(val)+1 > kps->maxlen) {
-                printk(KERN_ERR "%s: string doesn't fit in %u chars.\n",
+                pr_err("%s: string doesn't fit in %u chars.\n",
                       kp->name, kps->maxlen-1);
                return -ENOSPC;
        }
@@ -750,11 +747,8 @@ static struct module_kobject * __init locate_module_kobject(const char *name)
 #endif
                if (err) {
                        kobject_put(&mk->kobj);
-                        printk(KERN_ERR
+                        pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n",
-                                "Module '%s' failed add to sysfs, error number %d\n",
                                name, err);
-                        printk(KERN_ERR
-                                "The system will be unstable now.\n");
                        return NULL;
                }
diff --git a/kernel/pid.c b/kernel/pid.c
index 9f08dfabaf13..e86b291ad834 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -547,7 +547,8 @@ void __init pidhash_init(void)
        pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
                                           HASH_EARLY | HASH_SMALL,
-                                           &pidhash_shift, NULL, 4096);
+                                           &pidhash_shift, NULL,
+                                           0, 4096);
        pidhash_size = 1U << pidhash_shift;
        for (i = 0; i < pidhash_size; i++)
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 57bc1fd35b3c..b3c7fd554250 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -149,7 +149,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 {
        int nr;
        int rc;
-        struct task_struct *task;
+        struct task_struct *task, *me = current;
+        /* Ignore SIGCHLD causing any terminated children to autoreap */
+        spin_lock_irq(&me->sighand->siglock);
+        me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
+        spin_unlock_irq(&me->sighand->siglock);
        /*
         * The last thread in the cgroup-init thread group is terminating.
@@ -179,11 +184,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
        }
        read_unlock(&tasklist_lock);
+        /* Firstly reap the EXIT_ZOMBIE children we may have. */
        do {
                clear_thread_flag(TIF_SIGPENDING);
                rc = sys_wait4(-1, NULL, __WALL, NULL);
        } while (rc != -ECHILD);
+        /*
+         * sys_wait4() above can't reap the TASK_DEAD children.
+         * Make sure they all go away, see __unhash_process().
+         */
+        for (;;) {
+                bool need_wait = false;
+                read_lock(&tasklist_lock);
+                if (!list_empty(&current->children)) {
+                        __set_current_state(TASK_UNINTERRUPTIBLE);
+                        need_wait = true;
+                }
+                read_unlock(&tasklist_lock);
+                if (!need_wait)
+                        break;
+                schedule();
+        }
        if (pid_ns->reboot)
                current->signal->group_exit_code = pid_ns->reboot;
@@ -191,6 +216,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
        return;
 }
+#ifdef CONFIG_CHECKPOINT_RESTORE
 static int pid_ns_ctl_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -218,8 +244,8 @@ static struct ctl_table pid_ns_ctl_table[] = {
        },
        { }
 };
 static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
+#endif  /* CONFIG_CHECKPOINT_RESTORE */
 int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
 {
@@ -253,7 +279,10 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
 static __init int pid_namespaces_init(void)
 {
        pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
+#ifdef CONFIG_CHECKPOINT_RESTORE
        register_sysctl_paths(kern_path, pid_ns_ctl_table);
+#endif
        return 0;
 }
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index deb5461e3216..8f9b4eb974e0 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -103,6 +103,33 @@ config PM_SLEEP_SMP
        select HOTPLUG
        select HOTPLUG_CPU
+config PM_AUTOSLEEP
+        bool "Opportunistic sleep"
+        depends on PM_SLEEP
+        default n
+        ---help---
+        Allow the kernel to trigger a system transition into a global sleep
+        state automatically whenever there are no active wakeup sources.
+config PM_WAKELOCKS
+        bool "User space wakeup sources interface"
+        depends on PM_SLEEP
+        default n
+        ---help---
+        Allow user space to create, activate and deactivate wakeup source
+        objects with the help of a sysfs-based interface.
+config PM_WAKELOCKS_LIMIT
+        int "Maximum number of user space wakeup sources (0 = no limit)"
+        range 0 100000
+        default 100
+        depends on PM_WAKELOCKS
+config PM_WAKELOCKS_GC
+        bool "Garbage collector for user space wakeup sources"
+        depends on PM_WAKELOCKS
+        default y
 config PM_RUNTIME
        bool "Run-time PM core functionality"
        depends on !IA64_HP_SIM
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 66d808ec5252..29472bff11ef 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -9,5 +9,7 @@ obj-$(CONFIG_SUSPEND)		+= suspend.o
 obj-$(CONFIG_PM_TEST_SUSPEND)   += suspend_test.o
 obj-$(CONFIG_HIBERNATION)       += hibernate.o snapshot.o swap.o user.o \
                                   block_io.o
+obj-$(CONFIG_PM_AUTOSLEEP)      += autosleep.o
+obj-$(CONFIG_PM_WAKELOCKS)      += wakelock.o
 obj-$(CONFIG_MAGIC_SYSRQ)       += poweroff.o
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
new file mode 100644
index 000000000000..ca304046d9e2
--- /dev/null
+++ b/kernel/power/autosleep.c
@@ -0,0 +1,127 @@
+/*
+ * kernel/power/autosleep.c
+ *
+ * Opportunistic sleep support.
+ *
+ * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl>
+ */
+#include <linux/device.h>
+#include <linux/mutex.h>
+#include <linux/pm_wakeup.h>
+#include "power.h"
+static suspend_state_t autosleep_state;
+static struct workqueue_struct *autosleep_wq;
+/*
+ * Note: it is only safe to mutex_lock(&autosleep_lock) if a wakeup_source
+ * is active, otherwise a deadlock with try_to_suspend() is possible.
+ * Alternatively mutex_lock_interruptible() can be used.  This will then fail
+ * if an auto_sleep cycle tries to freeze processes.
+ */
+static DEFINE_MUTEX(autosleep_lock);
+static struct wakeup_source *autosleep_ws;
+static void try_to_suspend(struct work_struct *work)
+{
+        unsigned int initial_count, final_count;
+        if (!pm_get_wakeup_count(&initial_count, true))
+                goto out;
+        mutex_lock(&autosleep_lock);
+        if (!pm_save_wakeup_count(initial_count)) {
+                mutex_unlock(&autosleep_lock);
+                goto out;
+        }
+        if (autosleep_state == PM_SUSPEND_ON) {
+                mutex_unlock(&autosleep_lock);
+                return;
+        }
+        if (autosleep_state >= PM_SUSPEND_MAX)
+                hibernate();
+        else
+                pm_suspend(autosleep_state);
+        mutex_unlock(&autosleep_lock);
+        if (!pm_get_wakeup_count(&final_count, false))
+                goto out;
+        /*
+         * If the wakeup occured for an unknown reason, wait to prevent the
+         * system from trying to suspend and waking up in a tight loop.
+         */
+        if (final_count == initial_count)
+                schedule_timeout_uninterruptible(HZ / 2);
+ out:
+        queue_up_suspend_work();
+}
+static DECLARE_WORK(suspend_work, try_to_suspend);
+void queue_up_suspend_work(void)
+{
+        if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON)
+                queue_work(autosleep_wq, &suspend_work);
+}
+suspend_state_t pm_autosleep_state(void)
+{
+        return autosleep_state;
+}
+int pm_autosleep_lock(void)
+{
+        return mutex_lock_interruptible(&autosleep_lock);
+}
+void pm_autosleep_unlock(void)
+{
+        mutex_unlock(&autosleep_lock);
+}
+int pm_autosleep_set_state(suspend_state_t state)
+{
+#ifndef CONFIG_HIBERNATION
+        if (state >= PM_SUSPEND_MAX)
+                return -EINVAL;
+#endif
+        __pm_stay_awake(autosleep_ws);
+        mutex_lock(&autosleep_lock);
+        autosleep_state = state;
+        __pm_relax(autosleep_ws);
+        if (state > PM_SUSPEND_ON) {
+                pm_wakep_autosleep_enabled(true);
+                queue_up_suspend_work();
+        } else {
+                pm_wakep_autosleep_enabled(false);
+        }
+        mutex_unlock(&autosleep_lock);
+        return 0;
+}
+int __init pm_autosleep_init(void)
+{
+        autosleep_ws = wakeup_source_register("autosleep");
+        if (!autosleep_ws)
+                return -ENOMEM;
+        autosleep_wq = alloc_ordered_workqueue("autosleep", 0);
+        if (autosleep_wq)
+                return 0;
+        wakeup_source_unregister(autosleep_ws);
+        return -ENOMEM;
+}
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index e09dfbfeecee..238025f5472e 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -25,7 +25,8 @@
 #include <linux/freezer.h>
 #include <linux/gfp.h>
 #include <linux/syscore_ops.h>
-#include <scsi/scsi_scan.h>
+#include <linux/ctype.h>
+#include <linux/genhd.h>
 #include "power.h"
@@ -722,6 +723,17 @@ static int software_resume(void)
        /* Check if the device is there */
        swsusp_resume_device = name_to_dev_t(resume_file);
+        /*
+         * name_to_dev_t is ineffective to verify parition if resume_file is in
+         * integer format. (e.g. major:minor)
+         */
+        if (isdigit(resume_file[0]) && resume_wait) {
+                int partno;
+                while (!get_gendisk(swsusp_resume_device, &partno))
+                        msleep(10);
+        }
        if (!swsusp_resume_device) {
                /*
                 * Some device discovery might still be in progress; we need
@@ -735,13 +747,6 @@ static int software_resume(void)
                        async_synchronize_full();
                }
-                /*
-                 * We can't depend on SCSI devices being available after loading
-                 * one of their modules until scsi_complete_async_scans() is
-                 * called and the resume device usually is a SCSI one.
-                 */
-                scsi_complete_async_scans();
                swsusp_resume_device = name_to_dev_t(resume_file);
                if (!swsusp_resume_device) {
                        error = -ENODEV;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 1c12581f1c62..428f8a034e96 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -269,8 +269,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
        return (s - buf);
 }
-static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
+static suspend_state_t decode_state(const char *buf, size_t n)
-                           const char *buf, size_t n)
 {
 #ifdef CONFIG_SUSPEND
        suspend_state_t state = PM_SUSPEND_STANDBY;
@@ -278,27 +277,48 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
 #endif
        char *p;
        int len;
-        int error = -EINVAL;
        p = memchr(buf, '\n', n);
        len = p ? p - buf : n;
-        /* First, check if we are requested to hibernate */
+        /* Check hibernation first. */
-        if (len == 4 && !strncmp(buf, "disk", len)) {
+        if (len == 4 && !strncmp(buf, "disk", len))
-                error = hibernate();
+                return PM_SUSPEND_MAX;
-                goto Exit;
-        }
 #ifdef CONFIG_SUSPEND
-        for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
+        for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++)
-                if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) {
+                if (*s && len == strlen(*s) && !strncmp(buf, *s, len))
-                        error = pm_suspend(state);
+                        return state;
-                        break;
-                }
-        }
 #endif
- Exit:
+        return PM_SUSPEND_ON;
+}
+static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
+                           const char *buf, size_t n)
+{
+        suspend_state_t state;
+        int error;
+        error = pm_autosleep_lock();
+        if (error)
+                return error;
+        if (pm_autosleep_state() > PM_SUSPEND_ON) {
+                error = -EBUSY;
+                goto out;
+        }
+        state = decode_state(buf, n);
+        if (state < PM_SUSPEND_MAX)
+                error = pm_suspend(state);
+        else if (state == PM_SUSPEND_MAX)
+                error = hibernate();
+        else
+                error = -EINVAL;
+ out:
+        pm_autosleep_unlock();
        return error ? error : n;
 }
@@ -339,7 +359,8 @@ static ssize_t wakeup_count_show(struct kobject *kobj,
 {
        unsigned int val;
-        return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR;
+        return pm_get_wakeup_count(&val, true) ?
+                sprintf(buf, "%u\n", val) : -EINTR;
 }
 static ssize_t wakeup_count_store(struct kobject *kobj,
@@ -347,15 +368,106 @@ static ssize_t wakeup_count_store(struct kobject *kobj,
                                const char *buf, size_t n)
 {
        unsigned int val;
+        int error;
+        error = pm_autosleep_lock();
+        if (error)
+                return error;
+        if (pm_autosleep_state() > PM_SUSPEND_ON) {
+                error = -EBUSY;
+                goto out;
+        }
+        error = -EINVAL;
        if (sscanf(buf, "%u", &val) == 1) {
                if (pm_save_wakeup_count(val))
-                        return n;
+                        error = n;
        }
-        return -EINVAL;
+ out:
+        pm_autosleep_unlock();
+        return error;
 }
 power_attr(wakeup_count);
+#ifdef CONFIG_PM_AUTOSLEEP
+static ssize_t autosleep_show(struct kobject *kobj,
+                              struct kobj_attribute *attr,
+                              char *buf)
+{
+        suspend_state_t state = pm_autosleep_state();
+        if (state == PM_SUSPEND_ON)
+                return sprintf(buf, "off\n");
+#ifdef CONFIG_SUSPEND
+        if (state < PM_SUSPEND_MAX)
+                return sprintf(buf, "%s\n", valid_state(state) ?
+                                                pm_states[state] : "error");
+#endif
+#ifdef CONFIG_HIBERNATION
+        return sprintf(buf, "disk\n");
+#else
+        return sprintf(buf, "error");
+#endif
+}
+static ssize_t autosleep_store(struct kobject *kobj,
+                               struct kobj_attribute *attr,
+                               const char *buf, size_t n)
+{
+        suspend_state_t state = decode_state(buf, n);
+        int error;
+        if (state == PM_SUSPEND_ON
+            && strcmp(buf, "off") && strcmp(buf, "off\n"))
+                return -EINVAL;
+        error = pm_autosleep_set_state(state);
+        return error ? error : n;
+}
+power_attr(autosleep);
+#endif /* CONFIG_PM_AUTOSLEEP */
+#ifdef CONFIG_PM_WAKELOCKS
+static ssize_t wake_lock_show(struct kobject *kobj,
+                              struct kobj_attribute *attr,
+                              char *buf)
+{
+        return pm_show_wakelocks(buf, true);
+}
+static ssize_t wake_lock_store(struct kobject *kobj,
+                               struct kobj_attribute *attr,
+                               const char *buf, size_t n)
+{
+        int error = pm_wake_lock(buf);
+        return error ? error : n;
+}
+power_attr(wake_lock);
+static ssize_t wake_unlock_show(struct kobject *kobj,
+                                struct kobj_attribute *attr,
+                                char *buf)
+{
+        return pm_show_wakelocks(buf, false);
+}
+static ssize_t wake_unlock_store(struct kobject *kobj,
+                                 struct kobj_attribute *attr,
+                                 const char *buf, size_t n)
+{
+        int error = pm_wake_unlock(buf);
+        return error ? error : n;
+}
+power_attr(wake_unlock);
+#endif /* CONFIG_PM_WAKELOCKS */
 #endif /* CONFIG_PM_SLEEP */
 #ifdef CONFIG_PM_TRACE
@@ -409,6 +521,13 @@ static struct attribute * g[] = {
 #ifdef CONFIG_PM_SLEEP
        &pm_async_attr.attr,
        &wakeup_count_attr.attr,
+#ifdef CONFIG_PM_AUTOSLEEP
+        &autosleep_attr.attr,
+#endif
+#ifdef CONFIG_PM_WAKELOCKS
+        &wake_lock_attr.attr,
+        &wake_unlock_attr.attr,
+#endif
 #ifdef CONFIG_PM_DEBUG
        &pm_test_attr.attr,
 #endif
@@ -444,7 +563,10 @@ static int __init pm_init(void)
        power_kobj = kobject_create_and_add("power", NULL);
        if (!power_kobj)
                return -ENOMEM;
-        return sysfs_create_group(power_kobj, &attr_group);
+        error = sysfs_create_group(power_kobj, &attr_group);
+        if (error)
+                return error;
+        return pm_autosleep_init();
 }
 core_initcall(pm_init);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 98f3622d7407..b0bd4beaebfe 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -264,3 +264,30 @@ static inline void suspend_thaw_processes(void)
 {
 }
 #endif
+#ifdef CONFIG_PM_AUTOSLEEP
+/* kernel/power/autosleep.c */
+extern int pm_autosleep_init(void);
+extern int pm_autosleep_lock(void);
+extern void pm_autosleep_unlock(void);
+extern suspend_state_t pm_autosleep_state(void);
+extern int pm_autosleep_set_state(suspend_state_t state);
+#else /* !CONFIG_PM_AUTOSLEEP */
+static inline int pm_autosleep_init(void) { return 0; }
+static inline int pm_autosleep_lock(void) { return 0; }
+static inline void pm_autosleep_unlock(void) {}
+static inline suspend_state_t pm_autosleep_state(void) { return PM_SUSPEND_ON; }
+#endif /* !CONFIG_PM_AUTOSLEEP */
+#ifdef CONFIG_PM_WAKELOCKS
+/* kernel/power/wakelock.c */
+extern ssize_t pm_show_wakelocks(char *buf, bool show_active);
+extern int pm_wake_lock(const char *buf);
+extern int pm_wake_unlock(const char *buf);
+#endif /* !CONFIG_PM_WAKELOCKS */
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index eef311a58a64..11e22c068e8b 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -6,7 +6,7 @@
 *
 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
- * Copyright (C) 2010 Bojan Smojver <bojan@rexursive.com>
+ * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com>
 *
 * This file is released under the GPLv2.
 *
@@ -282,14 +282,17 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
                return -ENOSPC;
        if (bio_chain) {
-                src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+                src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN |
+                                              __GFP_NORETRY);
                if (src) {
                        copy_page(src, buf);
                } else {
                        ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */
                        if (ret)
                                return ret;
-                        src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+                        src = (void *)__get_free_page(__GFP_WAIT |
+                                                      __GFP_NOWARN |
+                                                      __GFP_NORETRY);
                        if (src) {
                                copy_page(src, buf);
                        } else {
@@ -367,12 +370,17 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
                clear_page(handle->cur);
                handle->cur_swap = offset;
                handle->k = 0;
-        }
-        if (bio_chain && low_free_pages() <= handle->reqd_free_pages) {
+                if (bio_chain && low_free_pages() <= handle->reqd_free_pages) {
-                error = hib_wait_on_bio_chain(bio_chain);
+                        error = hib_wait_on_bio_chain(bio_chain);
-                if (error)
+                        if (error)
-                        goto out;
+                                goto out;
-                handle->reqd_free_pages = reqd_free_pages();
+                        /*
+                         * Recalculate the number of required free pages, to
+                         * make sure we never take more than half.
+                         */
+                        handle->reqd_free_pages = reqd_free_pages();
+                }
        }
 out:
        return error;
@@ -419,8 +427,9 @@ static int swap_writer_finish(struct swap_map_handle *handle,
 /* Maximum number of threads for compression/decompression. */
 #define LZO_THREADS     3
-/* Maximum number of pages for read buffering. */
+/* Minimum/maximum number of pages for read buffering. */
-#define LZO_READ_PAGES  (MAP_PAGE_ENTRIES * 8)
+#define LZO_MIN_RD_PAGES        1024
+#define LZO_MAX_RD_PAGES        8192
 /**
@@ -631,12 +640,6 @@ static int save_image_lzo(struct swap_map_handle *handle,
        }
        /*
-         * Adjust number of free pages after all allocations have been done.
-         * We don't want to run out of pages when writing.
-         */
-        handle->reqd_free_pages = reqd_free_pages();
-        /*
         * Start the CRC32 thread.
         */
        init_waitqueue_head(&crc->go);
@@ -657,6 +660,12 @@ static int save_image_lzo(struct swap_map_handle *handle,
                goto out_clean;
        }
+        /*
+         * Adjust the number of required free pages after all allocations have
+         * been done. We don't want to run out of pages when writing.
+         */
+        handle->reqd_free_pages = reqd_free_pages();
        printk(KERN_INFO
                "PM: Using %u thread(s) for compression.\n"
                "PM: Compressing and saving image data (%u pages) ...     ",
@@ -1067,7 +1076,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
        unsigned i, thr, run_threads, nr_threads;
        unsigned ring = 0, pg = 0, ring_size = 0,
                 have = 0, want, need, asked = 0;
-        unsigned long read_pages;
+        unsigned long read_pages = 0;
        unsigned char **page = NULL;
        struct dec_data *data = NULL;
        struct crc_data *crc = NULL;
@@ -1079,7 +1088,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
        nr_threads = num_online_cpus() - 1;
        nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
-        page = vmalloc(sizeof(*page) * LZO_READ_PAGES);
+        page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES);
        if (!page) {
                printk(KERN_ERR "PM: Failed to allocate LZO page\n");
                ret = -ENOMEM;
@@ -1144,15 +1153,22 @@ static int load_image_lzo(struct swap_map_handle *handle,
        }
        /*
-         * Adjust number of pages for read buffering, in case we are short.
+         * Set the number of pages for read buffering.
+         * This is complete guesswork, because we'll only know the real
+         * picture once prepare_image() is called, which is much later on
+         * during the image load phase. We'll assume the worst case and
+         * say that none of the image pages are from high memory.
         */
-        read_pages = (nr_free_pages() - snapshot_get_image_size()) >> 1;
+        if (low_free_pages() > snapshot_get_image_size())
-        read_pages = clamp_val(read_pages, LZO_CMP_PAGES, LZO_READ_PAGES);
+                read_pages = (low_free_pages() - snapshot_get_image_size()) / 2;
+        read_pages = clamp_val(read_pages, LZO_MIN_RD_PAGES, LZO_MAX_RD_PAGES);
        for (i = 0; i < read_pages; i++) {
                page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
                                                  __GFP_WAIT | __GFP_HIGH :
-                                                  __GFP_WAIT);
+                                                  __GFP_WAIT | __GFP_NOWARN |
+                                                  __GFP_NORETRY);
                if (!page[i]) {
                        if (i < LZO_CMP_PAGES) {
                                ring_size = i;
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 91b0fd021a95..4ed81e74f86f 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -24,7 +24,6 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
-#include <scsi/scsi_scan.h>
 #include <asm/uaccess.h>
@@ -84,7 +83,6 @@ static int snapshot_open(struct inode *inode, struct file *filp)
                 * appear.
                 */
                wait_for_device_probe();
-                scsi_complete_async_scans();
                data->swap = -1;
                data->mode = O_WRONLY;
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
new file mode 100644
index 000000000000..c8fba3380076
--- /dev/null
+++ b/kernel/power/wakelock.c
@@ -0,0 +1,259 @@
+/*
+ * kernel/power/wakelock.c
+ *
+ * User space wakeup sources support.
+ *
+ * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * This code is based on the analogous interface allowing user space to
+ * manipulate wakelocks on Android.
+ */
+#include <linux/ctype.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/hrtimer.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+static DEFINE_MUTEX(wakelocks_lock);
+struct wakelock {
+        char                    *name;
+        struct rb_node          node;
+        struct wakeup_source    ws;
+#ifdef CONFIG_PM_WAKELOCKS_GC
+        struct list_head        lru;
+#endif
+};
+static struct rb_root wakelocks_tree = RB_ROOT;
+ssize_t pm_show_wakelocks(char *buf, bool show_active)
+{
+        struct rb_node *node;
+        struct wakelock *wl;
+        char *str = buf;
+        char *end = buf + PAGE_SIZE;
+        mutex_lock(&wakelocks_lock);
+        for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) {
+                wl = rb_entry(node, struct wakelock, node);
+                if (wl->ws.active == show_active)
+                        str += scnprintf(str, end - str, "%s ", wl->name);
+        }
+        if (str > buf)
+                str--;
+        str += scnprintf(str, end - str, "\n");
+        mutex_unlock(&wakelocks_lock);
+        return (str - buf);
+}
+#if CONFIG_PM_WAKELOCKS_LIMIT > 0
+static unsigned int number_of_wakelocks;
+static inline bool wakelocks_limit_exceeded(void)
+{
+        return number_of_wakelocks > CONFIG_PM_WAKELOCKS_LIMIT;
+}
+static inline void increment_wakelocks_number(void)
+{
+        number_of_wakelocks++;
+}
+static inline void decrement_wakelocks_number(void)
+{
+        number_of_wakelocks--;
+}
+#else /* CONFIG_PM_WAKELOCKS_LIMIT = 0 */
+static inline bool wakelocks_limit_exceeded(void) { return false; }
+static inline void increment_wakelocks_number(void) {}
+static inline void decrement_wakelocks_number(void) {}
+#endif /* CONFIG_PM_WAKELOCKS_LIMIT */
+#ifdef CONFIG_PM_WAKELOCKS_GC
+#define WL_GC_COUNT_MAX 100
+#define WL_GC_TIME_SEC  300
+static LIST_HEAD(wakelocks_lru_list);
+static unsigned int wakelocks_gc_count;
+static inline void wakelocks_lru_add(struct wakelock *wl)
+{
+        list_add(&wl->lru, &wakelocks_lru_list);
+}
+static inline void wakelocks_lru_most_recent(struct wakelock *wl)
+{
+        list_move(&wl->lru, &wakelocks_lru_list);
+}
+static void wakelocks_gc(void)
+{
+        struct wakelock *wl, *aux;
+        ktime_t now;
+        if (++wakelocks_gc_count <= WL_GC_COUNT_MAX)
+                return;
+        now = ktime_get();
+        list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) {
+                u64 idle_time_ns;
+                bool active;
+                spin_lock_irq(&wl->ws.lock);
+                idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws.last_time));
+                active = wl->ws.active;
+                spin_unlock_irq(&wl->ws.lock);
+                if (idle_time_ns < ((u64)WL_GC_TIME_SEC * NSEC_PER_SEC))
+                        break;
+                if (!active) {
+                        wakeup_source_remove(&wl->ws);
+                        rb_erase(&wl->node, &wakelocks_tree);
+                        list_del(&wl->lru);
+                        kfree(wl->name);
+                        kfree(wl);
+                        decrement_wakelocks_number();
+                }
+        }
+        wakelocks_gc_count = 0;
+}
+#else /* !CONFIG_PM_WAKELOCKS_GC */
+static inline void wakelocks_lru_add(struct wakelock *wl) {}
+static inline void wakelocks_lru_most_recent(struct wakelock *wl) {}
+static inline void wakelocks_gc(void) {}
+#endif /* !CONFIG_PM_WAKELOCKS_GC */
+static struct wakelock *wakelock_lookup_add(const char *name, size_t len,
+                                            bool add_if_not_found)
+{
+        struct rb_node **node = &wakelocks_tree.rb_node;
+        struct rb_node *parent = *node;
+        struct wakelock *wl;
+        while (*node) {
+                int diff;
+                parent = *node;
+                wl = rb_entry(*node, struct wakelock, node);
+                diff = strncmp(name, wl->name, len);
+                if (diff == 0) {
+                        if (wl->name[len])
+                                diff = -1;
+                        else
+                                return wl;
+                }
+                if (diff < 0)
+                        node = &(*node)->rb_left;
+                else
+                        node = &(*node)->rb_right;
+        }
+        if (!add_if_not_found)
+                return ERR_PTR(-EINVAL);
+        if (wakelocks_limit_exceeded())
+                return ERR_PTR(-ENOSPC);
+        /* Not found, we have to add a new one. */
+        wl = kzalloc(sizeof(*wl), GFP_KERNEL);
+        if (!wl)
+                return ERR_PTR(-ENOMEM);
+        wl->name = kstrndup(name, len, GFP_KERNEL);
+        if (!wl->name) {
+                kfree(wl);
+                return ERR_PTR(-ENOMEM);
+        }
+        wl->ws.name = wl->name;
+        wakeup_source_add(&wl->ws);
+        rb_link_node(&wl->node, parent, node);
+        rb_insert_color(&wl->node, &wakelocks_tree);
+        wakelocks_lru_add(wl);
+        increment_wakelocks_number();
+        return wl;
+}
+int pm_wake_lock(const char *buf)
+{
+        const char *str = buf;
+        struct wakelock *wl;
+        u64 timeout_ns = 0;
+        size_t len;
+        int ret = 0;
+        while (*str && !isspace(*str))
+                str++;
+        len = str - buf;
+        if (!len)
+                return -EINVAL;
+        if (*str && *str != '\n') {
+                /* Find out if there's a valid timeout string appended. */
+                ret = kstrtou64(skip_spaces(str), 10, &timeout_ns);
+                if (ret)
+                        return -EINVAL;
+        }
+        mutex_lock(&wakelocks_lock);
+        wl = wakelock_lookup_add(buf, len, true);
+        if (IS_ERR(wl)) {
+                ret = PTR_ERR(wl);
+                goto out;
+        }
+        if (timeout_ns) {
+                u64 timeout_ms = timeout_ns + NSEC_PER_MSEC - 1;
+                do_div(timeout_ms, NSEC_PER_MSEC);
+                __pm_wakeup_event(&wl->ws, timeout_ms);
+        } else {
+                __pm_stay_awake(&wl->ws);
+        }
+        wakelocks_lru_most_recent(wl);
+ out:
+        mutex_unlock(&wakelocks_lock);
+        return ret;
+}
+int pm_wake_unlock(const char *buf)
+{
+        struct wakelock *wl;
+        size_t len;
+        int ret = 0;
+        len = strlen(buf);
+        if (!len)
+                return -EINVAL;
+        if (buf[len-1] == '\n')
+                len--;
+        if (!len)
+                return -EINVAL;
+        mutex_lock(&wakelocks_lock);
+        wl = wakelock_lookup_add(buf, len, false);
+        if (IS_ERR(wl)) {
+                ret = PTR_ERR(wl);
+                goto out;
+        }
+        __pm_relax(&wl->ws);
+        wakelocks_lru_most_recent(wl);
+        wakelocks_gc();
+ out:
+        mutex_unlock(&wakelocks_lock);
+        return ret;
+}
diff --git a/kernel/printk.c b/kernel/printk.c
index b663c2c95d39..ac4bc9e79465 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -41,6 +41,7 @@
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <linux/rculist.h>
+#include <linux/poll.h>
 #include <asm/uaccess.h>
@@ -54,8 +55,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
 {
 }
-#define __LOG_BUF_LEN   (1 << CONFIG_LOG_BUF_SHIFT)
 /* printk's without a loglevel use this.. */
 #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
@@ -99,24 +98,6 @@ EXPORT_SYMBOL_GPL(console_drivers);
 static int console_locked, console_suspended;
 /*
- * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
- * It is also used in interesting ways to provide interlocking in
- * console_unlock();.
- */
-static DEFINE_RAW_SPINLOCK(logbuf_lock);
-#define LOG_BUF_MASK (log_buf_len-1)
-#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
-/*
- * The indices into log_buf are not constrained to log_buf_len - they
- * must be masked before subscripting
- */
-static unsigned log_start;      /* Index into log_buf: next char to be read by syslog() */
-static unsigned con_start;      /* Index into log_buf: next char to be sent to consoles */
-static unsigned log_end;        /* Index into log_buf: most-recently-written-char + 1 */
-/*
 * If exclusive_console is non-NULL then only this console is to be printed to.
 */
 static struct console *exclusive_console;
@@ -145,13 +126,510 @@ EXPORT_SYMBOL(console_set_on_cmdline);
 /* Flag: console code may call schedule() */
 static int console_may_schedule;
+/*
+ * The printk log buffer consists of a chain of concatenated variable
+ * length records. Every record starts with a record header, containing
+ * the overall length of the record.
+ *
+ * The heads to the first and last entry in the buffer, as well as the
+ * sequence numbers of these both entries are maintained when messages
+ * are stored..
+ *
+ * If the heads indicate available messages, the length in the header
+ * tells the start next message. A length == 0 for the next message
+ * indicates a wrap-around to the beginning of the buffer.
+ *
+ * Every record carries the monotonic timestamp in microseconds, as well as
+ * the standard userspace syslog level and syslog facility. The usual
+ * kernel messages use LOG_KERN; userspace-injected messages always carry
+ * a matching syslog facility, by default LOG_USER. The origin of every
+ * message can be reliably determined that way.
+ *
+ * The human readable log message directly follows the message header. The
+ * length of the message text is stored in the header, the stored message
+ * is not terminated.
+ *
+ * Optionally, a message can carry a dictionary of properties (key/value pairs),
+ * to provide userspace with a machine-readable message context.
+ *
+ * Examples for well-defined, commonly used property names are:
+ *   DEVICE=b12:8               device identifier
+ *                                b12:8         block dev_t
+ *                                c127:3        char dev_t
+ *                                n8            netdev ifindex
+ *                                +sound:card0  subsystem:devname
+ *   SUBSYSTEM=pci              driver-core subsystem name
+ *
+ * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value
+ * follows directly after a '=' character. Every property is terminated by
+ * a '\0' character. The last property is not terminated.
+ *
+ * Example of a message structure:
+ *   0000  ff 8f 00 00 00 00 00 00      monotonic time in nsec
+ *   0008  34 00                        record is 52 bytes long
+ *   000a        0b 00                  text is 11 bytes long
+ *   000c              1f 00            dictionary is 23 bytes long
+ *   000e                    03 00      LOG_KERN (facility) LOG_ERR (level)
+ *   0010  69 74 27 73 20 61 20 6c      "it's a l"
+ *         69 6e 65                     "ine"
+ *   001b           44 45 56 49 43      "DEVIC"
+ *         45 3d 62 38 3a 32 00 44      "E=b8:2\0D"
+ *         52 49 56 45 52 3d 62 75      "RIVER=bu"
+ *         67                           "g"
+ *   0032     00 00 00                  padding to next message header
+ *
+ * The 'struct log' buffer header must never be directly exported to
+ * userspace, it is a kernel-private implementation detail that might
+ * need to be changed in the future, when the requirements change.
+ *
+ * /dev/kmsg exports the structured data in the following line format:
+ *   "level,sequnum,timestamp;<message text>\n"
+ *
+ * The optional key/value pairs are attached as continuation lines starting
+ * with a space character and terminated by a newline. All possible
+ * non-prinatable characters are escaped in the "\xff" notation.
+ *
+ * Users of the export format should ignore possible additional values
+ * separated by ',', and find the message after the ';' character.
+ */
+enum log_flags {
+        LOG_NOCONS      = 1,    /* already flushed, do not print to console */
+        LOG_NEWLINE     = 2,    /* text ended with a newline */
+        LOG_PREFIX      = 4,    /* text started with a prefix */
+        LOG_CONT        = 8,    /* text is a fragment of a continuation line */
+};
+struct log {
+        u64 ts_nsec;            /* timestamp in nanoseconds */
+        u16 len;                /* length of entire record */
+        u16 text_len;           /* length of text buffer */
+        u16 dict_len;           /* length of dictionary buffer */
+        u8 facility;            /* syslog facility */
+        u8 flags:5;             /* internal record flags */
+        u8 level:3;             /* syslog level */
+};
+/*
+ * The logbuf_lock protects kmsg buffer, indices, counters. It is also
+ * used in interesting ways to provide interlocking in console_unlock();
+ */
+static DEFINE_RAW_SPINLOCK(logbuf_lock);
+/* the next printk record to read by syslog(READ) or /proc/kmsg */
+static u64 syslog_seq;
+static u32 syslog_idx;
+static enum log_flags syslog_prev;
+static size_t syslog_partial;
+/* index and sequence number of the first record stored in the buffer */
+static u64 log_first_seq;
+static u32 log_first_idx;
+/* index and sequence number of the next record to store in the buffer */
+static u64 log_next_seq;
 #ifdef CONFIG_PRINTK
+static u32 log_next_idx;
-static char __log_buf[__LOG_BUF_LEN];
+/* the next printk record to read after the last 'clear' command */
+static u64 clear_seq;
+static u32 clear_idx;
+#define LOG_LINE_MAX 1024
+/* record buffer */
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+#define LOG_ALIGN 4
+#else
+#define LOG_ALIGN __alignof__(struct log)
+#endif
+#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
+static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
 static char *log_buf = __log_buf;
-static int log_buf_len = __LOG_BUF_LEN;
+static u32 log_buf_len = __LOG_BUF_LEN;
-static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
-static int saved_console_loglevel = -1;
+/* cpu currently holding logbuf_lock */
+static volatile unsigned int logbuf_cpu = UINT_MAX;
+/* human readable text of the record */
+static char *log_text(const struct log *msg)
+{
+        return (char *)msg + sizeof(struct log);
+}
+/* optional key/value pair dictionary attached to the record */
+static char *log_dict(const struct log *msg)
+{
+        return (char *)msg + sizeof(struct log) + msg->text_len;
+}
+/* get record by index; idx must point to valid msg */
+static struct log *log_from_idx(u32 idx)
+{
+        struct log *msg = (struct log *)(log_buf + idx);
+        /*
+         * A length == 0 record is the end of buffer marker. Wrap around and
+         * read the message at the start of the buffer.
+         */
+        if (!msg->len)
+                return (struct log *)log_buf;
+        return msg;
+}
+/* get next record; idx must point to valid msg */
+static u32 log_next(u32 idx)
+{
+        struct log *msg = (struct log *)(log_buf + idx);
+        /* length == 0 indicates the end of the buffer; wrap */
+        /*
+         * A length == 0 record is the end of buffer marker. Wrap around and
+         * read the message at the start of the buffer as *this* one, and
+         * return the one after that.
+         */
+        if (!msg->len) {
+                msg = (struct log *)log_buf;
+                return msg->len;
+        }
+        return idx + msg->len;
+}
+/* insert record into the buffer, discard old ones, update heads */
+static void log_store(int facility, int level,
+                      enum log_flags flags, u64 ts_nsec,
+                      const char *dict, u16 dict_len,
+                      const char *text, u16 text_len)
+{
+        struct log *msg;
+        u32 size, pad_len;
+        /* number of '\0' padding bytes to next message */
+        size = sizeof(struct log) + text_len + dict_len;
+        pad_len = (-size) & (LOG_ALIGN - 1);
+        size += pad_len;
+        while (log_first_seq < log_next_seq) {
+                u32 free;
+                if (log_next_idx > log_first_idx)
+                        free = max(log_buf_len - log_next_idx, log_first_idx);
+                else
+                        free = log_first_idx - log_next_idx;
+                if (free > size + sizeof(struct log))
+                        break;
+                /* drop old messages until we have enough contiuous space */
+                log_first_idx = log_next(log_first_idx);
+                log_first_seq++;
+        }
+        if (log_next_idx + size + sizeof(struct log) >= log_buf_len) {
+                /*
+                 * This message + an additional empty header does not fit
+                 * at the end of the buffer. Add an empty header with len == 0
+                 * to signify a wrap around.
+                 */
+                memset(log_buf + log_next_idx, 0, sizeof(struct log));
+                log_next_idx = 0;
+        }
+        /* fill message */
+        msg = (struct log *)(log_buf + log_next_idx);
+        memcpy(log_text(msg), text, text_len);
+        msg->text_len = text_len;
+        memcpy(log_dict(msg), dict, dict_len);
+        msg->dict_len = dict_len;
+        msg->facility = facility;
+        msg->level = level & 7;
+        msg->flags = flags & 0x1f;
+        if (ts_nsec > 0)
+                msg->ts_nsec = ts_nsec;
+        else
+                msg->ts_nsec = local_clock();
+        memset(log_dict(msg) + dict_len, 0, pad_len);
+        msg->len = sizeof(struct log) + text_len + dict_len + pad_len;
+        /* insert message */
+        log_next_idx += msg->len;
+        log_next_seq++;
+}
+/* /dev/kmsg - userspace message inject/listen interface */
+struct devkmsg_user {
+        u64 seq;
+        u32 idx;
+        struct mutex lock;
+        char buf[8192];
+};
+static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
+                              unsigned long count, loff_t pos)
+{
+        char *buf, *line;
+        int i;
+        int level = default_message_loglevel;
+        int facility = 1;       /* LOG_USER */
+        size_t len = iov_length(iv, count);
+        ssize_t ret = len;
+        if (len > LOG_LINE_MAX)
+                return -EINVAL;
+        buf = kmalloc(len+1, GFP_KERNEL);
+        if (buf == NULL)
+                return -ENOMEM;
+        line = buf;
+        for (i = 0; i < count; i++) {
+                if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len))
+                        goto out;
+                line += iv[i].iov_len;
+        }
+        /*
+         * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace
+         * the decimal value represents 32bit, the lower 3 bit are the log
+         * level, the rest are the log facility.
+         *
+         * If no prefix or no userspace facility is specified, we
+         * enforce LOG_USER, to be able to reliably distinguish
+         * kernel-generated messages from userspace-injected ones.
+         */
+        line = buf;
+        if (line[0] == '<') {
+                char *endp = NULL;
+                i = simple_strtoul(line+1, &endp, 10);
+                if (endp && endp[0] == '>') {
+                        level = i & 7;
+                        if (i >> 3)
+                                facility = i >> 3;
+                        endp++;
+                        len -= endp - line;
+                        line = endp;
+                }
+        }
+        line[len] = '\0';
+        printk_emit(facility, level, NULL, 0, "%s", line);
+out:
+        kfree(buf);
+        return ret;
+}
+static ssize_t devkmsg_read(struct file *file, char __user *buf,
+                            size_t count, loff_t *ppos)
+{
+        struct devkmsg_user *user = file->private_data;
+        struct log *msg;
+        u64 ts_usec;
+        size_t i;
+        size_t len;
+        ssize_t ret;
+        if (!user)
+                return -EBADF;
+        ret = mutex_lock_interruptible(&user->lock);
+        if (ret)
+                return ret;
+        raw_spin_lock_irq(&logbuf_lock);
+        while (user->seq == log_next_seq) {
+                if (file->f_flags & O_NONBLOCK) {
+                        ret = -EAGAIN;
+                        raw_spin_unlock_irq(&logbuf_lock);
+                        goto out;
+                }
+                raw_spin_unlock_irq(&logbuf_lock);
+                ret = wait_event_interruptible(log_wait,
+                                               user->seq != log_next_seq);
+                if (ret)
+                        goto out;
+                raw_spin_lock_irq(&logbuf_lock);
+        }
+        if (user->seq < log_first_seq) {
+                /* our last seen message is gone, return error and reset */
+                user->idx = log_first_idx;
+                user->seq = log_first_seq;
+                ret = -EPIPE;
+                raw_spin_unlock_irq(&logbuf_lock);
+                goto out;
+        }
+        msg = log_from_idx(user->idx);
+        ts_usec = msg->ts_nsec;
+        do_div(ts_usec, 1000);
+        len = sprintf(user->buf, "%u,%llu,%llu;",
+                      (msg->facility << 3) | msg->level, user->seq, ts_usec);
+        /* escape non-printable characters */
+        for (i = 0; i < msg->text_len; i++) {
+                unsigned char c = log_text(msg)[i];
+                if (c < ' ' || c >= 127 || c == '\\')
+                        len += sprintf(user->buf + len, "\\x%02x", c);
+                else
+                        user->buf[len++] = c;
+        }
+        user->buf[len++] = '\n';
+        if (msg->dict_len) {
+                bool line = true;
+                for (i = 0; i < msg->dict_len; i++) {
+                        unsigned char c = log_dict(msg)[i];
+                        if (line) {
+                                user->buf[len++] = ' ';
+                                line = false;
+                        }
+                        if (c == '\0') {
+                                user->buf[len++] = '\n';
+                                line = true;
+                                continue;
+                        }
+                        if (c < ' ' || c >= 127 || c == '\\') {
+                                len += sprintf(user->buf + len, "\\x%02x", c);
+                                continue;
+                        }
+                        user->buf[len++] = c;
+                }
+                user->buf[len++] = '\n';
+        }
+        user->idx = log_next(user->idx);
+        user->seq++;
+        raw_spin_unlock_irq(&logbuf_lock);
+        if (len > count) {
+                ret = -EINVAL;
+                goto out;
+        }
+        if (copy_to_user(buf, user->buf, len)) {
+                ret = -EFAULT;
+                goto out;
+        }
+        ret = len;
+out:
+        mutex_unlock(&user->lock);
+        return ret;
+}
+static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
+{
+        struct devkmsg_user *user = file->private_data;
+        loff_t ret = 0;
+        if (!user)
+                return -EBADF;
+        if (offset)
+                return -ESPIPE;
+        raw_spin_lock_irq(&logbuf_lock);
+        switch (whence) {
+        case SEEK_SET:
+                /* the first record */
+                user->idx = log_first_idx;
+                user->seq = log_first_seq;
+                break;
+        case SEEK_DATA:
+                /*
+                 * The first record after the last SYSLOG_ACTION_CLEAR,
+                 * like issued by 'dmesg -c'. Reading /dev/kmsg itself
+                 * changes no global state, and does not clear anything.
+                 */
+                user->idx = clear_idx;
+                user->seq = clear_seq;
+                break;
+        case SEEK_END:
+                /* after the last record */
+                user->idx = log_next_idx;
+                user->seq = log_next_seq;
+                break;
+        default:
+                ret = -EINVAL;
+        }
+        raw_spin_unlock_irq(&logbuf_lock);
+        return ret;
+}
+static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
+{
+        struct devkmsg_user *user = file->private_data;
+        int ret = 0;
+        if (!user)
+                return POLLERR|POLLNVAL;
+        poll_wait(file, &log_wait, wait);
+        raw_spin_lock_irq(&logbuf_lock);
+        if (user->seq < log_next_seq) {
+                /* return error when data has vanished underneath us */
+                if (user->seq < log_first_seq)
+                        ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
+                ret = POLLIN|POLLRDNORM;
+        }
+        raw_spin_unlock_irq(&logbuf_lock);
+        return ret;
+}
+static int devkmsg_open(struct inode *inode, struct file *file)
+{
+        struct devkmsg_user *user;
+        int err;
+        /* write-only does not need any file context */
+        if ((file->f_flags & O_ACCMODE) == O_WRONLY)
+                return 0;
+        err = security_syslog(SYSLOG_ACTION_READ_ALL);
+        if (err)
+                return err;
+        user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
+        if (!user)
+                return -ENOMEM;
+        mutex_init(&user->lock);
+        raw_spin_lock_irq(&logbuf_lock);
+        user->idx = log_first_idx;
+        user->seq = log_first_seq;
+        raw_spin_unlock_irq(&logbuf_lock);
+        file->private_data = user;
+        return 0;
+}
+static int devkmsg_release(struct inode *inode, struct file *file)
+{
+        struct devkmsg_user *user = file->private_data;
+        if (!user)
+                return 0;
+        mutex_destroy(&user->lock);
+        kfree(user);
+        return 0;
+}
+const struct file_operations kmsg_fops = {
+        .open = devkmsg_open,
+        .read = devkmsg_read,
+        .aio_write = devkmsg_writev,
+        .llseek = devkmsg_llseek,
+        .poll = devkmsg_poll,
+        .release = devkmsg_release,
+};
 #ifdef CONFIG_KEXEC
 /*
@@ -165,9 +643,9 @@ static int saved_console_loglevel = -1;
 void log_buf_kexec_setup(void)
 {
        VMCOREINFO_SYMBOL(log_buf);
-        VMCOREINFO_SYMBOL(log_end);
        VMCOREINFO_SYMBOL(log_buf_len);
-        VMCOREINFO_SYMBOL(logged_chars);
+        VMCOREINFO_SYMBOL(log_first_idx);
+        VMCOREINFO_SYMBOL(log_next_idx);
 }
 #endif
@@ -191,7 +669,6 @@ early_param("log_buf_len", log_buf_len_setup);
 void __init setup_log_buf(int early)
 {
        unsigned long flags;
-        unsigned start, dest_idx, offset;
        char *new_log_buf;
        int free;
@@ -219,20 +696,8 @@ void __init setup_log_buf(int early)
        log_buf_len = new_log_buf_len;
        log_buf = new_log_buf;
        new_log_buf_len = 0;
-        free = __LOG_BUF_LEN - log_end;
+        free = __LOG_BUF_LEN - log_next_idx;
+        memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
-        offset = start = min(con_start, log_start);
-        dest_idx = 0;
-        while (start != log_end) {
-                unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1);
-                log_buf[dest_idx] = __log_buf[log_idx_mask];
-                start++;
-                dest_idx++;
-        }
-        log_start -= offset;
-        con_start -= offset;
-        log_end -= offset;
        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
        pr_info("log_buf_len: %d\n", log_buf_len);
@@ -332,11 +797,270 @@ static int check_syslog_permissions(int type, bool from_file)
        return 0;
 }
+#if defined(CONFIG_PRINTK_TIME)
+static bool printk_time = 1;
+#else
+static bool printk_time;
+#endif
+module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
+static size_t print_time(u64 ts, char *buf)
+{
+        unsigned long rem_nsec;
+        if (!printk_time)
+                return 0;
+        if (!buf)
+                return 15;
+        rem_nsec = do_div(ts, 1000000000);
+        return sprintf(buf, "[%5lu.%06lu] ",
+                       (unsigned long)ts, rem_nsec / 1000);
+}
+static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
+{
+        size_t len = 0;
+        unsigned int prefix = (msg->facility << 3) | msg->level;
+        if (syslog) {
+                if (buf) {
+                        len += sprintf(buf, "<%u>", prefix);
+                } else {
+                        len += 3;
+                        if (prefix > 999)
+                                len += 3;
+                        else if (prefix > 99)
+                                len += 2;
+                        else if (prefix > 9)
+                                len++;
+                }
+        }
+        len += print_time(msg->ts_nsec, buf ? buf + len : NULL);
+        return len;
+}
+static size_t msg_print_text(const struct log *msg, enum log_flags prev,
+                             bool syslog, char *buf, size_t size)
+{
+        const char *text = log_text(msg);
+        size_t text_size = msg->text_len;
+        bool prefix = true;
+        bool newline = true;
+        size_t len = 0;
+        if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))
+                prefix = false;
+        if (msg->flags & LOG_CONT) {
+                if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE))
+                        prefix = false;
+                if (!(msg->flags & LOG_NEWLINE))
+                        newline = false;
+        }
+        do {
+                const char *next = memchr(text, '\n', text_size);
+                size_t text_len;
+                if (next) {
+                        text_len = next - text;
+                        next++;
+                        text_size -= next - text;
+                } else {
+                        text_len = text_size;
+                }
+                if (buf) {
+                        if (print_prefix(msg, syslog, NULL) +
+                            text_len + 1>= size - len)
+                                break;
+                        if (prefix)
+                                len += print_prefix(msg, syslog, buf + len);
+                        memcpy(buf + len, text, text_len);
+                        len += text_len;
+                        if (next || newline)
+                                buf[len++] = '\n';
+                } else {
+                        /* SYSLOG_ACTION_* buffer size only calculation */
+                        if (prefix)
+                                len += print_prefix(msg, syslog, NULL);
+                        len += text_len;
+                        if (next || newline)
+                                len++;
+                }
+                prefix = true;
+                text = next;
+        } while (text);
+        return len;
+}
+static int syslog_print(char __user *buf, int size)
+{
+        char *text;
+        struct log *msg;
+        int len = 0;
+        text = kmalloc(LOG_LINE_MAX, GFP_KERNEL);
+        if (!text)
+                return -ENOMEM;
+        while (size > 0) {
+                size_t n;
+                size_t skip;
+                raw_spin_lock_irq(&logbuf_lock);
+                if (syslog_seq < log_first_seq) {
+                        /* messages are gone, move to first one */
+                        syslog_seq = log_first_seq;
+                        syslog_idx = log_first_idx;
+                        syslog_prev = 0;
+                        syslog_partial = 0;
+                }
+                if (syslog_seq == log_next_seq) {
+                        raw_spin_unlock_irq(&logbuf_lock);
+                        break;
+                }
+                skip = syslog_partial;
+                msg = log_from_idx(syslog_idx);
+                n = msg_print_text(msg, syslog_prev, true, text, LOG_LINE_MAX);
+                if (n - syslog_partial <= size) {
+                        /* message fits into buffer, move forward */
+                        syslog_idx = log_next(syslog_idx);
+                        syslog_seq++;
+                        syslog_prev = msg->flags;
+                        n -= syslog_partial;
+                        syslog_partial = 0;
+                } else if (!len){
+                        /* partial read(), remember position */
+                        n = size;
+                        syslog_partial += n;
+                } else
+                        n = 0;
+                raw_spin_unlock_irq(&logbuf_lock);
+                if (!n)
+                        break;
+                if (copy_to_user(buf, text + skip, n)) {
+                        if (!len)
+                                len = -EFAULT;
+                        break;
+                }
+                len += n;
+                size -= n;
+                buf += n;
+        }
+        kfree(text);
+        return len;
+}
+static int syslog_print_all(char __user *buf, int size, bool clear)
+{
+        char *text;
+        int len = 0;
+        text = kmalloc(LOG_LINE_MAX, GFP_KERNEL);
+        if (!text)
+                return -ENOMEM;
+        raw_spin_lock_irq(&logbuf_lock);
+        if (buf) {
+                u64 next_seq;
+                u64 seq;
+                u32 idx;
+                enum log_flags prev;
+                if (clear_seq < log_first_seq) {
+                        /* messages are gone, move to first available one */
+                        clear_seq = log_first_seq;
+                        clear_idx = log_first_idx;
+                }
+                /*
+                 * Find first record that fits, including all following records,
+                 * into the user-provided buffer for this dump.
+                 */
+                seq = clear_seq;
+                idx = clear_idx;
+                prev = 0;
+                while (seq < log_next_seq) {
+                        struct log *msg = log_from_idx(idx);
+                        len += msg_print_text(msg, prev, true, NULL, 0);
+                        idx = log_next(idx);
+                        seq++;
+                }
+                /* move first record forward until length fits into the buffer */
+                seq = clear_seq;
+                idx = clear_idx;
+                prev = 0;
+                while (len > size && seq < log_next_seq) {
+                        struct log *msg = log_from_idx(idx);
+                        len -= msg_print_text(msg, prev, true, NULL, 0);
+                        idx = log_next(idx);
+                        seq++;
+                }
+                /* last message fitting into this dump */
+                next_seq = log_next_seq;
+                len = 0;
+                prev = 0;
+                while (len >= 0 && seq < next_seq) {
+                        struct log *msg = log_from_idx(idx);
+                        int textlen;
+                        textlen = msg_print_text(msg, prev, true, text, LOG_LINE_MAX);
+                        if (textlen < 0) {
+                                len = textlen;
+                                break;
+                        }
+                        idx = log_next(idx);
+                        seq++;
+                        prev = msg->flags;
+                        raw_spin_unlock_irq(&logbuf_lock);
+                        if (copy_to_user(buf + len, text, textlen))
+                                len = -EFAULT;
+                        else
+                                len += textlen;
+                        raw_spin_lock_irq(&logbuf_lock);
+                        if (seq < log_first_seq) {
+                                /* messages are gone, move to next one */
+                                seq = log_first_seq;
+                                idx = log_first_idx;
+                                prev = 0;
+                        }
+                }
+        }
+        if (clear) {
+                clear_seq = log_next_seq;
+                clear_idx = log_next_idx;
+        }
+        raw_spin_unlock_irq(&logbuf_lock);
+        kfree(text);
+        return len;
+}
 int do_syslog(int type, char __user *buf, int len, bool from_file)
 {
-        unsigned i, j, limit, count;
+        bool clear = false;
-        int do_clear = 0;
+        static int saved_console_loglevel = -1;
-        char c;
        int error;
        error = check_syslog_permissions(type, from_file);
@@ -364,28 +1088,14 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
                        goto out;
                }
                error = wait_event_interruptible(log_wait,
-                                                        (log_start - log_end));
+                                                 syslog_seq != log_next_seq);
                if (error)
                        goto out;
-                i = 0;
+                error = syslog_print(buf, len);
-                raw_spin_lock_irq(&logbuf_lock);
-                while (!error && (log_start != log_end) && i < len) {
-                        c = LOG_BUF(log_start);
-                        log_start++;
-                        raw_spin_unlock_irq(&logbuf_lock);
-                        error = __put_user(c,buf);
-                        buf++;
-                        i++;
-                        cond_resched();
-                        raw_spin_lock_irq(&logbuf_lock);
-                }
-                raw_spin_unlock_irq(&logbuf_lock);
-                if (!error)
-                        error = i;
                break;
        /* Read/clear last kernel messages */
        case SYSLOG_ACTION_READ_CLEAR:
-                do_clear = 1;
+                clear = true;
                /* FALL THRU */
        /* Read last kernel messages */
        case SYSLOG_ACTION_READ_ALL:
@@ -399,51 +1109,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
                        error = -EFAULT;
                        goto out;
                }
-                count = len;
+                error = syslog_print_all(buf, len, clear);
-                if (count > log_buf_len)
-                        count = log_buf_len;
-                raw_spin_lock_irq(&logbuf_lock);
-                if (count > logged_chars)
-                        count = logged_chars;
-                if (do_clear)
-                        logged_chars = 0;
-                limit = log_end;
-                /*
-                 * __put_user() could sleep, and while we sleep
-                 * printk() could overwrite the messages
-                 * we try to copy to user space. Therefore
-                 * the messages are copied in reverse. <manfreds>
-                 */
-                for (i = 0; i < count && !error; i++) {
-                        j = limit-1-i;
-                        if (j + log_buf_len < log_end)
-                                break;
-                        c = LOG_BUF(j);
-                        raw_spin_unlock_irq(&logbuf_lock);
-                        error = __put_user(c,&buf[count-1-i]);
-                        cond_resched();
-                        raw_spin_lock_irq(&logbuf_lock);
-                }
-                raw_spin_unlock_irq(&logbuf_lock);
-                if (error)
-                        break;
-                error = i;
-                if (i != count) {
-                        int offset = count-error;
-                        /* buffer overflow during copy, correct user buffer. */
-                        for (i = 0; i < error; i++) {
-                                if (__get_user(c,&buf[i+offset]) ||
-                                    __put_user(c,&buf[i])) {
-                                        error = -EFAULT;
-                                        break;
-                                }
-                                cond_resched();
-                        }
-                }
                break;
        /* Clear ring buffer */
        case SYSLOG_ACTION_CLEAR:
-                logged_chars = 0;
+                syslog_print_all(NULL, 0, true);
                break;
        /* Disable logging to console */
        case SYSLOG_ACTION_CONSOLE_OFF:
@@ -472,7 +1142,38 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
                break;
        /* Number of chars in the log buffer */
        case SYSLOG_ACTION_SIZE_UNREAD:
-                error = log_end - log_start;
+                raw_spin_lock_irq(&logbuf_lock);
+                if (syslog_seq < log_first_seq) {
+                        /* messages are gone, move to first one */
+                        syslog_seq = log_first_seq;
+                        syslog_idx = log_first_idx;
+                        syslog_prev = 0;
+                        syslog_partial = 0;
+                }
+                if (from_file) {
+                        /*
+                         * Short-cut for poll(/"proc/kmsg") which simply checks
+                         * for pending data, not the size; return the count of
+                         * records, not the length.
+                         */
+                        error = log_next_idx - syslog_idx;
+                } else {
+                        u64 seq = syslog_seq;
+                        u32 idx = syslog_idx;
+                        enum log_flags prev = syslog_prev;
+                        error = 0;
+                        while (seq < log_next_seq) {
+                                struct log *msg = log_from_idx(idx);
+                                error += msg_print_text(msg, prev, true, NULL, 0);
+                                idx = log_next(idx);
+                                seq++;
+                                prev = msg->flags;
+                        }
+                        error -= syslog_partial;
+                }
+                raw_spin_unlock_irq(&logbuf_lock);
                break;
        /* Size of the log buffer */
        case SYSLOG_ACTION_SIZE_BUFFER:
@@ -491,39 +1192,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
        return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
 }
-#ifdef  CONFIG_KGDB_KDB
-/* kdb dmesg command needs access to the syslog buffer.  do_syslog()
- * uses locks so it cannot be used during debugging.  Just tell kdb
- * where the start and end of the physical and logical logs are.  This
- * is equivalent to do_syslog(3).
- */
-void kdb_syslog_data(char *syslog_data[4])
-{
-        syslog_data[0] = log_buf;
-        syslog_data[1] = log_buf + log_buf_len;
-        syslog_data[2] = log_buf + log_end -
-                (logged_chars < log_buf_len ? logged_chars : log_buf_len);
-        syslog_data[3] = log_buf + log_end;
-}
-#endif  /* CONFIG_KGDB_KDB */
-/*
- * Call the console drivers on a range of log_buf
- */
-static void __call_console_drivers(unsigned start, unsigned end)
-{
-        struct console *con;
-        for_each_console(con) {
-                if (exclusive_console && con != exclusive_console)
-                        continue;
-                if ((con->flags & CON_ENABLED) && con->write &&
-                                (cpu_online(smp_processor_id()) ||
-                                (con->flags & CON_ANYTIME)))
-                        con->write(con, &LOG_BUF(start), end - start);
-        }
-}
 static bool __read_mostly ignore_loglevel;
 static int __init ignore_loglevel_setup(char *str)
@@ -540,142 +1208,33 @@ MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
        "print all kernel messages to the console.");
 /*
- * Write out chars from start to end - 1 inclusive
- */
-static void _call_console_drivers(unsigned start,
-                                unsigned end, int msg_log_level)
-{
-        trace_console(&LOG_BUF(0), start, end, log_buf_len);
-        if ((msg_log_level < console_loglevel || ignore_loglevel) &&
-                        console_drivers && start != end) {
-                if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
-                        /* wrapped write */
-                        __call_console_drivers(start & LOG_BUF_MASK,
-                                                log_buf_len);
-                        __call_console_drivers(0, end & LOG_BUF_MASK);
-                } else {
-                        __call_console_drivers(start, end);
-                }
-        }
-}
-/*
- * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the
- * lower 3 bit are the log level, the rest are the log facility. In case
- * userspace passes usual userspace syslog messages to /dev/kmsg or
- * /dev/ttyprintk, the log prefix might contain the facility. Printk needs
- * to extract the correct log level for in-kernel processing, and not mangle
- * the original value.
- *
- * If a prefix is found, the length of the prefix is returned. If 'level' is
- * passed, it will be filled in with the log level without a possible facility
- * value. If 'special' is passed, the special printk prefix chars are accepted
- * and returned. If no valid header is found, 0 is returned and the passed
- * variables are not touched.
- */
-static size_t log_prefix(const char *p, unsigned int *level, char *special)
-{
-        unsigned int lev = 0;
-        char sp = '\0';
-        size_t len;
-        if (p[0] != '<' || !p[1])
-                return 0;
-        if (p[2] == '>') {
-                /* usual single digit level number or special char */
-                switch (p[1]) {
-                case '0' ... '7':
-                        lev = p[1] - '0';
-                        break;
-                case 'c': /* KERN_CONT */
-                case 'd': /* KERN_DEFAULT */
-                        sp = p[1];
-                        break;
-                default:
-                        return 0;
-                }
-                len = 3;
-        } else {
-                /* multi digit including the level and facility number */
-                char *endp = NULL;
-                lev = (simple_strtoul(&p[1], &endp, 10) & 7);
-                if (endp == NULL || endp[0] != '>')
-                        return 0;
-                len = (endp + 1) - p;
-        }
-        /* do not accept special char if not asked for */
-        if (sp && !special)
-                return 0;
-        if (special) {
-                *special = sp;
-                /* return special char, do not touch level */
-                if (sp)
-                        return len;
-        }
-        if (level)
-                *level = lev;
-        return len;
-}
-/*
 * Call the console drivers, asking them to write out
 * log_buf[start] to log_buf[end - 1].
 * The console_lock must be held.
 */
-static void call_console_drivers(unsigned start, unsigned end)
+static void call_console_drivers(int level, const char *text, size_t len)
 {
-        unsigned cur_index, start_print;
+        struct console *con;
-        static int msg_level = -1;
-        BUG_ON(((int)(start - end)) > 0);
+        trace_console(text, 0, len, len);
-        cur_index = start;
+        if (level >= console_loglevel && !ignore_loglevel)
-        start_print = start;
+                return;
-        while (cur_index != end) {
+        if (!console_drivers)
-                if (msg_level < 0 && ((end - cur_index) > 2)) {
+                return;
-                        /* strip log prefix */
-                        cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL);
-                        start_print = cur_index;
-                }
-                while (cur_index != end) {
-                        char c = LOG_BUF(cur_index);
-                        cur_index++;
-                        if (c == '\n') {
-                                if (msg_level < 0) {
-                                        /*
-                                         * printk() has already given us loglevel tags in
-                                         * the buffer.  This code is here in case the
-                                         * log buffer has wrapped right round and scribbled
-                                         * on those tags
-                                         */
-                                        msg_level = default_message_loglevel;
-                                }
-                                _call_console_drivers(start_print, cur_index, msg_level);
-                                msg_level = -1;
-                                start_print = cur_index;
-                                break;
-                        }
-                }
-        }
-        _call_console_drivers(start_print, end, msg_level);
-}
-static void emit_log_char(char c)
+        for_each_console(con) {
-{
+                if (exclusive_console && con != exclusive_console)
-        LOG_BUF(log_end) = c;
+                        continue;
-        log_end++;
+                if (!(con->flags & CON_ENABLED))
-        if (log_end - log_start > log_buf_len)
+                        continue;
-                log_start = log_end - log_buf_len;
+                if (!con->write)
-        if (log_end - con_start > log_buf_len)
+                        continue;
-                con_start = log_end - log_buf_len;
+                if (!cpu_online(smp_processor_id()) &&
-        if (logged_chars < log_buf_len)
+                    !(con->flags & CON_ANYTIME))
-                logged_chars++;
+                        continue;
+                con->write(con, text, len);
+        }
 }
 /*
@@ -700,16 +1259,6 @@ static void zap_locks(void)
        sema_init(&console_sem, 1);
 }
-#if defined(CONFIG_PRINTK_TIME)
-static bool printk_time = 1;
-#else
-static bool printk_time = 0;
-#endif
-module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
-static bool always_kmsg_dump;
-module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
 /* Check if we have any console registered that can be called early in boot. */
 static int have_callable_console(void)
 {
@@ -722,51 +1271,6 @@ static int have_callable_console(void)
        return 0;
 }
-/**
- * printk - print a kernel message
- * @fmt: format string
- *
- * This is printk().  It can be called from any context.  We want it to work.
- *
- * We try to grab the console_lock.  If we succeed, it's easy - we log the output and
- * call the console drivers.  If we fail to get the semaphore we place the output
- * into the log buffer and return.  The current holder of the console_sem will
- * notice the new output in console_unlock(); and will send it to the
- * consoles before releasing the lock.
- *
- * One effect of this deferred printing is that code which calls printk() and
- * then changes console_loglevel may break. This is because console_loglevel
- * is inspected when the actual printing occurs.
- *
- * See also:
- * printf(3)
- *
- * See the vsnprintf() documentation for format string extensions over C99.
- */
-asmlinkage int printk(const char *fmt, ...)
-{
-        va_list args;
-        int r;
-#ifdef CONFIG_KGDB_KDB
-        if (unlikely(kdb_trap_printk)) {
-                va_start(args, fmt);
-                r = vkdb_printf(fmt, args);
-                va_end(args);
-                return r;
-        }
-#endif
-        va_start(args, fmt);
-        r = vprintk(fmt, args);
-        va_end(args);
-        return r;
-}
-/* cpu currently holding logbuf_lock */
-static volatile unsigned int printk_cpu = UINT_MAX;
 /*
 * Can we actually use the console at this time on this cpu?
 *
@@ -810,17 +1314,12 @@ static int console_trylock_for_printk(unsigned int cpu)
                        retval = 0;
                }
        }
-        printk_cpu = UINT_MAX;
+        logbuf_cpu = UINT_MAX;
        if (wake)
                up(&console_sem);
        raw_spin_unlock(&logbuf_lock);
        return retval;
 }
-static const char recursion_bug_msg [] =
-                KERN_CRIT "BUG: recent printk recursion!\n";
-static int recursion_bug;
-static int new_text_line = 1;
-static char printk_buf[1024];
 int printk_delay_msec __read_mostly;
@@ -836,15 +1335,99 @@ static inline void printk_delay(void)
        }
 }
-asmlinkage int vprintk(const char *fmt, va_list args)
+/*
+ * Continuation lines are buffered, and not committed to the record buffer
+ * until the line is complete, or a race forces it. The line fragments
+ * though, are printed immediately to the consoles to ensure everything has
+ * reached the console in case of a kernel crash.
+ */
+static struct cont {
+        char buf[LOG_LINE_MAX];
+        size_t len;                     /* length == 0 means unused buffer */
+        size_t cons;                    /* bytes written to console */
+        struct task_struct *owner;      /* task of first print*/
+        u64 ts_nsec;                    /* time of first print */
+        u8 level;                       /* log level of first message */
+        u8 facility;                    /* log level of first message */
+        bool flushed:1;                 /* buffer sealed and committed */
+} cont;
+static void cont_flush(void)
 {
-        int printed_len = 0;
+        if (cont.flushed)
-        int current_log_level = default_message_loglevel;
+                return;
+        if (cont.len == 0)
+                return;
+        log_store(cont.facility, cont.level, LOG_NOCONS, cont.ts_nsec,
+                  NULL, 0, cont.buf, cont.len);
+        cont.flushed = true;
+}
+static bool cont_add(int facility, int level, const char *text, size_t len)
+{
+        if (cont.len && cont.flushed)
+                return false;
+        if (cont.len + len > sizeof(cont.buf)) {
+                cont_flush();
+                return false;
+        }
+        if (!cont.len) {
+                cont.facility = facility;
+                cont.level = level;
+                cont.owner = current;
+                cont.ts_nsec = local_clock();
+                cont.cons = 0;
+                cont.flushed = false;
+        }
+        memcpy(cont.buf + cont.len, text, len);
+        cont.len += len;
+        return true;
+}
+static size_t cont_print_text(char *text, size_t size)
+{
+        size_t textlen = 0;
+        size_t len;
+        if (cont.cons == 0) {
+                textlen += print_time(cont.ts_nsec, text);
+                size -= textlen;
+        }
+        len = cont.len - cont.cons;
+        if (len > 0) {
+                if (len+1 > size)
+                        len = size-1;
+                memcpy(text + textlen, cont.buf + cont.cons, len);
+                textlen += len;
+                cont.cons = cont.len;
+        }
+        if (cont.flushed) {
+                text[textlen++] = '\n';
+                /* got everything, release buffer */
+                cont.len = 0;
+        }
+        return textlen;
+}
+asmlinkage int vprintk_emit(int facility, int level,
+                            const char *dict, size_t dictlen,
+                            const char *fmt, va_list args)
+{
+        static int recursion_bug;
+        static char textbuf[LOG_LINE_MAX];
+        char *text = textbuf;
+        size_t text_len;
+        enum log_flags lflags = 0;
        unsigned long flags;
        int this_cpu;
-        char *p;
+        int printed_len = 0;
-        size_t plen;
-        char special;
        boot_delay_msec();
        printk_delay();
@@ -856,7 +1439,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
        /*
         * Ouch, printk recursed into itself!
         */
-        if (unlikely(printk_cpu == this_cpu)) {
+        if (unlikely(logbuf_cpu == this_cpu)) {
                /*
                 * If a crash is occurring during printk() on this CPU,
                 * then try to get the crash message out but make sure
@@ -873,97 +1456,91 @@ asmlinkage int vprintk(const char *fmt, va_list args)
        lockdep_off();
        raw_spin_lock(&logbuf_lock);
-        printk_cpu = this_cpu;
+        logbuf_cpu = this_cpu;
        if (recursion_bug) {
+                static const char recursion_msg[] =
+                        "BUG: recent printk recursion!";
                recursion_bug = 0;
-                strcpy(printk_buf, recursion_bug_msg);
+                printed_len += strlen(recursion_msg);
-                printed_len = strlen(recursion_bug_msg);
+                /* emit KERN_CRIT message */
+                log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
+                          NULL, 0, recursion_msg, printed_len);
        }
-        /* Emit the output into the temporary buffer */
-        printed_len += vscnprintf(printk_buf + printed_len,
-                                  sizeof(printk_buf) - printed_len, fmt, args);
-        p = printk_buf;
+        /*
+         * The printf needs to come first; we need the syslog
+         * prefix which might be passed-in as a parameter.
+         */
+        text_len = vscnprintf(text, sizeof(textbuf), fmt, args);
-        /* Read log level and handle special printk prefix */
+        /* mark and strip a trailing newline */
-        plen = log_prefix(p, &current_log_level, &special);
+        if (text_len && text[text_len-1] == '\n') {
-        if (plen) {
+                text_len--;
-                p += plen;
+                lflags |= LOG_NEWLINE;
+        }
-                switch (special) {
+        /* strip syslog prefix and extract log level or control flags */
-                case 'c': /* Strip <c> KERN_CONT, continue line */
+        if (text[0] == '<' && text[1] && text[2] == '>') {
-                        plen = 0;
+                switch (text[1]) {
-                        break;
+                case '0' ... '7':
-                case 'd': /* Strip <d> KERN_DEFAULT, start new line */
+                        if (level == -1)
-                        plen = 0;
+                                level = text[1] - '0';
-                default:
+                case 'd':       /* KERN_DEFAULT */
-                        if (!new_text_line) {
+                        lflags |= LOG_PREFIX;
-                                emit_log_char('\n');
+                case 'c':       /* KERN_CONT */
-                                new_text_line = 1;
+                        text += 3;
-                        }
+                        text_len -= 3;
                }
        }
-        /*
+        if (level == -1)
-         * Copy the output into log_buf. If the caller didn't provide
+                level = default_message_loglevel;
-         * the appropriate log prefix, we insert them here
-         */
-        for (; *p; p++) {
-                if (new_text_line) {
-                        new_text_line = 0;
-                        if (plen) {
-                                /* Copy original log prefix */
-                                int i;
-                                for (i = 0; i < plen; i++)
-                                        emit_log_char(printk_buf[i]);
-                                printed_len += plen;
-                        } else {
-                                /* Add log prefix */
-                                emit_log_char('<');
-                                emit_log_char(current_log_level + '0');
-                                emit_log_char('>');
-                                printed_len += 3;
-                        }
-                        if (printk_time) {
+        if (dict)
-                                /* Add the current time stamp */
+                lflags |= LOG_PREFIX|LOG_NEWLINE;
-                                char tbuf[50], *tp;
-                                unsigned tlen;
-                                unsigned long long t;
-                                unsigned long nanosec_rem;
-                                t = cpu_clock(printk_cpu);
-                                nanosec_rem = do_div(t, 1000000000);
-                                tlen = sprintf(tbuf, "[%5lu.%06lu] ",
-                                                (unsigned long) t,
-                                                nanosec_rem / 1000);
-                                for (tp = tbuf; tp < tbuf + tlen; tp++)
-                                        emit_log_char(*tp);
-                                printed_len += tlen;
-                        }
-                        if (!*p)
+        if (!(lflags & LOG_NEWLINE)) {
-                                break;
+                /*
+                 * Flush the conflicting buffer. An earlier newline was missing,
+                 * or another task also prints continuation lines.
+                 */
+                if (cont.len && (lflags & LOG_PREFIX || cont.owner != current))
+                        cont_flush();
+                /* buffer line if possible, otherwise store it right away */
+                if (!cont_add(facility, level, text, text_len))
+                        log_store(facility, level, lflags | LOG_CONT, 0,
+                                  dict, dictlen, text, text_len);
+        } else {
+                bool stored = false;
+                /*
+                 * If an earlier newline was missing and it was the same task,
+                 * either merge it with the current buffer and flush, or if
+                 * there was a race with interrupts (prefix == true) then just
+                 * flush it out and store this line separately.
+                 */
+                if (cont.len && cont.owner == current) {
+                        if (!(lflags & LOG_PREFIX))
+                                stored = cont_add(facility, level, text, text_len);
+                        cont_flush();
                }
-                emit_log_char(*p);
+                if (!stored)
-                if (*p == '\n')
+                        log_store(facility, level, lflags, 0,
-                        new_text_line = 1;
+                                  dict, dictlen, text, text_len);
        }
+        printed_len += text_len;
        /*
-         * Try to acquire and then immediately release the
+         * Try to acquire and then immediately release the console semaphore.
-         * console semaphore. The release will do all the
+         * The release will print out buffers and wake up /dev/kmsg and syslog()
-         * actual magic (print out buffers, wake up klogd,
+         * users.
-         * etc). 
         *
-         * The console_trylock_for_printk() function
+         * The console_trylock_for_printk() function will release 'logbuf_lock'
-         * will release 'logbuf_lock' regardless of whether it
+         * regardless of whether it actually gets the console semaphore or not.
-         * actually gets the semaphore or not.
         */
        if (console_trylock_for_printk(this_cpu))
                console_unlock();
@@ -974,16 +1551,88 @@ out_restore_irqs:
        return printed_len;
 }
-EXPORT_SYMBOL(printk);
+EXPORT_SYMBOL(vprintk_emit);
-EXPORT_SYMBOL(vprintk);
-#else
+asmlinkage int vprintk(const char *fmt, va_list args)
+{
+        return vprintk_emit(0, -1, NULL, 0, fmt, args);
+}
+EXPORT_SYMBOL(vprintk);
-static void call_console_drivers(unsigned start, unsigned end)
+asmlinkage int printk_emit(int facility, int level,
+                           const char *dict, size_t dictlen,
+                           const char *fmt, ...)
 {
+        va_list args;
+        int r;
+        va_start(args, fmt);
+        r = vprintk_emit(facility, level, dict, dictlen, fmt, args);
+        va_end(args);
+        return r;
 }
+EXPORT_SYMBOL(printk_emit);
+/**
+ * printk - print a kernel message
+ * @fmt: format string
+ *
+ * This is printk(). It can be called from any context. We want it to work.
+ *
+ * We try to grab the console_lock. If we succeed, it's easy - we log the
+ * output and call the console drivers.  If we fail to get the semaphore, we
+ * place the output into the log buffer and return. The current holder of
+ * the console_sem will notice the new output in console_unlock(); and will
+ * send it to the consoles before releasing the lock.
+ *
+ * One effect of this deferred printing is that code which calls printk() and
+ * then changes console_loglevel may break. This is because console_loglevel
+ * is inspected when the actual printing occurs.
+ *
+ * See also:
+ * printf(3)
+ *
+ * See the vsnprintf() documentation for format string extensions over C99.
+ */
+asmlinkage int printk(const char *fmt, ...)
+{
+        va_list args;
+        int r;
+#ifdef CONFIG_KGDB_KDB
+        if (unlikely(kdb_trap_printk)) {
+                va_start(args, fmt);
+                r = vkdb_printf(fmt, args);
+                va_end(args);
+                return r;
+        }
 #endif
+        va_start(args, fmt);
+        r = vprintk_emit(0, -1, NULL, 0, fmt, args);
+        va_end(args);
+        return r;
+}
+EXPORT_SYMBOL(printk);
+#else
+#define LOG_LINE_MAX 0
+static struct cont {
+        size_t len;
+        size_t cons;
+        u8 level;
+        bool flushed:1;
+} cont;
+static struct log *log_from_idx(u32 idx) { return NULL; }
+static u32 log_next(u32 idx) { return 0; }
+static void call_console_drivers(int level, const char *text, size_t len) {}
+static size_t msg_print_text(const struct log *msg, enum log_flags prev,
+                             bool syslog, char *buf, size_t size) { return 0; }
+static size_t cont_print_text(char *text, size_t size) { return 0; }
+#endif /* CONFIG_PRINTK */
 static int __add_preferred_console(char *name, int idx, char *options,
                                   char *brl_options)
@@ -1217,7 +1866,7 @@ int is_console_locked(void)
 }
 /*
- * Delayed printk facility, for scheduler-internal messages:
+ * Delayed printk version, for scheduler-internal messages:
 */
 #define PRINTK_BUF_SIZE         512
@@ -1253,6 +1902,11 @@ void wake_up_klogd(void)
                this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
 }
+/* the next printk record to write to the console */
+static u64 console_seq;
+static u32 console_idx;
+static enum log_flags console_prev;
 /**
 * console_unlock - unlock the console system
 *
@@ -1263,15 +1917,17 @@ void wake_up_klogd(void)
 * by printk().  If this is the case, console_unlock(); emits
 * the output prior to releasing the lock.
 *
- * If there is output waiting for klogd, we wake it up.
+ * If there is output waiting, we wake /dev/kmsg and syslog() users.
 *
 * console_unlock(); may be called from any context.
 */
 void console_unlock(void)
 {
+        static char text[LOG_LINE_MAX];
+        static u64 seen_seq;
        unsigned long flags;
-        unsigned _con_start, _log_end;
+        bool wake_klogd = false;
-        unsigned wake_klogd = 0, retry = 0;
+        bool retry;
        if (console_suspended) {
                up(&console_sem);
@@ -1280,18 +1936,69 @@ void console_unlock(void)
        console_may_schedule = 0;
+        /* flush buffered message fragment immediately to console */
+        raw_spin_lock_irqsave(&logbuf_lock, flags);
+        if (cont.len && (cont.cons < cont.len || cont.flushed)) {
+                size_t len;
+                len = cont_print_text(text, sizeof(text));
+                raw_spin_unlock(&logbuf_lock);
+                stop_critical_timings();
+                call_console_drivers(cont.level, text, len);
+                start_critical_timings();
+                local_irq_restore(flags);
+        } else
+                raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 again:
-        for ( ; ; ) {
+        for (;;) {
+                struct log *msg;
+                size_t len;
+                int level;
                raw_spin_lock_irqsave(&logbuf_lock, flags);
-                wake_klogd |= log_start - log_end;
+                if (seen_seq != log_next_seq) {
-                if (con_start == log_end)
+                        wake_klogd = true;
-                        break;                  /* Nothing to print */
+                        seen_seq = log_next_seq;
-                _con_start = con_start;
+                }
-                _log_end = log_end;
-                con_start = log_end;            /* Flush */
+                if (console_seq < log_first_seq) {
+                        /* messages are gone, move to first one */
+                        console_seq = log_first_seq;
+                        console_idx = log_first_idx;
+                        console_prev = 0;
+                }
+skip:
+                if (console_seq == log_next_seq)
+                        break;
+                msg = log_from_idx(console_idx);
+                if (msg->flags & LOG_NOCONS) {
+                        /*
+                         * Skip record we have buffered and already printed
+                         * directly to the console when we received it.
+                         */
+                        console_idx = log_next(console_idx);
+                        console_seq++;
+                        /*
+                         * We will get here again when we register a new
+                         * CON_PRINTBUFFER console. Clear the flag so we
+                         * will properly dump everything later.
+                         */
+                        msg->flags &= ~LOG_NOCONS;
+                        goto skip;
+                }
+                level = msg->level;
+                len = msg_print_text(msg, console_prev, false,
+                                     text, sizeof(text));
+                console_idx = log_next(console_idx);
+                console_seq++;
+                console_prev = msg->flags;
                raw_spin_unlock(&logbuf_lock);
                stop_critical_timings();        /* don't trace print latency */
-                call_console_drivers(_con_start, _log_end);
+                call_console_drivers(level, text, len);
                start_critical_timings();
                local_irq_restore(flags);
        }
@@ -1312,8 +2019,7 @@ again:
         * flush, no worries.
         */
        raw_spin_lock(&logbuf_lock);
-        if (con_start != log_end)
+        retry = console_seq != log_next_seq;
-                retry = 1;
        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
        if (retry && console_trylock())
@@ -1549,7 +2255,9 @@ void register_console(struct console *newcon)
                 * for us.
                 */
                raw_spin_lock_irqsave(&logbuf_lock, flags);
-                con_start = log_start;
+                console_seq = syslog_seq;
+                console_idx = syslog_idx;
+                console_prev = syslog_prev;
                raw_spin_unlock_irqrestore(&logbuf_lock, flags);
                /*
                 * We're about to replay the log buffer.  Only do this to the
@@ -1758,50 +2466,263 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)
 }
 EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
+static bool always_kmsg_dump;
+module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
 /**
 * kmsg_dump - dump kernel log to kernel message dumpers.
 * @reason: the reason (oops, panic etc) for dumping
 *
- * Iterate through each of the dump devices and call the oops/panic
+ * Call each of the registered dumper's dump() callback, which can
- * callbacks with the log buffer.
+ * retrieve the kmsg records with kmsg_dump_get_line() or
+ * kmsg_dump_get_buffer().
 */
 void kmsg_dump(enum kmsg_dump_reason reason)
 {
-        unsigned long end;
-        unsigned chars;
        struct kmsg_dumper *dumper;
-        const char *s1, *s2;
-        unsigned long l1, l2;
        unsigned long flags;
        if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump)
                return;
-        /* Theoretically, the log could move on after we do this, but
+        rcu_read_lock();
-           there's not a lot we can do about that. The new messages
+        list_for_each_entry_rcu(dumper, &dump_list, list) {
-           will overwrite the start of what we dump. */
+                if (dumper->max_reason && reason > dumper->max_reason)
+                        continue;
+                /* initialize iterator with data about the stored records */
+                dumper->active = true;
+                raw_spin_lock_irqsave(&logbuf_lock, flags);
+                dumper->cur_seq = clear_seq;
+                dumper->cur_idx = clear_idx;
+                dumper->next_seq = log_next_seq;
+                dumper->next_idx = log_next_idx;
+                raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+                /* invoke dumper which will iterate over records */
+                dumper->dump(dumper, reason);
+                /* reset iterator */
+                dumper->active = false;
+        }
+        rcu_read_unlock();
+}
+/**
+ * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version)
+ * @dumper: registered kmsg dumper
+ * @syslog: include the "<4>" prefixes
+ * @line: buffer to copy the line to
+ * @size: maximum size of the buffer
+ * @len: length of line placed into buffer
+ *
+ * Start at the beginning of the kmsg buffer, with the oldest kmsg
+ * record, and copy one record into the provided buffer.
+ *
+ * Consecutive calls will return the next available record moving
+ * towards the end of the buffer with the youngest messages.
+ *
+ * A return value of FALSE indicates that there are no more records to
+ * read.
+ *
+ * The function is similar to kmsg_dump_get_line(), but grabs no locks.
+ */
+bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
+                               char *line, size_t size, size_t *len)
+{
+        struct log *msg;
+        size_t l = 0;
+        bool ret = false;
+        if (!dumper->active)
+                goto out;
+        if (dumper->cur_seq < log_first_seq) {
+                /* messages are gone, move to first available one */
+                dumper->cur_seq = log_first_seq;
+                dumper->cur_idx = log_first_idx;
+        }
+        /* last entry */
+        if (dumper->cur_seq >= log_next_seq)
+                goto out;
+        msg = log_from_idx(dumper->cur_idx);
+        l = msg_print_text(msg, 0, syslog, line, size);
+        dumper->cur_idx = log_next(dumper->cur_idx);
+        dumper->cur_seq++;
+        ret = true;
+out:
+        if (len)
+                *len = l;
+        return ret;
+}
+/**
+ * kmsg_dump_get_line - retrieve one kmsg log line
+ * @dumper: registered kmsg dumper
+ * @syslog: include the "<4>" prefixes
+ * @line: buffer to copy the line to
+ * @size: maximum size of the buffer
+ * @len: length of line placed into buffer
+ *
+ * Start at the beginning of the kmsg buffer, with the oldest kmsg
+ * record, and copy one record into the provided buffer.
+ *
+ * Consecutive calls will return the next available record moving
+ * towards the end of the buffer with the youngest messages.
+ *
+ * A return value of FALSE indicates that there are no more records to
+ * read.
+ */
+bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
+                        char *line, size_t size, size_t *len)
+{
+        unsigned long flags;
+        bool ret;
        raw_spin_lock_irqsave(&logbuf_lock, flags);
-        end = log_end & LOG_BUF_MASK;
+        ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
-        chars = logged_chars;
        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
-        if (chars > end) {
+        return ret;
-                s1 = log_buf + log_buf_len - chars + end;
+}
-                l1 = chars - end;
+EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
-                s2 = log_buf;
+/**
-                l2 = end;
+ * kmsg_dump_get_buffer - copy kmsg log lines
-        } else {
+ * @dumper: registered kmsg dumper
-                s1 = "";
+ * @syslog: include the "<4>" prefixes
-                l1 = 0;
+ * @buf: buffer to copy the line to
+ * @size: maximum size of the buffer
+ * @len: length of line placed into buffer
+ *
+ * Start at the end of the kmsg buffer and fill the provided buffer
+ * with as many of the the *youngest* kmsg records that fit into it.
+ * If the buffer is large enough, all available kmsg records will be
+ * copied with a single call.
+ *
+ * Consecutive calls will fill the buffer with the next block of
+ * available older records, not including the earlier retrieved ones.
+ *
+ * A return value of FALSE indicates that there are no more records to
+ * read.
+ */
+bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
+                          char *buf, size_t size, size_t *len)
+{
+        unsigned long flags;
+        u64 seq;
+        u32 idx;
+        u64 next_seq;
+        u32 next_idx;
+        enum log_flags prev;
+        size_t l = 0;
+        bool ret = false;
+        if (!dumper->active)
+                goto out;
-                s2 = log_buf + end - chars;
+        raw_spin_lock_irqsave(&logbuf_lock, flags);
-                l2 = chars;
+        if (dumper->cur_seq < log_first_seq) {
+                /* messages are gone, move to first available one */
+                dumper->cur_seq = log_first_seq;
+                dumper->cur_idx = log_first_idx;
        }
-        rcu_read_lock();
+        /* last entry */
-        list_for_each_entry_rcu(dumper, &dump_list, list)
+        if (dumper->cur_seq >= dumper->next_seq) {
-                dumper->dump(dumper, reason, s1, l1, s2, l2);
+                raw_spin_unlock_irqrestore(&logbuf_lock, flags);
-        rcu_read_unlock();
+                goto out;
+        }
+        /* calculate length of entire buffer */
+        seq = dumper->cur_seq;
+        idx = dumper->cur_idx;
+        prev = 0;
+        while (seq < dumper->next_seq) {
+                struct log *msg = log_from_idx(idx);
+                l += msg_print_text(msg, prev, true, NULL, 0);
+                idx = log_next(idx);
+                seq++;
+                prev = msg->flags;
+        }
+        /* move first record forward until length fits into the buffer */
+        seq = dumper->cur_seq;
+        idx = dumper->cur_idx;
+        prev = 0;
+        while (l > size && seq < dumper->next_seq) {
+                struct log *msg = log_from_idx(idx);
+                l -= msg_print_text(msg, prev, true, NULL, 0);
+                idx = log_next(idx);
+                seq++;
+                prev = msg->flags;
+        }
+        /* last message in next interation */
+        next_seq = seq;
+        next_idx = idx;
+        l = 0;
+        prev = 0;
+        while (seq < dumper->next_seq) {
+                struct log *msg = log_from_idx(idx);
+                l += msg_print_text(msg, prev, syslog, buf + l, size - l);
+                idx = log_next(idx);
+                seq++;
+                prev = msg->flags;
+        }
+        dumper->next_seq = next_seq;
+        dumper->next_idx = next_idx;
+        ret = true;
+        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+out:
+        if (len)
+                *len = l;
+        return ret;
+}
+EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
+/**
+ * kmsg_dump_rewind_nolock - reset the interator (unlocked version)
+ * @dumper: registered kmsg dumper
+ *
+ * Reset the dumper's iterator so that kmsg_dump_get_line() and
+ * kmsg_dump_get_buffer() can be called again and used multiple
+ * times within the same dumper.dump() callback.
+ *
+ * The function is similar to kmsg_dump_rewind(), but grabs no locks.
+ */
+void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
+{
+        dumper->cur_seq = clear_seq;
+        dumper->cur_idx = clear_idx;
+        dumper->next_seq = log_next_seq;
+        dumper->next_idx = log_next_idx;
+}
+/**
+ * kmsg_dump_rewind - reset the interator
+ * @dumper: registered kmsg dumper
+ *
+ * Reset the dumper's iterator so that kmsg_dump_get_line() and
+ * kmsg_dump_get_buffer() can be called again and used multiple
+ * times within the same dumper.dump() callback.
+ */
+void kmsg_dump_rewind(struct kmsg_dumper *dumper)
+{
+        unsigned long flags;
+        raw_spin_lock_irqsave(&logbuf_lock, flags);
+        kmsg_dump_rewind_nolock(dumper);
+        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 }
+EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
 #endif
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index ee8d49b9c309..a232bb59d93f 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -198,15 +198,14 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
                return 0;
        rcu_read_lock();
        tcred = __task_cred(task);
-        if (cred->user->user_ns == tcred->user->user_ns &&
+        if (uid_eq(cred->uid, tcred->euid) &&
-            (cred->uid == tcred->euid &&
+            uid_eq(cred->uid, tcred->suid) &&
-             cred->uid == tcred->suid &&
+            uid_eq(cred->uid, tcred->uid)  &&
-             cred->uid == tcred->uid  &&
+            gid_eq(cred->gid, tcred->egid) &&
-             cred->gid == tcred->egid &&
+            gid_eq(cred->gid, tcred->sgid) &&
-             cred->gid == tcred->sgid &&
+            gid_eq(cred->gid, tcred->gid))
-             cred->gid == tcred->gid))
                goto ok;
-        if (ptrace_has_cap(tcred->user->user_ns, mode))
+        if (ptrace_has_cap(tcred->user_ns, mode))
                goto ok;
        rcu_read_unlock();
        return -EPERM;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a86f1741cc27..95cba41ce1e9 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -51,6 +51,34 @@
 #include "rcu.h"
+#ifdef CONFIG_PREEMPT_RCU
+/*
+ * Check for a task exiting while in a preemptible-RCU read-side
+ * critical section, clean up if so.  No need to issue warnings,
+ * as debug_check_no_locks_held() already does this if lockdep
+ * is enabled.
+ */
+void exit_rcu(void)
+{
+        struct task_struct *t = current;
+        if (likely(list_empty(&current->rcu_node_entry)))
+                return;
+        t->rcu_read_lock_nesting = 1;
+        barrier();
+        t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED;
+        __rcu_read_unlock();
+}
+#else /* #ifdef CONFIG_PREEMPT_RCU */
+void exit_rcu(void)
+{
+}
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key rcu_lock_key;
 struct lockdep_map rcu_lock_map =
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 22ecea0dfb62..fc31a2d65100 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -851,22 +851,6 @@ int rcu_preempt_needs_cpu(void)
        return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
 }
-/*
- * Check for a task exiting while in a preemptible -RCU read-side
- * critical section, clean up if so.  No need to issue warnings,
- * as debug_check_no_locks_held() already does this if lockdep
- * is enabled.
- */
-void exit_rcu(void)
-{
-        struct task_struct *t = current;
-        if (t->rcu_read_lock_nesting == 0)
-                return;
-        t->rcu_read_lock_nesting = 1;
-        __rcu_read_unlock();
-}
 #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
 #ifdef CONFIG_RCU_TRACE
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index a89b381a8c6e..e66b34ab7555 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -64,6 +64,7 @@ static int irqreader = 1;	/* RCU readers from irq (timers). */
 static int fqs_duration;        /* Duration of bursts (us), 0 to disable. */
 static int fqs_holdoff;         /* Hold time within burst (us). */
 static int fqs_stutter = 3;     /* Wait time between bursts (s). */
+static int n_barrier_cbs;       /* Number of callbacks to test RCU barriers. */
 static int onoff_interval;      /* Wait time between CPU hotplugs, 0=disable. */
 static int onoff_holdoff;       /* Seconds after boot before CPU hotplugs. */
 static int shutdown_secs;       /* Shutdown time (s).  <=0 for no shutdown. */
@@ -96,6 +97,8 @@ module_param(fqs_holdoff, int, 0444);
 MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
 module_param(fqs_stutter, int, 0444);
 MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
+module_param(n_barrier_cbs, int, 0444);
+MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing");
 module_param(onoff_interval, int, 0444);
 MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
 module_param(onoff_holdoff, int, 0444);
@@ -139,6 +142,8 @@ static struct task_struct *shutdown_task;
 static struct task_struct *onoff_task;
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static struct task_struct *stall_task;
+static struct task_struct **barrier_cbs_tasks;
+static struct task_struct *barrier_task;
 #define RCU_TORTURE_PIPE_LEN 10
@@ -164,6 +169,7 @@ static atomic_t n_rcu_torture_alloc_fail;
 static atomic_t n_rcu_torture_free;
 static atomic_t n_rcu_torture_mberror;
 static atomic_t n_rcu_torture_error;
+static long n_rcu_torture_barrier_error;
 static long n_rcu_torture_boost_ktrerror;
 static long n_rcu_torture_boost_rterror;
 static long n_rcu_torture_boost_failure;
@@ -173,6 +179,8 @@ static long n_offline_attempts;
 static long n_offline_successes;
 static long n_online_attempts;
 static long n_online_successes;
+static long n_barrier_attempts;
+static long n_barrier_successes;
 static struct list_head rcu_torture_removed;
 static cpumask_var_t shuffle_tmp_mask;
@@ -197,6 +205,10 @@ static unsigned long shutdown_time;	/* jiffies to system shutdown. */
 static unsigned long boost_starttime;   /* jiffies of next boost test start. */
 DEFINE_MUTEX(boost_mutex);              /* protect setting boost_starttime */
                                        /*  and boost task create/destroy. */
+static atomic_t barrier_cbs_count;      /* Barrier callbacks registered. */
+static atomic_t barrier_cbs_invoked;    /* Barrier callbacks invoked. */
+static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
+static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
 /* Mediate rmmod and system shutdown.  Concurrent rmmod & shutdown illegal! */
@@ -327,6 +339,7 @@ struct rcu_torture_ops {
        int (*completed)(void);
        void (*deferred_free)(struct rcu_torture *p);
        void (*sync)(void);
+        void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
        void (*cb_barrier)(void);
        void (*fqs)(void);
        int (*stats)(char *page);
@@ -417,6 +430,7 @@ static struct rcu_torture_ops rcu_ops = {
        .completed      = rcu_torture_completed,
        .deferred_free  = rcu_torture_deferred_free,
        .sync           = synchronize_rcu,
+        .call           = call_rcu,
        .cb_barrier     = rcu_barrier,
        .fqs            = rcu_force_quiescent_state,
        .stats          = NULL,
@@ -460,6 +474,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
        .completed      = rcu_torture_completed,
        .deferred_free  = rcu_sync_torture_deferred_free,
        .sync           = synchronize_rcu,
+        .call           = NULL,
        .cb_barrier     = NULL,
        .fqs            = rcu_force_quiescent_state,
        .stats          = NULL,
@@ -477,6 +492,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
        .completed      = rcu_no_completed,
        .deferred_free  = rcu_sync_torture_deferred_free,
        .sync           = synchronize_rcu_expedited,
+        .call           = NULL,
        .cb_barrier     = NULL,
        .fqs            = rcu_force_quiescent_state,
        .stats          = NULL,
@@ -519,6 +535,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
        .completed      = rcu_bh_torture_completed,
        .deferred_free  = rcu_bh_torture_deferred_free,
        .sync           = synchronize_rcu_bh,
+        .call           = call_rcu_bh,
        .cb_barrier     = rcu_barrier_bh,
        .fqs            = rcu_bh_force_quiescent_state,
        .stats          = NULL,
@@ -535,6 +552,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
        .completed      = rcu_bh_torture_completed,
        .deferred_free  = rcu_sync_torture_deferred_free,
        .sync           = synchronize_rcu_bh,
+        .call           = NULL,
        .cb_barrier     = NULL,
        .fqs            = rcu_bh_force_quiescent_state,
        .stats          = NULL,
@@ -551,6 +569,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = {
        .completed      = rcu_bh_torture_completed,
        .deferred_free  = rcu_sync_torture_deferred_free,
        .sync           = synchronize_rcu_bh_expedited,
+        .call           = NULL,
        .cb_barrier     = NULL,
        .fqs            = rcu_bh_force_quiescent_state,
        .stats          = NULL,
@@ -606,6 +625,11 @@ static int srcu_torture_completed(void)
        return srcu_batches_completed(&srcu_ctl);
 }
+static void srcu_torture_deferred_free(struct rcu_torture *rp)
+{
+        call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb);
+}
 static void srcu_torture_synchronize(void)
 {
        synchronize_srcu(&srcu_ctl);
@@ -620,7 +644,7 @@ static int srcu_torture_stats(char *page)
        cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):",
                       torture_type, TORTURE_FLAG, idx);
        for_each_possible_cpu(cpu) {
-                cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu,
+                cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu,
                               per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx],
                               per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]);
        }
@@ -635,13 +659,29 @@ static struct rcu_torture_ops srcu_ops = {
        .read_delay     = srcu_read_delay,
        .readunlock     = srcu_torture_read_unlock,
        .completed      = srcu_torture_completed,
-        .deferred_free  = rcu_sync_torture_deferred_free,
+        .deferred_free  = srcu_torture_deferred_free,
        .sync           = srcu_torture_synchronize,
+        .call           = NULL,
        .cb_barrier     = NULL,
        .stats          = srcu_torture_stats,
        .name           = "srcu"
 };
+static struct rcu_torture_ops srcu_sync_ops = {
+        .init           = srcu_torture_init,
+        .cleanup        = srcu_torture_cleanup,
+        .readlock       = srcu_torture_read_lock,
+        .read_delay     = srcu_read_delay,
+        .readunlock     = srcu_torture_read_unlock,
+        .completed      = srcu_torture_completed,
+        .deferred_free  = rcu_sync_torture_deferred_free,
+        .sync           = srcu_torture_synchronize,
+        .call           = NULL,
+        .cb_barrier     = NULL,
+        .stats          = srcu_torture_stats,
+        .name           = "srcu_sync"
+};
 static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl)
 {
        return srcu_read_lock_raw(&srcu_ctl);
@@ -659,13 +699,29 @@ static struct rcu_torture_ops srcu_raw_ops = {
        .read_delay     = srcu_read_delay,
        .readunlock     = srcu_torture_read_unlock_raw,
        .completed      = srcu_torture_completed,
-        .deferred_free  = rcu_sync_torture_deferred_free,
+        .deferred_free  = srcu_torture_deferred_free,
        .sync           = srcu_torture_synchronize,
+        .call           = NULL,
        .cb_barrier     = NULL,
        .stats          = srcu_torture_stats,
        .name           = "srcu_raw"
 };
+static struct rcu_torture_ops srcu_raw_sync_ops = {
+        .init           = srcu_torture_init,
+        .cleanup        = srcu_torture_cleanup,
+        .readlock       = srcu_torture_read_lock_raw,
+        .read_delay     = srcu_read_delay,
+        .readunlock     = srcu_torture_read_unlock_raw,
+        .completed      = srcu_torture_completed,
+        .deferred_free  = rcu_sync_torture_deferred_free,
+        .sync           = srcu_torture_synchronize,
+        .call           = NULL,
+        .cb_barrier     = NULL,
+        .stats          = srcu_torture_stats,
+        .name           = "srcu_raw_sync"
+};
 static void srcu_torture_synchronize_expedited(void)
 {
        synchronize_srcu_expedited(&srcu_ctl);
@@ -680,6 +736,7 @@ static struct rcu_torture_ops srcu_expedited_ops = {
        .completed      = srcu_torture_completed,
        .deferred_free  = rcu_sync_torture_deferred_free,
        .sync           = srcu_torture_synchronize_expedited,
+        .call           = NULL,
        .cb_barrier     = NULL,
        .stats          = srcu_torture_stats,
        .name           = "srcu_expedited"
@@ -1129,7 +1186,8 @@ rcu_torture_printk(char *page)
                       "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
                       "rtmbe: %d rtbke: %ld rtbre: %ld "
                       "rtbf: %ld rtb: %ld nt: %ld "
-                       "onoff: %ld/%ld:%ld/%ld",
+                       "onoff: %ld/%ld:%ld/%ld "
+                       "barrier: %ld/%ld:%ld",
                       rcu_torture_current,
                       rcu_torture_current_version,
                       list_empty(&rcu_torture_freelist),
@@ -1145,14 +1203,17 @@ rcu_torture_printk(char *page)
                       n_online_successes,
                       n_online_attempts,
                       n_offline_successes,
-                       n_offline_attempts);
+                       n_offline_attempts,
+                       n_barrier_successes,
+                       n_barrier_attempts,
+                       n_rcu_torture_barrier_error);
+        cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
        if (atomic_read(&n_rcu_torture_mberror) != 0 ||
+            n_rcu_torture_barrier_error != 0 ||
            n_rcu_torture_boost_ktrerror != 0 ||
            n_rcu_torture_boost_rterror != 0 ||
-            n_rcu_torture_boost_failure != 0)
+            n_rcu_torture_boost_failure != 0 ||
-                cnt += sprintf(&page[cnt], " !!!");
+            i > 1) {
-        cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
-        if (i > 1) {
                cnt += sprintf(&page[cnt], "!!! ");
                atomic_inc(&n_rcu_torture_error);
                WARN_ON_ONCE(1);
@@ -1337,6 +1398,7 @@ static void rcutorture_booster_cleanup(int cpu)
        /* This must be outside of the mutex, otherwise deadlock! */
        kthread_stop(t);
+        boost_tasks[cpu] = NULL;
 }
 static int rcutorture_booster_init(int cpu)
@@ -1484,13 +1546,15 @@ static void rcu_torture_onoff_cleanup(void)
                return;
        VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
        kthread_stop(onoff_task);
+        onoff_task = NULL;
 }
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
-static void
+static int
 rcu_torture_onoff_init(void)
 {
+        return 0;
 }
 static void rcu_torture_onoff_cleanup(void)
@@ -1554,6 +1618,152 @@ static void rcu_torture_stall_cleanup(void)
                return;
        VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task.");
        kthread_stop(stall_task);
+        stall_task = NULL;
+}
+/* Callback function for RCU barrier testing. */
+void rcu_torture_barrier_cbf(struct rcu_head *rcu)
+{
+        atomic_inc(&barrier_cbs_invoked);
+}
+/* kthread function to register callbacks used to test RCU barriers. */
+static int rcu_torture_barrier_cbs(void *arg)
+{
+        long myid = (long)arg;
+        struct rcu_head rcu;
+        init_rcu_head_on_stack(&rcu);
+        VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started");
+        set_user_nice(current, 19);
+        do {
+                wait_event(barrier_cbs_wq[myid],
+                           atomic_read(&barrier_cbs_count) == n_barrier_cbs ||
+                           kthread_should_stop() ||
+                           fullstop != FULLSTOP_DONTSTOP);
+                if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
+                        break;
+                cur_ops->call(&rcu, rcu_torture_barrier_cbf);
+                if (atomic_dec_and_test(&barrier_cbs_count))
+                        wake_up(&barrier_wq);
+        } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+        VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping");
+        rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
+        while (!kthread_should_stop())
+                schedule_timeout_interruptible(1);
+        cur_ops->cb_barrier();
+        destroy_rcu_head_on_stack(&rcu);
+        return 0;
+}
+/* kthread function to drive and coordinate RCU barrier testing. */
+static int rcu_torture_barrier(void *arg)
+{
+        int i;
+        VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting");
+        do {
+                atomic_set(&barrier_cbs_invoked, 0);
+                atomic_set(&barrier_cbs_count, n_barrier_cbs);
+                /* wake_up() path contains the required barriers. */
+                for (i = 0; i < n_barrier_cbs; i++)
+                        wake_up(&barrier_cbs_wq[i]);
+                wait_event(barrier_wq,
+                           atomic_read(&barrier_cbs_count) == 0 ||
+                           kthread_should_stop() ||
+                           fullstop != FULLSTOP_DONTSTOP);
+                if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
+                        break;
+                n_barrier_attempts++;
+                cur_ops->cb_barrier();
+                if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
+                        n_rcu_torture_barrier_error++;
+                        WARN_ON_ONCE(1);
+                }
+                n_barrier_successes++;
+                schedule_timeout_interruptible(HZ / 10);
+        } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+        VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping");
+        rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
+        while (!kthread_should_stop())
+                schedule_timeout_interruptible(1);
+        return 0;
+}
+/* Initialize RCU barrier testing. */
+static int rcu_torture_barrier_init(void)
+{
+        int i;
+        int ret;
+        if (n_barrier_cbs == 0)
+                return 0;
+        if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) {
+                printk(KERN_ALERT "%s" TORTURE_FLAG
+                       " Call or barrier ops missing for %s,\n",
+                       torture_type, cur_ops->name);
+                printk(KERN_ALERT "%s" TORTURE_FLAG
+                       " RCU barrier testing omitted from run.\n",
+                       torture_type);
+                return 0;
+        }
+        atomic_set(&barrier_cbs_count, 0);
+        atomic_set(&barrier_cbs_invoked, 0);
+        barrier_cbs_tasks =
+                kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]),
+                        GFP_KERNEL);
+        barrier_cbs_wq =
+                kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
+                        GFP_KERNEL);
+        if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0)
+                return -ENOMEM;
+        for (i = 0; i < n_barrier_cbs; i++) {
+                init_waitqueue_head(&barrier_cbs_wq[i]);
+                barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs,
+                                                   (void *)(long)i,
+                                                   "rcu_torture_barrier_cbs");
+                if (IS_ERR(barrier_cbs_tasks[i])) {
+                        ret = PTR_ERR(barrier_cbs_tasks[i]);
+                        VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs");
+                        barrier_cbs_tasks[i] = NULL;
+                        return ret;
+                }
+        }
+        barrier_task = kthread_run(rcu_torture_barrier, NULL,
+                                   "rcu_torture_barrier");
+        if (IS_ERR(barrier_task)) {
+                ret = PTR_ERR(barrier_task);
+                VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier");
+                barrier_task = NULL;
+        }
+        return 0;
+}
+/* Clean up after RCU barrier testing. */
+static void rcu_torture_barrier_cleanup(void)
+{
+        int i;
+        if (barrier_task != NULL) {
+                VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task");
+                kthread_stop(barrier_task);
+                barrier_task = NULL;
+        }
+        if (barrier_cbs_tasks != NULL) {
+                for (i = 0; i < n_barrier_cbs; i++) {
+                        if (barrier_cbs_tasks[i] != NULL) {
+                                VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task");
+                                kthread_stop(barrier_cbs_tasks[i]);
+                                barrier_cbs_tasks[i] = NULL;
+                        }
+                }
+                kfree(barrier_cbs_tasks);
+                barrier_cbs_tasks = NULL;
+        }
+        if (barrier_cbs_wq != NULL) {
+                kfree(barrier_cbs_wq);
+                barrier_cbs_wq = NULL;
+        }
 }
 static int rcutorture_cpu_notify(struct notifier_block *self,
@@ -1598,6 +1808,7 @@ rcu_torture_cleanup(void)
        fullstop = FULLSTOP_RMMOD;
        mutex_unlock(&fullstop_mutex);
        unregister_reboot_notifier(&rcutorture_shutdown_nb);
+        rcu_torture_barrier_cleanup();
        rcu_torture_stall_cleanup();
        if (stutter_task) {
                VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
@@ -1665,6 +1876,7 @@ rcu_torture_cleanup(void)
                VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
                kthread_stop(shutdown_task);
        }
+        shutdown_task = NULL;
        rcu_torture_onoff_cleanup();
        /* Wait for all RCU callbacks to fire.  */
@@ -1676,7 +1888,7 @@ rcu_torture_cleanup(void)
        if (cur_ops->cleanup)
                cur_ops->cleanup();
-        if (atomic_read(&n_rcu_torture_error))
+        if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
                rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
        else if (n_online_successes != n_online_attempts ||
                 n_offline_successes != n_offline_attempts)
@@ -1692,10 +1904,12 @@ rcu_torture_init(void)
        int i;
        int cpu;
        int firsterr = 0;
+        int retval;
        static struct rcu_torture_ops *torture_ops[] =
                { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
                  &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
-                  &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops,
+                  &srcu_ops, &srcu_sync_ops, &srcu_raw_ops,
+                  &srcu_raw_sync_ops, &srcu_expedited_ops,
                  &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
        mutex_lock(&fullstop_mutex);
@@ -1749,6 +1963,7 @@ rcu_torture_init(void)
        atomic_set(&n_rcu_torture_free, 0);
        atomic_set(&n_rcu_torture_mberror, 0);
        atomic_set(&n_rcu_torture_error, 0);
+        n_rcu_torture_barrier_error = 0;
        n_rcu_torture_boost_ktrerror = 0;
        n_rcu_torture_boost_rterror = 0;
        n_rcu_torture_boost_failure = 0;
@@ -1872,7 +2087,6 @@ rcu_torture_init(void)
                test_boost_duration = 2;
        if ((test_boost == 1 && cur_ops->can_boost) ||
            test_boost == 2) {
-                int retval;
                boost_starttime = jiffies + test_boost_interval * HZ;
                register_cpu_notifier(&rcutorture_cpu_nb);
@@ -1897,9 +2111,22 @@ rcu_torture_init(void)
                        goto unwind;
                }
        }
-        rcu_torture_onoff_init();
+        i = rcu_torture_onoff_init();
+        if (i != 0) {
+                firsterr = i;
+                goto unwind;
+        }
        register_reboot_notifier(&rcutorture_shutdown_nb);
-        rcu_torture_stall_init();
+        i = rcu_torture_stall_init();
+        if (i != 0) {
+                firsterr = i;
+                goto unwind;
+        }
+        retval = rcu_torture_barrier_init();
+        if (retval != 0) {
+                firsterr = retval;
+                goto unwind;
+        }
        rcutorture_record_test_transition();
        mutex_unlock(&fullstop_mutex);
        return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d0c5baf1ab18..4b97bba7396e 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -75,6 +75,8 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
        .gpnum = -300, \
        .completed = -300, \
        .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
+        .orphan_nxttail = &structname##_state.orphan_nxtlist, \
+        .orphan_donetail = &structname##_state.orphan_donelist, \
        .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \
        .n_force_qs = 0, \
        .n_force_qs_ngp = 0, \
@@ -145,6 +147,13 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
 unsigned long rcutorture_testseq;
 unsigned long rcutorture_vernum;
+/* State information for rcu_barrier() and friends. */
+static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
+static atomic_t rcu_barrier_cpu_count;
+static DEFINE_MUTEX(rcu_barrier_mutex);
+static struct completion rcu_barrier_completion;
 /*
 * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
 * permit this function to be invoked without holding the root rcu_node
@@ -1311,95 +1320,135 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 #ifdef CONFIG_HOTPLUG_CPU
 /*
- * Move a dying CPU's RCU callbacks to online CPU's callback list.
+ * Send the specified CPU's RCU callbacks to the orphanage.  The
- * Also record a quiescent state for this CPU for the current grace period.
+ * specified CPU must be offline, and the caller must hold the
- * Synchronization and interrupt disabling are not required because
+ * ->onofflock.
- * this function executes in stop_machine() context.  Therefore, cleanup
- * operations that might block must be done later from the CPU_DEAD
- * notifier.
- *
- * Note that the outgoing CPU's bit has already been cleared in the
- * cpu_online_mask.  This allows us to randomly pick a callback
- * destination from the bits set in that mask.
 */
-static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
+static void
+rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
+                          struct rcu_node *rnp, struct rcu_data *rdp)
 {
        int i;
-        unsigned long mask;
-        int receive_cpu = cpumask_any(cpu_online_mask);
-        struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
-        struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
-        RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */
-        /* First, adjust the counts. */
+        /*
+         * Orphan the callbacks.  First adjust the counts.  This is safe
+         * because ->onofflock excludes _rcu_barrier()'s adoption of
+         * the callbacks, thus no memory barrier is required.
+         */
        if (rdp->nxtlist != NULL) {
-                receive_rdp->qlen_lazy += rdp->qlen_lazy;
+                rsp->qlen_lazy += rdp->qlen_lazy;
-                receive_rdp->qlen += rdp->qlen;
+                rsp->qlen += rdp->qlen;
+                rdp->n_cbs_orphaned += rdp->qlen;
                rdp->qlen_lazy = 0;
                rdp->qlen = 0;
        }
        /*
-         * Next, move ready-to-invoke callbacks to be invoked on some
+         * Next, move those callbacks still needing a grace period to
-         * other CPU.  These will not be required to pass through another
+         * the orphanage, where some other CPU will pick them up.
-         * grace period:  They are done, regardless of CPU.
+         * Some of the callbacks might have gone partway through a grace
+         * period, but that is too bad.  They get to start over because we
+         * cannot assume that grace periods are synchronized across CPUs.
+         * We don't bother updating the ->nxttail[] array yet, instead
+         * we just reset the whole thing later on.
         */
-        if (rdp->nxtlist != NULL &&
+        if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) {
-            rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) {
+                *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL];
-                struct rcu_head *oldhead;
+                rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL];
-                struct rcu_head **oldtail;
+                *rdp->nxttail[RCU_DONE_TAIL] = NULL;
-                struct rcu_head **newtail;
-                oldhead = rdp->nxtlist;
-                oldtail = receive_rdp->nxttail[RCU_DONE_TAIL];
-                rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
-                *rdp->nxttail[RCU_DONE_TAIL] = *oldtail;
-                *receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead;
-                newtail = rdp->nxttail[RCU_DONE_TAIL];
-                for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) {
-                        if (receive_rdp->nxttail[i] == oldtail)
-                                receive_rdp->nxttail[i] = newtail;
-                        if (rdp->nxttail[i] == newtail)
-                                rdp->nxttail[i] = &rdp->nxtlist;
-                }
        }
        /*
-         * Finally, put the rest of the callbacks at the end of the list.
+         * Then move the ready-to-invoke callbacks to the orphanage,
-         * The ones that made it partway through get to start over:  We
+         * where some other CPU will pick them up.  These will not be
-         * cannot assume that grace periods are synchronized across CPUs.
+         * required to pass though another grace period: They are done.
-         * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but
-         * this does not seem compelling.  Not yet, anyway.)
         */
        if (rdp->nxtlist != NULL) {
-                *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
+                *rsp->orphan_donetail = rdp->nxtlist;
-                receive_rdp->nxttail[RCU_NEXT_TAIL] =
+                rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
-                                rdp->nxttail[RCU_NEXT_TAIL];
-                receive_rdp->n_cbs_adopted += rdp->qlen;
-                rdp->n_cbs_orphaned += rdp->qlen;
-                rdp->nxtlist = NULL;
-                for (i = 0; i < RCU_NEXT_SIZE; i++)
-                        rdp->nxttail[i] = &rdp->nxtlist;
        }
+        /* Finally, initialize the rcu_data structure's list to empty.  */
+        rdp->nxtlist = NULL;
+        for (i = 0; i < RCU_NEXT_SIZE; i++)
+                rdp->nxttail[i] = &rdp->nxtlist;
+}
+/*
+ * Adopt the RCU callbacks from the specified rcu_state structure's
+ * orphanage.  The caller must hold the ->onofflock.
+ */
+static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
+{
+        int i;
+        struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
        /*
-         * Record a quiescent state for the dying CPU.  This is safe
+         * If there is an rcu_barrier() operation in progress, then
-         * only because we have already cleared out the callbacks.
+         * only the task doing that operation is permitted to adopt
-         * (Otherwise, the RCU core might try to schedule the invocation
+         * callbacks.  To do otherwise breaks rcu_barrier() and friends
-         * of callbacks on this now-offline CPU, which would be bad.)
+         * by causing them to fail to wait for the callbacks in the
+         * orphanage.
         */
-        mask = rdp->grpmask;    /* rnp->grplo is constant. */
+        if (rsp->rcu_barrier_in_progress &&
+            rsp->rcu_barrier_in_progress != current)
+                return;
+        /* Do the accounting first. */
+        rdp->qlen_lazy += rsp->qlen_lazy;
+        rdp->qlen += rsp->qlen;
+        rdp->n_cbs_adopted += rsp->qlen;
+        if (rsp->qlen_lazy != rsp->qlen)
+                rcu_idle_count_callbacks_posted();
+        rsp->qlen_lazy = 0;
+        rsp->qlen = 0;
+        /*
+         * We do not need a memory barrier here because the only way we
+         * can get here if there is an rcu_barrier() in flight is if
+         * we are the task doing the rcu_barrier().
+         */
+        /* First adopt the ready-to-invoke callbacks. */
+        if (rsp->orphan_donelist != NULL) {
+                *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL];
+                *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist;
+                for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--)
+                        if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
+                                rdp->nxttail[i] = rsp->orphan_donetail;
+                rsp->orphan_donelist = NULL;
+                rsp->orphan_donetail = &rsp->orphan_donelist;
+        }
+        /* And then adopt the callbacks that still need a grace period. */
+        if (rsp->orphan_nxtlist != NULL) {
+                *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist;
+                rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail;
+                rsp->orphan_nxtlist = NULL;
+                rsp->orphan_nxttail = &rsp->orphan_nxtlist;
+        }
+}
+/*
+ * Trace the fact that this CPU is going offline.
+ */
+static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
+{
+        RCU_TRACE(unsigned long mask);
+        RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda));
+        RCU_TRACE(struct rcu_node *rnp = rdp->mynode);
+        RCU_TRACE(mask = rdp->grpmask);
        trace_rcu_grace_period(rsp->name,
                               rnp->gpnum + 1 - !!(rnp->qsmask & mask),
                               "cpuofl");
-        rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum);
-        /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */
 }
 /*
 * The CPU has been completely removed, and some other CPU is reporting
- * this fact from process context.  Do the remainder of the cleanup.
+ * this fact from process context.  Do the remainder of the cleanup,
+ * including orphaning the outgoing CPU's RCU callbacks, and also
+ * adopting them, if there is no _rcu_barrier() instance running.
 * There can only be one CPU hotplug operation at a time, so no other
 * CPU can be attempting to update rcu_cpu_kthread_task.
 */
@@ -1409,17 +1458,21 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
        unsigned long mask;
        int need_report = 0;
        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-        struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rnp. */
+        struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
        /* Adjust any no-longer-needed kthreads. */
        rcu_stop_cpu_kthread(cpu);
        rcu_node_kthread_setaffinity(rnp, -1);
-        /* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */
+        /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
        /* Exclude any attempts to start a new grace period. */
        raw_spin_lock_irqsave(&rsp->onofflock, flags);
+        /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
+        rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
+        rcu_adopt_orphan_cbs(rsp);
        /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
        mask = rdp->grpmask;    /* rnp->grplo is constant. */
        do {
@@ -1456,6 +1509,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
+static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
+{
+}
 static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
 {
 }
@@ -1474,7 +1531,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 {
        unsigned long flags;
        struct rcu_head *next, *list, **tail;
-        int bl, count, count_lazy;
+        int bl, count, count_lazy, i;
        /* If no callbacks are ready, just return.*/
        if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
@@ -1497,9 +1554,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
        rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
        *rdp->nxttail[RCU_DONE_TAIL] = NULL;
        tail = rdp->nxttail[RCU_DONE_TAIL];
-        for (count = RCU_NEXT_SIZE - 1; count >= 0; count--)
+        for (i = RCU_NEXT_SIZE - 1; i >= 0; i--)
-                if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL])
+                if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
-                        rdp->nxttail[count] = &rdp->nxtlist;
+                        rdp->nxttail[i] = &rdp->nxtlist;
        local_irq_restore(flags);
        /* Invoke callbacks. */
@@ -1524,18 +1581,19 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
                            rcu_is_callbacks_kthread());
        /* Update count, and requeue any remaining callbacks. */
-        rdp->qlen_lazy -= count_lazy;
-        rdp->qlen -= count;
-        rdp->n_cbs_invoked += count;
        if (list != NULL) {
                *tail = rdp->nxtlist;
                rdp->nxtlist = list;
-                for (count = 0; count < RCU_NEXT_SIZE; count++)
+                for (i = 0; i < RCU_NEXT_SIZE; i++)
-                        if (&rdp->nxtlist == rdp->nxttail[count])
+                        if (&rdp->nxtlist == rdp->nxttail[i])
-                                rdp->nxttail[count] = tail;
+                                rdp->nxttail[i] = tail;
                        else
                                break;
        }
+        smp_mb(); /* List handling before counting for rcu_barrier(). */
+        rdp->qlen_lazy -= count_lazy;
+        rdp->qlen -= count;
+        rdp->n_cbs_invoked += count;
        /* Reinstate batch limit if we have worked down the excess. */
        if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
@@ -1823,11 +1881,14 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
        rdp = this_cpu_ptr(rsp->rda);
        /* Add the callback to our list. */
-        *rdp->nxttail[RCU_NEXT_TAIL] = head;
-        rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
        rdp->qlen++;
        if (lazy)
                rdp->qlen_lazy++;
+        else
+                rcu_idle_count_callbacks_posted();
+        smp_mb();  /* Count before adding callback for rcu_barrier(). */
+        *rdp->nxttail[RCU_NEXT_TAIL] = head;
+        rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
        if (__is_kfree_rcu_offset((unsigned long)func))
                trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
@@ -1893,6 +1954,38 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
+/*
+ * Because a context switch is a grace period for RCU-sched and RCU-bh,
+ * any blocking grace-period wait automatically implies a grace period
+ * if there is only one CPU online at any point time during execution
+ * of either synchronize_sched() or synchronize_rcu_bh().  It is OK to
+ * occasionally incorrectly indicate that there are multiple CPUs online
+ * when there was in fact only one the whole time, as this just adds
+ * some overhead: RCU still operates correctly.
+ *
+ * Of course, sampling num_online_cpus() with preemption enabled can
+ * give erroneous results if there are concurrent CPU-hotplug operations.
+ * For example, given a demonic sequence of preemptions in num_online_cpus()
+ * and CPU-hotplug operations, there could be two or more CPUs online at
+ * all times, but num_online_cpus() might well return one (or even zero).
+ *
+ * However, all such demonic sequences require at least one CPU-offline
+ * operation.  Furthermore, rcu_blocking_is_gp() giving the wrong answer
+ * is only a problem if there is an RCU read-side critical section executing
+ * throughout.  But RCU-sched and RCU-bh read-side critical sections
+ * disable either preemption or bh, which prevents a CPU from going offline.
+ * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return
+ * that there is only one CPU when in fact there was more than one throughout
+ * is when there were no RCU readers in the system.  If there are no
+ * RCU readers, the grace period by definition can be of zero length,
+ * regardless of the number of online CPUs.
+ */
+static inline int rcu_blocking_is_gp(void)
+{
+        might_sleep();  /* Check for RCU read-side critical section. */
+        return num_online_cpus() <= 1;
+}
 /**
 * synchronize_sched - wait until an rcu-sched grace period has elapsed.
 *
@@ -2166,11 +2259,10 @@ static int rcu_cpu_has_callbacks(int cpu)
               rcu_preempt_cpu_has_callbacks(cpu);
 }
-static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
+/*
-static atomic_t rcu_barrier_cpu_count;
+ * RCU callback function for _rcu_barrier().  If we are last, wake
-static DEFINE_MUTEX(rcu_barrier_mutex);
+ * up the task executing _rcu_barrier().
-static struct completion rcu_barrier_completion;
+ */
 static void rcu_barrier_callback(struct rcu_head *notused)
 {
        if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -2200,27 +2292,94 @@ static void _rcu_barrier(struct rcu_state *rsp,
                         void (*call_rcu_func)(struct rcu_head *head,
                                               void (*func)(struct rcu_head *head)))
 {
-        BUG_ON(in_interrupt());
+        int cpu;
+        unsigned long flags;
+        struct rcu_data *rdp;
+        struct rcu_head rh;
+        init_rcu_head_on_stack(&rh);
        /* Take mutex to serialize concurrent rcu_barrier() requests. */
        mutex_lock(&rcu_barrier_mutex);
-        init_completion(&rcu_barrier_completion);
+        smp_mb();  /* Prevent any prior operations from leaking in. */
        /*
-         * Initialize rcu_barrier_cpu_count to 1, then invoke
+         * Initialize the count to one rather than to zero in order to
-         * rcu_barrier_func() on each CPU, so that each CPU also has
+         * avoid a too-soon return to zero in case of a short grace period
-         * incremented rcu_barrier_cpu_count.  Only then is it safe to
+         * (or preemption of this task).  Also flag this task as doing
-         * decrement rcu_barrier_cpu_count -- otherwise the first CPU
+         * an rcu_barrier().  This will prevent anyone else from adopting
-         * might complete its grace period before all of the other CPUs
+         * orphaned callbacks, which could cause otherwise failure if a
-         * did their increment, causing this function to return too
+         * CPU went offline and quickly came back online.  To see this,
-         * early.  Note that on_each_cpu() disables irqs, which prevents
+         * consider the following sequence of events:
-         * any CPUs from coming online or going offline until each online
+         *
-         * CPU has queued its RCU-barrier callback.
+         * 1.   We cause CPU 0 to post an rcu_barrier_callback() callback.
+         * 2.   CPU 1 goes offline, orphaning its callbacks.
+         * 3.   CPU 0 adopts CPU 1's orphaned callbacks.
+         * 4.   CPU 1 comes back online.
+         * 5.   We cause CPU 1 to post an rcu_barrier_callback() callback.
+         * 6.   Both rcu_barrier_callback() callbacks are invoked, awakening
+         *      us -- but before CPU 1's orphaned callbacks are invoked!!!
         */
+        init_completion(&rcu_barrier_completion);
        atomic_set(&rcu_barrier_cpu_count, 1);
-        on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
+        raw_spin_lock_irqsave(&rsp->onofflock, flags);
+        rsp->rcu_barrier_in_progress = current;
+        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
+        /*
+         * Force every CPU with callbacks to register a new callback
+         * that will tell us when all the preceding callbacks have
+         * been invoked.  If an offline CPU has callbacks, wait for
+         * it to either come back online or to finish orphaning those
+         * callbacks.
+         */
+        for_each_possible_cpu(cpu) {
+                preempt_disable();
+                rdp = per_cpu_ptr(rsp->rda, cpu);
+                if (cpu_is_offline(cpu)) {
+                        preempt_enable();
+                        while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen))
+                                schedule_timeout_interruptible(1);
+                } else if (ACCESS_ONCE(rdp->qlen)) {
+                        smp_call_function_single(cpu, rcu_barrier_func,
+                                                 (void *)call_rcu_func, 1);
+                        preempt_enable();
+                } else {
+                        preempt_enable();
+                }
+        }
+        /*
+         * Now that all online CPUs have rcu_barrier_callback() callbacks
+         * posted, we can adopt all of the orphaned callbacks and place
+         * an rcu_barrier_callback() callback after them.  When that is done,
+         * we are guaranteed to have an rcu_barrier_callback() callback
+         * following every callback that could possibly have been
+         * registered before _rcu_barrier() was called.
+         */
+        raw_spin_lock_irqsave(&rsp->onofflock, flags);
+        rcu_adopt_orphan_cbs(rsp);
+        rsp->rcu_barrier_in_progress = NULL;
+        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
+        atomic_inc(&rcu_barrier_cpu_count);
+        smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */
+        call_rcu_func(&rh, rcu_barrier_callback);
+        /*
+         * Now that we have an rcu_barrier_callback() callback on each
+         * CPU, and thus each counted, remove the initial count.
+         */
        if (atomic_dec_and_test(&rcu_barrier_cpu_count))
                complete(&rcu_barrier_completion);
+        /* Wait for all rcu_barrier_callback() callbacks to be invoked. */
        wait_for_completion(&rcu_barrier_completion);
+        /* Other rcu_barrier() invocations can now safely proceed. */
        mutex_unlock(&rcu_barrier_mutex);
+        destroy_rcu_head_on_stack(&rh);
 }
 /**
@@ -2417,7 +2576,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
        for (i = NUM_RCU_LVLS - 1; i > 0; i--)
                rsp->levelspread[i] = CONFIG_RCU_FANOUT;
-        rsp->levelspread[0] = RCU_FANOUT_LEAF;
+        rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF;
 }
 #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
 static void __init rcu_init_levelspread(struct rcu_state *rsp)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index cdd1be0a4072..19b61ac1079f 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -29,18 +29,14 @@
 #include <linux/seqlock.h>
 /*
- * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
+ * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
+ * CONFIG_RCU_FANOUT_LEAF.
 * In theory, it should be possible to add more levels straightforwardly.
 * In practice, this did work well going from three levels to four.
 * Of course, your mileage may vary.
 */
 #define MAX_RCU_LVLS 4
-#if CONFIG_RCU_FANOUT > 16
+#define RCU_FANOUT_1          (CONFIG_RCU_FANOUT_LEAF)
-#define RCU_FANOUT_LEAF       16
-#else /* #if CONFIG_RCU_FANOUT > 16 */
-#define RCU_FANOUT_LEAF       (CONFIG_RCU_FANOUT)
-#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
-#define RCU_FANOUT_1          (RCU_FANOUT_LEAF)
 #define RCU_FANOUT_2          (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
 #define RCU_FANOUT_3          (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
 #define RCU_FANOUT_4          (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
@@ -88,6 +84,20 @@ struct rcu_dynticks {
                                    /* Process level is worth LLONG_MAX/2. */
        int dynticks_nmi_nesting;   /* Track NMI nesting level. */
        atomic_t dynticks;          /* Even value for idle, else odd. */
+#ifdef CONFIG_RCU_FAST_NO_HZ
+        int dyntick_drain;          /* Prepare-for-idle state variable. */
+        unsigned long dyntick_holdoff;
+                                    /* No retries for the jiffy of failure. */
+        struct timer_list idle_gp_timer;
+                                    /* Wake up CPU sleeping with callbacks. */
+        unsigned long idle_gp_timer_expires;
+                                    /* When to wake up CPU (for repost). */
+        bool idle_first_pass;       /* First pass of attempt to go idle? */
+        unsigned long nonlazy_posted;
+                                    /* # times non-lazy CBs posted to CPU. */
+        unsigned long nonlazy_posted_snap;
+                                    /* idle-period nonlazy_posted snapshot. */
+#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
 };
 /* RCU's kthread states for tracing. */
@@ -371,6 +381,17 @@ struct rcu_state {
        raw_spinlock_t onofflock;               /* exclude on/offline and */
                                                /*  starting new GP. */
+        struct rcu_head *orphan_nxtlist;        /* Orphaned callbacks that */
+                                                /*  need a grace period. */
+        struct rcu_head **orphan_nxttail;       /* Tail of above. */
+        struct rcu_head *orphan_donelist;       /* Orphaned callbacks that */
+                                                /*  are ready to invoke. */
+        struct rcu_head **orphan_donetail;      /* Tail of above. */
+        long qlen_lazy;                         /* Number of lazy callbacks. */
+        long qlen;                              /* Total number of callbacks. */
+        struct task_struct *rcu_barrier_in_progress;
+                                                /* Task doing rcu_barrier(), */
+                                                /*  or NULL if no barrier. */
        raw_spinlock_t fqslock;                 /* Only one task forcing */
                                                /*  quiescent states. */
        unsigned long jiffies_force_qs;         /* Time at which to invoke */
@@ -471,6 +492,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu);
 static void rcu_prepare_for_idle_init(int cpu);
 static void rcu_cleanup_after_idle(int cpu);
 static void rcu_prepare_for_idle(int cpu);
+static void rcu_idle_count_callbacks_posted(void);
 static void print_cpu_stall_info_begin(void);
 static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
 static void print_cpu_stall_info_end(void);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c023464816be..3e4899459f3d 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -969,22 +969,6 @@ static void __init __rcu_init_preempt(void)
        rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
 }
-/*
- * Check for a task exiting while in a preemptible-RCU read-side
- * critical section, clean up if so.  No need to issue warnings,
- * as debug_check_no_locks_held() already does this if lockdep
- * is enabled.
- */
-void exit_rcu(void)
-{
-        struct task_struct *t = current;
-        if (t->rcu_read_lock_nesting == 0)
-                return;
-        t->rcu_read_lock_nesting = 1;
-        __rcu_read_unlock();
-}
 #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 static struct rcu_state *rcu_state = &rcu_sched_state;
@@ -1910,8 +1894,9 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
 * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
 * any flavor of RCU.
 */
-int rcu_needs_cpu(int cpu)
+int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
 {
+        *delta_jiffies = ULONG_MAX;
        return rcu_cpu_has_callbacks(cpu);
 }
@@ -1938,6 +1923,14 @@ static void rcu_prepare_for_idle(int cpu)
 {
 }
+/*
+ * Don't bother keeping a running count of the number of RCU callbacks
+ * posted because CONFIG_RCU_FAST_NO_HZ=n.
+ */
+static void rcu_idle_count_callbacks_posted(void)
+{
+}
 #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
 /*
@@ -1978,30 +1971,6 @@ static void rcu_prepare_for_idle(int cpu)
 #define RCU_IDLE_GP_DELAY 6             /* Roughly one grace period. */
 #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
-static DEFINE_PER_CPU(int, rcu_dyntick_drain);
-static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
-static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer);
-static ktime_t rcu_idle_gp_wait;        /* If some non-lazy callbacks. */
-static ktime_t rcu_idle_lazy_gp_wait;   /* If only lazy callbacks. */
-/*
- * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
- * callbacks on this CPU, (2) this CPU has not yet attempted to enter
- * dyntick-idle mode, or (3) this CPU is in the process of attempting to
- * enter dyntick-idle mode.  Otherwise, if we have recently tried and failed
- * to enter dyntick-idle mode, we refuse to try to enter it.  After all,
- * it is better to incur scheduling-clock interrupts than to spin
- * continuously for the same time duration!
- */
-int rcu_needs_cpu(int cpu)
-{
-        /* If no callbacks, RCU doesn't need the CPU. */
-        if (!rcu_cpu_has_callbacks(cpu))
-                return 0;
-        /* Otherwise, RCU needs the CPU only if it recently tried and failed. */
-        return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies;
-}
 /*
 * Does the specified flavor of RCU have non-lazy callbacks pending on
 * the specified CPU?  Both RCU flavor and CPU are specified by the
@@ -2045,16 +2014,75 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
 }
 /*
+ * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
+ * callbacks on this CPU, (2) this CPU has not yet attempted to enter
+ * dyntick-idle mode, or (3) this CPU is in the process of attempting to
+ * enter dyntick-idle mode.  Otherwise, if we have recently tried and failed
+ * to enter dyntick-idle mode, we refuse to try to enter it.  After all,
+ * it is better to incur scheduling-clock interrupts than to spin
+ * continuously for the same time duration!
+ *
+ * The delta_jiffies argument is used to store the time when RCU is
+ * going to need the CPU again if it still has callbacks.  The reason
+ * for this is that rcu_prepare_for_idle() might need to post a timer,
+ * but if so, it will do so after tick_nohz_stop_sched_tick() has set
+ * the wakeup time for this CPU.  This means that RCU's timer can be
+ * delayed until the wakeup time, which defeats the purpose of posting
+ * a timer.
+ */
+int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
+{
+        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+        /* Flag a new idle sojourn to the idle-entry state machine. */
+        rdtp->idle_first_pass = 1;
+        /* If no callbacks, RCU doesn't need the CPU. */
+        if (!rcu_cpu_has_callbacks(cpu)) {
+                *delta_jiffies = ULONG_MAX;
+                return 0;
+        }
+        if (rdtp->dyntick_holdoff == jiffies) {
+                /* RCU recently tried and failed, so don't try again. */
+                *delta_jiffies = 1;
+                return 1;
+        }
+        /* Set up for the possibility that RCU will post a timer. */
+        if (rcu_cpu_has_nonlazy_callbacks(cpu))
+                *delta_jiffies = RCU_IDLE_GP_DELAY;
+        else
+                *delta_jiffies = RCU_IDLE_LAZY_GP_DELAY;
+        return 0;
+}
+/*
+ * Handler for smp_call_function_single().  The only point of this
+ * handler is to wake the CPU up, so the handler does only tracing.
+ */
+void rcu_idle_demigrate(void *unused)
+{
+        trace_rcu_prep_idle("Demigrate");
+}
+/*
 * Timer handler used to force CPU to start pushing its remaining RCU
 * callbacks in the case where it entered dyntick-idle mode with callbacks
 * pending.  The hander doesn't really need to do anything because the
 * real work is done upon re-entry to idle, or by the next scheduling-clock
 * interrupt should idle not be re-entered.
+ *
+ * One special case: the timer gets migrated without awakening the CPU
+ * on which the timer was scheduled on.  In this case, we must wake up
+ * that CPU.  We do so with smp_call_function_single().
 */
-static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp)
+static void rcu_idle_gp_timer_func(unsigned long cpu_in)
 {
+        int cpu = (int)cpu_in;
        trace_rcu_prep_idle("Timer");
-        return HRTIMER_NORESTART;
+        if (cpu != smp_processor_id())
+                smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0);
+        else
+                WARN_ON_ONCE(1); /* Getting here can hang the system... */
 }
 /*
@@ -2062,29 +2090,25 @@ static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp)
 */
 static void rcu_prepare_for_idle_init(int cpu)
 {
-        static int firsttime = 1;
+        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-        struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu);
-        hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        rdtp->dyntick_holdoff = jiffies - 1;
-        hrtp->function = rcu_idle_gp_timer_func;
+        setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu);
-        if (firsttime) {
+        rdtp->idle_gp_timer_expires = jiffies - 1;
-                unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY);
+        rdtp->idle_first_pass = 1;
-                rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000);
-                upj = jiffies_to_usecs(RCU_IDLE_LAZY_GP_DELAY);
-                rcu_idle_lazy_gp_wait = ns_to_ktime(upj * (u64)1000);
-                firsttime = 0;
-        }
 }
 /*
 * Clean up for exit from idle.  Because we are exiting from idle, there
- * is no longer any point to rcu_idle_gp_timer, so cancel it.  This will
+ * is no longer any point to ->idle_gp_timer, so cancel it.  This will
 * do nothing if this timer is not active, so just cancel it unconditionally.
 */
 static void rcu_cleanup_after_idle(int cpu)
 {
-        hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu));
+        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+        del_timer(&rdtp->idle_gp_timer);
+        trace_rcu_prep_idle("Cleanup after idle");
 }
 /*
@@ -2102,19 +2126,41 @@ static void rcu_cleanup_after_idle(int cpu)
 * Because it is not legal to invoke rcu_process_callbacks() with irqs
 * disabled, we do one pass of force_quiescent_state(), then do a
 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
- * later.  The per-cpu rcu_dyntick_drain variable controls the sequencing.
+ * later.  The ->dyntick_drain field controls the sequencing.
 *
 * The caller must have disabled interrupts.
 */
 static void rcu_prepare_for_idle(int cpu)
 {
+        struct timer_list *tp;
+        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+        /*
+         * If this is an idle re-entry, for example, due to use of
+         * RCU_NONIDLE() or the new idle-loop tracing API within the idle
+         * loop, then don't take any state-machine actions, unless the
+         * momentary exit from idle queued additional non-lazy callbacks.
+         * Instead, repost the ->idle_gp_timer if this CPU has callbacks
+         * pending.
+         */
+        if (!rdtp->idle_first_pass &&
+            (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) {
+                if (rcu_cpu_has_callbacks(cpu)) {
+                        tp = &rdtp->idle_gp_timer;
+                        mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
+                }
+                return;
+        }
+        rdtp->idle_first_pass = 0;
+        rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1;
        /*
         * If there are no callbacks on this CPU, enter dyntick-idle mode.
         * Also reset state to avoid prejudicing later attempts.
         */
        if (!rcu_cpu_has_callbacks(cpu)) {
-                per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
+                rdtp->dyntick_holdoff = jiffies - 1;
-                per_cpu(rcu_dyntick_drain, cpu) = 0;
+                rdtp->dyntick_drain = 0;
                trace_rcu_prep_idle("No callbacks");
                return;
        }
@@ -2123,32 +2169,37 @@ static void rcu_prepare_for_idle(int cpu)
         * If in holdoff mode, just return.  We will presumably have
         * refrained from disabling the scheduling-clock tick.
         */
-        if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) {
+        if (rdtp->dyntick_holdoff == jiffies) {
                trace_rcu_prep_idle("In holdoff");
                return;
        }
-        /* Check and update the rcu_dyntick_drain sequencing. */
+        /* Check and update the ->dyntick_drain sequencing. */
-        if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
+        if (rdtp->dyntick_drain <= 0) {
                /* First time through, initialize the counter. */
-                per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES;
+                rdtp->dyntick_drain = RCU_IDLE_FLUSHES;
-        } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES &&
+        } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES &&
                   !rcu_pending(cpu) &&
                   !local_softirq_pending()) {
                /* Can we go dyntick-idle despite still having callbacks? */
-                trace_rcu_prep_idle("Dyntick with callbacks");
+                rdtp->dyntick_drain = 0;
-                per_cpu(rcu_dyntick_drain, cpu) = 0;
+                rdtp->dyntick_holdoff = jiffies;
-                per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
+                if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
-                if (rcu_cpu_has_nonlazy_callbacks(cpu))
+                        trace_rcu_prep_idle("Dyntick with callbacks");
-                        hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu),
+                        rdtp->idle_gp_timer_expires =
-                                      rcu_idle_gp_wait, HRTIMER_MODE_REL);
+                                           jiffies + RCU_IDLE_GP_DELAY;
-                else
+                } else {
-                        hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu),
+                        rdtp->idle_gp_timer_expires =
-                                      rcu_idle_lazy_gp_wait, HRTIMER_MODE_REL);
+                                           jiffies + RCU_IDLE_LAZY_GP_DELAY;
+                        trace_rcu_prep_idle("Dyntick with lazy callbacks");
+                }
+                tp = &rdtp->idle_gp_timer;
+                mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
+                rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
                return; /* Nothing more to do immediately. */
-        } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
+        } else if (--(rdtp->dyntick_drain) <= 0) {
                /* We have hit the limit, so time to give up. */
-                per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
+                rdtp->dyntick_holdoff = jiffies;
                trace_rcu_prep_idle("Begin holdoff");
                invoke_rcu_core();  /* Force the CPU out of dyntick-idle. */
                return;
@@ -2184,6 +2235,19 @@ static void rcu_prepare_for_idle(int cpu)
                trace_rcu_prep_idle("Callbacks drained");
 }
+/*
+ * Keep a running count of the number of non-lazy callbacks posted
+ * on this CPU.  This running counter (which is never decremented) allows
+ * rcu_prepare_for_idle() to detect when something out of the idle loop
+ * posts a callback, even if an equal number of callbacks are invoked.
+ * Of course, callbacks should only be posted from within a trace event
+ * designed to be called from idle or from within RCU_NONIDLE().
+ */
+static void rcu_idle_count_callbacks_posted(void)
+{
+        __this_cpu_add(rcu_dynticks.nonlazy_posted, 1);
+}
 #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
 #ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -2192,14 +2256,13 @@ static void rcu_prepare_for_idle(int cpu)
 static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
 {
-        struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu);
+        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+        struct timer_list *tltp = &rdtp->idle_gp_timer;
-        sprintf(cp, "drain=%d %c timer=%lld",
+        sprintf(cp, "drain=%d %c timer=%lu",
-                per_cpu(rcu_dyntick_drain, cpu),
+                rdtp->dyntick_drain,
-                per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.',
+                rdtp->dyntick_holdoff == jiffies ? 'H' : '.',
-                hrtimer_active(hrtp)
+                timer_pending(tltp) ? tltp->expires - jiffies : -1);
-                        ? ktime_to_us(hrtimer_get_remaining(hrtp))
-                        : -1);
 }
 #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index ed459edeff43..d4bc16ddd1d4 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -271,13 +271,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
        gpnum = rsp->gpnum;
        seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
-                      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
+                      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
                   rsp->completed, gpnum, rsp->fqs_state,
                   (long)(rsp->jiffies_force_qs - jiffies),
                   (int)(jiffies & 0xffff),
                   rsp->n_force_qs, rsp->n_force_qs_ngp,
                   rsp->n_force_qs - rsp->n_force_qs_ngp,
-                   rsp->n_force_qs_lh);
+                   rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen);
        for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
                if (rnp->level != level) {
                        seq_puts(m, "\n");
diff --git a/kernel/relay.c b/kernel/relay.c
index ab56a1764d4d..e8cd2027abbd 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1235,6 +1235,7 @@ static ssize_t subbuf_splice_actor(struct file *in,
        struct splice_pipe_desc spd = {
                .pages = pages,
                .nr_pages = 0,
+                .nr_pages_max = PIPE_DEF_BUFFERS,
                .partial = partial,
                .flags = flags,
                .ops = &relay_pipe_buf_ops,
@@ -1302,8 +1303,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
                ret += padding;
 out:
-        splice_shrink_spd(pipe, &spd);
+        splice_shrink_spd(&spd);
-        return ret;
+        return ret;
 }
 static ssize_t relay_file_splice_read(struct file *in,
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index d508363858b3..ad581aa2369a 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -22,75 +22,70 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent)
        counter->parent = parent;
 }
-int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
+int res_counter_charge_locked(struct res_counter *counter, unsigned long val,
+                              bool force)
 {
+        int ret = 0;
        if (counter->usage + val > counter->limit) {
                counter->failcnt++;
-                return -ENOMEM;
+                ret = -ENOMEM;
+                if (!force)
+                        return ret;
        }
        counter->usage += val;
        if (counter->usage > counter->max_usage)
                counter->max_usage = counter->usage;
-        return 0;
+        return ret;
 }
-int res_counter_charge(struct res_counter *counter, unsigned long val,
+static int __res_counter_charge(struct res_counter *counter, unsigned long val,
-                        struct res_counter **limit_fail_at)
+                                struct res_counter **limit_fail_at, bool force)
 {
-        int ret;
+        int ret, r;
        unsigned long flags;
        struct res_counter *c, *u;
+        r = ret = 0;
        *limit_fail_at = NULL;
        local_irq_save(flags);
        for (c = counter; c != NULL; c = c->parent) {
                spin_lock(&c->lock);
-                ret = res_counter_charge_locked(c, val);
+                r = res_counter_charge_locked(c, val, force);
                spin_unlock(&c->lock);
-                if (ret < 0) {
+                if (r < 0 && !ret) {
+                        ret = r;
                        *limit_fail_at = c;
-                        goto undo;
+                        if (!force)
+                                break;
                }
        }
-        ret = 0;
-        goto done;
+        if (ret < 0 && !force) {
-undo:
+                for (u = counter; u != c; u = u->parent) {
-        for (u = counter; u != c; u = u->parent) {
+                        spin_lock(&u->lock);
-                spin_lock(&u->lock);
+                        res_counter_uncharge_locked(u, val);
-                res_counter_uncharge_locked(u, val);
+                        spin_unlock(&u->lock);
-                spin_unlock(&u->lock);
+                }
        }
-done:
        local_irq_restore(flags);
        return ret;
 }
+int res_counter_charge(struct res_counter *counter, unsigned long val,
+                        struct res_counter **limit_fail_at)
+{
+        return __res_counter_charge(counter, val, limit_fail_at, false);
+}
 int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
                              struct res_counter **limit_fail_at)
 {
-        int ret, r;
+        return __res_counter_charge(counter, val, limit_fail_at, true);
-        unsigned long flags;
-        struct res_counter *c;
-        r = ret = 0;
-        *limit_fail_at = NULL;
-        local_irq_save(flags);
-        for (c = counter; c != NULL; c = c->parent) {
-                spin_lock(&c->lock);
-                r = res_counter_charge_locked(c, val);
-                if (r)
-                        c->usage += val;
-                spin_unlock(&c->lock);
-                if (r < 0 && ret == 0) {
-                        *limit_fail_at = c;
-                        ret = r;
-                }
-        }
-        local_irq_restore(flags);
-        return ret;
 }
 void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
 {
        if (WARN_ON(counter->usage < val))
@@ -99,13 +94,15 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
        counter->usage -= val;
 }
-void res_counter_uncharge(struct res_counter *counter, unsigned long val)
+void res_counter_uncharge_until(struct res_counter *counter,
+                                struct res_counter *top,
+                                unsigned long val)
 {
        unsigned long flags;
        struct res_counter *c;
        local_irq_save(flags);
-        for (c = counter; c != NULL; c = c->parent) {
+        for (c = counter; c != top; c = c->parent) {
                spin_lock(&c->lock);
                res_counter_uncharge_locked(c, val);
                spin_unlock(&c->lock);
@@ -113,6 +110,10 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val)
        local_irq_restore(flags);
 }
+void res_counter_uncharge(struct res_counter *counter, unsigned long val)
+{
+        res_counter_uncharge_until(counter, NULL, val);
+}
 static inline unsigned long long *
 res_counter_member(struct res_counter *counter, int member)
diff --git a/kernel/resource.c b/kernel/resource.c
index 7e8ea66a8c01..e1d2b8ee76d5 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -515,8 +515,8 @@ out:
 * @root: root resource descriptor
 * @new: resource descriptor desired by caller
 * @size: requested resource region size
- * @min: minimum size to allocate
+ * @min: minimum boundary to allocate
- * @max: maximum size to allocate
+ * @max: maximum boundary to allocate
 * @align: alignment requested, in bytes
 * @alignf: alignment function, optional, called if not NULL
 * @alignf_data: arbitrary data to pass to the @alignf function
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 9a7dd35102a3..173ea52f3af0 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -16,5 +16,3 @@ obj-$(CONFIG_SMP) += cpupri.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0533a688ce22..468bdd44c1ba 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -83,6 +83,7 @@
 #include "sched.h"
 #include "../workqueue_sched.h"
+#include "../smpboot.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
@@ -141,9 +142,8 @@ const_debug unsigned int sysctl_sched_features =
 #define SCHED_FEAT(name, enabled)       \
        #name ,
-static __read_mostly char *sched_feat_names[] = {
+static const char * const sched_feat_names[] = {
 #include "features.h"
-        NULL
 };
 #undef SCHED_FEAT
@@ -692,8 +692,6 @@ int tg_nop(struct task_group *tg, void *data)
 }
 #endif
-void update_cpu_load(struct rq *this_rq);
 static void set_load_weight(struct task_struct *p)
 {
        int prio = p->static_prio - MAX_RT_PRIO;
@@ -2162,11 +2160,73 @@ unsigned long this_cpu_load(void)
 }
+/*
+ * Global load-average calculations
+ *
+ * We take a distributed and async approach to calculating the global load-avg
+ * in order to minimize overhead.
+ *
+ * The global load average is an exponentially decaying average of nr_running +
+ * nr_uninterruptible.
+ *
+ * Once every LOAD_FREQ:
+ *
+ *   nr_active = 0;
+ *   for_each_possible_cpu(cpu)
+ *      nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
+ *
+ *   avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
+ *
+ * Due to a number of reasons the above turns in the mess below:
+ *
+ *  - for_each_possible_cpu() is prohibitively expensive on machines with
+ *    serious number of cpus, therefore we need to take a distributed approach
+ *    to calculating nr_active.
+ *
+ *        \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
+ *                      = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
+ *
+ *    So assuming nr_active := 0 when we start out -- true per definition, we
+ *    can simply take per-cpu deltas and fold those into a global accumulate
+ *    to obtain the same result. See calc_load_fold_active().
+ *
+ *    Furthermore, in order to avoid synchronizing all per-cpu delta folding
+ *    across the machine, we assume 10 ticks is sufficient time for every
+ *    cpu to have completed this task.
+ *
+ *    This places an upper-bound on the IRQ-off latency of the machine. Then
+ *    again, being late doesn't loose the delta, just wrecks the sample.
+ *
+ *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
+ *    this would add another cross-cpu cacheline miss and atomic operation
+ *    to the wakeup path. Instead we increment on whatever cpu the task ran
+ *    when it went into uninterruptible state and decrement on whatever cpu
+ *    did the wakeup. This means that only the sum of nr_uninterruptible over
+ *    all cpus yields the correct result.
+ *
+ *  This covers the NO_HZ=n code, for extra head-aches, see the comment below.
+ */
 /* Variables and functions for calc_load */
 static atomic_long_t calc_load_tasks;
 static unsigned long calc_load_update;
 unsigned long avenrun[3];
-EXPORT_SYMBOL(avenrun);
+EXPORT_SYMBOL(avenrun); /* should be removed */
+/**
+ * get_avenrun - get the load average array
+ * @loads:      pointer to dest load array
+ * @offset:     offset to add
+ * @shift:      shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+        loads[0] = (avenrun[0] + offset) << shift;
+        loads[1] = (avenrun[1] + offset) << shift;
+        loads[2] = (avenrun[2] + offset) << shift;
+}
 static long calc_load_fold_active(struct rq *this_rq)
 {
@@ -2183,6 +2243,9 @@ static long calc_load_fold_active(struct rq *this_rq)
        return delta;
 }
+/*
+ * a1 = a0 * e + a * (1 - e)
+ */
 static unsigned long
 calc_load(unsigned long load, unsigned long exp, unsigned long active)
 {
@@ -2194,30 +2257,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
 #ifdef CONFIG_NO_HZ
 /*
- * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
+ * Handle NO_HZ for the global load-average.
+ *
+ * Since the above described distributed algorithm to compute the global
+ * load-average relies on per-cpu sampling from the tick, it is affected by
+ * NO_HZ.
+ *
+ * The basic idea is to fold the nr_active delta into a global idle-delta upon
+ * entering NO_HZ state such that we can include this as an 'extra' cpu delta
+ * when we read the global state.
+ *
+ * Obviously reality has to ruin such a delightfully simple scheme:
+ *
+ *  - When we go NO_HZ idle during the window, we can negate our sample
+ *    contribution, causing under-accounting.
+ *
+ *    We avoid this by keeping two idle-delta counters and flipping them
+ *    when the window starts, thus separating old and new NO_HZ load.
+ *
+ *    The only trick is the slight shift in index flip for read vs write.
+ *
+ *        0s            5s            10s           15s
+ *          +10           +10           +10           +10
+ *        |-|-----------|-|-----------|-|-----------|-|
+ *    r:0 0 1           1 0           0 1           1 0
+ *    w:0 1 1           0 0           1 1           0 0
+ *
+ *    This ensures we'll fold the old idle contribution in this window while
+ *    accumlating the new one.
+ *
+ *  - When we wake up from NO_HZ idle during the window, we push up our
+ *    contribution, since we effectively move our sample point to a known
+ *    busy state.
+ *
+ *    This is solved by pushing the window forward, and thus skipping the
+ *    sample, for this cpu (effectively using the idle-delta for this cpu which
+ *    was in effect at the time the window opened). This also solves the issue
+ *    of having to deal with a cpu having been in NOHZ idle for multiple
+ *    LOAD_FREQ intervals.
 *
 * When making the ILB scale, we should try to pull this in as well.
 */
-static atomic_long_t calc_load_tasks_idle;
+static atomic_long_t calc_load_idle[2];
+static int calc_load_idx;
-void calc_load_account_idle(struct rq *this_rq)
+static inline int calc_load_write_idx(void)
 {
+        int idx = calc_load_idx;
+        /*
+         * See calc_global_nohz(), if we observe the new index, we also
+         * need to observe the new update time.
+         */
+        smp_rmb();
+        /*
+         * If the folding window started, make sure we start writing in the
+         * next idle-delta.
+         */
+        if (!time_before(jiffies, calc_load_update))
+                idx++;
+        return idx & 1;
+}
+static inline int calc_load_read_idx(void)
+{
+        return calc_load_idx & 1;
+}
+void calc_load_enter_idle(void)
+{
+        struct rq *this_rq = this_rq();
        long delta;
+        /*
+         * We're going into NOHZ mode, if there's any pending delta, fold it
+         * into the pending idle delta.
+         */
        delta = calc_load_fold_active(this_rq);
-        if (delta)
+        if (delta) {
-                atomic_long_add(delta, &calc_load_tasks_idle);
+                int idx = calc_load_write_idx();
+                atomic_long_add(delta, &calc_load_idle[idx]);
+        }
 }
-static long calc_load_fold_idle(void)
+void calc_load_exit_idle(void)
 {
-        long delta = 0;
+        struct rq *this_rq = this_rq();
        /*
-         * Its got a race, we don't care...
+         * If we're still before the sample window, we're done.
         */
-        if (atomic_long_read(&calc_load_tasks_idle))
+        if (time_before(jiffies, this_rq->calc_load_update))
-                delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
+                return;
+        /*
+         * We woke inside or after the sample window, this means we're already
+         * accounted through the nohz accounting, so skip the entire deal and
+         * sync up for the next window.
+         */
+        this_rq->calc_load_update = calc_load_update;
+        if (time_before(jiffies, this_rq->calc_load_update + 10))
+                this_rq->calc_load_update += LOAD_FREQ;
+}
+static long calc_load_fold_idle(void)
+{
+        int idx = calc_load_read_idx();
+        long delta = 0;
+        if (atomic_long_read(&calc_load_idle[idx]))
+                delta = atomic_long_xchg(&calc_load_idle[idx], 0);
        return delta;
 }
@@ -2303,66 +2454,39 @@ static void calc_global_nohz(void)
 {
        long delta, active, n;
-        /*
+        if (!time_before(jiffies, calc_load_update + 10)) {
-         * If we crossed a calc_load_update boundary, make sure to fold
+                /*
-         * any pending idle changes, the respective CPUs might have
+                 * Catch-up, fold however many we are behind still
-         * missed the tick driven calc_load_account_active() update
+                 */
-         * due to NO_HZ.
+                delta = jiffies - calc_load_update - 10;
-         */
+                n = 1 + (delta / LOAD_FREQ);
-        delta = calc_load_fold_idle();
-        if (delta)
-                atomic_long_add(delta, &calc_load_tasks);
-        /*
+                active = atomic_long_read(&calc_load_tasks);
-         * It could be the one fold was all it took, we done!
+                active = active > 0 ? active * FIXED_1 : 0;
-         */
-        if (time_before(jiffies, calc_load_update + 10))
-                return;
-        /*
+                avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
-         * Catch-up, fold however many we are behind still
+                avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
-         */
+                avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
-        delta = jiffies - calc_load_update - 10;
-        n = 1 + (delta / LOAD_FREQ);
-        active = atomic_long_read(&calc_load_tasks);
+                calc_load_update += n * LOAD_FREQ;
-        active = active > 0 ? active * FIXED_1 : 0;
+        }
-        avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
-        avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
-        avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
-        calc_load_update += n * LOAD_FREQ;
-}
-#else
-void calc_load_account_idle(struct rq *this_rq)
-{
-}
-static inline long calc_load_fold_idle(void)
+        /*
-{
+         * Flip the idle index...
-        return 0;
+         *
+         * Make sure we first write the new time then flip the index, so that
+         * calc_load_write_idx() will see the new time when it reads the new
+         * index, this avoids a double flip messing things up.
+         */
+        smp_wmb();
+        calc_load_idx++;
 }
+#else /* !CONFIG_NO_HZ */
-static void calc_global_nohz(void)
+static inline long calc_load_fold_idle(void) { return 0; }
-{
+static inline void calc_global_nohz(void) { }
-}
-#endif
-/**
+#endif /* CONFIG_NO_HZ */
- * get_avenrun - get the load average array
- * @loads:      pointer to dest load array
- * @offset:     offset to add
- * @shift:      shift count to shift the result left
- *
- * These values are estimates at best, so no need for locking.
- */
-void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
-{
-        loads[0] = (avenrun[0] + offset) << shift;
-        loads[1] = (avenrun[1] + offset) << shift;
-        loads[2] = (avenrun[2] + offset) << shift;
-}
 /*
 * calc_load - update the avenrun load estimates 10 ticks after the
@@ -2370,11 +2494,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
 */
 void calc_global_load(unsigned long ticks)
 {
-        long active;
+        long active, delta;
        if (time_before(jiffies, calc_load_update + 10))
                return;
+        /*
+         * Fold the 'old' idle-delta to include all NO_HZ cpus.
+         */
+        delta = calc_load_fold_idle();
+        if (delta)
+                atomic_long_add(delta, &calc_load_tasks);
        active = atomic_long_read(&calc_load_tasks);
        active = active > 0 ? active * FIXED_1 : 0;
@@ -2385,12 +2516,7 @@ void calc_global_load(unsigned long ticks)
        calc_load_update += LOAD_FREQ;
        /*
-         * Account one period with whatever state we found before
+         * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
-         * folding in the nohz state and ageing the entire idle period.
-         *
-         * This avoids loosing a sample when we go idle between 
-         * calc_load_account_active() (10 ticks ago) and now and thus
-         * under-accounting.
         */
        calc_global_nohz();
 }
@@ -2407,7 +2533,6 @@ static void calc_load_account_active(struct rq *this_rq)
                return;
        delta  = calc_load_fold_active(this_rq);
-        delta += calc_load_fold_idle();
        if (delta)
                atomic_long_add(delta, &calc_load_tasks);
@@ -2415,6 +2540,10 @@ static void calc_load_account_active(struct rq *this_rq)
 }
 /*
+ * End of global load-average stuff
+ */
+/*
 * The exact cpuload at various idx values, calculated at every tick would be
 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
 *
@@ -2486,22 +2615,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
 * every tick. We fix it up based on jiffies.
 */
-void update_cpu_load(struct rq *this_rq)
+static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+                              unsigned long pending_updates)
 {
-        unsigned long this_load = this_rq->load.weight;
-        unsigned long curr_jiffies = jiffies;
-        unsigned long pending_updates;
        int i, scale;
        this_rq->nr_load_updates++;
-        /* Avoid repeated calls on same jiffy, when moving in and out of idle */
-        if (curr_jiffies == this_rq->last_load_update_tick)
-                return;
-        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-        this_rq->last_load_update_tick = curr_jiffies;
        /* Update our load: */
        this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
        for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
@@ -2526,9 +2646,78 @@ void update_cpu_load(struct rq *this_rq)
        sched_avg_update(this_rq);
 }
+#ifdef CONFIG_NO_HZ
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we cannot use the delta approach from the regular tick since that
+ * would seriously skew the load calculation. However we'll make do for those
+ * updates happening while idle (nohz_idle_balance) or coming out of idle
+ * (tick_nohz_idle_exit).
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+/*
+ * Called from nohz_idle_balance() to update the load ratings before doing the
+ * idle balance.
+ */
+void update_idle_cpu_load(struct rq *this_rq)
+{
+        unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+        unsigned long load = this_rq->load.weight;
+        unsigned long pending_updates;
+        /*
+         * bail if there's load or we're actually up-to-date.
+         */
+        if (load || curr_jiffies == this_rq->last_load_update_tick)
+                return;
+        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+        this_rq->last_load_update_tick = curr_jiffies;
+        __update_cpu_load(this_rq, load, pending_updates);
+}
+/*
+ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ */
+void update_cpu_load_nohz(void)
+{
+        struct rq *this_rq = this_rq();
+        unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+        unsigned long pending_updates;
+        if (curr_jiffies == this_rq->last_load_update_tick)
+                return;
+        raw_spin_lock(&this_rq->lock);
+        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+        if (pending_updates) {
+                this_rq->last_load_update_tick = curr_jiffies;
+                /*
+                 * We were idle, this means load 0, the current load might be
+                 * !0 due to remote wakeups and the sort.
+                 */
+                __update_cpu_load(this_rq, 0, pending_updates);
+        }
+        raw_spin_unlock(&this_rq->lock);
+}
+#endif /* CONFIG_NO_HZ */
+/*
+ * Called from scheduler_tick()
+ */
 static void update_cpu_load_active(struct rq *this_rq)
 {
-        update_cpu_load(this_rq);
+        /*
+         * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+         */
+        this_rq->last_load_update_tick = jiffies;
+        __update_cpu_load(this_rq, this_rq->load.weight, 1);
        calc_load_account_active(this_rq);
 }
@@ -3113,6 +3302,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
        if (irqs_disabled())
                print_irqtrace_events(prev);
        dump_stack();
+        add_taint(TAINT_WARN);
 }
 /*
@@ -4042,11 +4232,8 @@ static bool check_same_owner(struct task_struct *p)
        rcu_read_lock();
        pcred = __task_cred(p);
-        if (cred->user->user_ns == pcred->user->user_ns)
+        match = (uid_eq(cred->euid, pcred->euid) ||
-                match = (cred->euid == pcred->euid ||
+                 uid_eq(cred->euid, pcred->uid));
-                         cred->euid == pcred->uid);
-        else
-                match = false;
        rcu_read_unlock();
        return match;
 }
@@ -4957,7 +5144,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
                p->sched_class->set_cpus_allowed(p, new_mask);
        cpumask_copy(&p->cpus_allowed, new_mask);
-        p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
+        p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
 /*
@@ -5499,15 +5686,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
 #ifdef CONFIG_SCHED_DEBUG
-static __read_mostly int sched_domain_debug_enabled;
+static __read_mostly int sched_debug_enabled;
-static int __init sched_domain_debug_setup(char *str)
+static int __init sched_debug_setup(char *str)
 {
-        sched_domain_debug_enabled = 1;
+        sched_debug_enabled = 1;
        return 0;
 }
-early_param("sched_debug", sched_domain_debug_setup);
+early_param("sched_debug", sched_debug_setup);
+static inline bool sched_debug(void)
+{
+        return sched_debug_enabled;
+}
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                                  struct cpumask *groupmask)
@@ -5547,7 +5739,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                        break;
                }
-                if (!group->sgp->power) {
+                /*
+                 * Even though we initialize ->power to something semi-sane,
+                 * we leave power_orig unset. This allows us to detect if
+                 * domain iteration is still funny without causing /0 traps.
+                 */
+                if (!group->sgp->power_orig) {
                        printk(KERN_CONT "\n");
                        printk(KERN_ERR "ERROR: domain->cpu_power not "
                                        "set\n");
@@ -5560,7 +5757,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                        break;
                }
-                if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
+                if (!(sd->flags & SD_OVERLAP) &&
+                    cpumask_intersects(groupmask, sched_group_cpus(group))) {
                        printk(KERN_CONT "\n");
                        printk(KERN_ERR "ERROR: repeated CPUs\n");
                        break;
@@ -5594,7 +5792,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
        int level = 0;
-        if (!sched_domain_debug_enabled)
+        if (!sched_debug_enabled)
                return;
        if (!sd) {
@@ -5615,6 +5813,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 }
 #else /* !CONFIG_SCHED_DEBUG */
 # define sched_domain_debug(sd, cpu) do { } while (0)
+static inline bool sched_debug(void)
+{
+        return false;
+}
 #endif /* CONFIG_SCHED_DEBUG */
 static int sd_degenerate(struct sched_domain *sd)
@@ -5898,99 +6100,11 @@ static int __init isolated_cpu_setup(char *str)
 __setup("isolcpus=", isolated_cpu_setup);
-#ifdef CONFIG_NUMA
-/**
- * find_next_best_node - find the next node to include in a sched_domain
- * @node: node whose sched_domain we're building
- * @used_nodes: nodes already in the sched_domain
- *
- * Find the next node to include in a given scheduling domain. Simply
- * finds the closest node not already in the @used_nodes map.
- *
- * Should use nodemask_t.
- */
-static int find_next_best_node(int node, nodemask_t *used_nodes)
-{
-        int i, n, val, min_val, best_node = -1;
-        min_val = INT_MAX;
-        for (i = 0; i < nr_node_ids; i++) {
-                /* Start at @node */
-                n = (node + i) % nr_node_ids;
-                if (!nr_cpus_node(n))
-                        continue;
-                /* Skip already used nodes */
-                if (node_isset(n, *used_nodes))
-                        continue;
-                /* Simple min distance search */
-                val = node_distance(node, n);
-                if (val < min_val) {
-                        min_val = val;
-                        best_node = n;
-                }
-        }
-        if (best_node != -1)
-                node_set(best_node, *used_nodes);
-        return best_node;
-}
-/**
- * sched_domain_node_span - get a cpumask for a node's sched_domain
- * @node: node whose cpumask we're constructing
- * @span: resulting cpumask
- *
- * Given a node, construct a good cpumask for its sched_domain to span. It
- * should be one that prevents unnecessary balancing, but also spreads tasks
- * out optimally.
- */
-static void sched_domain_node_span(int node, struct cpumask *span)
-{
-        nodemask_t used_nodes;
-        int i;
-        cpumask_clear(span);
-        nodes_clear(used_nodes);
-        cpumask_or(span, span, cpumask_of_node(node));
-        node_set(node, used_nodes);
-        for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
-                int next_node = find_next_best_node(node, &used_nodes);
-                if (next_node < 0)
-                        break;
-                cpumask_or(span, span, cpumask_of_node(next_node));
-        }
-}
-static const struct cpumask *cpu_node_mask(int cpu)
-{
-        lockdep_assert_held(&sched_domains_mutex);
-        sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
-        return sched_domains_tmpmask;
-}
-static const struct cpumask *cpu_allnodes_mask(int cpu)
-{
-        return cpu_possible_mask;
-}
-#endif /* CONFIG_NUMA */
 static const struct cpumask *cpu_cpu_mask(int cpu)
 {
        return cpumask_of_node(cpu_to_node(cpu));
 }
-int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 struct sd_data {
        struct sched_domain **__percpu sd;
        struct sched_group **__percpu sg;
@@ -6020,9 +6134,48 @@ struct sched_domain_topology_level {
        sched_domain_init_f init;
        sched_domain_mask_f mask;
        int                 flags;
+        int                 numa_level;
        struct sd_data      data;
 };
+/*
+ * Build an iteration mask that can exclude certain CPUs from the upwards
+ * domain traversal.
+ *
+ * Asymmetric node setups can result in situations where the domain tree is of
+ * unequal depth, make sure to skip domains that already cover the entire
+ * range.
+ *
+ * In that case build_sched_domains() will have terminated the iteration early
+ * and our sibling sd spans will be empty. Domains should always include the
+ * cpu they're built on, so check that.
+ *
+ */
+static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
+{
+        const struct cpumask *span = sched_domain_span(sd);
+        struct sd_data *sdd = sd->private;
+        struct sched_domain *sibling;
+        int i;
+        for_each_cpu(i, span) {
+                sibling = *per_cpu_ptr(sdd->sd, i);
+                if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+                        continue;
+                cpumask_set_cpu(i, sched_group_mask(sg));
+        }
+}
+/*
+ * Return the canonical balance cpu for this group, this is the first cpu
+ * of this group that's also in the iteration mask.
+ */
+int group_balance_cpu(struct sched_group *sg)
+{
+        return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
+}
 static int
 build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 {
@@ -6041,6 +6194,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                if (cpumask_test_cpu(i, covered))
                        continue;
+                child = *per_cpu_ptr(sdd->sd, i);
+                /* See the comment near build_group_mask(). */
+                if (!cpumask_test_cpu(i, sched_domain_span(child)))
+                        continue;
                sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
                                GFP_KERNEL, cpu_to_node(cpu));
@@ -6048,8 +6207,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                        goto fail;
                sg_span = sched_group_cpus(sg);
-                child = *per_cpu_ptr(sdd->sd, i);
                if (child->child) {
                        child = child->child;
                        cpumask_copy(sg_span, sched_domain_span(child));
@@ -6058,10 +6215,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                cpumask_or(covered, covered, sg_span);
-                sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
+                sg->sgp = *per_cpu_ptr(sdd->sgp, i);
-                atomic_inc(&sg->sgp->ref);
+                if (atomic_inc_return(&sg->sgp->ref) == 1)
+                        build_group_mask(sd, sg);
-                if (cpumask_test_cpu(cpu, sg_span))
+                /*
+                 * Initialize sgp->power such that even if we mess up the
+                 * domains and no possible iteration will get us here, we won't
+                 * die on a /0 trap.
+                 */
+                sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
+                /*
+                 * Make sure the first group of this domain contains the
+                 * canonical balance cpu. Otherwise the sched_domain iteration
+                 * breaks. See update_sg_lb_stats().
+                 */
+                if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
+                    group_balance_cpu(sg) == cpu)
                        groups = sg;
                if (!first)
@@ -6135,6 +6306,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
                cpumask_clear(sched_group_cpus(sg));
                sg->sgp->power = 0;
+                cpumask_setall(sched_group_mask(sg));
                for_each_cpu(j, span) {
                        if (get_group(j, sdd, NULL) != group)
@@ -6176,7 +6348,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
                sg = sg->next;
        } while (sg != sd->groups);
-        if (cpu != group_first_cpu(sg))
+        if (cpu != group_balance_cpu(sg))
                return;
        update_group_power(sd, cpu);
@@ -6211,10 +6383,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) 	\
 }
 SD_INIT_FUNC(CPU)
-#ifdef CONFIG_NUMA
- SD_INIT_FUNC(ALLNODES)
- SD_INIT_FUNC(NODE)
-#endif
 #ifdef CONFIG_SCHED_SMT
 SD_INIT_FUNC(SIBLING)
 #endif
@@ -6230,11 +6398,8 @@ int sched_domain_level_max;
 static int __init setup_relax_domain_level(char *str)
 {
-        unsigned long val;
+        if (kstrtoint(str, 0, &default_relax_domain_level))
+                pr_warn("Unable to set relax_domain_level\n");
-        val = simple_strtoul(str, NULL, 0);
-        if (val < sched_domain_level_max)
-                default_relax_domain_level = val;
        return 1;
 }
@@ -6336,15 +6501,236 @@ static struct sched_domain_topology_level default_topology[] = {
        { sd_init_BOOK, cpu_book_mask, },
 #endif
        { sd_init_CPU, cpu_cpu_mask, },
-#ifdef CONFIG_NUMA
-        { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
-        { sd_init_ALLNODES, cpu_allnodes_mask, },
-#endif
        { NULL, },
 };
 static struct sched_domain_topology_level *sched_domain_topology = default_topology;
+#ifdef CONFIG_NUMA
+static int sched_domains_numa_levels;
+static int *sched_domains_numa_distance;
+static struct cpumask ***sched_domains_numa_masks;
+static int sched_domains_curr_level;
+static inline int sd_local_flags(int level)
+{
+        if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
+                return 0;
+        return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
+}
+static struct sched_domain *
+sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
+{
+        struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
+        int level = tl->numa_level;
+        int sd_weight = cpumask_weight(
+                        sched_domains_numa_masks[level][cpu_to_node(cpu)]);
+        *sd = (struct sched_domain){
+                .min_interval           = sd_weight,
+                .max_interval           = 2*sd_weight,
+                .busy_factor            = 32,
+                .imbalance_pct          = 125,
+                .cache_nice_tries       = 2,
+                .busy_idx               = 3,
+                .idle_idx               = 2,
+                .newidle_idx            = 0,
+                .wake_idx               = 0,
+                .forkexec_idx           = 0,
+                .flags                  = 1*SD_LOAD_BALANCE
+                                        | 1*SD_BALANCE_NEWIDLE
+                                        | 0*SD_BALANCE_EXEC
+                                        | 0*SD_BALANCE_FORK
+                                        | 0*SD_BALANCE_WAKE
+                                        | 0*SD_WAKE_AFFINE
+                                        | 0*SD_PREFER_LOCAL
+                                        | 0*SD_SHARE_CPUPOWER
+                                        | 0*SD_SHARE_PKG_RESOURCES
+                                        | 1*SD_SERIALIZE
+                                        | 0*SD_PREFER_SIBLING
+                                        | sd_local_flags(level)
+                                        ,
+                .last_balance           = jiffies,
+                .balance_interval       = sd_weight,
+        };
+        SD_INIT_NAME(sd, NUMA);
+        sd->private = &tl->data;
+        /*
+         * Ugly hack to pass state to sd_numa_mask()...
+         */
+        sched_domains_curr_level = tl->numa_level;
+        return sd;
+}
+static const struct cpumask *sd_numa_mask(int cpu)
+{
+        return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+}
+static void sched_numa_warn(const char *str)
+{
+        static int done = false;
+        int i,j;
+        if (done)
+                return;
+        done = true;
+        printk(KERN_WARNING "ERROR: %s\n\n", str);
+        for (i = 0; i < nr_node_ids; i++) {
+                printk(KERN_WARNING "  ");
+                for (j = 0; j < nr_node_ids; j++)
+                        printk(KERN_CONT "%02d ", node_distance(i,j));
+                printk(KERN_CONT "\n");
+        }
+        printk(KERN_WARNING "\n");
+}
+static bool find_numa_distance(int distance)
+{
+        int i;
+        if (distance == node_distance(0, 0))
+                return true;
+        for (i = 0; i < sched_domains_numa_levels; i++) {
+                if (sched_domains_numa_distance[i] == distance)
+                        return true;
+        }
+        return false;
+}
+static void sched_init_numa(void)
+{
+        int next_distance, curr_distance = node_distance(0, 0);
+        struct sched_domain_topology_level *tl;
+        int level = 0;
+        int i, j, k;
+        sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
+        if (!sched_domains_numa_distance)
+                return;
+        /*
+         * O(nr_nodes^2) deduplicating selection sort -- in order to find the
+         * unique distances in the node_distance() table.
+         *
+         * Assumes node_distance(0,j) includes all distances in
+         * node_distance(i,j) in order to avoid cubic time.
+         */
+        next_distance = curr_distance;
+        for (i = 0; i < nr_node_ids; i++) {
+                for (j = 0; j < nr_node_ids; j++) {
+                        for (k = 0; k < nr_node_ids; k++) {
+                                int distance = node_distance(i, k);
+                                if (distance > curr_distance &&
+                                    (distance < next_distance ||
+                                     next_distance == curr_distance))
+                                        next_distance = distance;
+                                /*
+                                 * While not a strong assumption it would be nice to know
+                                 * about cases where if node A is connected to B, B is not
+                                 * equally connected to A.
+                                 */
+                                if (sched_debug() && node_distance(k, i) != distance)
+                                        sched_numa_warn("Node-distance not symmetric");
+                                if (sched_debug() && i && !find_numa_distance(distance))
+                                        sched_numa_warn("Node-0 not representative");
+                        }
+                        if (next_distance != curr_distance) {
+                                sched_domains_numa_distance[level++] = next_distance;
+                                sched_domains_numa_levels = level;
+                                curr_distance = next_distance;
+                        } else break;
+                }
+                /*
+                 * In case of sched_debug() we verify the above assumption.
+                 */
+                if (!sched_debug())
+                        break;
+        }
+        /*
+         * 'level' contains the number of unique distances, excluding the
+         * identity distance node_distance(i,i).
+         *
+         * The sched_domains_nume_distance[] array includes the actual distance
+         * numbers.
+         */
+        sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+        if (!sched_domains_numa_masks)
+                return;
+        /*
+         * Now for each level, construct a mask per node which contains all
+         * cpus of nodes that are that many hops away from us.
+         */
+        for (i = 0; i < level; i++) {
+                sched_domains_numa_masks[i] =
+                        kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+                if (!sched_domains_numa_masks[i])
+                        return;
+                for (j = 0; j < nr_node_ids; j++) {
+                        struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
+                        if (!mask)
+                                return;
+                        sched_domains_numa_masks[i][j] = mask;
+                        for (k = 0; k < nr_node_ids; k++) {
+                                if (node_distance(j, k) > sched_domains_numa_distance[i])
+                                        continue;
+                                cpumask_or(mask, mask, cpumask_of_node(k));
+                        }
+                }
+        }
+        tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
+                        sizeof(struct sched_domain_topology_level), GFP_KERNEL);
+        if (!tl)
+                return;
+        /*
+         * Copy the default topology bits..
+         */
+        for (i = 0; default_topology[i].init; i++)
+                tl[i] = default_topology[i];
+        /*
+         * .. and append 'j' levels of NUMA goodness.
+         */
+        for (j = 0; j < level; i++, j++) {
+                tl[i] = (struct sched_domain_topology_level){
+                        .init = sd_numa_init,
+                        .mask = sd_numa_mask,
+                        .flags = SDTL_OVERLAP,
+                        .numa_level = j,
+                };
+        }
+        sched_domain_topology = tl;
+}
+#else
+static inline void sched_init_numa(void)
+{
+}
+#endif /* CONFIG_NUMA */
 static int __sdt_alloc(const struct cpumask *cpu_map)
 {
        struct sched_domain_topology_level *tl;
@@ -6382,9 +6768,11 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
                        if (!sg)
                                return -ENOMEM;
+                        sg->next = sg;
                        *per_cpu_ptr(sdd->sg, j) = sg;
-                        sgp = kzalloc_node(sizeof(struct sched_group_power),
+                        sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
                                        GFP_KERNEL, cpu_to_node(j));
                        if (!sgp)
                                return -ENOMEM;
@@ -6437,7 +6825,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
        if (!sd)
                return child;
-        set_domain_attribute(sd, attr);
        cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
        if (child) {
                sd->level = child->level + 1;
@@ -6445,6 +6832,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
                child->parent = sd;
        }
        sd->child = child;
+        set_domain_attribute(sd, attr);
        return sd;
 }
@@ -6585,7 +6973,6 @@ static int init_sched_domains(const struct cpumask *cpu_map)
        if (!doms_cur)
                doms_cur = &fallback_doms;
        cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
-        dattr_cur = NULL;
        err = build_sched_domains(doms_cur[0], NULL);
        register_sched_domain_sysctl();
@@ -6710,97 +7097,6 @@ match2:
        mutex_unlock(&sched_domains_mutex);
 }
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-static void reinit_sched_domains(void)
-{
-        get_online_cpus();
-        /* Destroy domains first to force the rebuild */
-        partition_sched_domains(0, NULL, NULL);
-        rebuild_sched_domains();
-        put_online_cpus();
-}
-static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
-{
-        unsigned int level = 0;
-        if (sscanf(buf, "%u", &level) != 1)
-                return -EINVAL;
-        /*
-         * level is always be positive so don't check for
-         * level < POWERSAVINGS_BALANCE_NONE which is 0
-         * What happens on 0 or 1 byte write,
-         * need to check for count as well?
-         */
-        if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
-                return -EINVAL;
-        if (smt)
-                sched_smt_power_savings = level;
-        else
-                sched_mc_power_savings = level;
-        reinit_sched_domains();
-        return count;
-}
-#ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct device *dev,
-                                           struct device_attribute *attr,
-                                           char *buf)
-{
-        return sprintf(buf, "%u\n", sched_mc_power_savings);
-}
-static ssize_t sched_mc_power_savings_store(struct device *dev,
-                                            struct device_attribute *attr,
-                                            const char *buf, size_t count)
-{
-        return sched_power_savings_store(buf, count, 0);
-}
-static DEVICE_ATTR(sched_mc_power_savings, 0644,
-                   sched_mc_power_savings_show,
-                   sched_mc_power_savings_store);
-#endif
-#ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct device *dev,
-                                            struct device_attribute *attr,
-                                            char *buf)
-{
-        return sprintf(buf, "%u\n", sched_smt_power_savings);
-}
-static ssize_t sched_smt_power_savings_store(struct device *dev,
-                                            struct device_attribute *attr,
-                                             const char *buf, size_t count)
-{
-        return sched_power_savings_store(buf, count, 1);
-}
-static DEVICE_ATTR(sched_smt_power_savings, 0644,
-                   sched_smt_power_savings_show,
-                   sched_smt_power_savings_store);
-#endif
-int __init sched_create_sysfs_power_savings_entries(struct device *dev)
-{
-        int err = 0;
-#ifdef CONFIG_SCHED_SMT
-        if (smt_capable())
-                err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
-#endif
-#ifdef CONFIG_SCHED_MC
-        if (!err && mc_capable())
-                err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
-#endif
-        return err;
-}
-#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 /*
 * Update cpusets according to cpu_active mask.  If cpusets are
 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
@@ -6838,6 +7134,8 @@ void __init sched_init_smp(void)
        alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
        alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+        sched_init_numa();
        get_online_cpus();
        mutex_lock(&sched_domains_mutex);
        init_sched_domains(cpu_active_mask);
@@ -7059,6 +7357,7 @@ void __init sched_init(void)
        /* May be allocated at isolcpus cmdline parse time */
        if (cpu_isolated_map == NULL)
                zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
+        idle_thread_set_boot_cpu();
 #endif
        init_sched_fair_class();
@@ -7980,13 +8279,9 @@ static struct cftype cpu_files[] = {
                .write_u64 = cpu_rt_period_write_uint,
        },
 #endif
+        { }     /* terminate */
 };
-static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-{
-        return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
-}
 struct cgroup_subsys cpu_cgroup_subsys = {
        .name           = "cpu",
        .create         = cpu_cgroup_create,
@@ -7994,8 +8289,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
        .can_attach     = cpu_cgroup_can_attach,
        .attach         = cpu_cgroup_attach,
        .exit           = cpu_cgroup_exit,
-        .populate       = cpu_cgroup_populate,
        .subsys_id      = cpu_cgroup_subsys_id,
+        .base_cftypes   = cpu_files,
        .early_init     = 1,
 };
@@ -8180,13 +8475,9 @@ static struct cftype files[] = {
                .name = "stat",
                .read_map = cpuacct_stats_show,
        },
+        { }     /* terminate */
 };
-static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
-{
-        return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
-}
 /*
 * charge this task's execution time to its accounting group.
 *
@@ -8218,7 +8509,7 @@ struct cgroup_subsys cpuacct_subsys = {
        .name = "cpuacct",
        .create = cpuacct_create,
        .destroy = cpuacct_destroy,
-        .populate = cpuacct_populate,
        .subsys_id = cpuacct_subsys_id,
+        .base_cftypes = files,
 };
 #endif  /* CONFIG_CGROUP_CPUACCT */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 09acaa15161d..6f79596e0ea9 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -202,7 +202,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
                        SPLIT_NS(spread0));
        SEQ_printf(m, "  .%-30s: %d\n", "nr_spread_over",
                        cfs_rq->nr_spread_over);
-        SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
+        SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
        SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_SMP
@@ -260,8 +260,14 @@ static void print_cpu(struct seq_file *m, int cpu)
        SEQ_printf(m, "\ncpu#%d\n", cpu);
 #endif
-#define P(x) \
+#define P(x)                                                            \
-        SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rq->x))
+do {                                                                    \
+        if (sizeof(rq->x) == 4)                                         \
+                SEQ_printf(m, "  .%-30s: %ld\n", #x, (long)(rq->x));    \
+        else                                                            \
+                SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rq->x));\
+} while (0)
 #define PN(x) \
        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e9553640c1c3..c099cc6eebe3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
        int want_sd = 1;
        int sync = wake_flags & WF_SYNC;
-        if (p->rt.nr_cpus_allowed == 1)
+        if (p->nr_cpus_allowed == 1)
                return prev_cpu;
        if (sd_flag & SD_BALANCE_WAKE) {
@@ -2721,7 +2721,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
                 * If power savings logic is enabled for a domain, see if we
                 * are not overloaded, if so, don't balance wider.
                 */
-                if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
+                if (tmp->flags & (SD_PREFER_LOCAL)) {
                        unsigned long power = 0;
                        unsigned long nr_running = 0;
                        unsigned long capacity;
@@ -2734,9 +2734,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
                        capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
-                        if (tmp->flags & SD_POWERSAVINGS_BALANCE)
-                                nr_running /= 2;
                        if (nr_running < capacity)
                                want_sd = 0;
                }
@@ -3082,7 +3079,7 @@ struct lb_env {
        struct rq               *dst_rq;
        enum cpu_idle_type      idle;
-        long                    load_move;
+        long                    imbalance;
        unsigned int            flags;
        unsigned int            loop;
@@ -3218,7 +3215,7 @@ static unsigned long task_h_load(struct task_struct *p);
 static const unsigned int sched_nr_migrate_break = 32;
 /*
- * move_tasks tries to move up to load_move weighted load from busiest to
+ * move_tasks tries to move up to imbalance weighted load from busiest to
 * this_rq, as part of a balancing operation within domain "sd".
 * Returns 1 if successful and 0 otherwise.
 *
@@ -3231,7 +3228,7 @@ static int move_tasks(struct lb_env *env)
        unsigned long load;
        int pulled = 0;
-        if (env->load_move <= 0)
+        if (env->imbalance <= 0)
                return 0;
        while (!list_empty(tasks)) {
@@ -3257,7 +3254,7 @@ static int move_tasks(struct lb_env *env)
                if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
                        goto next;
-                if ((load / 2) > env->load_move)
+                if ((load / 2) > env->imbalance)
                        goto next;
                if (!can_migrate_task(p, env))
@@ -3265,7 +3262,7 @@ static int move_tasks(struct lb_env *env)
                move_task(p, env);
                pulled++;
-                env->load_move -= load;
+                env->imbalance -= load;
 #ifdef CONFIG_PREEMPT
                /*
@@ -3281,7 +3278,7 @@ static int move_tasks(struct lb_env *env)
                 * We only want to steal up to the prescribed amount of
                 * weighted load.
                 */
-                if (env->load_move <= 0)
+                if (env->imbalance <= 0)
                        break;
                continue;
@@ -3435,14 +3432,6 @@ struct sd_lb_stats {
        unsigned int  busiest_group_weight;
        int group_imb; /* Is there imbalance in this sd */
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-        int power_savings_balance; /* Is powersave balance needed for this sd */
-        struct sched_group *group_min; /* Least loaded group in sd */
-        struct sched_group *group_leader; /* Group which relieves group_min */
-        unsigned long min_load_per_task; /* load_per_task in group_min */
-        unsigned long leader_nr_running; /* Nr running of group_leader */
-        unsigned long min_nr_running; /* Nr running of group_min */
-#endif
 };
 /*
@@ -3486,148 +3475,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
        return load_idx;
 }
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-/**
- * init_sd_power_savings_stats - Initialize power savings statistics for
- * the given sched_domain, during load balancing.
- *
- * @sd: Sched domain whose power-savings statistics are to be initialized.
- * @sds: Variable containing the statistics for sd.
- * @idle: Idle status of the CPU at which we're performing load-balancing.
- */
-static inline void init_sd_power_savings_stats(struct sched_domain *sd,
-        struct sd_lb_stats *sds, enum cpu_idle_type idle)
-{
-        /*
-         * Busy processors will not participate in power savings
-         * balance.
-         */
-        if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
-                sds->power_savings_balance = 0;
-        else {
-                sds->power_savings_balance = 1;
-                sds->min_nr_running = ULONG_MAX;
-                sds->leader_nr_running = 0;
-        }
-}
-/**
- * update_sd_power_savings_stats - Update the power saving stats for a
- * sched_domain while performing load balancing.
- *
- * @group: sched_group belonging to the sched_domain under consideration.
- * @sds: Variable containing the statistics of the sched_domain
- * @local_group: Does group contain the CPU for which we're performing
- *              load balancing ?
- * @sgs: Variable containing the statistics of the group.
- */
-static inline void update_sd_power_savings_stats(struct sched_group *group,
-        struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
-{
-        if (!sds->power_savings_balance)
-                return;
-        /*
-         * If the local group is idle or completely loaded
-         * no need to do power savings balance at this domain
-         */
-        if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
-                                !sds->this_nr_running))
-                sds->power_savings_balance = 0;
-        /*
-         * If a group is already running at full capacity or idle,
-         * don't include that group in power savings calculations
-         */
-        if (!sds->power_savings_balance ||
-                sgs->sum_nr_running >= sgs->group_capacity ||
-                !sgs->sum_nr_running)
-                return;
-        /*
-         * Calculate the group which has the least non-idle load.
-         * This is the group from where we need to pick up the load
-         * for saving power
-         */
-        if ((sgs->sum_nr_running < sds->min_nr_running) ||
-            (sgs->sum_nr_running == sds->min_nr_running &&
-             group_first_cpu(group) > group_first_cpu(sds->group_min))) {
-                sds->group_min = group;
-                sds->min_nr_running = sgs->sum_nr_running;
-                sds->min_load_per_task = sgs->sum_weighted_load /
-                                                sgs->sum_nr_running;
-        }
-        /*
-         * Calculate the group which is almost near its
-         * capacity but still has some space to pick up some load
-         * from other group and save more power
-         */
-        if (sgs->sum_nr_running + 1 > sgs->group_capacity)
-                return;
-        if (sgs->sum_nr_running > sds->leader_nr_running ||
-            (sgs->sum_nr_running == sds->leader_nr_running &&
-             group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
-                sds->group_leader = group;
-                sds->leader_nr_running = sgs->sum_nr_running;
-        }
-}
-/**
- * check_power_save_busiest_group - see if there is potential for some power-savings balance
- * @sds: Variable containing the statistics of the sched_domain
- *      under consideration.
- * @this_cpu: Cpu at which we're currently performing load-balancing.
- * @imbalance: Variable to store the imbalance.
- *
- * Description:
- * Check if we have potential to perform some power-savings balance.
- * If yes, set the busiest group to be the least loaded group in the
- * sched_domain, so that it's CPUs can be put to idle.
- *
- * Returns 1 if there is potential to perform power-savings balance.
- * Else returns 0.
- */
-static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
-                                        int this_cpu, unsigned long *imbalance)
-{
-        if (!sds->power_savings_balance)
-                return 0;
-        if (sds->this != sds->group_leader ||
-                        sds->group_leader == sds->group_min)
-                return 0;
-        *imbalance = sds->min_load_per_task;
-        sds->busiest = sds->group_min;
-        return 1;
-}
-#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-static inline void init_sd_power_savings_stats(struct sched_domain *sd,
-        struct sd_lb_stats *sds, enum cpu_idle_type idle)
-{
-        return;
-}
-static inline void update_sd_power_savings_stats(struct sched_group *group,
-        struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
-{
-        return;
-}
-static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
-                                        int this_cpu, unsigned long *imbalance)
-{
-        return 0;
-}
-#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
 {
        return SCHED_POWER_SCALE;
@@ -3656,15 +3503,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
 unsigned long scale_rt_power(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-        u64 total, available;
+        u64 total, available, age_stamp, avg;
+        /*
+         * Since we're reading these variables without serialization make sure
+         * we read them once before doing sanity checks on them.
+         */
+        age_stamp = ACCESS_ONCE(rq->age_stamp);
+        avg = ACCESS_ONCE(rq->rt_avg);
-        total = sched_avg_period() + (rq->clock - rq->age_stamp);
+        total = sched_avg_period() + (rq->clock - age_stamp);
-        if (unlikely(total < rq->rt_avg)) {
+        if (unlikely(total < avg)) {
                /* Ensures that power won't end up being negative */
                available = 0;
        } else {
-                available = total - rq->rt_avg;
+                available = total - avg;
        }
        if (unlikely((s64)total < SCHED_POWER_SCALE))
@@ -3727,13 +3581,28 @@ void update_group_power(struct sched_domain *sd, int cpu)
        power = 0;
-        group = child->groups;
+        if (child->flags & SD_OVERLAP) {
-        do {
+                /*
-                power += group->sgp->power;
+                 * SD_OVERLAP domains cannot assume that child groups
-                group = group->next;
+                 * span the current group.
-        } while (group != child->groups);
+                 */
-        sdg->sgp->power = power;
+                for_each_cpu(cpu, sched_group_cpus(sdg))
+                        power += power_of(cpu);
+        } else  {
+                /*
+                 * !SD_OVERLAP domains can assume that child groups
+                 * span the current group.
+                 */ 
+                group = child->groups;
+                do {
+                        power += group->sgp->power;
+                        group = group->next;
+                } while (group != child->groups);
+        }
+        sdg->sgp->power_orig = sdg->sgp->power = power;
 }
 /*
@@ -3763,41 +3632,43 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 /**
 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
- * @sd: The sched_domain whose statistics are to be updated.
+ * @env: The load balancing environment.
 * @group: sched_group whose statistics are to be updated.
- * @this_cpu: Cpu for which load balance is currently performed.
- * @idle: Idle status of this_cpu
 * @load_idx: Load index of sched_domain of this_cpu for load calc.
 * @local_group: Does group contain this_cpu.
 * @cpus: Set of cpus considered for load balancing.
 * @balance: Should we balance.
 * @sgs: variable to hold the statistics for this group.
 */
-static inline void update_sg_lb_stats(struct sched_domain *sd,
+static inline void update_sg_lb_stats(struct lb_env *env,
-                        struct sched_group *group, int this_cpu,
+                        struct sched_group *group, int load_idx,
-                        enum cpu_idle_type idle, int load_idx,
                        int local_group, const struct cpumask *cpus,
                        int *balance, struct sg_lb_stats *sgs)
 {
-        unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
+        unsigned long nr_running, max_nr_running, min_nr_running;
-        int i;
+        unsigned long load, max_cpu_load, min_cpu_load;
        unsigned int balance_cpu = -1, first_idle_cpu = 0;
        unsigned long avg_load_per_task = 0;
+        int i;
        if (local_group)
-                balance_cpu = group_first_cpu(group);
+                balance_cpu = group_balance_cpu(group);
        /* Tally up the load of all CPUs in the group */
        max_cpu_load = 0;
        min_cpu_load = ~0UL;
        max_nr_running = 0;
+        min_nr_running = ~0UL;
        for_each_cpu_and(i, sched_group_cpus(group), cpus) {
                struct rq *rq = cpu_rq(i);
+                nr_running = rq->nr_running;
                /* Bias balancing toward cpus of our domain */
                if (local_group) {
-                        if (idle_cpu(i) && !first_idle_cpu) {
+                        if (idle_cpu(i) && !first_idle_cpu &&
+                                        cpumask_test_cpu(i, sched_group_mask(group))) {
                                first_idle_cpu = 1;
                                balance_cpu = i;
                        }
@@ -3805,16 +3676,19 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
                        load = target_load(i, load_idx);
                } else {
                        load = source_load(i, load_idx);
-                        if (load > max_cpu_load) {
+                        if (load > max_cpu_load)
                                max_cpu_load = load;
-                                max_nr_running = rq->nr_running;
-                        }
                        if (min_cpu_load > load)
                                min_cpu_load = load;
+                        if (nr_running > max_nr_running)
+                                max_nr_running = nr_running;
+                        if (min_nr_running > nr_running)
+                                min_nr_running = nr_running;
                }
                sgs->group_load += load;
-                sgs->sum_nr_running += rq->nr_running;
+                sgs->sum_nr_running += nr_running;
                sgs->sum_weighted_load += weighted_cpuload(i);
                if (idle_cpu(i))
                        sgs->idle_cpus++;
@@ -3827,14 +3701,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
         * to do the newly idle load balance.
         */
        if (local_group) {
-                if (idle != CPU_NEWLY_IDLE) {
+                if (env->idle != CPU_NEWLY_IDLE) {
-                        if (balance_cpu != this_cpu) {
+                        if (balance_cpu != env->dst_cpu) {
                                *balance = 0;
                                return;
                        }
-                        update_group_power(sd, this_cpu);
+                        update_group_power(env->sd, env->dst_cpu);
                } else if (time_after_eq(jiffies, group->sgp->next_update))
-                        update_group_power(sd, this_cpu);
+                        update_group_power(env->sd, env->dst_cpu);
        }
        /* Adjust by relative CPU power of the group */
@@ -3852,13 +3726,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
        if (sgs->sum_nr_running)
                avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
-        if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
+        if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
+            (max_nr_running - min_nr_running) > 1)
                sgs->group_imb = 1;
        sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
                                                SCHED_POWER_SCALE);
        if (!sgs->group_capacity)
-                sgs->group_capacity = fix_small_capacity(sd, group);
+                sgs->group_capacity = fix_small_capacity(env->sd, group);
        sgs->group_weight = group->group_weight;
        if (sgs->group_capacity > sgs->sum_nr_running)
@@ -3867,20 +3742,18 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 /**
 * update_sd_pick_busiest - return 1 on busiest group
- * @sd: sched_domain whose statistics are to be checked
+ * @env: The load balancing environment.
 * @sds: sched_domain statistics
 * @sg: sched_group candidate to be checked for being the busiest
 * @sgs: sched_group statistics
- * @this_cpu: the current cpu
 *
 * Determine if @sg is a busier group than the previously selected
 * busiest group.
 */
-static bool update_sd_pick_busiest(struct sched_domain *sd,
+static bool update_sd_pick_busiest(struct lb_env *env,
                                   struct sd_lb_stats *sds,
                                   struct sched_group *sg,
-                                   struct sg_lb_stats *sgs,
+                                   struct sg_lb_stats *sgs)
-                                   int this_cpu)
 {
        if (sgs->avg_load <= sds->max_load)
                return false;
@@ -3896,8 +3769,8 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
         * numbered CPUs in the group, therefore mark all groups
         * higher than ourself as busy.
         */
-        if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
+        if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
-            this_cpu < group_first_cpu(sg)) {
+            env->dst_cpu < group_first_cpu(sg)) {
                if (!sds->busiest)
                        return true;
@@ -3910,35 +3783,32 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
 /**
 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
- * @sd: sched_domain whose statistics are to be updated.
+ * @env: The load balancing environment.
- * @this_cpu: Cpu for which load balance is currently performed.
- * @idle: Idle status of this_cpu
 * @cpus: Set of cpus considered for load balancing.
 * @balance: Should we balance.
 * @sds: variable to hold the statistics for this sched_domain.
 */
-static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
+static inline void update_sd_lb_stats(struct lb_env *env,
-                        enum cpu_idle_type idle, const struct cpumask *cpus,
+                                      const struct cpumask *cpus,
-                        int *balance, struct sd_lb_stats *sds)
+                                      int *balance, struct sd_lb_stats *sds)
 {
-        struct sched_domain *child = sd->child;
+        struct sched_domain *child = env->sd->child;
-        struct sched_group *sg = sd->groups;
+        struct sched_group *sg = env->sd->groups;
        struct sg_lb_stats sgs;
        int load_idx, prefer_sibling = 0;
        if (child && child->flags & SD_PREFER_SIBLING)
                prefer_sibling = 1;
-        init_sd_power_savings_stats(sd, sds, idle);
+        load_idx = get_sd_load_idx(env->sd, env->idle);
-        load_idx = get_sd_load_idx(sd, idle);
        do {
                int local_group;
-                local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
+                local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
                memset(&sgs, 0, sizeof(sgs));
-                update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx,
+                update_sg_lb_stats(env, sg, load_idx, local_group,
-                                local_group, cpus, balance, &sgs);
+                                   cpus, balance, &sgs);
                if (local_group && !(*balance))
                        return;
@@ -3966,7 +3836,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                        sds->this_load_per_task = sgs.sum_weighted_load;
                        sds->this_has_capacity = sgs.group_has_capacity;
                        sds->this_idle_cpus = sgs.idle_cpus;
-                } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
+                } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
                        sds->max_load = sgs.avg_load;
                        sds->busiest = sg;
                        sds->busiest_nr_running = sgs.sum_nr_running;
@@ -3978,9 +3848,8 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                        sds->group_imb = sgs.group_imb;
                }
-                update_sd_power_savings_stats(sg, sds, local_group, &sgs);
                sg = sg->next;
-        } while (sg != sd->groups);
+        } while (sg != env->sd->groups);
 }
 /**
@@ -4003,29 +3872,26 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 * Returns 1 when packing is required and a task should be moved to
 * this CPU.  The amount of the imbalance is returned in *imbalance.
 *
- * @sd: The sched_domain whose packing is to be checked.
+ * @env: The load balancing environment.
 * @sds: Statistics of the sched_domain which is to be packed
- * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
- * @imbalance: returns amount of imbalanced due to packing.
 */
-static int check_asym_packing(struct sched_domain *sd,
+static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
-                              struct sd_lb_stats *sds,
-                              int this_cpu, unsigned long *imbalance)
 {
        int busiest_cpu;
-        if (!(sd->flags & SD_ASYM_PACKING))
+        if (!(env->sd->flags & SD_ASYM_PACKING))
                return 0;
        if (!sds->busiest)
                return 0;
        busiest_cpu = group_first_cpu(sds->busiest);
-        if (this_cpu > busiest_cpu)
+        if (env->dst_cpu > busiest_cpu)
                return 0;
-        *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power,
+        env->imbalance = DIV_ROUND_CLOSEST(
-                                       SCHED_POWER_SCALE);
+                sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
        return 1;
 }
@@ -4033,12 +3899,11 @@ static int check_asym_packing(struct sched_domain *sd,
 * fix_small_imbalance - Calculate the minor imbalance that exists
 *                      amongst the groups of a sched_domain, during
 *                      load balancing.
+ * @env: The load balancing environment.
 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
- * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
- * @imbalance: Variable to store the imbalance.
 */
-static inline void fix_small_imbalance(struct sd_lb_stats *sds,
+static inline
-                                int this_cpu, unsigned long *imbalance)
+void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
 {
        unsigned long tmp, pwr_now = 0, pwr_move = 0;
        unsigned int imbn = 2;
@@ -4049,9 +3914,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
                if (sds->busiest_load_per_task >
                                sds->this_load_per_task)
                        imbn = 1;
-        } else
+        } else {
                sds->this_load_per_task =
-                        cpu_avg_load_per_task(this_cpu);
+                        cpu_avg_load_per_task(env->dst_cpu);
+        }
        scaled_busy_load_per_task = sds->busiest_load_per_task
                                         * SCHED_POWER_SCALE;
@@ -4059,7 +3925,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
        if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
                        (scaled_busy_load_per_task * imbn)) {
-                *imbalance = sds->busiest_load_per_task;
+                env->imbalance = sds->busiest_load_per_task;
                return;
        }
@@ -4096,18 +3962,16 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
        /* Move if we gain throughput */
        if (pwr_move > pwr_now)
-                *imbalance = sds->busiest_load_per_task;
+                env->imbalance = sds->busiest_load_per_task;
 }
 /**
 * calculate_imbalance - Calculate the amount of imbalance present within the
 *                       groups of a given sched_domain during load balance.
+ * @env: load balance environment
 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
- * @this_cpu: Cpu for which currently load balance is being performed.
- * @imbalance: The variable to store the imbalance.
 */
-static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
+static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
-                unsigned long *imbalance)
 {
        unsigned long max_pull, load_above_capacity = ~0UL;
@@ -4123,8 +3987,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
         * its cpu_power, while calculating max_load..)
         */
        if (sds->max_load < sds->avg_load) {
-                *imbalance = 0;
+                env->imbalance = 0;
-                return fix_small_imbalance(sds, this_cpu, imbalance);
+                return fix_small_imbalance(env, sds);
        }
        if (!sds->group_imb) {
@@ -4152,7 +4016,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
        max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
        /* How much load to actually move to equalise the imbalance */
-        *imbalance = min(max_pull * sds->busiest->sgp->power,
+        env->imbalance = min(max_pull * sds->busiest->sgp->power,
                (sds->avg_load - sds->this_load) * sds->this->sgp->power)
                        / SCHED_POWER_SCALE;
@@ -4162,8 +4026,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
         * a think about bumping its value to force at least one task to be
         * moved
         */
-        if (*imbalance < sds->busiest_load_per_task)
+        if (env->imbalance < sds->busiest_load_per_task)
-                return fix_small_imbalance(sds, this_cpu, imbalance);
+                return fix_small_imbalance(env, sds);
 }
@@ -4179,11 +4043,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 * Also calculates the amount of weighted load which should be moved
 * to restore balance.
 *
- * @sd: The sched_domain whose busiest group is to be returned.
+ * @env: The load balancing environment.
- * @this_cpu: The cpu for which load balancing is currently being performed.
- * @imbalance: Variable which stores amount of weighted load which should
- *              be moved to restore balance/put a group to idle.
- * @idle: The idle status of this_cpu.
 * @cpus: The set of CPUs under consideration for load-balancing.
 * @balance: Pointer to a variable indicating if this_cpu
 *      is the appropriate cpu to perform load balancing at this_level.
@@ -4194,9 +4054,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 *                 put to idle by rebalancing its tasks onto our group.
 */
 static struct sched_group *
-find_busiest_group(struct sched_domain *sd, int this_cpu,
+find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance)
-                   unsigned long *imbalance, enum cpu_idle_type idle,
-                   const struct cpumask *cpus, int *balance)
 {
        struct sd_lb_stats sds;
@@ -4206,7 +4064,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
         * Compute the various statistics relavent for load balancing at
         * this level.
         */
-        update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
+        update_sd_lb_stats(env, cpus, balance, &sds);
        /*
         * this_cpu is not the appropriate cpu to perform load balancing at
@@ -4215,8 +4073,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
        if (!(*balance))
                goto ret;
-        if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
+        if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
-            check_asym_packing(sd, &sds, this_cpu, imbalance))
+            check_asym_packing(env, &sds))
                return sds.busiest;
        /* There is no busy sibling group to pull tasks from */
@@ -4234,7 +4092,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                goto force_balance;
        /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
-        if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
+        if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
                        !sds.busiest_has_capacity)
                goto force_balance;
@@ -4252,7 +4110,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
        if (sds.this_load >= sds.avg_load)
                goto out_balanced;
-        if (idle == CPU_IDLE) {
+        if (env->idle == CPU_IDLE) {
                /*
                 * This cpu is idle. If the busiest group load doesn't
                 * have more tasks than the number of available cpu's and
@@ -4267,34 +4125,27 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
                 * imbalance_pct to be conservative.
                 */
-                if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+                if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
                        goto out_balanced;
        }
 force_balance:
        /* Looks like there is an imbalance. Compute it */
-        calculate_imbalance(&sds, this_cpu, imbalance);
+        calculate_imbalance(env, &sds);
        return sds.busiest;
 out_balanced:
-        /*
-         * There is no obvious imbalance. But check if we can do some balancing
-         * to save power.
-         */
-        if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
-                return sds.busiest;
 ret:
-        *imbalance = 0;
+        env->imbalance = 0;
        return NULL;
 }
 /*
 * find_busiest_queue - find the busiest runqueue among the cpus in group.
 */
-static struct rq *
+static struct rq *find_busiest_queue(struct lb_env *env,
-find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
+                                     struct sched_group *group,
-                   enum cpu_idle_type idle, unsigned long imbalance,
+                                     const struct cpumask *cpus)
-                   const struct cpumask *cpus)
 {
        struct rq *busiest = NULL, *rq;
        unsigned long max_load = 0;
@@ -4307,7 +4158,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
                unsigned long wl;
                if (!capacity)
-                        capacity = fix_small_capacity(sd, group);
+                        capacity = fix_small_capacity(env->sd, group);
                if (!cpumask_test_cpu(i, cpus))
                        continue;
@@ -4319,7 +4170,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
                 * When comparing with imbalance, use weighted_cpuload()
                 * which is not scaled with the cpu power.
                 */
-                if (capacity && rq->nr_running == 1 && wl > imbalance)
+                if (capacity && rq->nr_running == 1 && wl > env->imbalance)
                        continue;
                /*
@@ -4348,40 +4199,19 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
 /* Working cpumask for load_balance and load_balance_newidle. */
 DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
-static int need_active_balance(struct sched_domain *sd, int idle,
+static int need_active_balance(struct lb_env *env)
-                               int busiest_cpu, int this_cpu)
 {
-        if (idle == CPU_NEWLY_IDLE) {
+        struct sched_domain *sd = env->sd;
+        if (env->idle == CPU_NEWLY_IDLE) {
                /*
                 * ASYM_PACKING needs to force migrate tasks from busy but
                 * higher numbered CPUs in order to pack all tasks in the
                 * lowest numbered CPUs.
                 */
-                if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
+                if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
                        return 1;
-                /*
-                 * The only task running in a non-idle cpu can be moved to this
-                 * cpu in an attempt to completely freeup the other CPU
-                 * package.
-                 *
-                 * The package power saving logic comes from
-                 * find_busiest_group(). If there are no imbalance, then
-                 * f_b_g() will return NULL. However when sched_mc={1,2} then
-                 * f_b_g() will select a group from which a running task may be
-                 * pulled to this cpu in order to make the other package idle.
-                 * If there is no opportunity to make a package idle and if
-                 * there are no imbalance, then f_b_g() will return NULL and no
-                 * action will be taken in load_balance_newidle().
-                 *
-                 * Under normal task pull operation due to imbalance, there
-                 * will be more than one task in the source run queue and
-                 * move_tasks() will succeed.  ld_moved will be true and this
-                 * active balance code will not be triggered.
-                 */
-                if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
-                        return 0;
        }
        return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
@@ -4399,7 +4229,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 {
        int ld_moved, active_balance = 0;
        struct sched_group *group;
-        unsigned long imbalance;
        struct rq *busiest;
        unsigned long flags;
        struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
@@ -4417,8 +4246,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        schedstat_inc(sd, lb_count[idle]);
 redo:
-        group = find_busiest_group(sd, this_cpu, &imbalance, idle,
+        group = find_busiest_group(&env, cpus, balance);
-                                   cpus, balance);
        if (*balance == 0)
                goto out_balanced;
@@ -4428,7 +4256,7 @@ redo:
                goto out_balanced;
        }
-        busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
+        busiest = find_busiest_queue(&env, group, cpus);
        if (!busiest) {
                schedstat_inc(sd, lb_nobusyq[idle]);
                goto out_balanced;
@@ -4436,7 +4264,7 @@ redo:
        BUG_ON(busiest == this_rq);
-        schedstat_add(sd, lb_imbalance[idle], imbalance);
+        schedstat_add(sd, lb_imbalance[idle], env.imbalance);
        ld_moved = 0;
        if (busiest->nr_running > 1) {
@@ -4447,10 +4275,9 @@ redo:
                 * correctly treated as an imbalance.
                 */
                env.flags |= LBF_ALL_PINNED;
-                env.load_move   = imbalance;
+                env.src_cpu   = busiest->cpu;
-                env.src_cpu     = busiest->cpu;
+                env.src_rq    = busiest;
-                env.src_rq      = busiest;
+                env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
-                env.loop_max    = min_t(unsigned long, sysctl_sched_nr_migrate, busiest->nr_running);
 more_balance:
                local_irq_save(flags);
@@ -4492,7 +4319,7 @@ more_balance:
                if (idle != CPU_NEWLY_IDLE)
                        sd->nr_balance_failed++;
-                if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
+                if (need_active_balance(&env)) {
                        raw_spin_lock_irqsave(&busiest->lock, flags);
                        /* don't kick the active_load_balance_cpu_stop,
@@ -4519,10 +4346,11 @@ more_balance:
                        }
                        raw_spin_unlock_irqrestore(&busiest->lock, flags);
-                        if (active_balance)
+                        if (active_balance) {
                                stop_one_cpu_nowait(cpu_of(busiest),
                                        active_load_balance_cpu_stop, busiest,
                                        &busiest->active_balance_work);
+                        }
                        /*
                         * We've kicked active balancing, reset the failure
@@ -4703,104 +4531,15 @@ static struct {
        unsigned long next_balance;     /* in jiffy units */
 } nohz ____cacheline_aligned;
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+static inline int find_new_ilb(int call_cpu)
-/**
- * lowest_flag_domain - Return lowest sched_domain containing flag.
- * @cpu:        The cpu whose lowest level of sched domain is to
- *              be returned.
- * @flag:       The flag to check for the lowest sched_domain
- *              for the given cpu.
- *
- * Returns the lowest sched_domain of a cpu which contains the given flag.
- */
-static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
-{
-        struct sched_domain *sd;
-        for_each_domain(cpu, sd)
-                if (sd->flags & flag)
-                        break;
-        return sd;
-}
-/**
- * for_each_flag_domain - Iterates over sched_domains containing the flag.
- * @cpu:        The cpu whose domains we're iterating over.
- * @sd:         variable holding the value of the power_savings_sd
- *              for cpu.
- * @flag:       The flag to filter the sched_domains to be iterated.
- *
- * Iterates over all the scheduler domains for a given cpu that has the 'flag'
- * set, starting from the lowest sched_domain to the highest.
- */
-#define for_each_flag_domain(cpu, sd, flag) \
-        for (sd = lowest_flag_domain(cpu, flag); \
-                (sd && (sd->flags & flag)); sd = sd->parent)
-/**
- * find_new_ilb - Finds the optimum idle load balancer for nomination.
- * @cpu:        The cpu which is nominating a new idle_load_balancer.
- *
- * Returns:     Returns the id of the idle load balancer if it exists,
- *              Else, returns >= nr_cpu_ids.
- *
- * This algorithm picks the idle load balancer such that it belongs to a
- * semi-idle powersavings sched_domain. The idea is to try and avoid
- * completely idle packages/cores just for the purpose of idle load balancing
- * when there are other idle cpu's which are better suited for that job.
- */
-static int find_new_ilb(int cpu)
 {
        int ilb = cpumask_first(nohz.idle_cpus_mask);
-        struct sched_group *ilbg;
-        struct sched_domain *sd;
-        /*
-         * Have idle load balancer selection from semi-idle packages only
-         * when power-aware load balancing is enabled
-         */
-        if (!(sched_smt_power_savings || sched_mc_power_savings))
-                goto out_done;
-        /*
-         * Optimize for the case when we have no idle CPUs or only one
-         * idle CPU. Don't walk the sched_domain hierarchy in such cases
-         */
-        if (cpumask_weight(nohz.idle_cpus_mask) < 2)
-                goto out_done;
-        rcu_read_lock();
-        for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
-                ilbg = sd->groups;
-                do {
-                        if (ilbg->group_weight !=
-                                atomic_read(&ilbg->sgp->nr_busy_cpus)) {
-                                ilb = cpumask_first_and(nohz.idle_cpus_mask,
-                                                        sched_group_cpus(ilbg));
-                                goto unlock;
-                        }
-                        ilbg = ilbg->next;
-                } while (ilbg != sd->groups);
-        }
-unlock:
-        rcu_read_unlock();
-out_done:
        if (ilb < nr_cpu_ids && idle_cpu(ilb))
                return ilb;
        return nr_cpu_ids;
 }
-#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
-static inline int find_new_ilb(int call_cpu)
-{
-        return nr_cpu_ids;
-}
-#endif
 /*
 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
@@ -5023,7 +4762,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
                raw_spin_lock_irq(&this_rq->lock);
                update_rq_clock(this_rq);
-                update_cpu_load(this_rq);
+                update_idle_cpu_load(this_rq);
                raw_spin_unlock_irq(&this_rq->lock);
                rebalance_domains(balance_cpu, CPU_IDLE);
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 91b4c957f289..b6baf370cae9 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -4,7 +4,7 @@
 * idle-task scheduling class.
 *
 * (NOTE: these are not related to SCHED_IDLE tasks which are
- *  handled in sched_fair.c)
+ *  handled in sched/fair.c)
 */
 #ifdef CONFIG_SMP
@@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 static struct task_struct *pick_next_task_idle(struct rq *rq)
 {
        schedstat_inc(rq, sched_goidle);
-        calc_load_account_idle(rq);
        return rq->idle;
 }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 44af55e6d5d0..573e1ca01102 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -274,13 +274,16 @@ static void update_rt_migration(struct rt_rq *rt_rq)
 static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
+        struct task_struct *p;
        if (!rt_entity_is_task(rt_se))
                return;
+        p = rt_task_of(rt_se);
        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
        rt_rq->rt_nr_total++;
-        if (rt_se->nr_cpus_allowed > 1)
+        if (p->nr_cpus_allowed > 1)
                rt_rq->rt_nr_migratory++;
        update_rt_migration(rt_rq);
@@ -288,13 +291,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
+        struct task_struct *p;
        if (!rt_entity_is_task(rt_se))
                return;
+        p = rt_task_of(rt_se);
        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
        rt_rq->rt_nr_total--;
-        if (rt_se->nr_cpus_allowed > 1)
+        if (p->nr_cpus_allowed > 1)
                rt_rq->rt_nr_migratory--;
        update_rt_migration(rt_rq);
@@ -1161,7 +1167,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
        enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
-        if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
+        if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
        inc_nr_running(rq);
@@ -1225,7 +1231,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
        cpu = task_cpu(p);
-        if (p->rt.nr_cpus_allowed == 1)
+        if (p->nr_cpus_allowed == 1)
                goto out;
        /* For anything but wake ups, just return the task_cpu */
@@ -1260,9 +1266,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
         * will have to sort it out.
         */
        if (curr && unlikely(rt_task(curr)) &&
-            (curr->rt.nr_cpus_allowed < 2 ||
+            (curr->nr_cpus_allowed < 2 ||
             curr->prio <= p->prio) &&
-            (p->rt.nr_cpus_allowed > 1)) {
+            (p->nr_cpus_allowed > 1)) {
                int target = find_lowest_rq(p);
                if (target != -1)
@@ -1276,10 +1282,10 @@ out:
 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 {
-        if (rq->curr->rt.nr_cpus_allowed == 1)
+        if (rq->curr->nr_cpus_allowed == 1)
                return;
-        if (p->rt.nr_cpus_allowed != 1
+        if (p->nr_cpus_allowed != 1
            && cpupri_find(&rq->rd->cpupri, p, NULL))
                return;
@@ -1395,7 +1401,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
         * The previous task needs to be made eligible for pushing
         * if it is still active
         */
-        if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
+        if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
 }
@@ -1408,7 +1414,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
        if (!task_running(rq, p) &&
            (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
-            (p->rt.nr_cpus_allowed > 1))
+            (p->nr_cpus_allowed > 1))
                return 1;
        return 0;
 }
@@ -1464,7 +1470,7 @@ static int find_lowest_rq(struct task_struct *task)
        if (unlikely(!lowest_mask))
                return -1;
-        if (task->rt.nr_cpus_allowed == 1)
+        if (task->nr_cpus_allowed == 1)
                return -1; /* No other targets possible */
        if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
@@ -1556,7 +1562,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
                                     task_running(rq, task) ||
                                     !task->on_rq)) {
-                                raw_spin_unlock(&lowest_rq->lock);
+                                double_unlock_balance(rq, lowest_rq);
                                lowest_rq = NULL;
                                break;
                        }
@@ -1586,7 +1592,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
        BUG_ON(rq->cpu != task_cpu(p));
        BUG_ON(task_current(rq, p));
-        BUG_ON(p->rt.nr_cpus_allowed <= 1);
+        BUG_ON(p->nr_cpus_allowed <= 1);
        BUG_ON(!p->on_rq);
        BUG_ON(!rt_task(p));
@@ -1793,9 +1799,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
        if (!task_running(rq, p) &&
            !test_tsk_need_resched(rq->curr) &&
            has_pushable_tasks(rq) &&
-            p->rt.nr_cpus_allowed > 1 &&
+            p->nr_cpus_allowed > 1 &&
            rt_task(rq->curr) &&
-            (rq->curr->rt.nr_cpus_allowed < 2 ||
+            (rq->curr->nr_cpus_allowed < 2 ||
             rq->curr->prio <= p->prio))
                push_rt_tasks(rq);
 }
@@ -1803,44 +1809,40 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
 static void set_cpus_allowed_rt(struct task_struct *p,
                                const struct cpumask *new_mask)
 {
-        int weight = cpumask_weight(new_mask);
+        struct rq *rq;
+        int weight;
        BUG_ON(!rt_task(p));
-        /*
+        if (!p->on_rq)
-         * Update the migration status of the RQ if we have an RT task
+                return;
-         * which is running AND changing its weight value.
-         */
-        if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
-                struct rq *rq = task_rq(p);
-                if (!task_current(rq, p)) {
+        weight = cpumask_weight(new_mask);
-                        /*
-                         * Make sure we dequeue this task from the pushable list
-                         * before going further.  It will either remain off of
-                         * the list because we are no longer pushable, or it
-                         * will be requeued.
-                         */
-                        if (p->rt.nr_cpus_allowed > 1)
-                                dequeue_pushable_task(rq, p);
-                        /*
+        /*
-                         * Requeue if our weight is changing and still > 1
+         * Only update if the process changes its state from whether it
-                         */
+         * can migrate or not.
-                        if (weight > 1)
+         */
-                                enqueue_pushable_task(rq, p);
+        if ((p->nr_cpus_allowed > 1) == (weight > 1))
+                return;
-                }
-                if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
+        rq = task_rq(p);
-                        rq->rt.rt_nr_migratory++;
-                } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
-                        BUG_ON(!rq->rt.rt_nr_migratory);
-                        rq->rt.rt_nr_migratory--;
-                }
-                update_rt_migration(&rq->rt);
+        /*
+         * The process used to be able to migrate OR it can now migrate
+         */
+        if (weight <= 1) {
+                if (!task_current(rq, p))
+                        dequeue_pushable_task(rq, p);
+                BUG_ON(!rq->rt.rt_nr_migratory);
+                rq->rt.rt_nr_migratory--;
+        } else {
+                if (!task_current(rq, p))
+                        enqueue_pushable_task(rq, p);
+                rq->rt.rt_nr_migratory++;
        }
+        update_rt_migration(&rq->rt);
 }
 /* Assumes rq->lock is held */
@@ -1983,6 +1985,8 @@ static void watchdog(struct rq *rq, struct task_struct *p)
 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 {
+        struct sched_rt_entity *rt_se = &p->rt;
        update_curr_rt(rq);
        watchdog(rq, p);
@@ -2000,12 +2004,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
        p->rt.time_slice = RR_TIMESLICE;
        /*
-         * Requeue to the end of queue if we are not the only element
+         * Requeue to the end of queue if we (and all of our ancestors) are the
-         * on the queue:
+         * only element on the queue
         */
-        if (p->rt.run_list.prev != p->rt.run_list.next) {
+        for_each_sched_rt_entity(rt_se) {
-                requeue_task_rt(rq, p, 0);
+                if (rt_se->run_list.prev != rt_se->run_list.next) {
-                set_tsk_need_resched(p);
+                        requeue_task_rt(rq, p, 0);
+                        set_tsk_need_resched(p);
+                        return;
+                }
        }
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fb3acba4d52e..55844f24435a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -201,7 +201,7 @@ struct cfs_bandwidth { };
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
        struct load_weight load;
-        unsigned long nr_running, h_nr_running;
+        unsigned int nr_running, h_nr_running;
        u64 exec_clock;
        u64 min_vruntime;
@@ -279,7 +279,7 @@ static inline int rt_bandwidth_enabled(void)
 /* Real-Time classes' related field in a runqueue: */
 struct rt_rq {
        struct rt_prio_array active;
-        unsigned long rt_nr_running;
+        unsigned int rt_nr_running;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
        struct {
                int curr; /* highest queued rt task prio */
@@ -353,7 +353,7 @@ struct rq {
         * nr_running and cpu_load should be in the same cacheline because
         * remote CPUs use both these fields when doing load calculation.
         */
-        unsigned long nr_running;
+        unsigned int nr_running;
        #define CPU_LOAD_IDX_MAX 5
        unsigned long cpu_load[CPU_LOAD_IDX_MAX];
        unsigned long last_load_update_tick;
@@ -526,6 +526,8 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
 DECLARE_PER_CPU(struct sched_domain *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_id);
+extern int group_balance_cpu(struct sched_group *sg);
 #endif /* CONFIG_SMP */
 #include "stats.h"
@@ -876,7 +878,7 @@ extern void resched_cpu(int cpu);
 extern struct rt_bandwidth def_rt_bandwidth;
 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
-extern void update_cpu_load(struct rq *this_rq);
+extern void update_idle_cpu_load(struct rq *this_rq);
 #ifdef CONFIG_CGROUP_CPUACCT
 #include <linux/cgroup.h>
@@ -940,8 +942,6 @@ static inline u64 sched_avg_period(void)
        return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
 }
-void calc_load_account_idle(struct rq *this_rq);
 #ifdef CONFIG_SCHED_HRTICK
 /*
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index e8d76c5895ea..ee376beedaf9 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -3,16 +3,357 @@
 *
 * Copyright 2004-2005  Andrea Arcangeli <andrea@cpushare.com>
 *
- * This defines a simple but solid secure-computing mode.
+ * Copyright (C) 2012 Google, Inc.
+ * Will Drewry <wad@chromium.org>
+ *
+ * This defines a simple but solid secure-computing facility.
+ *
+ * Mode 1 uses a fixed list of allowed system calls.
+ * Mode 2 allows user-defined system call filters in the form
+ *        of Berkeley Packet Filters/Linux Socket Filters.
 */
+#include <linux/atomic.h>
 #include <linux/audit.h>
-#include <linux/seccomp.h>
-#include <linux/sched.h>
 #include <linux/compat.h>
+#include <linux/sched.h>
+#include <linux/seccomp.h>
 /* #define SECCOMP_DEBUG 1 */
-#define NR_SECCOMP_MODES 1
+#ifdef CONFIG_SECCOMP_FILTER
+#include <asm/syscall.h>
+#include <linux/filter.h>
+#include <linux/ptrace.h>
+#include <linux/security.h>
+#include <linux/slab.h>
+#include <linux/tracehook.h>
+#include <linux/uaccess.h>
+/**
+ * struct seccomp_filter - container for seccomp BPF programs
+ *
+ * @usage: reference count to manage the object lifetime.
+ *         get/put helpers should be used when accessing an instance
+ *         outside of a lifetime-guarded section.  In general, this
+ *         is only needed for handling filters shared across tasks.
+ * @prev: points to a previously installed, or inherited, filter
+ * @len: the number of instructions in the program
+ * @insns: the BPF program instructions to evaluate
+ *
+ * seccomp_filter objects are organized in a tree linked via the @prev
+ * pointer.  For any task, it appears to be a singly-linked list starting
+ * with current->seccomp.filter, the most recently attached or inherited filter.
+ * However, multiple filters may share a @prev node, by way of fork(), which
+ * results in a unidirectional tree existing in memory.  This is similar to
+ * how namespaces work.
+ *
+ * seccomp_filter objects should never be modified after being attached
+ * to a task_struct (other than @usage).
+ */
+struct seccomp_filter {
+        atomic_t usage;
+        struct seccomp_filter *prev;
+        unsigned short len;  /* Instruction count */
+        struct sock_filter insns[];
+};
+/* Limit any path through the tree to 256KB worth of instructions. */
+#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
+/**
+ * get_u32 - returns a u32 offset into data
+ * @data: a unsigned 64 bit value
+ * @index: 0 or 1 to return the first or second 32-bits
+ *
+ * This inline exists to hide the length of unsigned long.  If a 32-bit
+ * unsigned long is passed in, it will be extended and the top 32-bits will be
+ * 0. If it is a 64-bit unsigned long, then whatever data is resident will be
+ * properly returned.
+ *
+ * Endianness is explicitly ignored and left for BPF program authors to manage
+ * as per the specific architecture.
+ */
+static inline u32 get_u32(u64 data, int index)
+{
+        return ((u32 *)&data)[index];
+}
+/* Helper for bpf_load below. */
+#define BPF_DATA(_name) offsetof(struct seccomp_data, _name)
+/**
+ * bpf_load: checks and returns a pointer to the requested offset
+ * @off: offset into struct seccomp_data to load from
+ *
+ * Returns the requested 32-bits of data.
+ * seccomp_check_filter() should assure that @off is 32-bit aligned
+ * and not out of bounds.  Failure to do so is a BUG.
+ */
+u32 seccomp_bpf_load(int off)
+{
+        struct pt_regs *regs = task_pt_regs(current);
+        if (off == BPF_DATA(nr))
+                return syscall_get_nr(current, regs);
+        if (off == BPF_DATA(arch))
+                return syscall_get_arch(current, regs);
+        if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) {
+                unsigned long value;
+                int arg = (off - BPF_DATA(args[0])) / sizeof(u64);
+                int index = !!(off % sizeof(u64));
+                syscall_get_arguments(current, regs, arg, 1, &value);
+                return get_u32(value, index);
+        }
+        if (off == BPF_DATA(instruction_pointer))
+                return get_u32(KSTK_EIP(current), 0);
+        if (off == BPF_DATA(instruction_pointer) + sizeof(u32))
+                return get_u32(KSTK_EIP(current), 1);
+        /* seccomp_check_filter should make this impossible. */
+        BUG();
+}
+/**
+ *      seccomp_check_filter - verify seccomp filter code
+ *      @filter: filter to verify
+ *      @flen: length of filter
+ *
+ * Takes a previously checked filter (by sk_chk_filter) and
+ * redirects all filter code that loads struct sk_buff data
+ * and related data through seccomp_bpf_load.  It also
+ * enforces length and alignment checking of those loads.
+ *
+ * Returns 0 if the rule set is legal or -EINVAL if not.
+ */
+static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
+{
+        int pc;
+        for (pc = 0; pc < flen; pc++) {
+                struct sock_filter *ftest = &filter[pc];
+                u16 code = ftest->code;
+                u32 k = ftest->k;
+                switch (code) {
+                case BPF_S_LD_W_ABS:
+                        ftest->code = BPF_S_ANC_SECCOMP_LD_W;
+                        /* 32-bit aligned and not out of bounds. */
+                        if (k >= sizeof(struct seccomp_data) || k & 3)
+                                return -EINVAL;
+                        continue;
+                case BPF_S_LD_W_LEN:
+                        ftest->code = BPF_S_LD_IMM;
+                        ftest->k = sizeof(struct seccomp_data);
+                        continue;
+                case BPF_S_LDX_W_LEN:
+                        ftest->code = BPF_S_LDX_IMM;
+                        ftest->k = sizeof(struct seccomp_data);
+                        continue;
+                /* Explicitly include allowed calls. */
+                case BPF_S_RET_K:
+                case BPF_S_RET_A:
+                case BPF_S_ALU_ADD_K:
+                case BPF_S_ALU_ADD_X:
+                case BPF_S_ALU_SUB_K:
+                case BPF_S_ALU_SUB_X:
+                case BPF_S_ALU_MUL_K:
+                case BPF_S_ALU_MUL_X:
+                case BPF_S_ALU_DIV_X:
+                case BPF_S_ALU_AND_K:
+                case BPF_S_ALU_AND_X:
+                case BPF_S_ALU_OR_K:
+                case BPF_S_ALU_OR_X:
+                case BPF_S_ALU_LSH_K:
+                case BPF_S_ALU_LSH_X:
+                case BPF_S_ALU_RSH_K:
+                case BPF_S_ALU_RSH_X:
+                case BPF_S_ALU_NEG:
+                case BPF_S_LD_IMM:
+                case BPF_S_LDX_IMM:
+                case BPF_S_MISC_TAX:
+                case BPF_S_MISC_TXA:
+                case BPF_S_ALU_DIV_K:
+                case BPF_S_LD_MEM:
+                case BPF_S_LDX_MEM:
+                case BPF_S_ST:
+                case BPF_S_STX:
+                case BPF_S_JMP_JA:
+                case BPF_S_JMP_JEQ_K:
+                case BPF_S_JMP_JEQ_X:
+                case BPF_S_JMP_JGE_K:
+                case BPF_S_JMP_JGE_X:
+                case BPF_S_JMP_JGT_K:
+                case BPF_S_JMP_JGT_X:
+                case BPF_S_JMP_JSET_K:
+                case BPF_S_JMP_JSET_X:
+                        continue;
+                default:
+                        return -EINVAL;
+                }
+        }
+        return 0;
+}
+/**
+ * seccomp_run_filters - evaluates all seccomp filters against @syscall
+ * @syscall: number of the current system call
+ *
+ * Returns valid seccomp BPF response codes.
+ */
+static u32 seccomp_run_filters(int syscall)
+{
+        struct seccomp_filter *f;
+        u32 ret = SECCOMP_RET_ALLOW;
+        /* Ensure unexpected behavior doesn't result in failing open. */
+        if (WARN_ON(current->seccomp.filter == NULL))
+                return SECCOMP_RET_KILL;
+        /*
+         * All filters in the list are evaluated and the lowest BPF return
+         * value always takes priority (ignoring the DATA).
+         */
+        for (f = current->seccomp.filter; f; f = f->prev) {
+                u32 cur_ret = sk_run_filter(NULL, f->insns);
+                if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
+                        ret = cur_ret;
+        }
+        return ret;
+}
+/**
+ * seccomp_attach_filter: Attaches a seccomp filter to current.
+ * @fprog: BPF program to install
+ *
+ * Returns 0 on success or an errno on failure.
+ */
+static long seccomp_attach_filter(struct sock_fprog *fprog)
+{
+        struct seccomp_filter *filter;
+        unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
+        unsigned long total_insns = fprog->len;
+        long ret;
+        if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
+                return -EINVAL;
+        for (filter = current->seccomp.filter; filter; filter = filter->prev)
+                total_insns += filter->len + 4;  /* include a 4 instr penalty */
+        if (total_insns > MAX_INSNS_PER_PATH)
+                return -ENOMEM;
+        /*
+         * Installing a seccomp filter requires that the task have
+         * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
+         * This avoids scenarios where unprivileged tasks can affect the
+         * behavior of privileged children.
+         */
+        if (!current->no_new_privs &&
+            security_capable_noaudit(current_cred(), current_user_ns(),
+                                     CAP_SYS_ADMIN) != 0)
+                return -EACCES;
+        /* Allocate a new seccomp_filter */
+        filter = kzalloc(sizeof(struct seccomp_filter) + fp_size,
+                         GFP_KERNEL|__GFP_NOWARN);
+        if (!filter)
+                return -ENOMEM;
+        atomic_set(&filter->usage, 1);
+        filter->len = fprog->len;
+        /* Copy the instructions from fprog. */
+        ret = -EFAULT;
+        if (copy_from_user(filter->insns, fprog->filter, fp_size))
+                goto fail;
+        /* Check and rewrite the fprog via the skb checker */
+        ret = sk_chk_filter(filter->insns, filter->len);
+        if (ret)
+                goto fail;
+        /* Check and rewrite the fprog for seccomp use */
+        ret = seccomp_check_filter(filter->insns, filter->len);
+        if (ret)
+                goto fail;
+        /*
+         * If there is an existing filter, make it the prev and don't drop its
+         * task reference.
+         */
+        filter->prev = current->seccomp.filter;
+        current->seccomp.filter = filter;
+        return 0;
+fail:
+        kfree(filter);
+        return ret;
+}
+/**
+ * seccomp_attach_user_filter - attaches a user-supplied sock_fprog
+ * @user_filter: pointer to the user data containing a sock_fprog.
+ *
+ * Returns 0 on success and non-zero otherwise.
+ */
+long seccomp_attach_user_filter(char __user *user_filter)
+{
+        struct sock_fprog fprog;
+        long ret = -EFAULT;
+#ifdef CONFIG_COMPAT
+        if (is_compat_task()) {
+                struct compat_sock_fprog fprog32;
+                if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
+                        goto out;
+                fprog.len = fprog32.len;
+                fprog.filter = compat_ptr(fprog32.filter);
+        } else /* falls through to the if below. */
+#endif
+        if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
+                goto out;
+        ret = seccomp_attach_filter(&fprog);
+out:
+        return ret;
+}
+/* get_seccomp_filter - increments the reference count of the filter on @tsk */
+void get_seccomp_filter(struct task_struct *tsk)
+{
+        struct seccomp_filter *orig = tsk->seccomp.filter;
+        if (!orig)
+                return;
+        /* Reference count is bounded by the number of total processes. */
+        atomic_inc(&orig->usage);
+}
+/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
+void put_seccomp_filter(struct task_struct *tsk)
+{
+        struct seccomp_filter *orig = tsk->seccomp.filter;
+        /* Clean up single-reference branches iteratively. */
+        while (orig && atomic_dec_and_test(&orig->usage)) {
+                struct seccomp_filter *freeme = orig;
+                orig = orig->prev;
+                kfree(freeme);
+        }
+}
+/**
+ * seccomp_send_sigsys - signals the task to allow in-process syscall emulation
+ * @syscall: syscall number to send to userland
+ * @reason: filter-supplied reason code to send to userland (via si_errno)
+ *
+ * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
+ */
+static void seccomp_send_sigsys(int syscall, int reason)
+{
+        struct siginfo info;
+        memset(&info, 0, sizeof(info));
+        info.si_signo = SIGSYS;
+        info.si_code = SYS_SECCOMP;
+        info.si_call_addr = (void __user *)KSTK_EIP(current);
+        info.si_errno = reason;
+        info.si_arch = syscall_get_arch(current, task_pt_regs(current));
+        info.si_syscall = syscall;
+        force_sig_info(SIGSYS, &info, current);
+}
+#endif  /* CONFIG_SECCOMP_FILTER */
 /*
 * Secure computing mode 1 allows only read/write/exit/sigreturn.
@@ -31,13 +372,15 @@ static int mode1_syscalls_32[] = {
 };
 #endif
-void __secure_computing(int this_syscall)
+int __secure_computing(int this_syscall)
 {
        int mode = current->seccomp.mode;
-        int * syscall;
+        int exit_sig = 0;
+        int *syscall;
+        u32 ret;
        switch (mode) {
-        case 1:
+        case SECCOMP_MODE_STRICT:
                syscall = mode1_syscalls;
 #ifdef CONFIG_COMPAT
                if (is_compat_task())
@@ -45,9 +388,54 @@ void __secure_computing(int this_syscall)
 #endif
                do {
                        if (*syscall == this_syscall)
-                                return;
+                                return 0;
                } while (*++syscall);
+                exit_sig = SIGKILL;
+                ret = SECCOMP_RET_KILL;
+                break;
+#ifdef CONFIG_SECCOMP_FILTER
+        case SECCOMP_MODE_FILTER: {
+                int data;
+                ret = seccomp_run_filters(this_syscall);
+                data = ret & SECCOMP_RET_DATA;
+                ret &= SECCOMP_RET_ACTION;
+                switch (ret) {
+                case SECCOMP_RET_ERRNO:
+                        /* Set the low-order 16-bits as a errno. */
+                        syscall_set_return_value(current, task_pt_regs(current),
+                                                 -data, 0);
+                        goto skip;
+                case SECCOMP_RET_TRAP:
+                        /* Show the handler the original registers. */
+                        syscall_rollback(current, task_pt_regs(current));
+                        /* Let the filter pass back 16 bits of data. */
+                        seccomp_send_sigsys(this_syscall, data);
+                        goto skip;
+                case SECCOMP_RET_TRACE:
+                        /* Skip these calls if there is no tracer. */
+                        if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP))
+                                goto skip;
+                        /* Allow the BPF to provide the event message */
+                        ptrace_event(PTRACE_EVENT_SECCOMP, data);
+                        /*
+                         * The delivery of a fatal signal during event
+                         * notification may silently skip tracer notification.
+                         * Terminating the task now avoids executing a system
+                         * call that may not be intended.
+                         */
+                        if (fatal_signal_pending(current))
+                                break;
+                        return 0;
+                case SECCOMP_RET_ALLOW:
+                        return 0;
+                case SECCOMP_RET_KILL:
+                default:
+                        break;
+                }
+                exit_sig = SIGSYS;
                break;
+        }
+#endif
        default:
                BUG();
        }
@@ -55,8 +443,13 @@ void __secure_computing(int this_syscall)
 #ifdef SECCOMP_DEBUG
        dump_stack();
 #endif
-        audit_seccomp(this_syscall);
+        audit_seccomp(this_syscall, exit_sig, ret);
-        do_exit(SIGKILL);
+        do_exit(exit_sig);
+#ifdef CONFIG_SECCOMP_FILTER
+skip:
+        audit_seccomp(this_syscall, exit_sig, ret);
+#endif
+        return -1;
 }
 long prctl_get_seccomp(void)
@@ -64,25 +457,48 @@ long prctl_get_seccomp(void)
        return current->seccomp.mode;
 }
-long prctl_set_seccomp(unsigned long seccomp_mode)
+/**
+ * prctl_set_seccomp: configures current->seccomp.mode
+ * @seccomp_mode: requested mode to use
+ * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
+ *
+ * This function may be called repeatedly with a @seccomp_mode of
+ * SECCOMP_MODE_FILTER to install additional filters.  Every filter
+ * successfully installed will be evaluated (in reverse order) for each system
+ * call the task makes.
+ *
+ * Once current->seccomp.mode is non-zero, it may not be changed.
+ *
+ * Returns 0 on success or -EINVAL on failure.
+ */
+long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
 {
-        long ret;
+        long ret = -EINVAL;
-        /* can set it only once to be even more secure */
+        if (current->seccomp.mode &&
-        ret = -EPERM;
+            current->seccomp.mode != seccomp_mode)
-        if (unlikely(current->seccomp.mode))
                goto out;
-        ret = -EINVAL;
+        switch (seccomp_mode) {
-        if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
+        case SECCOMP_MODE_STRICT:
-                current->seccomp.mode = seccomp_mode;
+                ret = 0;
-                set_thread_flag(TIF_SECCOMP);
 #ifdef TIF_NOTSC
                disable_TSC();
 #endif
-                ret = 0;
+                break;
+#ifdef CONFIG_SECCOMP_FILTER
+        case SECCOMP_MODE_FILTER:
+                ret = seccomp_attach_user_filter(filter);
+                if (ret)
+                        goto out;
+                break;
+#endif
+        default:
+                goto out;
        }
- out:
+        current->seccomp.mode = seccomp_mode;
+        set_thread_flag(TIF_SECCOMP);
+out:
        return ret;
 }
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 60636a4e25c3..4567fc020fe3 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -118,7 +118,7 @@ EXPORT_SYMBOL(down_killable);
 * down_trylock - try to acquire the semaphore, without waiting
 * @sem: the semaphore to be acquired
 *
- * Try to acquire the semaphore atomically.  Returns 0 if the mutex has
+ * Try to acquire the semaphore atomically.  Returns 0 if the semaphore has
 * been acquired successfully or 1 if it it cannot be acquired.
 *
 * NOTE: This return value is inverted from both spin_trylock and
diff --git a/kernel/signal.c b/kernel/signal.c
index 17afcaf582d0..677102789cf2 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -29,6 +29,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/nsproxy.h>
 #include <linux/user_namespace.h>
+#include <linux/uprobes.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/signal.h>
@@ -160,7 +161,7 @@ void recalc_sigpending(void)
 #define SYNCHRONOUS_MASK \
        (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
-         sigmask(SIGTRAP) | sigmask(SIGFPE))
+         sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS))
 int next_signal(struct sigpending *pending, sigset_t *mask)
 {
@@ -767,14 +768,13 @@ static int kill_ok_by_cred(struct task_struct *t)
        const struct cred *cred = current_cred();
        const struct cred *tcred = __task_cred(t);
-        if (cred->user->user_ns == tcred->user->user_ns &&
+        if (uid_eq(cred->euid, tcred->suid) ||
-            (cred->euid == tcred->suid ||
+            uid_eq(cred->euid, tcred->uid)  ||
-             cred->euid == tcred->uid ||
+            uid_eq(cred->uid,  tcred->suid) ||
-             cred->uid  == tcred->suid ||
+            uid_eq(cred->uid,  tcred->uid))
-             cred->uid  == tcred->uid))
                return 1;
-        if (ns_capable(tcred->user->user_ns, CAP_KILL))
+        if (ns_capable(tcred->user_ns, CAP_KILL))
                return 1;
        return 0;
@@ -1020,15 +1020,6 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
        return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
 }
-/*
- * map the uid in struct cred into user namespace *ns
- */
-static inline uid_t map_cred_ns(const struct cred *cred,
-                                struct user_namespace *ns)
-{
-        return user_ns_map_uid(ns, cred, cred->uid);
-}
 #ifdef CONFIG_USER_NS
 static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
 {
@@ -1038,8 +1029,10 @@ static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_str
        if (SI_FROMKERNEL(info))
                return;
-        info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns),
+        rcu_read_lock();
-                                        current_cred(), info->si_uid);
+        info->si_uid = from_kuid_munged(task_cred_xxx(t, user_ns),
+                                        make_kuid(current_user_ns(), info->si_uid));
+        rcu_read_unlock();
 }
 #else
 static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
@@ -1106,7 +1099,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
                        q->info.si_code = SI_USER;
                        q->info.si_pid = task_tgid_nr_ns(current,
                                                        task_active_pid_ns(t));
-                        q->info.si_uid = current_uid();
+                        q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
                        break;
                case (unsigned long) SEND_SIG_PRIV:
                        q->info.si_signo = sig;
@@ -1387,10 +1380,8 @@ static int kill_as_cred_perm(const struct cred *cred,
                             struct task_struct *target)
 {
        const struct cred *pcred = __task_cred(target);
-        if (cred->user_ns != pcred->user_ns)
+        if (!uid_eq(cred->euid, pcred->suid) && !uid_eq(cred->euid, pcred->uid) &&
-                return 0;
+            !uid_eq(cred->uid,  pcred->suid) && !uid_eq(cred->uid,  pcred->uid))
-        if (cred->euid != pcred->suid && cred->euid != pcred->uid &&
-            cred->uid  != pcred->suid && cred->uid  != pcred->uid)
                return 0;
        return 1;
 }
@@ -1665,21 +1656,20 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
        info.si_signo = sig;
        info.si_errno = 0;
        /*
-         * we are under tasklist_lock here so our parent is tied to
+         * We are under tasklist_lock here so our parent is tied to
-         * us and cannot exit and release its namespace.
+         * us and cannot change.
         *
-         * the only it can is to switch its nsproxy with sys_unshare,
+         * task_active_pid_ns will always return the same pid namespace
-         * bu uncharing pid namespaces is not allowed, so we'll always
+         * until a task passes through release_task.
-         * see relevant namespace
         *
         * write_lock() currently calls preempt_disable() which is the
         * same as rcu_read_lock(), but according to Oleg, this is not
         * correct to rely on this
         */
        rcu_read_lock();
-        info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
+        info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(tsk->parent));
-        info.si_uid = map_cred_ns(__task_cred(tsk),
+        info.si_uid = from_kuid_munged(task_cred_xxx(tsk->parent, user_ns),
-                        task_cred_xxx(tsk->parent, user_ns));
+                                       task_uid(tsk));
        rcu_read_unlock();
        info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime);
@@ -1762,8 +1752,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
         */
        rcu_read_lock();
        info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
-        info.si_uid = map_cred_ns(__task_cred(tsk),
+        info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
-                        task_cred_xxx(parent, user_ns));
        rcu_read_unlock();
        info.si_utime = cputime_to_clock_t(tsk->utime);
@@ -1973,7 +1962,7 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
        info.si_signo = signr;
        info.si_code = exit_code;
        info.si_pid = task_pid_vnr(current);
-        info.si_uid = current_uid();
+        info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
        /* Let the debugger run.  */
        ptrace_stop(exit_code, why, 1, &info);
@@ -2181,8 +2170,8 @@ static int ptrace_signal(int signr, siginfo_t *info,
                info->si_code = SI_USER;
                rcu_read_lock();
                info->si_pid = task_pid_vnr(current->parent);
-                info->si_uid = map_cred_ns(__task_cred(current->parent),
+                info->si_uid = from_kuid_munged(current_user_ns(),
-                                current_user_ns());
+                                                task_uid(current->parent));
                rcu_read_unlock();
        }
@@ -2202,6 +2191,9 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
        struct signal_struct *signal = current->signal;
        int signr;
+        if (unlikely(uprobe_deny_signal()))
+                return 0;
 relock:
        /*
         * We'll jump back here after any time we were stopped in TASK_STOPPED.
@@ -2376,24 +2368,34 @@ relock:
 }
 /**
- * block_sigmask - add @ka's signal mask to current->blocked
+ * signal_delivered - 
- * @ka: action for @signr
+ * @sig:                number of signal being delivered
- * @signr: signal that has been successfully delivered
+ * @info:               siginfo_t of signal being delivered
+ * @ka:                 sigaction setting that chose the handler
+ * @regs:               user register state
+ * @stepping:           nonzero if debugger single-step or block-step in use
 *
 * This function should be called when a signal has succesfully been
- * delivered. It adds the mask of signals for @ka to current->blocked
+ * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask
- * so that they are blocked during the execution of the signal
+ * is always blocked, and the signal itself is blocked unless %SA_NODEFER
- * handler. In addition, @signr will be blocked unless %SA_NODEFER is
+ * is set in @ka->sa.sa_flags.  Tracing is notified.
- * set in @ka->sa.sa_flags.
 */
-void block_sigmask(struct k_sigaction *ka, int signr)
+void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,
+                        struct pt_regs *regs, int stepping)
 {
        sigset_t blocked;
+        /* A signal was successfully delivered, and the
+           saved sigmask was stored on the signal frame,
+           and will be restored by sigreturn.  So we can
+           simply clear the restore sigmask flag.  */
+        clear_restore_sigmask();
        sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
        if (!(ka->sa.sa_flags & SA_NODEFER))
-                sigaddset(&blocked, signr);
+                sigaddset(&blocked, sig);
        set_current_blocked(&blocked);
+        tracehook_signal_handler(sig, info, ka, regs, stepping);
 }
 /*
@@ -2526,7 +2528,16 @@ static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
 * It is wrong to change ->blocked directly, this helper should be used
 * to ensure the process can't miss a shared signal we are going to block.
 */
-void set_current_blocked(const sigset_t *newset)
+void set_current_blocked(sigset_t *newset)
+{
+        struct task_struct *tsk = current;
+        sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP));
+        spin_lock_irq(&tsk->sighand->siglock);
+        __set_task_blocked(tsk, newset);
+        spin_unlock_irq(&tsk->sighand->siglock);
+}
+void __set_current_blocked(const sigset_t *newset)
 {
        struct task_struct *tsk = current;
@@ -2566,7 +2577,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
                return -EINVAL;
        }
-        set_current_blocked(&newset);
+        __set_current_blocked(&newset);
        return 0;
 }
@@ -2706,6 +2717,13 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
                err |= __put_user(from->si_uid, &to->si_uid);
                err |= __put_user(from->si_ptr, &to->si_ptr);
                break;
+#ifdef __ARCH_SIGSYS
+        case __SI_SYS:
+                err |= __put_user(from->si_call_addr, &to->si_call_addr);
+                err |= __put_user(from->si_syscall, &to->si_syscall);
+                err |= __put_user(from->si_arch, &to->si_arch);
+                break;
+#endif
        default: /* this is just in case for now ... */
                err |= __put_user(from->si_pid, &to->si_pid);
                err |= __put_user(from->si_uid, &to->si_uid);
@@ -2828,7 +2846,7 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
        info.si_errno = 0;
        info.si_code = SI_USER;
        info.si_pid = task_tgid_vnr(current);
-        info.si_uid = current_uid();
+        info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
        return kill_something_info(sig, &info, pid);
 }
@@ -2871,7 +2889,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
        info.si_errno = 0;
        info.si_code = SI_TKILL;
        info.si_pid = task_tgid_vnr(current);
-        info.si_uid = current_uid();
+        info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
        return do_send_specific(tgid, pid, sig, &info);
 }
@@ -3133,7 +3151,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
                        return -EINVAL;
                }
-                set_current_blocked(&new_blocked);
+                __set_current_blocked(&new_blocked);
        }
        if (oset) {
@@ -3197,7 +3215,6 @@ SYSCALL_DEFINE1(ssetmask, int, newmask)
        int old = current->blocked.sig[0];
        sigset_t newset;
-        siginitset(&newset, newmask & ~(sigmask(SIGKILL) | sigmask(SIGSTOP)));
        set_current_blocked(&newset);
        return old;
@@ -3236,6 +3253,17 @@ SYSCALL_DEFINE0(pause)
 #endif
+int sigsuspend(sigset_t *set)
+{
+        current->saved_sigmask = current->blocked;
+        set_current_blocked(set);
+        current->state = TASK_INTERRUPTIBLE;
+        schedule();
+        set_restore_sigmask();
+        return -ERESTARTNOHAND;
+}
 #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
 /**
 *  sys_rt_sigsuspend - replace the signal mask for a value with the
@@ -3253,15 +3281,7 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
        if (copy_from_user(&newset, unewset, sizeof(newset)))
                return -EFAULT;
-        sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
+        return sigsuspend(&newset);
-        current->saved_sigmask = current->blocked;
-        set_current_blocked(&newset);
-        current->state = TASK_INTERRUPTIBLE;
-        schedule();
-        set_restore_sigmask();
-        return -ERESTARTNOHAND;
 }
 #endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
diff --git a/kernel/smp.c b/kernel/smp.c
index 2f8b10ecf759..d0ae5b24875e 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -13,6 +13,8 @@
 #include <linux/smp.h>
 #include <linux/cpu.h>
+#include "smpboot.h"
 #ifdef CONFIG_USE_GENERIC_SMP_HELPERS
 static struct {
        struct list_head        queue;
@@ -669,6 +671,8 @@ void __init smp_init(void)
 {
        unsigned int cpu;
+        idle_threads_init();
        /* FIXME: This should be done in userspace --RR */
        for_each_present_cpu(cpu) {
                if (num_online_cpus() >= setup_max_cpus)
@@ -791,3 +795,26 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
        }
 }
 EXPORT_SYMBOL(on_each_cpu_cond);
+static void do_nothing(void *unused)
+{
+}
+/**
+ * kick_all_cpus_sync - Force all cpus out of idle
+ *
+ * Used to synchronize the update of pm_idle function pointer. It's
+ * called after the pointer is updated and returns after the dummy
+ * callback function has been executed on all cpus. The execution of
+ * the function can only happen on the remote cpus after they have
+ * left the idle function which had been called via pm_idle function
+ * pointer. So it's guaranteed that nothing uses the previous pointer
+ * anymore.
+ */
+void kick_all_cpus_sync(void)
+{
+        /* Make sure the change is visible before we kick the cpus */
+        smp_mb();
+        smp_call_function(do_nothing, NULL, 1);
+}
+EXPORT_SYMBOL_GPL(kick_all_cpus_sync);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
new file mode 100644
index 000000000000..98f60c5caa1b
--- /dev/null
+++ b/kernel/smpboot.c
@@ -0,0 +1,67 @@
+/*
+ * Common SMP CPU bringup/teardown functions
+ */
+#include <linux/err.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/percpu.h>
+#include "smpboot.h"
+#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
+/*
+ * For the hotplug case we keep the task structs around and reuse
+ * them.
+ */
+static DEFINE_PER_CPU(struct task_struct *, idle_threads);
+struct task_struct * __cpuinit idle_thread_get(unsigned int cpu)
+{
+        struct task_struct *tsk = per_cpu(idle_threads, cpu);
+        if (!tsk)
+                return ERR_PTR(-ENOMEM);
+        init_idle(tsk, cpu);
+        return tsk;
+}
+void __init idle_thread_set_boot_cpu(void)
+{
+        per_cpu(idle_threads, smp_processor_id()) = current;
+}
+/**
+ * idle_init - Initialize the idle thread for a cpu
+ * @cpu:        The cpu for which the idle thread should be initialized
+ *
+ * Creates the thread if it does not exist.
+ */
+static inline void idle_init(unsigned int cpu)
+{
+        struct task_struct *tsk = per_cpu(idle_threads, cpu);
+        if (!tsk) {
+                tsk = fork_idle(cpu);
+                if (IS_ERR(tsk))
+                        pr_err("SMP: fork_idle() failed for CPU %u\n", cpu);
+                else
+                        per_cpu(idle_threads, cpu) = tsk;
+        }
+}
+/**
+ * idle_threads_init - Initialize idle threads for all cpus
+ */
+void __init idle_threads_init(void)
+{
+        unsigned int cpu, boot_cpu;
+        boot_cpu = smp_processor_id();
+        for_each_possible_cpu(cpu) {
+                if (cpu != boot_cpu)
+                        idle_init(cpu);
+        }
+}
+#endif
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
new file mode 100644
index 000000000000..80c0acfb8472
--- /dev/null
+++ b/kernel/smpboot.h
@@ -0,0 +1,18 @@
+#ifndef SMPBOOT_H
+#define SMPBOOT_H
+struct task_struct;
+int smpboot_prepare(unsigned int cpu);
+#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
+struct task_struct *idle_thread_get(unsigned int cpu);
+void idle_thread_set_boot_cpu(void);
+void idle_threads_init(void);
+#else
+static inline struct task_struct *idle_thread_get(unsigned int cpu) { return NULL; }
+static inline void idle_thread_set_boot_cpu(void) { }
+static inline void idle_threads_init(void) { }
+#endif
+#endif
diff --git a/kernel/srcu.c b/kernel/srcu.c
index ba35f3a4a1f4..2095be3318d5 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -34,10 +34,77 @@
 #include <linux/delay.h>
 #include <linux/srcu.h>
+/*
+ * Initialize an rcu_batch structure to empty.
+ */
+static inline void rcu_batch_init(struct rcu_batch *b)
+{
+        b->head = NULL;
+        b->tail = &b->head;
+}
+/*
+ * Enqueue a callback onto the tail of the specified rcu_batch structure.
+ */
+static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head)
+{
+        *b->tail = head;
+        b->tail = &head->next;
+}
+/*
+ * Is the specified rcu_batch structure empty?
+ */
+static inline bool rcu_batch_empty(struct rcu_batch *b)
+{
+        return b->tail == &b->head;
+}
+/*
+ * Remove the callback at the head of the specified rcu_batch structure
+ * and return a pointer to it, or return NULL if the structure is empty.
+ */
+static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b)
+{
+        struct rcu_head *head;
+        if (rcu_batch_empty(b))
+                return NULL;
+        head = b->head;
+        b->head = head->next;
+        if (b->tail == &head->next)
+                rcu_batch_init(b);
+        return head;
+}
+/*
+ * Move all callbacks from the rcu_batch structure specified by "from" to
+ * the structure specified by "to".
+ */
+static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
+{
+        if (!rcu_batch_empty(from)) {
+                *to->tail = from->head;
+                to->tail = from->tail;
+                rcu_batch_init(from);
+        }
+}
+/* single-thread state-machine */
+static void process_srcu(struct work_struct *work);
 static int init_srcu_struct_fields(struct srcu_struct *sp)
 {
        sp->completed = 0;
-        mutex_init(&sp->mutex);
+        spin_lock_init(&sp->queue_lock);
+        sp->running = false;
+        rcu_batch_init(&sp->batch_queue);
+        rcu_batch_init(&sp->batch_check0);
+        rcu_batch_init(&sp->batch_check1);
+        rcu_batch_init(&sp->batch_done);
+        INIT_DELAYED_WORK(&sp->work, process_srcu);
        sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
        return sp->per_cpu_ref ? 0 : -ENOMEM;
 }
@@ -73,21 +140,116 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 /*
- * srcu_readers_active_idx -- returns approximate number of readers
+ * Returns approximate total of the readers' ->seq[] values for the
- *      active on the specified rank of per-CPU counters.
+ * rank of per-CPU counters specified by idx.
 */
+static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
+{
+        int cpu;
+        unsigned long sum = 0;
+        unsigned long t;
-static int srcu_readers_active_idx(struct srcu_struct *sp, int idx)
+        for_each_possible_cpu(cpu) {
+                t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
+                sum += t;
+        }
+        return sum;
+}
+/*
+ * Returns approximate number of readers active on the specified rank
+ * of the per-CPU ->c[] counters.
+ */
+static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
 {
        int cpu;
-        int sum;
+        unsigned long sum = 0;
+        unsigned long t;
-        sum = 0;
+        for_each_possible_cpu(cpu) {
-        for_each_possible_cpu(cpu)
+                t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
-                sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx];
+                sum += t;
+        }
        return sum;
 }
+/*
+ * Return true if the number of pre-existing readers is determined to
+ * be stably zero.  An example unstable zero can occur if the call
+ * to srcu_readers_active_idx() misses an __srcu_read_lock() increment,
+ * but due to task migration, sees the corresponding __srcu_read_unlock()
+ * decrement.  This can happen because srcu_readers_active_idx() takes
+ * time to sum the array, and might in fact be interrupted or preempted
+ * partway through the summation.
+ */
+static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
+{
+        unsigned long seq;
+        seq = srcu_readers_seq_idx(sp, idx);
+        /*
+         * The following smp_mb() A pairs with the smp_mb() B located in
+         * __srcu_read_lock().  This pairing ensures that if an
+         * __srcu_read_lock() increments its counter after the summation
+         * in srcu_readers_active_idx(), then the corresponding SRCU read-side
+         * critical section will see any changes made prior to the start
+         * of the current SRCU grace period.
+         *
+         * Also, if the above call to srcu_readers_seq_idx() saw the
+         * increment of ->seq[], then the call to srcu_readers_active_idx()
+         * must see the increment of ->c[].
+         */
+        smp_mb(); /* A */
+        /*
+         * Note that srcu_readers_active_idx() can incorrectly return
+         * zero even though there is a pre-existing reader throughout.
+         * To see this, suppose that task A is in a very long SRCU
+         * read-side critical section that started on CPU 0, and that
+         * no other reader exists, so that the sum of the counters
+         * is equal to one.  Then suppose that task B starts executing
+         * srcu_readers_active_idx(), summing up to CPU 1, and then that
+         * task C starts reading on CPU 0, so that its increment is not
+         * summed, but finishes reading on CPU 2, so that its decrement
+         * -is- summed.  Then when task B completes its sum, it will
+         * incorrectly get zero, despite the fact that task A has been
+         * in its SRCU read-side critical section the whole time.
+         *
+         * We therefore do a validation step should srcu_readers_active_idx()
+         * return zero.
+         */
+        if (srcu_readers_active_idx(sp, idx) != 0)
+                return false;
+        /*
+         * The remainder of this function is the validation step.
+         * The following smp_mb() D pairs with the smp_mb() C in
+         * __srcu_read_unlock().  If the __srcu_read_unlock() was seen
+         * by srcu_readers_active_idx() above, then any destructive
+         * operation performed after the grace period will happen after
+         * the corresponding SRCU read-side critical section.
+         *
+         * Note that there can be at most NR_CPUS worth of readers using
+         * the old index, which is not enough to overflow even a 32-bit
+         * integer.  (Yes, this does mean that systems having more than
+         * a billion or so CPUs need to be 64-bit systems.)  Therefore,
+         * the sum of the ->seq[] counters cannot possibly overflow.
+         * Therefore, the only way that the return values of the two
+         * calls to srcu_readers_seq_idx() can be equal is if there were
+         * no increments of the corresponding rank of ->seq[] counts
+         * in the interim.  But the missed-increment scenario laid out
+         * above includes an increment of the ->seq[] counter by
+         * the corresponding __srcu_read_lock().  Therefore, if this
+         * scenario occurs, the return values from the two calls to
+         * srcu_readers_seq_idx() will differ, and thus the validation
+         * step below suffices.
+         */
+        smp_mb(); /* D */
+        return srcu_readers_seq_idx(sp, idx) == seq;
+}
 /**
 * srcu_readers_active - returns approximate number of readers.
 * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
@@ -98,7 +260,14 @@ static int srcu_readers_active_idx(struct srcu_struct *sp, int idx)
 */
 static int srcu_readers_active(struct srcu_struct *sp)
 {
-        return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1);
+        int cpu;
+        unsigned long sum = 0;
+        for_each_possible_cpu(cpu) {
+                sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
+                sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
+        }
+        return sum;
 }
 /**
@@ -131,10 +300,11 @@ int __srcu_read_lock(struct srcu_struct *sp)
        int idx;
        preempt_disable();
-        idx = sp->completed & 0x1;
+        idx = rcu_dereference_index_check(sp->completed,
-        barrier();  /* ensure compiler looks -once- at sp->completed. */
+                                          rcu_read_lock_sched_held()) & 0x1;
-        per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++;
+        ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;
-        srcu_barrier();  /* ensure compiler won't misorder critical section. */
+        smp_mb(); /* B */  /* Avoid leaking the critical section. */
+        ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
        preempt_enable();
        return idx;
 }
@@ -149,8 +319,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
 void __srcu_read_unlock(struct srcu_struct *sp, int idx)
 {
        preempt_disable();
-        srcu_barrier();  /* ensure compiler won't misorder critical section. */
+        smp_mb(); /* C */  /* Avoid leaking the critical section. */
-        per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
+        ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1;
        preempt_enable();
 }
 EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@@ -163,106 +333,119 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
 * we repeatedly block for 1-millisecond time periods.  This approach
 * has done well in testing, so there is no need for a config parameter.
 */
-#define SYNCHRONIZE_SRCU_READER_DELAY 10
+#define SRCU_RETRY_CHECK_DELAY          5
+#define SYNCHRONIZE_SRCU_TRYCOUNT       2
+#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT   12
 /*
- * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
+ * @@@ Wait until all pre-existing readers complete.  Such readers
+ * will have used the index specified by "idx".
+ * the caller should ensures the ->completed is not changed while checking
+ * and idx = (->completed & 1) ^ 1
 */
-static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
+static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
 {
-        int idx;
+        for (;;) {
+                if (srcu_readers_active_idx_check(sp, idx))
-        rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
+                        return true;
-                           !lock_is_held(&rcu_bh_lock_map) &&
+                if (--trycount <= 0)
-                           !lock_is_held(&rcu_lock_map) &&
+                        return false;
-                           !lock_is_held(&rcu_sched_lock_map),
+                udelay(SRCU_RETRY_CHECK_DELAY);
-                           "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
+        }
+}
-        idx = sp->completed;
-        mutex_lock(&sp->mutex);
-        /*
+/*
-         * Check to see if someone else did the work for us while we were
+ * Increment the ->completed counter so that future SRCU readers will
-         * waiting to acquire the lock.  We need -two- advances of
+ * use the other rank of the ->c[] and ->seq[] arrays.  This allows
-         * the counter, not just one.  If there was but one, we might have
+ * us to wait for pre-existing readers in a starvation-free manner.
-         * shown up -after- our helper's first synchronize_sched(), thus
+ */
-         * having failed to prevent CPU-reordering races with concurrent
+static void srcu_flip(struct srcu_struct *sp)
-         * srcu_read_unlock()s on other CPUs (see comment below).  So we
+{
-         * either (1) wait for two or (2) supply the second ourselves.
+        sp->completed++;
-         */
+}
-        if ((sp->completed - idx) >= 2) {
+/*
-                mutex_unlock(&sp->mutex);
+ * Enqueue an SRCU callback on the specified srcu_struct structure,
-                return;
+ * initiating grace-period processing if it is not already running.
+ */
+void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
+                void (*func)(struct rcu_head *head))
+{
+        unsigned long flags;
+        head->next = NULL;
+        head->func = func;
+        spin_lock_irqsave(&sp->queue_lock, flags);
+        rcu_batch_queue(&sp->batch_queue, head);
+        if (!sp->running) {
+                sp->running = true;
+                queue_delayed_work(system_nrt_wq, &sp->work, 0);
        }
+        spin_unlock_irqrestore(&sp->queue_lock, flags);
+}
+EXPORT_SYMBOL_GPL(call_srcu);
-        sync_func();  /* Force memory barrier on all CPUs. */
+struct rcu_synchronize {
+        struct rcu_head head;
+        struct completion completion;
+};
-        /*
+/*
-         * The preceding synchronize_sched() ensures that any CPU that
+ * Awaken the corresponding synchronize_srcu() instance now that a
-         * sees the new value of sp->completed will also see any preceding
+ * grace period has elapsed.
-         * changes to data structures made by this CPU.  This prevents
+ */
-         * some other CPU from reordering the accesses in its SRCU
+static void wakeme_after_rcu(struct rcu_head *head)
-         * read-side critical section to precede the corresponding
+{
-         * srcu_read_lock() -- ensuring that such references will in
+        struct rcu_synchronize *rcu;
-         * fact be protected.
-         *
-         * So it is now safe to do the flip.
-         */
-        idx = sp->completed & 0x1;
+        rcu = container_of(head, struct rcu_synchronize, head);
-        sp->completed++;
+        complete(&rcu->completion);
+}
-        sync_func();  /* Force memory barrier on all CPUs. */
+static void srcu_advance_batches(struct srcu_struct *sp, int trycount);
+static void srcu_reschedule(struct srcu_struct *sp);
-        /*
+/*
-         * At this point, because of the preceding synchronize_sched(),
+ * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
-         * all srcu_read_lock() calls using the old counters have completed.
+ */
-         * Their corresponding critical sections might well be still
+static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
-         * executing, but the srcu_read_lock() primitives themselves
+{
-         * will have finished executing.  We initially give readers
+        struct rcu_synchronize rcu;
-         * an arbitrarily chosen 10 microseconds to get out of their
+        struct rcu_head *head = &rcu.head;
-         * SRCU read-side critical sections, then loop waiting 1/HZ
+        bool done = false;
-         * seconds per iteration.  The 10-microsecond value has done
-         * very well in testing.
-         */
-        if (srcu_readers_active_idx(sp, idx))
-                udelay(SYNCHRONIZE_SRCU_READER_DELAY);
-        while (srcu_readers_active_idx(sp, idx))
-                schedule_timeout_interruptible(1);
-        sync_func();  /* Force memory barrier on all CPUs. */
+        rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
+                           !lock_is_held(&rcu_bh_lock_map) &&
+                           !lock_is_held(&rcu_lock_map) &&
+                           !lock_is_held(&rcu_sched_lock_map),
+                           "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
-        /*
+        init_completion(&rcu.completion);
-         * The preceding synchronize_sched() forces all srcu_read_unlock()
-         * primitives that were executing concurrently with the preceding
+        head->next = NULL;
-         * for_each_possible_cpu() loop to have completed by this point.
+        head->func = wakeme_after_rcu;
-         * More importantly, it also forces the corresponding SRCU read-side
+        spin_lock_irq(&sp->queue_lock);
-         * critical sections to have also completed, and the corresponding
+        if (!sp->running) {
-         * references to SRCU-protected data items to be dropped.
+                /* steal the processing owner */
-         *
+                sp->running = true;
-         * Note:
+                rcu_batch_queue(&sp->batch_check0, head);
-         *
+                spin_unlock_irq(&sp->queue_lock);
-         *      Despite what you might think at first glance, the
-         *      preceding synchronize_sched() -must- be within the
+                srcu_advance_batches(sp, trycount);
-         *      critical section ended by the following mutex_unlock().
+                if (!rcu_batch_empty(&sp->batch_done)) {
-         *      Otherwise, a task taking the early exit can race
+                        BUG_ON(sp->batch_done.head != head);
-         *      with a srcu_read_unlock(), which might have executed
+                        rcu_batch_dequeue(&sp->batch_done);
-         *      just before the preceding srcu_readers_active() check,
+                        done = true;
-         *      and whose CPU might have reordered the srcu_read_unlock()
+                }
-         *      with the preceding critical section.  In this case, there
+                /* give the processing owner to work_struct */
-         *      is nothing preventing the synchronize_sched() task that is
+                srcu_reschedule(sp);
-         *      taking the early exit from freeing a data structure that
+        } else {
-         *      is still being referenced (out of order) by the task
+                rcu_batch_queue(&sp->batch_queue, head);
-         *      doing the srcu_read_unlock().
+                spin_unlock_irq(&sp->queue_lock);
-         *
+        }
-         *      Alternatively, the comparison with "2" on the early exit
-         *      could be changed to "3", but this increases synchronize_srcu()
-         *      latency for bulk loads.  So the current code is preferred.
-         */
-        mutex_unlock(&sp->mutex);
+        if (!done)
+                wait_for_completion(&rcu.completion);
 }
 /**
@@ -281,7 +464,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
 */
 void synchronize_srcu(struct srcu_struct *sp)
 {
-        __synchronize_srcu(sp, synchronize_sched);
+        __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT);
 }
 EXPORT_SYMBOL_GPL(synchronize_srcu);
@@ -289,18 +472,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
 * synchronize_srcu_expedited - Brute-force SRCU grace period
 * @sp: srcu_struct with which to synchronize.
 *
- * Wait for an SRCU grace period to elapse, but use a "big hammer"
+ * Wait for an SRCU grace period to elapse, but be more aggressive about
- * approach to force the grace period to end quickly.  This consumes
+ * spinning rather than blocking when waiting.
- * significant time on all CPUs and is unfriendly to real-time workloads,
- * so is thus not recommended for any sort of common-case code.  In fact,
- * if you are using synchronize_srcu_expedited() in a loop, please
- * restructure your code to batch your updates, and then use a single
- * synchronize_srcu() instead.
 *
 * Note that it is illegal to call this function while holding any lock
- * that is acquired by a CPU-hotplug notifier.  And yes, it is also illegal
+ * that is acquired by a CPU-hotplug notifier.  It is also illegal to call
- * to call this function from a CPU-hotplug notifier.  Failing to observe
- * these restriction will result in deadlock.  It is also illegal to call
 * synchronize_srcu_expedited() from the corresponding SRCU read-side
 * critical section; doing so will result in deadlock.  However, it is
 * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct
@@ -309,20 +485,166 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
 */
 void synchronize_srcu_expedited(struct srcu_struct *sp)
 {
-        __synchronize_srcu(sp, synchronize_sched_expedited);
+        __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT);
 }
 EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
 /**
+ * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
+ */
+void srcu_barrier(struct srcu_struct *sp)
+{
+        synchronize_srcu(sp);
+}
+EXPORT_SYMBOL_GPL(srcu_barrier);
+/**
 * srcu_batches_completed - return batches completed.
 * @sp: srcu_struct on which to report batch completion.
 *
 * Report the number of batches, correlated with, but not necessarily
 * precisely the same as, the number of grace periods that have elapsed.
 */
 long srcu_batches_completed(struct srcu_struct *sp)
 {
        return sp->completed;
 }
 EXPORT_SYMBOL_GPL(srcu_batches_completed);
+#define SRCU_CALLBACK_BATCH     10
+#define SRCU_INTERVAL           1
+/*
+ * Move any new SRCU callbacks to the first stage of the SRCU grace
+ * period pipeline.
+ */
+static void srcu_collect_new(struct srcu_struct *sp)
+{
+        if (!rcu_batch_empty(&sp->batch_queue)) {
+                spin_lock_irq(&sp->queue_lock);
+                rcu_batch_move(&sp->batch_check0, &sp->batch_queue);
+                spin_unlock_irq(&sp->queue_lock);
+        }
+}
+/*
+ * Core SRCU state machine.  Advance callbacks from ->batch_check0 to
+ * ->batch_check1 and then to ->batch_done as readers drain.
+ */
+static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
+{
+        int idx = 1 ^ (sp->completed & 1);
+        /*
+         * Because readers might be delayed for an extended period after
+         * fetching ->completed for their index, at any point in time there
+         * might well be readers using both idx=0 and idx=1.  We therefore
+         * need to wait for readers to clear from both index values before
+         * invoking a callback.
+         */
+        if (rcu_batch_empty(&sp->batch_check0) &&
+            rcu_batch_empty(&sp->batch_check1))
+                return; /* no callbacks need to be advanced */
+        if (!try_check_zero(sp, idx, trycount))
+                return; /* failed to advance, will try after SRCU_INTERVAL */
+        /*
+         * The callbacks in ->batch_check1 have already done with their
+         * first zero check and flip back when they were enqueued on
+         * ->batch_check0 in a previous invocation of srcu_advance_batches().
+         * (Presumably try_check_zero() returned false during that
+         * invocation, leaving the callbacks stranded on ->batch_check1.)
+         * They are therefore ready to invoke, so move them to ->batch_done.
+         */
+        rcu_batch_move(&sp->batch_done, &sp->batch_check1);
+        if (rcu_batch_empty(&sp->batch_check0))
+                return; /* no callbacks need to be advanced */
+        srcu_flip(sp);
+        /*
+         * The callbacks in ->batch_check0 just finished their
+         * first check zero and flip, so move them to ->batch_check1
+         * for future checking on the other idx.
+         */
+        rcu_batch_move(&sp->batch_check1, &sp->batch_check0);
+        /*
+         * SRCU read-side critical sections are normally short, so check
+         * at least twice in quick succession after a flip.
+         */
+        trycount = trycount < 2 ? 2 : trycount;
+        if (!try_check_zero(sp, idx^1, trycount))
+                return; /* failed to advance, will try after SRCU_INTERVAL */
+        /*
+         * The callbacks in ->batch_check1 have now waited for all
+         * pre-existing readers using both idx values.  They are therefore
+         * ready to invoke, so move them to ->batch_done.
+         */
+        rcu_batch_move(&sp->batch_done, &sp->batch_check1);
+}
+/*
+ * Invoke a limited number of SRCU callbacks that have passed through
+ * their grace period.  If there are more to do, SRCU will reschedule
+ * the workqueue.
+ */
+static void srcu_invoke_callbacks(struct srcu_struct *sp)
+{
+        int i;
+        struct rcu_head *head;
+        for (i = 0; i < SRCU_CALLBACK_BATCH; i++) {
+                head = rcu_batch_dequeue(&sp->batch_done);
+                if (!head)
+                        break;
+                local_bh_disable();
+                head->func(head);
+                local_bh_enable();
+        }
+}
+/*
+ * Finished one round of SRCU grace period.  Start another if there are
+ * more SRCU callbacks queued, otherwise put SRCU into not-running state.
+ */
+static void srcu_reschedule(struct srcu_struct *sp)
+{
+        bool pending = true;
+        if (rcu_batch_empty(&sp->batch_done) &&
+            rcu_batch_empty(&sp->batch_check1) &&
+            rcu_batch_empty(&sp->batch_check0) &&
+            rcu_batch_empty(&sp->batch_queue)) {
+                spin_lock_irq(&sp->queue_lock);
+                if (rcu_batch_empty(&sp->batch_done) &&
+                    rcu_batch_empty(&sp->batch_check1) &&
+                    rcu_batch_empty(&sp->batch_check0) &&
+                    rcu_batch_empty(&sp->batch_queue)) {
+                        sp->running = false;
+                        pending = false;
+                }
+                spin_unlock_irq(&sp->queue_lock);
+        }
+        if (pending)
+                queue_delayed_work(system_nrt_wq, &sp->work, SRCU_INTERVAL);
+}
+/*
+ * This is the work-queue function that handles SRCU grace periods.
+ */
+static void process_srcu(struct work_struct *work)
+{
+        struct srcu_struct *sp;
+        sp = container_of(work, struct srcu_struct, work.work);
+        srcu_collect_new(sp);
+        srcu_advance_batches(sp, 1);
+        srcu_invoke_callbacks(sp);
+        srcu_reschedule(sp);
+}
diff --git a/kernel/sys.c b/kernel/sys.c
index e7006eb6c1e4..2d39a84cd857 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -36,6 +36,8 @@
 #include <linux/personality.h>
 #include <linux/ptrace.h>
 #include <linux/fs_struct.h>
+#include <linux/file.h>
+#include <linux/mount.h>
 #include <linux/gfp.h>
 #include <linux/syscore_ops.h>
 #include <linux/version.h>
@@ -93,10 +95,8 @@
 int overflowuid = DEFAULT_OVERFLOWUID;
 int overflowgid = DEFAULT_OVERFLOWGID;
-#ifdef CONFIG_UID16
 EXPORT_SYMBOL(overflowuid);
 EXPORT_SYMBOL(overflowgid);
-#endif
 /*
 * the same as above, but for filesystems which can only store a 16-bit
@@ -133,11 +133,10 @@ static bool set_one_prio_perm(struct task_struct *p)
 {
        const struct cred *cred = current_cred(), *pcred = __task_cred(p);
-        if (pcred->user->user_ns == cred->user->user_ns &&
+        if (uid_eq(pcred->uid,  cred->euid) ||
-            (pcred->uid  == cred->euid ||
+            uid_eq(pcred->euid, cred->euid))
-             pcred->euid == cred->euid))
                return true;
-        if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE))
+        if (ns_capable(pcred->user_ns, CAP_SYS_NICE))
                return true;
        return false;
 }
@@ -177,6 +176,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
        const struct cred *cred = current_cred();
        int error = -EINVAL;
        struct pid *pgrp;
+        kuid_t uid;
        if (which > PRIO_USER || which < PRIO_PROCESS)
                goto out;
@@ -209,18 +209,19 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
                        } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
                        break;
                case PRIO_USER:
-                        user = (struct user_struct *) cred->user;
+                        uid = make_kuid(cred->user_ns, who);
+                        user = cred->user;
                        if (!who)
-                                who = cred->uid;
+                                uid = cred->uid;
-                        else if ((who != cred->uid) &&
+                        else if (!uid_eq(uid, cred->uid) &&
-                                 !(user = find_user(who)))
+                                 !(user = find_user(uid)))
                                goto out_unlock;        /* No processes for this user */
                        do_each_thread(g, p) {
-                                if (__task_cred(p)->uid == who)
+                                if (uid_eq(task_uid(p), uid))
                                        error = set_one_prio(p, niceval, error);
                        } while_each_thread(g, p);
-                        if (who != cred->uid)
+                        if (!uid_eq(uid, cred->uid))
                                free_uid(user);         /* For find_user() */
                        break;
        }
@@ -244,6 +245,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
        const struct cred *cred = current_cred();
        long niceval, retval = -ESRCH;
        struct pid *pgrp;
+        kuid_t uid;
        if (which > PRIO_USER || which < PRIO_PROCESS)
                return -EINVAL;
@@ -274,21 +276,22 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
                        } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
                        break;
                case PRIO_USER:
-                        user = (struct user_struct *) cred->user;
+                        uid = make_kuid(cred->user_ns, who);
+                        user = cred->user;
                        if (!who)
-                                who = cred->uid;
+                                uid = cred->uid;
-                        else if ((who != cred->uid) &&
+                        else if (!uid_eq(uid, cred->uid) &&
-                                 !(user = find_user(who)))
+                                 !(user = find_user(uid)))
                                goto out_unlock;        /* No processes for this user */
                        do_each_thread(g, p) {
-                                if (__task_cred(p)->uid == who) {
+                                if (uid_eq(task_uid(p), uid)) {
                                        niceval = 20 - task_nice(p);
                                        if (niceval > retval)
                                                retval = niceval;
                                }
                        } while_each_thread(g, p);
-                        if (who != cred->uid)
+                        if (!uid_eq(uid, cred->uid))
                                free_uid(user);         /* for find_user() */
                        break;
        }
@@ -553,9 +556,19 @@ void ctrl_alt_del(void)
 */
 SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
 {
+        struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
+        kgid_t krgid, kegid;
+        krgid = make_kgid(ns, rgid);
+        kegid = make_kgid(ns, egid);
+        if ((rgid != (gid_t) -1) && !gid_valid(krgid))
+                return -EINVAL;
+        if ((egid != (gid_t) -1) && !gid_valid(kegid))
+                return -EINVAL;
        new = prepare_creds();
        if (!new)
@@ -564,25 +577,25 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
        retval = -EPERM;
        if (rgid != (gid_t) -1) {
-                if (old->gid == rgid ||
+                if (gid_eq(old->gid, krgid) ||
-                    old->egid == rgid ||
+                    gid_eq(old->egid, krgid) ||
                    nsown_capable(CAP_SETGID))
-                        new->gid = rgid;
+                        new->gid = krgid;
                else
                        goto error;
        }
        if (egid != (gid_t) -1) {
-                if (old->gid == egid ||
+                if (gid_eq(old->gid, kegid) ||
-                    old->egid == egid ||
+                    gid_eq(old->egid, kegid) ||
-                    old->sgid == egid ||
+                    gid_eq(old->sgid, kegid) ||
                    nsown_capable(CAP_SETGID))
-                        new->egid = egid;
+                        new->egid = kegid;
                else
                        goto error;
        }
        if (rgid != (gid_t) -1 ||
-            (egid != (gid_t) -1 && egid != old->gid))
+            (egid != (gid_t) -1 && !gid_eq(kegid, old->gid)))
                new->sgid = new->egid;
        new->fsgid = new->egid;
@@ -600,9 +613,15 @@ error:
 */
 SYSCALL_DEFINE1(setgid, gid_t, gid)
 {
+        struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
+        kgid_t kgid;
+        kgid = make_kgid(ns, gid);
+        if (!gid_valid(kgid))
+                return -EINVAL;
        new = prepare_creds();
        if (!new)
@@ -611,9 +630,9 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
        retval = -EPERM;
        if (nsown_capable(CAP_SETGID))
-                new->gid = new->egid = new->sgid = new->fsgid = gid;
+                new->gid = new->egid = new->sgid = new->fsgid = kgid;
-        else if (gid == old->gid || gid == old->sgid)
+        else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid))
-                new->egid = new->fsgid = gid;
+                new->egid = new->fsgid = kgid;
        else
                goto error;
@@ -631,7 +650,7 @@ static int set_user(struct cred *new)
 {
        struct user_struct *new_user;
-        new_user = alloc_uid(current_user_ns(), new->uid);
+        new_user = alloc_uid(new->uid);
        if (!new_user)
                return -EAGAIN;
@@ -670,9 +689,19 @@ static int set_user(struct cred *new)
 */
 SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
 {
+        struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
+        kuid_t kruid, keuid;
+        kruid = make_kuid(ns, ruid);
+        keuid = make_kuid(ns, euid);
+        if ((ruid != (uid_t) -1) && !uid_valid(kruid))
+                return -EINVAL;
+        if ((euid != (uid_t) -1) && !uid_valid(keuid))
+                return -EINVAL;
        new = prepare_creds();
        if (!new)
@@ -681,29 +710,29 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
        retval = -EPERM;
        if (ruid != (uid_t) -1) {
-                new->uid = ruid;
+                new->uid = kruid;
-                if (old->uid != ruid &&
+                if (!uid_eq(old->uid, kruid) &&
-                    old->euid != ruid &&
+                    !uid_eq(old->euid, kruid) &&
                    !nsown_capable(CAP_SETUID))
                        goto error;
        }
        if (euid != (uid_t) -1) {
-                new->euid = euid;
+                new->euid = keuid;
-                if (old->uid != euid &&
+                if (!uid_eq(old->uid, keuid) &&
-                    old->euid != euid &&
+                    !uid_eq(old->euid, keuid) &&
-                    old->suid != euid &&
+                    !uid_eq(old->suid, keuid) &&
                    !nsown_capable(CAP_SETUID))
                        goto error;
        }
-        if (new->uid != old->uid) {
+        if (!uid_eq(new->uid, old->uid)) {
                retval = set_user(new);
                if (retval < 0)
                        goto error;
        }
        if (ruid != (uid_t) -1 ||
-            (euid != (uid_t) -1 && euid != old->uid))
+            (euid != (uid_t) -1 && !uid_eq(keuid, old->uid)))
                new->suid = new->euid;
        new->fsuid = new->euid;
@@ -731,9 +760,15 @@ error:
 */
 SYSCALL_DEFINE1(setuid, uid_t, uid)
 {
+        struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
+        kuid_t kuid;
+        kuid = make_kuid(ns, uid);
+        if (!uid_valid(kuid))
+                return -EINVAL;
        new = prepare_creds();
        if (!new)
@@ -742,17 +777,17 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
        retval = -EPERM;
        if (nsown_capable(CAP_SETUID)) {
-                new->suid = new->uid = uid;
+                new->suid = new->uid = kuid;
-                if (uid != old->uid) {
+                if (!uid_eq(kuid, old->uid)) {
                        retval = set_user(new);
                        if (retval < 0)
                                goto error;
                }
-        } else if (uid != old->uid && uid != new->suid) {
+        } else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) {
                goto error;
        }
-        new->fsuid = new->euid = uid;
+        new->fsuid = new->euid = kuid;
        retval = security_task_fix_setuid(new, old, LSM_SETID_ID);
        if (retval < 0)
@@ -772,9 +807,24 @@ error:
 */
 SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
 {
+        struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
+        kuid_t kruid, keuid, ksuid;
+        kruid = make_kuid(ns, ruid);
+        keuid = make_kuid(ns, euid);
+        ksuid = make_kuid(ns, suid);
+        if ((ruid != (uid_t) -1) && !uid_valid(kruid))
+                return -EINVAL;
+        if ((euid != (uid_t) -1) && !uid_valid(keuid))
+                return -EINVAL;
+        if ((suid != (uid_t) -1) && !uid_valid(ksuid))
+                return -EINVAL;
        new = prepare_creds();
        if (!new)
@@ -784,29 +834,29 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
        retval = -EPERM;
        if (!nsown_capable(CAP_SETUID)) {
-                if (ruid != (uid_t) -1 && ruid != old->uid &&
+                if (ruid != (uid_t) -1        && !uid_eq(kruid, old->uid) &&
-                    ruid != old->euid  && ruid != old->suid)
+                    !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
                        goto error;
-                if (euid != (uid_t) -1 && euid != old->uid &&
+                if (euid != (uid_t) -1        && !uid_eq(keuid, old->uid) &&
-                    euid != old->euid  && euid != old->suid)
+                    !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid))
                        goto error;
-                if (suid != (uid_t) -1 && suid != old->uid &&
+                if (suid != (uid_t) -1        && !uid_eq(ksuid, old->uid) &&
-                    suid != old->euid  && suid != old->suid)
+                    !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid))
                        goto error;
        }
        if (ruid != (uid_t) -1) {
-                new->uid = ruid;
+                new->uid = kruid;
-                if (ruid != old->uid) {
+                if (!uid_eq(kruid, old->uid)) {
                        retval = set_user(new);
                        if (retval < 0)
                                goto error;
                }
        }
        if (euid != (uid_t) -1)
-                new->euid = euid;
+                new->euid = keuid;
        if (suid != (uid_t) -1)
-                new->suid = suid;
+                new->suid = ksuid;
        new->fsuid = new->euid;
        retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
@@ -820,14 +870,19 @@ error:
        return retval;
 }
-SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __user *, suid)
+SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp)
 {
        const struct cred *cred = current_cred();
        int retval;
+        uid_t ruid, euid, suid;
+        ruid = from_kuid_munged(cred->user_ns, cred->uid);
+        euid = from_kuid_munged(cred->user_ns, cred->euid);
+        suid = from_kuid_munged(cred->user_ns, cred->suid);
-        if (!(retval   = put_user(cred->uid,  ruid)) &&
+        if (!(retval   = put_user(ruid, ruidp)) &&
-            !(retval   = put_user(cred->euid, euid)))
+            !(retval   = put_user(euid, euidp)))
-                retval = put_user(cred->suid, suid);
+                retval = put_user(suid, suidp);
        return retval;
 }
@@ -837,9 +892,22 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __u
 */
 SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
 {
+        struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
+        kgid_t krgid, kegid, ksgid;
+        krgid = make_kgid(ns, rgid);
+        kegid = make_kgid(ns, egid);
+        ksgid = make_kgid(ns, sgid);
+        if ((rgid != (gid_t) -1) && !gid_valid(krgid))
+                return -EINVAL;
+        if ((egid != (gid_t) -1) && !gid_valid(kegid))
+                return -EINVAL;
+        if ((sgid != (gid_t) -1) && !gid_valid(ksgid))
+                return -EINVAL;
        new = prepare_creds();
        if (!new)
@@ -848,23 +916,23 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
        retval = -EPERM;
        if (!nsown_capable(CAP_SETGID)) {
-                if (rgid != (gid_t) -1 && rgid != old->gid &&
+                if (rgid != (gid_t) -1        && !gid_eq(krgid, old->gid) &&
-                    rgid != old->egid  && rgid != old->sgid)
+                    !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid))
                        goto error;
-                if (egid != (gid_t) -1 && egid != old->gid &&
+                if (egid != (gid_t) -1        && !gid_eq(kegid, old->gid) &&
-                    egid != old->egid  && egid != old->sgid)
+                    !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid))
                        goto error;
-                if (sgid != (gid_t) -1 && sgid != old->gid &&
+                if (sgid != (gid_t) -1        && !gid_eq(ksgid, old->gid) &&
-                    sgid != old->egid  && sgid != old->sgid)
+                    !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid))
                        goto error;
        }
        if (rgid != (gid_t) -1)
-                new->gid = rgid;
+                new->gid = krgid;
        if (egid != (gid_t) -1)
-                new->egid = egid;
+                new->egid = kegid;
        if (sgid != (gid_t) -1)
-                new->sgid = sgid;
+                new->sgid = ksgid;
        new->fsgid = new->egid;
        return commit_creds(new);
@@ -874,14 +942,19 @@ error:
        return retval;
 }
-SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __user *, sgid)
+SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp)
 {
        const struct cred *cred = current_cred();
        int retval;
+        gid_t rgid, egid, sgid;
+        rgid = from_kgid_munged(cred->user_ns, cred->gid);
+        egid = from_kgid_munged(cred->user_ns, cred->egid);
+        sgid = from_kgid_munged(cred->user_ns, cred->sgid);
-        if (!(retval   = put_user(cred->gid,  rgid)) &&
+        if (!(retval   = put_user(rgid, rgidp)) &&
-            !(retval   = put_user(cred->egid, egid)))
+            !(retval   = put_user(egid, egidp)))
-                retval = put_user(cred->sgid, sgid);
+                retval = put_user(sgid, sgidp);
        return retval;
 }
@@ -898,18 +971,24 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
        const struct cred *old;
        struct cred *new;
        uid_t old_fsuid;
+        kuid_t kuid;
+        old = current_cred();
+        old_fsuid = from_kuid_munged(old->user_ns, old->fsuid);
+        kuid = make_kuid(old->user_ns, uid);
+        if (!uid_valid(kuid))
+                return old_fsuid;
        new = prepare_creds();
        if (!new)
-                return current_fsuid();
+                return old_fsuid;
-        old = current_cred();
-        old_fsuid = old->fsuid;
-        if (uid == old->uid  || uid == old->euid  ||
+        if (uid_eq(kuid, old->uid)  || uid_eq(kuid, old->euid)  ||
-            uid == old->suid || uid == old->fsuid ||
+            uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) ||
            nsown_capable(CAP_SETUID)) {
-                if (uid != old_fsuid) {
+                if (!uid_eq(kuid, old->fsuid)) {
-                        new->fsuid = uid;
+                        new->fsuid = kuid;
                        if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
                                goto change_okay;
                }
@@ -931,18 +1010,24 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
        const struct cred *old;
        struct cred *new;
        gid_t old_fsgid;
+        kgid_t kgid;
+        old = current_cred();
+        old_fsgid = from_kgid_munged(old->user_ns, old->fsgid);
+        kgid = make_kgid(old->user_ns, gid);
+        if (!gid_valid(kgid))
+                return old_fsgid;
        new = prepare_creds();
        if (!new)
-                return current_fsgid();
+                return old_fsgid;
-        old = current_cred();
-        old_fsgid = old->fsgid;
-        if (gid == old->gid  || gid == old->egid  ||
+        if (gid_eq(kgid, old->gid)  || gid_eq(kgid, old->egid)  ||
-            gid == old->sgid || gid == old->fsgid ||
+            gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) ||
            nsown_capable(CAP_SETGID)) {
-                if (gid != old_fsgid) {
+                if (!gid_eq(kgid, old->fsgid)) {
-                        new->fsgid = gid;
+                        new->fsgid = kgid;
                        goto change_okay;
                }
        }
@@ -1295,8 +1380,8 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
                memcpy(u->nodename, tmp, len);
                memset(u->nodename + len, 0, sizeof(u->nodename) - len);
                errno = 0;
+                uts_proc_notify(UTS_PROC_HOSTNAME);
        }
-        uts_proc_notify(UTS_PROC_HOSTNAME);
        up_write(&uts_sem);
        return errno;
 }
@@ -1346,8 +1431,8 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
                memcpy(u->domainname, tmp, len);
                memset(u->domainname + len, 0, sizeof(u->domainname) - len);
                errno = 0;
+                uts_proc_notify(UTS_PROC_DOMAINNAME);
        }
-        uts_proc_notify(UTS_PROC_DOMAINNAME);
        up_write(&uts_sem);
        return errno;
 }
@@ -1498,15 +1583,14 @@ static int check_prlimit_permission(struct task_struct *task)
                return 0;
        tcred = __task_cred(task);
-        if (cred->user->user_ns == tcred->user->user_ns &&
+        if (uid_eq(cred->uid, tcred->euid) &&
-            (cred->uid == tcred->euid &&
+            uid_eq(cred->uid, tcred->suid) &&
-             cred->uid == tcred->suid &&
+            uid_eq(cred->uid, tcred->uid)  &&
-             cred->uid == tcred->uid  &&
+            gid_eq(cred->gid, tcred->egid) &&
-             cred->gid == tcred->egid &&
+            gid_eq(cred->gid, tcred->sgid) &&
-             cred->gid == tcred->sgid &&
+            gid_eq(cred->gid, tcred->gid))
-             cred->gid == tcred->gid))
                return 0;
-        if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE))
+        if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))
                return 0;
        return -EPERM;
@@ -1702,77 +1786,105 @@ SYSCALL_DEFINE1(umask, int, mask)
 }
 #ifdef CONFIG_CHECKPOINT_RESTORE
+static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
+{
+        struct file *exe_file;
+        struct dentry *dentry;
+        int err;
+        exe_file = fget(fd);
+        if (!exe_file)
+                return -EBADF;
+        dentry = exe_file->f_path.dentry;
+        /*
+         * Because the original mm->exe_file points to executable file, make
+         * sure that this one is executable as well, to avoid breaking an
+         * overall picture.
+         */
+        err = -EACCES;
+        if (!S_ISREG(dentry->d_inode->i_mode)   ||
+            exe_file->f_path.mnt->mnt_flags & MNT_NOEXEC)
+                goto exit;
+        err = inode_permission(dentry->d_inode, MAY_EXEC);
+        if (err)
+                goto exit;
+        down_write(&mm->mmap_sem);
+        /*
+         * Forbid mm->exe_file change if old file still mapped.
+         */
+        err = -EBUSY;
+        if (mm->exe_file) {
+                struct vm_area_struct *vma;
+                for (vma = mm->mmap; vma; vma = vma->vm_next)
+                        if (vma->vm_file &&
+                            path_equal(&vma->vm_file->f_path,
+                                       &mm->exe_file->f_path))
+                                goto exit_unlock;
+        }
+        /*
+         * The symlink can be changed only once, just to disallow arbitrary
+         * transitions malicious software might bring in. This means one
+         * could make a snapshot over all processes running and monitor
+         * /proc/pid/exe changes to notice unusual activity if needed.
+         */
+        err = -EPERM;
+        if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
+                goto exit_unlock;
+        err = 0;
+        set_mm_exe_file(mm, exe_file);
+exit_unlock:
+        up_write(&mm->mmap_sem);
+exit:
+        fput(exe_file);
+        return err;
+}
 static int prctl_set_mm(int opt, unsigned long addr,
                        unsigned long arg4, unsigned long arg5)
 {
        unsigned long rlim = rlimit(RLIMIT_DATA);
-        unsigned long vm_req_flags;
-        unsigned long vm_bad_flags;
-        struct vm_area_struct *vma;
-        int error = 0;
        struct mm_struct *mm = current->mm;
+        struct vm_area_struct *vma;
+        int error;
-        if (arg4 | arg5)
+        if (arg5 || (arg4 && opt != PR_SET_MM_AUXV))
                return -EINVAL;
        if (!capable(CAP_SYS_RESOURCE))
                return -EPERM;
-        if (addr >= TASK_SIZE)
+        if (opt == PR_SET_MM_EXE_FILE)
+                return prctl_set_mm_exe_file(mm, (unsigned int)addr);
+        if (addr >= TASK_SIZE || addr < mmap_min_addr)
                return -EINVAL;
+        error = -EINVAL;
        down_read(&mm->mmap_sem);
        vma = find_vma(mm, addr);
-        if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) {
-                /* It must be existing VMA */
-                if (!vma || vma->vm_start > addr)
-                        goto out;
-        }
-        error = -EINVAL;
        switch (opt) {
        case PR_SET_MM_START_CODE:
+                mm->start_code = addr;
+                break;
        case PR_SET_MM_END_CODE:
-                vm_req_flags = VM_READ | VM_EXEC;
+                mm->end_code = addr;
-                vm_bad_flags = VM_WRITE | VM_MAYSHARE;
-                if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
-                    (vma->vm_flags & vm_bad_flags))
-                        goto out;
-                if (opt == PR_SET_MM_START_CODE)
-                        mm->start_code = addr;
-                else
-                        mm->end_code = addr;
                break;
        case PR_SET_MM_START_DATA:
-        case PR_SET_MM_END_DATA:
+                mm->start_data = addr;
-                vm_req_flags = VM_READ | VM_WRITE;
-                vm_bad_flags = VM_EXEC | VM_MAYSHARE;
-                if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
-                    (vma->vm_flags & vm_bad_flags))
-                        goto out;
-                if (opt == PR_SET_MM_START_DATA)
-                        mm->start_data = addr;
-                else
-                        mm->end_data = addr;
                break;
+        case PR_SET_MM_END_DATA:
-        case PR_SET_MM_START_STACK:
+                mm->end_data = addr;
-#ifdef CONFIG_STACK_GROWSUP
-                vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP;
-#else
-                vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN;
-#endif
-                if ((vma->vm_flags & vm_req_flags) != vm_req_flags)
-                        goto out;
-                mm->start_stack = addr;
                break;
        case PR_SET_MM_START_BRK:
@@ -1799,24 +1911,89 @@ static int prctl_set_mm(int opt, unsigned long addr,
                mm->brk = addr;
                break;
+        /*
+         * If command line arguments and environment
+         * are placed somewhere else on stack, we can
+         * set them up here, ARG_START/END to setup
+         * command line argumets and ENV_START/END
+         * for environment.
+         */
+        case PR_SET_MM_START_STACK:
+        case PR_SET_MM_ARG_START:
+        case PR_SET_MM_ARG_END:
+        case PR_SET_MM_ENV_START:
+        case PR_SET_MM_ENV_END:
+                if (!vma) {
+                        error = -EFAULT;
+                        goto out;
+                }
+                if (opt == PR_SET_MM_START_STACK)
+                        mm->start_stack = addr;
+                else if (opt == PR_SET_MM_ARG_START)
+                        mm->arg_start = addr;
+                else if (opt == PR_SET_MM_ARG_END)
+                        mm->arg_end = addr;
+                else if (opt == PR_SET_MM_ENV_START)
+                        mm->env_start = addr;
+                else if (opt == PR_SET_MM_ENV_END)
+                        mm->env_end = addr;
+                break;
+        /*
+         * This doesn't move auxiliary vector itself
+         * since it's pinned to mm_struct, but allow
+         * to fill vector with new values. It's up
+         * to a caller to provide sane values here
+         * otherwise user space tools which use this
+         * vector might be unhappy.
+         */
+        case PR_SET_MM_AUXV: {
+                unsigned long user_auxv[AT_VECTOR_SIZE];
+                if (arg4 > sizeof(user_auxv))
+                        goto out;
+                up_read(&mm->mmap_sem);
+                if (copy_from_user(user_auxv, (const void __user *)addr, arg4))
+                        return -EFAULT;
+                /* Make sure the last entry is always AT_NULL */
+                user_auxv[AT_VECTOR_SIZE - 2] = 0;
+                user_auxv[AT_VECTOR_SIZE - 1] = 0;
+                BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
+                task_lock(current);
+                memcpy(mm->saved_auxv, user_auxv, arg4);
+                task_unlock(current);
+                return 0;
+        }
        default:
-                error = -EINVAL;
                goto out;
        }
        error = 0;
 out:
        up_read(&mm->mmap_sem);
        return error;
 }
+static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
+{
+        return put_user(me->clear_child_tid, tid_addr);
+}
 #else /* CONFIG_CHECKPOINT_RESTORE */
 static int prctl_set_mm(int opt, unsigned long addr,
                        unsigned long arg4, unsigned long arg5)
 {
        return -EINVAL;
 }
+static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
+{
+        return -EINVAL;
+}
 #endif
 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
@@ -1908,7 +2085,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                        error = prctl_get_seccomp();
                        break;
                case PR_SET_SECCOMP:
-                        error = prctl_set_seccomp(arg2);
+                        error = prctl_set_seccomp(arg2, (char __user *)arg3);
                        break;
                case PR_GET_TSC:
                        error = GET_TSC_CTL(arg2);
@@ -1971,6 +2148,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                case PR_SET_MM:
                        error = prctl_set_mm(arg2, arg3, arg4, arg5);
                        break;
+                case PR_GET_TID_ADDRESS:
+                        error = prctl_get_tid_address(me, (int __user **)arg2);
+                        break;
                case PR_SET_CHILD_SUBREAPER:
                        me->signal->is_child_subreaper = !!arg2;
                        error = 0;
@@ -1979,6 +2159,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                        error = put_user(me->signal->is_child_subreaper,
                                         (int __user *) arg2);
                        break;
+                case PR_SET_NO_NEW_PRIVS:
+                        if (arg2 != 1 || arg3 || arg4 || arg5)
+                                return -EINVAL;
+                        current->no_new_privs = 1;
+                        break;
+                case PR_GET_NO_NEW_PRIVS:
+                        if (arg2 || arg3 || arg4 || arg5)
+                                return -EINVAL;
+                        return current->no_new_privs ? 1 : 0;
                default:
                        error = -EINVAL;
                        break;
@@ -2022,7 +2212,6 @@ int orderly_poweroff(bool force)
                NULL
        };
        int ret = -ENOMEM;
-        struct subprocess_info *info;
        if (argv == NULL) {
                printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
@@ -2030,18 +2219,16 @@ int orderly_poweroff(bool force)
                goto out;
        }
-        info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC);
+        ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT,
-        if (info == NULL) {
+                                      NULL, argv_cleanup, NULL);
-                argv_free(argv);
+out:
-                goto out;
+        if (likely(!ret))
-        }
+                return 0;
-        call_usermodehelper_setfns(info, NULL, argv_cleanup, NULL);
-        ret = call_usermodehelper_exec(info, UMH_NO_WAIT);
+        if (ret == -ENOMEM)
+                argv_free(argv);
-  out:
+        if (force) {
-        if (ret && force) {
                printk(KERN_WARNING "Failed to start orderly shutdown: "
                       "forcing the issue\n");
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 47bfa16430d7..dbff751e4086 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -203,3 +203,6 @@ cond_syscall(sys_fanotify_mark);
 cond_syscall(sys_name_to_handle_at);
 cond_syscall(sys_open_by_handle_at);
 cond_syscall(compat_sys_open_by_handle_at);
+/* compare kernel pointers */
+cond_syscall(sys_kcmp);
diff --git a/kernel/task_work.c b/kernel/task_work.c
new file mode 100644
index 000000000000..82d1c794066d
--- /dev/null
+++ b/kernel/task_work.c
@@ -0,0 +1,84 @@
+#include <linux/spinlock.h>
+#include <linux/task_work.h>
+#include <linux/tracehook.h>
+int
+task_work_add(struct task_struct *task, struct task_work *twork, bool notify)
+{
+        unsigned long flags;
+        int err = -ESRCH;
+#ifndef TIF_NOTIFY_RESUME
+        if (notify)
+                return -ENOTSUPP;
+#endif
+        /*
+         * We must not insert the new work if the task has already passed
+         * exit_task_work(). We rely on do_exit()->raw_spin_unlock_wait()
+         * and check PF_EXITING under pi_lock.
+         */
+        raw_spin_lock_irqsave(&task->pi_lock, flags);
+        if (likely(!(task->flags & PF_EXITING))) {
+                hlist_add_head(&twork->hlist, &task->task_works);
+                err = 0;
+        }
+        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+        /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */
+        if (likely(!err) && notify)
+                set_notify_resume(task);
+        return err;
+}
+struct task_work *
+task_work_cancel(struct task_struct *task, task_work_func_t func)
+{
+        unsigned long flags;
+        struct task_work *twork;
+        struct hlist_node *pos;
+        raw_spin_lock_irqsave(&task->pi_lock, flags);
+        hlist_for_each_entry(twork, pos, &task->task_works, hlist) {
+                if (twork->func == func) {
+                        hlist_del(&twork->hlist);
+                        goto found;
+                }
+        }
+        twork = NULL;
+ found:
+        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+        return twork;
+}
+void task_work_run(void)
+{
+        struct task_struct *task = current;
+        struct hlist_head task_works;
+        struct hlist_node *pos;
+        raw_spin_lock_irq(&task->pi_lock);
+        hlist_move_list(&task->task_works, &task_works);
+        raw_spin_unlock_irq(&task->pi_lock);
+        if (unlikely(hlist_empty(&task_works)))
+                return;
+        /*
+         * We use hlist to save the space in task_struct, but we want fifo.
+         * Find the last entry, the list should be short, then process them
+         * in reverse order.
+         */
+        for (pos = task_works.first; pos->next; pos = pos->next)
+                ;
+        for (;;) {
+                struct hlist_node **pprev = pos->pprev;
+                struct task_work *twork = container_of(pos, struct task_work,
+                                                        hlist);
+                twork->func(twork);
+                if (pprev == &task_works.first)
+                        break;
+                pos = container_of(pprev, struct hlist_node, next);
+        }
+}
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index a20dc8a3c949..fd42bd452b75 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -2,6 +2,55 @@
 # Timer subsystem related configuration options
 #
+# Options selectable by arch Kconfig
+# Watchdog function for clocksources to detect instabilities
+config CLOCKSOURCE_WATCHDOG
+        bool
+# Architecture has extra clocksource data
+config ARCH_CLOCKSOURCE_DATA
+        bool
+# Timekeeping vsyscall support
+config GENERIC_TIME_VSYSCALL
+        bool
+# ktime_t scalar 64bit nsec representation
+config KTIME_SCALAR
+        bool
+# Old style timekeeping
+config ARCH_USES_GETTIMEOFFSET
+        bool
+# The generic clock events infrastructure
+config GENERIC_CLOCKEVENTS
+        bool
+# Migration helper. Builds, but does not invoke
+config GENERIC_CLOCKEVENTS_BUILD
+        bool
+        default y
+        depends on GENERIC_CLOCKEVENTS
+# Clockevents broadcasting infrastructure
+config GENERIC_CLOCKEVENTS_BROADCAST
+        bool
+        depends on GENERIC_CLOCKEVENTS
+# Automatically adjust the min. reprogramming time for
+# clock event device
+config GENERIC_CLOCKEVENTS_MIN_ADJUST
+        bool
+# Generic update of CMOS clock
+config GENERIC_CMOS_UPDATE
+        bool
+if GENERIC_CLOCKEVENTS
+menu "Timers subsystem"
 # Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is
 # only related to the tick functionality. Oneshot clockevent devices
 # are supported independ of this.
@@ -26,10 +75,5 @@ config HIGH_RES_TIMERS
          hardware is not capable then this option only increases
          the size of the kernel image.
-config GENERIC_CLOCKEVENTS_BUILD
+endmenu
-        bool
+endif
-        default y
-        depends on GENERIC_CLOCKEVENTS
-config GENERIC_CLOCKEVENTS_MIN_ADJUST
-        bool
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 8a538c55fc7b..aa27d391bfc8 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -59,7 +59,7 @@ static DEFINE_SPINLOCK(rtcdev_lock);
 * If one has not already been chosen, it checks to see if a
 * functional rtc device is available.
 */
-static struct rtc_device *alarmtimer_get_rtcdev(void)
+struct rtc_device *alarmtimer_get_rtcdev(void)
 {
        unsigned long flags;
        struct rtc_device *ret;
@@ -115,7 +115,7 @@ static void alarmtimer_rtc_interface_remove(void)
        class_interface_unregister(&alarmtimer_rtc_interface);
 }
 #else
-static inline struct rtc_device *alarmtimer_get_rtcdev(void)
+struct rtc_device *alarmtimer_get_rtcdev(void)
 {
        return NULL;
 }
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 9cd928f7a7c6..7e1ce012a851 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -297,8 +297,7 @@ void clockevents_register_device(struct clock_event_device *dev)
 }
 EXPORT_SYMBOL_GPL(clockevents_register_device);
-static void clockevents_config(struct clock_event_device *dev,
+void clockevents_config(struct clock_event_device *dev, u32 freq)
-                               u32 freq)
 {
        u64 sec;
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index f03fd83b170b..b7fbadc5c973 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -409,15 +409,20 @@ int second_overflow(unsigned long secs)
                        time_state = TIME_DEL;
                break;
        case TIME_INS:
-                if (secs % 86400 == 0) {
+                if (!(time_status & STA_INS))
+                        time_state = TIME_OK;
+                else if (secs % 86400 == 0) {
                        leap = -1;
                        time_state = TIME_OOP;
+                        time_tai++;
                        printk(KERN_NOTICE
                                "Clock: inserting leap second 23:59:60 UTC\n");
                }
                break;
        case TIME_DEL:
-                if ((secs + 1) % 86400 == 0) {
+                if (!(time_status & STA_DEL))
+                        time_state = TIME_OK;
+                else if ((secs + 1) % 86400 == 0) {
                        leap = 1;
                        time_tai--;
                        time_state = TIME_WAIT;
@@ -426,7 +431,6 @@ int second_overflow(unsigned long secs)
                }
                break;
        case TIME_OOP:
-                time_tai++;
                time_state = TIME_WAIT;
                break;
@@ -473,8 +477,6 @@ int second_overflow(unsigned long secs)
                                                         << NTP_SCALE_SHIFT;
        time_adjust = 0;
 out:
        spin_unlock_irqrestore(&ntp_lock, flags);
@@ -559,10 +561,10 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
        /* only set allowed bits */
        time_status &= STA_RONLY;
        time_status |= txc->status & ~STA_RONLY;
 }
 /*
- * Called with the xtime lock held, so we can access and modify
+ * Called with ntp_lock held, so we can access and modify
 * all the global NTP state:
 */
 static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6a3a5b9ff561..4a08472c3ca7 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -274,6 +274,7 @@ EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
 static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
 {
        unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
+        unsigned long rcu_delta_jiffies;
        ktime_t last_update, expires, now;
        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
        u64 time_delta;
@@ -322,7 +323,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
                time_delta = timekeeping_max_deferment();
        } while (read_seqretry(&xtime_lock, seq));
-        if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
+        if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) ||
            arch_needs_cpu(cpu)) {
                next_jiffies = last_jiffies + 1;
                delta_jiffies = 1;
@@ -330,6 +331,10 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
                /* Get the next timer wheel timer */
                next_jiffies = get_next_timer_interrupt(last_jiffies);
                delta_jiffies = next_jiffies - last_jiffies;
+                if (rcu_delta_jiffies < delta_jiffies) {
+                        next_jiffies = last_jiffies + rcu_delta_jiffies;
+                        delta_jiffies = rcu_delta_jiffies;
+                }
        }
        /*
         * Do not stop the tick, if we are only one off
@@ -401,6 +406,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
                 */
                if (!ts->tick_stopped) {
                        select_nohz_load_balancer(1);
+                        calc_load_enter_idle();
                        ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
                        ts->tick_stopped = 1;
@@ -576,6 +582,7 @@ void tick_nohz_idle_exit(void)
        /* Update jiffies first */
        select_nohz_load_balancer(0);
        tick_do_update_jiffies64(now);
+        update_cpu_load_nohz();
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
        /*
@@ -591,6 +598,7 @@ void tick_nohz_idle_exit(void)
                account_idle_ticks(ticks);
 #endif
+        calc_load_exit_idle();
        touch_softlockup_watchdog();
        /*
         * Cancel the scheduled timer and restore the tick
@@ -814,6 +822,16 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
        return HRTIMER_RESTART;
 }
+static int sched_skew_tick;
+static int __init skew_tick(char *str)
+{
+        get_option(&str, &sched_skew_tick);
+        return 0;
+}
+early_param("skew_tick", skew_tick);
 /**
 * tick_setup_sched_timer - setup the tick emulation timer
 */
@@ -831,6 +849,14 @@ void tick_setup_sched_timer(void)
        /* Get the next period (per cpu) */
        hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
+        /* Offset the tick to avert xtime_lock contention. */
+        if (sched_skew_tick) {
+                u64 offset = ktime_to_ns(tick_period) >> 1;
+                do_div(offset, num_possible_cpus());
+                offset *= smp_processor_id();
+                hrtimer_add_expires_ns(&ts->sched_timer, offset);
+        }
        for (;;) {
                hrtimer_forward(&ts->sched_timer, now, tick_period);
                hrtimer_start_expires(&ts->sched_timer,
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d66b21308f7c..3447cfaf11e7 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -70,6 +70,12 @@ struct timekeeper {
        /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
        struct timespec raw_time;
+        /* Offset clock monotonic -> clock realtime */
+        ktime_t offs_real;
+        /* Offset clock monotonic -> clock boottime */
+        ktime_t offs_boot;
        /* Seqlock for all timekeeper values */
        seqlock_t lock;
 };
@@ -172,6 +178,14 @@ static inline s64 timekeeping_get_ns_raw(void)
        return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
 }
+static void update_rt_offset(void)
+{
+        struct timespec tmp, *wtm = &timekeeper.wall_to_monotonic;
+        set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec);
+        timekeeper.offs_real = timespec_to_ktime(tmp);
+}
 /* must hold write on timekeeper.lock */
 static void timekeeping_update(bool clearntp)
 {
@@ -179,6 +193,7 @@ static void timekeeping_update(bool clearntp)
                timekeeper.ntp_error = 0;
                ntp_clear();
        }
+        update_rt_offset();
        update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic,
                         timekeeper.clock, timekeeper.mult);
 }
@@ -240,7 +255,6 @@ void getnstimeofday(struct timespec *ts)
        timespec_add_ns(ts, nsecs);
 }
 EXPORT_SYMBOL(getnstimeofday);
 ktime_t ktime_get(void)
@@ -357,8 +371,8 @@ void do_gettimeofday(struct timeval *tv)
        tv->tv_sec = now.tv_sec;
        tv->tv_usec = now.tv_nsec/1000;
 }
 EXPORT_SYMBOL(do_gettimeofday);
 /**
 * do_settimeofday - Sets the time of day
 * @tv:         pointer to the timespec variable containing the new time
@@ -392,7 +406,6 @@ int do_settimeofday(const struct timespec *tv)
        return 0;
 }
 EXPORT_SYMBOL(do_settimeofday);
@@ -606,6 +619,7 @@ void __init timekeeping_init(void)
        }
        set_normalized_timespec(&timekeeper.wall_to_monotonic,
                                -boot.tv_sec, -boot.tv_nsec);
+        update_rt_offset();
        timekeeper.total_sleep_time.tv_sec = 0;
        timekeeper.total_sleep_time.tv_nsec = 0;
        write_sequnlock_irqrestore(&timekeeper.lock, flags);
@@ -614,6 +628,12 @@ void __init timekeeping_init(void)
 /* time in seconds when suspend began */
 static struct timespec timekeeping_suspend_time;
+static void update_sleep_time(struct timespec t)
+{
+        timekeeper.total_sleep_time = t;
+        timekeeper.offs_boot = timespec_to_ktime(t);
+}
 /**
 * __timekeeping_inject_sleeptime - Internal function to add sleep interval
 * @delta: pointer to a timespec delta value
@@ -632,8 +652,7 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta)
        timekeeper.xtime = timespec_add(timekeeper.xtime, *delta);
        timekeeper.wall_to_monotonic =
                        timespec_sub(timekeeper.wall_to_monotonic, *delta);
-        timekeeper.total_sleep_time = timespec_add(
+        update_sleep_time(timespec_add(timekeeper.total_sleep_time, *delta));
-                                        timekeeper.total_sleep_time, *delta);
 }
@@ -698,6 +717,7 @@ static void timekeeping_resume(void)
        timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
        timekeeper.ntp_error = 0;
        timekeeping_suspended = 0;
+        timekeeping_update(false);
        write_sequnlock_irqrestore(&timekeeper.lock, flags);
        touch_softlockup_watchdog();
@@ -964,6 +984,9 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
                timekeeper.xtime.tv_sec++;
                leap = second_overflow(timekeeper.xtime.tv_sec);
                timekeeper.xtime.tv_sec += leap;
+                timekeeper.wall_to_monotonic.tv_sec -= leap;
+                if (leap)
+                        clock_was_set_delayed();
        }
        /* Accumulate raw time */
@@ -1079,6 +1102,9 @@ static void update_wall_time(void)
                timekeeper.xtime.tv_sec++;
                leap = second_overflow(timekeeper.xtime.tv_sec);
                timekeeper.xtime.tv_sec += leap;
+                timekeeper.wall_to_monotonic.tv_sec -= leap;
+                if (leap)
+                        clock_was_set_delayed();
        }
        timekeeping_update(false);
@@ -1246,6 +1272,40 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
        } while (read_seqretry(&timekeeper.lock, seq));
 }
+#ifdef CONFIG_HIGH_RES_TIMERS
+/**
+ * ktime_get_update_offsets - hrtimer helper
+ * @offs_real:  pointer to storage for monotonic -> realtime offset
+ * @offs_boot:  pointer to storage for monotonic -> boottime offset
+ *
+ * Returns current monotonic time and updates the offsets
+ * Called from hrtimer_interupt() or retrigger_next_event()
+ */
+ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
+{
+        ktime_t now;
+        unsigned int seq;
+        u64 secs, nsecs;
+        do {
+                seq = read_seqbegin(&timekeeper.lock);
+                secs = timekeeper.xtime.tv_sec;
+                nsecs = timekeeper.xtime.tv_nsec;
+                nsecs += timekeeping_get_ns();
+                /* If arch requires, add in gettimeoffset() */
+                nsecs += arch_gettimeoffset();
+                *offs_real = timekeeper.offs_real;
+                *offs_boot = timekeeper.offs_boot;
+        } while (read_seqretry(&timekeeper.lock, seq));
+        now = ktime_add_ns(ktime_set(secs, 0), nsecs);
+        now = ktime_sub(now, *offs_real);
+        return now;
+}
+#endif
 /**
 * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
 */
diff --git a/kernel/timer.c b/kernel/timer.c
index a297ffcf888e..6ec7e7e0db43 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -861,7 +861,13 @@ EXPORT_SYMBOL(mod_timer);
 *
 * mod_timer_pinned() is a way to update the expire field of an
 * active timer (if the timer is inactive it will be activated)
- * and not allow the timer to be migrated to a different CPU.
+ * and to ensure that the timer is scheduled on the current CPU.
+ *
+ * Note that this does not prevent the timer from being migrated
+ * when the current CPU goes offline.  If this is a problem for
+ * you, use CPU-hotplug notifiers to handle it correctly, for
+ * example, cancelling the timer when the corresponding CPU goes
+ * offline.
 *
 * mod_timer_pinned(timer, expires) is equivalent to:
 *
@@ -1102,7 +1108,9 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
         * warnings as well as problems when looking into
         * timer->lockdep_map, make a copy and use that here.
         */
-        struct lockdep_map lockdep_map = timer->lockdep_map;
+        struct lockdep_map lockdep_map;
+        lockdep_copy_map(&lockdep_map, &timer->lockdep_map);
 #endif
        /*
         * Couple the lock chain with the lock chain at
@@ -1427,25 +1435,25 @@ SYSCALL_DEFINE0(getppid)
 SYSCALL_DEFINE0(getuid)
 {
        /* Only we change this so SMP safe */
-        return current_uid();
+        return from_kuid_munged(current_user_ns(), current_uid());
 }
 SYSCALL_DEFINE0(geteuid)
 {
        /* Only we change this so SMP safe */
-        return current_euid();
+        return from_kuid_munged(current_user_ns(), current_euid());
 }
 SYSCALL_DEFINE0(getgid)
 {
        /* Only we change this so SMP safe */
-        return current_gid();
+        return from_kgid_munged(current_user_ns(), current_gid());
 }
 SYSCALL_DEFINE0(getegid)
 {
        /* Only we change this so SMP safe */
-        return  current_egid();
+        return from_kgid_munged(current_user_ns(), current_egid());
 }
 #endif
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a1d2849f2473..8c4c07071cc5 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -141,7 +141,6 @@ if FTRACE
 config FUNCTION_TRACER
        bool "Kernel Function Tracer"
        depends on HAVE_FUNCTION_TRACER
-        select FRAME_POINTER if !ARM_UNWIND && !PPC && !S390 && !MICROBLAZE
        select KALLSYMS
        select GENERIC_TRACER
        select CONTEXT_SWITCH_TRACER
@@ -272,7 +271,7 @@ config PROFILE_ANNOTATED_BRANCHES
        bool "Trace likely/unlikely profiler"
        select TRACE_BRANCH_PROFILING
        help
-          This tracer profiles all the the likely and unlikely macros
+          This tracer profiles all likely and unlikely macros
          in the kernel. It will display the results in:
          /sys/kernel/debug/tracing/trace_stat/branch_annotated
@@ -373,6 +372,7 @@ config KPROBE_EVENT
        depends on HAVE_REGS_AND_STACK_ACCESS_API
        bool "Enable kprobes-based dynamic events"
        select TRACING
+        select PROBE_EVENTS
        default y
        help
          This allows the user to add tracing events (similar to tracepoints)
@@ -385,6 +385,25 @@ config KPROBE_EVENT
          This option is also required by perf-probe subcommand of perf tools.
          If you want to use perf tools, this option is strongly recommended.
+config UPROBE_EVENT
+        bool "Enable uprobes-based dynamic events"
+        depends on ARCH_SUPPORTS_UPROBES
+        depends on MMU
+        select UPROBES
+        select PROBE_EVENTS
+        select TRACING
+        default n
+        help
+          This allows the user to add tracing events on top of userspace
+          dynamic events (similar to tracepoints) on the fly via the trace
+          events interface. Those events can be inserted wherever uprobes
+          can probe, and record various registers.
+          This option is required if you plan to use perf-probe subcommand
+          of perf tools on user space applications.
+config PROBE_EVENTS
+        def_bool n
 config DYNAMIC_FTRACE
        bool "enable/disable ftrace tracepoints dynamically"
        depends on FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 5f39a07fe5ea..b831087c8200 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -41,7 +41,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
-obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
 ifeq ($(CONFIG_BLOCK),y)
 obj-$(CONFIG_EVENT_TRACING) += blktrace.o
@@ -61,5 +60,7 @@ endif
 ifeq ($(CONFIG_TRACING),y)
 obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
 endif
+obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
+obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o
 libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0fa92f677c92..a008663d86c8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1383,44 +1383,73 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
 static int ftrace_cmp_recs(const void *a, const void *b)
 {
-        const struct dyn_ftrace *reca = a;
+        const struct dyn_ftrace *key = a;
-        const struct dyn_ftrace *recb = b;
+        const struct dyn_ftrace *rec = b;
-        if (reca->ip > recb->ip)
+        if (key->flags < rec->ip)
-                return 1;
-        if (reca->ip < recb->ip)
                return -1;
+        if (key->ip >= rec->ip + MCOUNT_INSN_SIZE)
+                return 1;
        return 0;
 }
-/**
+static unsigned long ftrace_location_range(unsigned long start, unsigned long end)
- * ftrace_location - return true if the ip giving is a traced location
- * @ip: the instruction pointer to check
- *
- * Returns 1 if @ip given is a pointer to a ftrace location.
- * That is, the instruction that is either a NOP or call to
- * the function tracer. It checks the ftrace internal tables to
- * determine if the address belongs or not.
- */
-int ftrace_location(unsigned long ip)
 {
        struct ftrace_page *pg;
        struct dyn_ftrace *rec;
        struct dyn_ftrace key;
-        key.ip = ip;
+        key.ip = start;
+        key.flags = end;        /* overload flags, as it is unsigned long */
        for (pg = ftrace_pages_start; pg; pg = pg->next) {
+                if (end < pg->records[0].ip ||
+                    start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
+                        continue;
                rec = bsearch(&key, pg->records, pg->index,
                              sizeof(struct dyn_ftrace),
                              ftrace_cmp_recs);
                if (rec)
-                        return 1;
+                        return rec->ip;
        }
        return 0;
 }
+/**
+ * ftrace_location - return true if the ip giving is a traced location
+ * @ip: the instruction pointer to check
+ *
+ * Returns rec->ip if @ip given is a pointer to a ftrace location.
+ * That is, the instruction that is either a NOP or call to
+ * the function tracer. It checks the ftrace internal tables to
+ * determine if the address belongs or not.
+ */
+unsigned long ftrace_location(unsigned long ip)
+{
+        return ftrace_location_range(ip, ip);
+}
+/**
+ * ftrace_text_reserved - return true if range contains an ftrace location
+ * @start: start of range to search
+ * @end: end of range to search (inclusive). @end points to the last byte to check.
+ *
+ * Returns 1 if @start and @end contains a ftrace location.
+ * That is, the instruction that is either a NOP or call to
+ * the function tracer. It checks the ftrace internal tables to
+ * determine if the address belongs or not.
+ */
+int ftrace_text_reserved(void *start, void *end)
+{
+        unsigned long ret;
+        ret = ftrace_location_range((unsigned long)start,
+                                    (unsigned long)end);
+        return (int)!!ret;
+}
 static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
                                     int filter_hash,
                                     bool inc)
@@ -1520,35 +1549,6 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
        __ftrace_hash_rec_update(ops, filter_hash, 1);
 }
-static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
-{
-        if (ftrace_pages->index == ftrace_pages->size) {
-                /* We should have allocated enough */
-                if (WARN_ON(!ftrace_pages->next))
-                        return NULL;
-                ftrace_pages = ftrace_pages->next;
-        }
-        return &ftrace_pages->records[ftrace_pages->index++];
-}
-static struct dyn_ftrace *
-ftrace_record_ip(unsigned long ip)
-{
-        struct dyn_ftrace *rec;
-        if (ftrace_disabled)
-                return NULL;
-        rec = ftrace_alloc_dyn_node(ip);
-        if (!rec)
-                return NULL;
-        rec->ip = ip;
-        return rec;
-}
 static void print_ip_ins(const char *fmt, unsigned char *p)
 {
        int i;
@@ -1598,21 +1598,6 @@ void ftrace_bug(int failed, unsigned long ip)
        }
 }
-/* Return 1 if the address range is reserved for ftrace */
-int ftrace_text_reserved(void *start, void *end)
-{
-        struct dyn_ftrace *rec;
-        struct ftrace_page *pg;
-        do_for_each_ftrace_rec(pg, rec) {
-                if (rec->ip <= (unsigned long)end &&
-                    rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start)
-                        return 1;
-        } while_for_each_ftrace_rec();
-        return 0;
-}
 static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
 {
        unsigned long flag = 0UL;
@@ -1698,7 +1683,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
        return -1; /* unknow ftrace bug */
 }
-static void ftrace_replace_code(int update)
+void __weak ftrace_replace_code(int enable)
 {
        struct dyn_ftrace *rec;
        struct ftrace_page *pg;
@@ -1708,7 +1693,7 @@ static void ftrace_replace_code(int update)
                return;
        do_for_each_ftrace_rec(pg, rec) {
-                failed = __ftrace_replace_code(rec, update);
+                failed = __ftrace_replace_code(rec, enable);
                if (failed) {
                        ftrace_bug(failed, rec->ip);
                        /* Stop processing */
@@ -1826,22 +1811,27 @@ int __weak ftrace_arch_code_modify_post_process(void)
        return 0;
 }
-static int __ftrace_modify_code(void *data)
+void ftrace_modify_all_code(int command)
 {
-        int *command = data;
+        if (command & FTRACE_UPDATE_CALLS)
-        if (*command & FTRACE_UPDATE_CALLS)
                ftrace_replace_code(1);
-        else if (*command & FTRACE_DISABLE_CALLS)
+        else if (command & FTRACE_DISABLE_CALLS)
                ftrace_replace_code(0);
-        if (*command & FTRACE_UPDATE_TRACE_FUNC)
+        if (command & FTRACE_UPDATE_TRACE_FUNC)
                ftrace_update_ftrace_func(ftrace_trace_function);
-        if (*command & FTRACE_START_FUNC_RET)
+        if (command & FTRACE_START_FUNC_RET)
                ftrace_enable_ftrace_graph_caller();
-        else if (*command & FTRACE_STOP_FUNC_RET)
+        else if (command & FTRACE_STOP_FUNC_RET)
                ftrace_disable_ftrace_graph_caller();
+}
+static int __ftrace_modify_code(void *data)
+{
+        int *command = data;
+        ftrace_modify_all_code(*command);
        return 0;
 }
@@ -2469,57 +2459,35 @@ static int
 ftrace_avail_open(struct inode *inode, struct file *file)
 {
        struct ftrace_iterator *iter;
-        int ret;
        if (unlikely(ftrace_disabled))
                return -ENODEV;
-        iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+        iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
-        if (!iter)
+        if (iter) {
-                return -ENOMEM;
+                iter->pg = ftrace_pages_start;
+                iter->ops = &global_ops;
-        iter->pg = ftrace_pages_start;
-        iter->ops = &global_ops;
-        ret = seq_open(file, &show_ftrace_seq_ops);
-        if (!ret) {
-                struct seq_file *m = file->private_data;
-                m->private = iter;
-        } else {
-                kfree(iter);
        }
-        return ret;
+        return iter ? 0 : -ENOMEM;
 }
 static int
 ftrace_enabled_open(struct inode *inode, struct file *file)
 {
        struct ftrace_iterator *iter;
-        int ret;
        if (unlikely(ftrace_disabled))
                return -ENODEV;
-        iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+        iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
-        if (!iter)
+        if (iter) {
-                return -ENOMEM;
+                iter->pg = ftrace_pages_start;
+                iter->flags = FTRACE_ITER_ENABLED;
-        iter->pg = ftrace_pages_start;
+                iter->ops = &global_ops;
-        iter->flags = FTRACE_ITER_ENABLED;
-        iter->ops = &global_ops;
-        ret = seq_open(file, &show_ftrace_seq_ops);
-        if (!ret) {
-                struct seq_file *m = file->private_data;
-                m->private = iter;
-        } else {
-                kfree(iter);
        }
-        return ret;
+        return iter ? 0 : -ENOMEM;
 }
 static void ftrace_filter_reset(struct ftrace_hash *hash)
@@ -3688,22 +3656,36 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
        return 0;
 }
-static void ftrace_swap_recs(void *a, void *b, int size)
+static int ftrace_cmp_ips(const void *a, const void *b)
+{
+        const unsigned long *ipa = a;
+        const unsigned long *ipb = b;
+        if (*ipa > *ipb)
+                return 1;
+        if (*ipa < *ipb)
+                return -1;
+        return 0;
+}
+static void ftrace_swap_ips(void *a, void *b, int size)
 {
-        struct dyn_ftrace *reca = a;
+        unsigned long *ipa = a;
-        struct dyn_ftrace *recb = b;
+        unsigned long *ipb = b;
-        struct dyn_ftrace t;
+        unsigned long t;
-        t = *reca;
+        t = *ipa;
-        *reca = *recb;
+        *ipa = *ipb;
-        *recb = t;
+        *ipb = t;
 }
 static int ftrace_process_locs(struct module *mod,
                               unsigned long *start,
                               unsigned long *end)
 {
+        struct ftrace_page *start_pg;
        struct ftrace_page *pg;
+        struct dyn_ftrace *rec;
        unsigned long count;
        unsigned long *p;
        unsigned long addr;
@@ -3715,8 +3697,11 @@ static int ftrace_process_locs(struct module *mod,
        if (!count)
                return 0;
-        pg = ftrace_allocate_pages(count);
+        sort(start, count, sizeof(*start),
-        if (!pg)
+             ftrace_cmp_ips, ftrace_swap_ips);
+        start_pg = ftrace_allocate_pages(count);
+        if (!start_pg)
                return -ENOMEM;
        mutex_lock(&ftrace_lock);
@@ -3729,7 +3714,7 @@ static int ftrace_process_locs(struct module *mod,
        if (!mod) {
                WARN_ON(ftrace_pages || ftrace_pages_start);
                /* First initialization */
-                ftrace_pages = ftrace_pages_start = pg;
+                ftrace_pages = ftrace_pages_start = start_pg;
        } else {
                if (!ftrace_pages)
                        goto out;
@@ -3740,11 +3725,11 @@ static int ftrace_process_locs(struct module *mod,
                                ftrace_pages = ftrace_pages->next;
                }
-                ftrace_pages->next = pg;
+                ftrace_pages->next = start_pg;
-                ftrace_pages = pg;
        }
        p = start;
+        pg = start_pg;
        while (p < end) {
                addr = ftrace_call_adjust(*p++);
                /*
@@ -3755,17 +3740,26 @@ static int ftrace_process_locs(struct module *mod,
                 */
                if (!addr)
                        continue;
-                if (!ftrace_record_ip(addr))
-                        break;
+                if (pg->index == pg->size) {
+                        /* We should have allocated enough */
+                        if (WARN_ON(!pg->next))
+                                break;
+                        pg = pg->next;
+                }
+                rec = &pg->records[pg->index++];
+                rec->ip = addr;
        }
-        /* These new locations need to be initialized */
+        /* We should have used all pages */
-        ftrace_new_pgs = pg;
+        WARN_ON(pg->next);
+        /* Assign the last page to ftrace_pages */
+        ftrace_pages = pg;
-        /* Make each individual set of pages sorted by ips */
+        /* These new locations need to be initialized */
-        for (; pg; pg = pg->next)
+        ftrace_new_pgs = start_pg;
-                sort(pg->records, pg->index, sizeof(struct dyn_ftrace),
-                     ftrace_cmp_recs, ftrace_swap_recs);
        /*
         * We only need to disable interrupts on start up
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index cf8d11e91efd..f765465bffe4 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -23,6 +23,8 @@
 #include <asm/local.h>
 #include "trace.h"
+static void update_pages_handler(struct work_struct *work);
 /*
 * The ring buffer header is special. We must manually up keep it.
 */
@@ -449,6 +451,7 @@ struct ring_buffer_per_cpu {
        raw_spinlock_t                  reader_lock;    /* serialize readers */
        arch_spinlock_t                 lock;
        struct lock_class_key           lock_key;
+        unsigned int                    nr_pages;
        struct list_head                *pages;
        struct buffer_page              *head_page;     /* read from head */
        struct buffer_page              *tail_page;     /* write to tail */
@@ -466,13 +469,18 @@ struct ring_buffer_per_cpu {
        unsigned long                   read_bytes;
        u64                             write_stamp;
        u64                             read_stamp;
+        /* ring buffer pages to update, > 0 to add, < 0 to remove */
+        int                             nr_pages_to_update;
+        struct list_head                new_pages; /* new pages to add */
+        struct work_struct              update_pages_work;
+        struct completion               update_done;
 };
 struct ring_buffer {
-        unsigned                        pages;
        unsigned                        flags;
        int                             cpus;
        atomic_t                        record_disabled;
+        atomic_t                        resize_disabled;
        cpumask_var_t                   cpumask;
        struct lock_class_key           *reader_lock_key;
@@ -937,6 +945,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
        struct list_head *head = cpu_buffer->pages;
        struct buffer_page *bpage, *tmp;
+        /* Reset the head page if it exists */
+        if (cpu_buffer->head_page)
+                rb_set_head_page(cpu_buffer);
        rb_head_page_deactivate(cpu_buffer);
        if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
@@ -963,14 +975,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
        return 0;
 }
-static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
+static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu)
-                             unsigned nr_pages)
 {
+        int i;
        struct buffer_page *bpage, *tmp;
-        LIST_HEAD(pages);
-        unsigned i;
-        WARN_ON(!nr_pages);
        for (i = 0; i < nr_pages; i++) {
                struct page *page;
@@ -981,15 +989,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
                 */
                bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
                                    GFP_KERNEL | __GFP_NORETRY,
-                                    cpu_to_node(cpu_buffer->cpu));
+                                    cpu_to_node(cpu));
                if (!bpage)
                        goto free_pages;
-                rb_check_bpage(cpu_buffer, bpage);
+                list_add(&bpage->list, pages);
-                list_add(&bpage->list, &pages);
+                page = alloc_pages_node(cpu_to_node(cpu),
-                page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
                                        GFP_KERNEL | __GFP_NORETRY, 0);
                if (!page)
                        goto free_pages;
@@ -997,6 +1003,27 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
                rb_init_page(bpage->page);
        }
+        return 0;
+free_pages:
+        list_for_each_entry_safe(bpage, tmp, pages, list) {
+                list_del_init(&bpage->list);
+                free_buffer_page(bpage);
+        }
+        return -ENOMEM;
+}
+static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
+                             unsigned nr_pages)
+{
+        LIST_HEAD(pages);
+        WARN_ON(!nr_pages);
+        if (__rb_allocate_pages(nr_pages, &pages, cpu_buffer->cpu))
+                return -ENOMEM;
        /*
         * The ring buffer page list is a circular list that does not
         * start and end with a list head. All page list items point to
@@ -1005,20 +1032,15 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
        cpu_buffer->pages = pages.next;
        list_del(&pages);
+        cpu_buffer->nr_pages = nr_pages;
        rb_check_pages(cpu_buffer);
        return 0;
- free_pages:
-        list_for_each_entry_safe(bpage, tmp, &pages, list) {
-                list_del_init(&bpage->list);
-                free_buffer_page(bpage);
-        }
-        return -ENOMEM;
 }
 static struct ring_buffer_per_cpu *
-rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
+rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
 {
        struct ring_buffer_per_cpu *cpu_buffer;
        struct buffer_page *bpage;
@@ -1035,6 +1057,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
        raw_spin_lock_init(&cpu_buffer->reader_lock);
        lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
        cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+        INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler);
+        init_completion(&cpu_buffer->update_done);
        bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
                            GFP_KERNEL, cpu_to_node(cpu));
@@ -1051,8 +1075,9 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
        rb_init_page(bpage->page);
        INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
+        INIT_LIST_HEAD(&cpu_buffer->new_pages);
-        ret = rb_allocate_pages(cpu_buffer, buffer->pages);
+        ret = rb_allocate_pages(cpu_buffer, nr_pages);
        if (ret < 0)
                goto fail_free_reader;
@@ -1113,7 +1138,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
 {
        struct ring_buffer *buffer;
        int bsize;
-        int cpu;
+        int cpu, nr_pages;
        /* keep it in its own cache line */
        buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
@@ -1124,14 +1149,14 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
        if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
                goto fail_free_buffer;
-        buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+        nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
        buffer->flags = flags;
        buffer->clock = trace_clock_local;
        buffer->reader_lock_key = key;
        /* need at least two pages */
-        if (buffer->pages < 2)
+        if (nr_pages < 2)
-                buffer->pages = 2;
+                nr_pages = 2;
        /*
         * In case of non-hotplug cpu, if the ring-buffer is allocated
@@ -1154,7 +1179,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
        for_each_buffer_cpu(buffer, cpu) {
                buffer->buffers[cpu] =
-                        rb_allocate_cpu_buffer(buffer, cpu);
+                        rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
                if (!buffer->buffers[cpu])
                        goto fail_free_buffers;
        }
@@ -1222,58 +1247,221 @@ void ring_buffer_set_clock(struct ring_buffer *buffer,
 static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
-static void
+static inline unsigned long rb_page_entries(struct buffer_page *bpage)
-rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
 {
-        struct buffer_page *bpage;
+        return local_read(&bpage->entries) & RB_WRITE_MASK;
-        struct list_head *p;
+}
-        unsigned i;
+static inline unsigned long rb_page_write(struct buffer_page *bpage)
+{
+        return local_read(&bpage->write) & RB_WRITE_MASK;
+}
+static int
+rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages)
+{
+        struct list_head *tail_page, *to_remove, *next_page;
+        struct buffer_page *to_remove_page, *tmp_iter_page;
+        struct buffer_page *last_page, *first_page;
+        unsigned int nr_removed;
+        unsigned long head_bit;
+        int page_entries;
+        head_bit = 0;
        raw_spin_lock_irq(&cpu_buffer->reader_lock);
-        rb_head_page_deactivate(cpu_buffer);
+        atomic_inc(&cpu_buffer->record_disabled);
+        /*
+         * We don't race with the readers since we have acquired the reader
+         * lock. We also don't race with writers after disabling recording.
+         * This makes it easy to figure out the first and the last page to be
+         * removed from the list. We unlink all the pages in between including
+         * the first and last pages. This is done in a busy loop so that we
+         * lose the least number of traces.
+         * The pages are freed after we restart recording and unlock readers.
+         */
+        tail_page = &cpu_buffer->tail_page->list;
-        for (i = 0; i < nr_pages; i++) {
+        /*
-                if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
+         * tail page might be on reader page, we remove the next page
-                        goto out;
+         * from the ring buffer
-                p = cpu_buffer->pages->next;
+         */
-                bpage = list_entry(p, struct buffer_page, list);
+        if (cpu_buffer->tail_page == cpu_buffer->reader_page)
-                list_del_init(&bpage->list);
+                tail_page = rb_list_head(tail_page->next);
-                free_buffer_page(bpage);
+        to_remove = tail_page;
+        /* start of pages to remove */
+        first_page = list_entry(rb_list_head(to_remove->next),
+                                struct buffer_page, list);
+        for (nr_removed = 0; nr_removed < nr_pages; nr_removed++) {
+                to_remove = rb_list_head(to_remove)->next;
+                head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD;
        }
-        if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
-                goto out;
-        rb_reset_cpu(cpu_buffer);
+        next_page = rb_list_head(to_remove)->next;
-        rb_check_pages(cpu_buffer);
-out:
+        /*
+         * Now we remove all pages between tail_page and next_page.
+         * Make sure that we have head_bit value preserved for the
+         * next page
+         */
+        tail_page->next = (struct list_head *)((unsigned long)next_page |
+                                                head_bit);
+        next_page = rb_list_head(next_page);
+        next_page->prev = tail_page;
+        /* make sure pages points to a valid page in the ring buffer */
+        cpu_buffer->pages = next_page;
+        /* update head page */
+        if (head_bit)
+                cpu_buffer->head_page = list_entry(next_page,
+                                                struct buffer_page, list);
+        /*
+         * change read pointer to make sure any read iterators reset
+         * themselves
+         */
+        cpu_buffer->read = 0;
+        /* pages are removed, resume tracing and then free the pages */
+        atomic_dec(&cpu_buffer->record_disabled);
        raw_spin_unlock_irq(&cpu_buffer->reader_lock);
+        RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages));
+        /* last buffer page to remove */
+        last_page = list_entry(rb_list_head(to_remove), struct buffer_page,
+                                list);
+        tmp_iter_page = first_page;
+        do {
+                to_remove_page = tmp_iter_page;
+                rb_inc_page(cpu_buffer, &tmp_iter_page);
+                /* update the counters */
+                page_entries = rb_page_entries(to_remove_page);
+                if (page_entries) {
+                        /*
+                         * If something was added to this page, it was full
+                         * since it is not the tail page. So we deduct the
+                         * bytes consumed in ring buffer from here.
+                         * Increment overrun to account for the lost events.
+                         */
+                        local_add(page_entries, &cpu_buffer->overrun);
+                        local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
+                }
+                /*
+                 * We have already removed references to this list item, just
+                 * free up the buffer_page and its page
+                 */
+                free_buffer_page(to_remove_page);
+                nr_removed--;
+        } while (to_remove_page != last_page);
+        RB_WARN_ON(cpu_buffer, nr_removed);
+        return nr_removed == 0;
 }
-static void
+static int
-rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
+rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
-                struct list_head *pages, unsigned nr_pages)
 {
-        struct buffer_page *bpage;
+        struct list_head *pages = &cpu_buffer->new_pages;
-        struct list_head *p;
+        int retries, success;
-        unsigned i;
        raw_spin_lock_irq(&cpu_buffer->reader_lock);
-        rb_head_page_deactivate(cpu_buffer);
+        /*
+         * We are holding the reader lock, so the reader page won't be swapped
+         * in the ring buffer. Now we are racing with the writer trying to
+         * move head page and the tail page.
+         * We are going to adapt the reader page update process where:
+         * 1. We first splice the start and end of list of new pages between
+         *    the head page and its previous page.
+         * 2. We cmpxchg the prev_page->next to point from head page to the
+         *    start of new pages list.
+         * 3. Finally, we update the head->prev to the end of new list.
+         *
+         * We will try this process 10 times, to make sure that we don't keep
+         * spinning.
+         */
+        retries = 10;
+        success = 0;
+        while (retries--) {
+                struct list_head *head_page, *prev_page, *r;
+                struct list_head *last_page, *first_page;
+                struct list_head *head_page_with_bit;
-        for (i = 0; i < nr_pages; i++) {
+                head_page = &rb_set_head_page(cpu_buffer)->list;
-                if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
+                prev_page = head_page->prev;
-                        goto out;
-                p = pages->next;
+                first_page = pages->next;
-                bpage = list_entry(p, struct buffer_page, list);
+                last_page  = pages->prev;
-                list_del_init(&bpage->list);
-                list_add_tail(&bpage->list, cpu_buffer->pages);
+                head_page_with_bit = (struct list_head *)
+                                     ((unsigned long)head_page | RB_PAGE_HEAD);
+                last_page->next = head_page_with_bit;
+                first_page->prev = prev_page;
+                r = cmpxchg(&prev_page->next, head_page_with_bit, first_page);
+                if (r == head_page_with_bit) {
+                        /*
+                         * yay, we replaced the page pointer to our new list,
+                         * now, we just have to update to head page's prev
+                         * pointer to point to end of list
+                         */
+                        head_page->prev = last_page;
+                        success = 1;
+                        break;
+                }
        }
-        rb_reset_cpu(cpu_buffer);
-        rb_check_pages(cpu_buffer);
-out:
+        if (success)
+                INIT_LIST_HEAD(pages);
+        /*
+         * If we weren't successful in adding in new pages, warn and stop
+         * tracing
+         */
+        RB_WARN_ON(cpu_buffer, !success);
        raw_spin_unlock_irq(&cpu_buffer->reader_lock);
+        /* free pages if they weren't inserted */
+        if (!success) {
+                struct buffer_page *bpage, *tmp;
+                list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages,
+                                         list) {
+                        list_del_init(&bpage->list);
+                        free_buffer_page(bpage);
+                }
+        }
+        return success;
+}
+static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer)
+{
+        int success;
+        if (cpu_buffer->nr_pages_to_update > 0)
+                success = rb_insert_pages(cpu_buffer);
+        else
+                success = rb_remove_pages(cpu_buffer,
+                                        -cpu_buffer->nr_pages_to_update);
+        if (success)
+                cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update;
+}
+static void update_pages_handler(struct work_struct *work)
+{
+        struct ring_buffer_per_cpu *cpu_buffer = container_of(work,
+                        struct ring_buffer_per_cpu, update_pages_work);
+        rb_update_pages(cpu_buffer);
+        complete(&cpu_buffer->update_done);
 }
 /**
@@ -1283,16 +1471,14 @@ out:
 *
 * Minimum size is 2 * BUF_PAGE_SIZE.
 *
- * Returns -1 on failure.
+ * Returns 0 on success and < 0 on failure.
 */
-int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
+                        int cpu_id)
 {
        struct ring_buffer_per_cpu *cpu_buffer;
-        unsigned nr_pages, rm_pages, new_pages;
+        unsigned nr_pages;
-        struct buffer_page *bpage, *tmp;
+        int cpu, err = 0;
-        unsigned long buffer_size;
-        LIST_HEAD(pages);
-        int i, cpu;
        /*
         * Always succeed at resizing a non-existent buffer:
@@ -1300,115 +1486,161 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
        if (!buffer)
                return size;
+        /* Make sure the requested buffer exists */
+        if (cpu_id != RING_BUFFER_ALL_CPUS &&
+            !cpumask_test_cpu(cpu_id, buffer->cpumask))
+                return size;
        size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
        size *= BUF_PAGE_SIZE;
-        buffer_size = buffer->pages * BUF_PAGE_SIZE;
        /* we need a minimum of two pages */
        if (size < BUF_PAGE_SIZE * 2)
                size = BUF_PAGE_SIZE * 2;
-        if (size == buffer_size)
+        nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
-                return size;
-        atomic_inc(&buffer->record_disabled);
-        /* Make sure all writers are done with this buffer. */
+        /*
-        synchronize_sched();
+         * Don't succeed if resizing is disabled, as a reader might be
+         * manipulating the ring buffer and is expecting a sane state while
+         * this is true.
+         */
+        if (atomic_read(&buffer->resize_disabled))
+                return -EBUSY;
+        /* prevent another thread from changing buffer sizes */
        mutex_lock(&buffer->mutex);
-        get_online_cpus();
-        nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
-        if (size < buffer_size) {
+        if (cpu_id == RING_BUFFER_ALL_CPUS) {
+                /* calculate the pages to update */
+                for_each_buffer_cpu(buffer, cpu) {
+                        cpu_buffer = buffer->buffers[cpu];
-                /* easy case, just free pages */
+                        cpu_buffer->nr_pages_to_update = nr_pages -
-                if (RB_WARN_ON(buffer, nr_pages >= buffer->pages))
+                                                        cpu_buffer->nr_pages;
-                        goto out_fail;
+                        /*
+                         * nothing more to do for removing pages or no update
+                         */
+                        if (cpu_buffer->nr_pages_to_update <= 0)
+                                continue;
+                        /*
+                         * to add pages, make sure all new pages can be
+                         * allocated without receiving ENOMEM
+                         */
+                        INIT_LIST_HEAD(&cpu_buffer->new_pages);
+                        if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update,
+                                                &cpu_buffer->new_pages, cpu)) {
+                                /* not enough memory for new pages */
+                                err = -ENOMEM;
+                                goto out_err;
+                        }
+                }
-                rm_pages = buffer->pages - nr_pages;
+                get_online_cpus();
+                /*
+                 * Fire off all the required work handlers
+                 * We can't schedule on offline CPUs, but it's not necessary
+                 * since we can change their buffer sizes without any race.
+                 */
+                for_each_buffer_cpu(buffer, cpu) {
+                        cpu_buffer = buffer->buffers[cpu];
+                        if (!cpu_buffer->nr_pages_to_update)
+                                continue;
+                        if (cpu_online(cpu))
+                                schedule_work_on(cpu,
+                                                &cpu_buffer->update_pages_work);
+                        else
+                                rb_update_pages(cpu_buffer);
+                }
+                /* wait for all the updates to complete */
                for_each_buffer_cpu(buffer, cpu) {
                        cpu_buffer = buffer->buffers[cpu];
-                        rb_remove_pages(cpu_buffer, rm_pages);
+                        if (!cpu_buffer->nr_pages_to_update)
+                                continue;
+                        if (cpu_online(cpu))
+                                wait_for_completion(&cpu_buffer->update_done);
+                        cpu_buffer->nr_pages_to_update = 0;
                }
-                goto out;
-        }
-        /*
+                put_online_cpus();
-         * This is a bit more difficult. We only want to add pages
+        } else {
-         * when we can allocate enough for all CPUs. We do this
+                cpu_buffer = buffer->buffers[cpu_id];
-         * by allocating all the pages and storing them on a local
-         * link list. If we succeed in our allocation, then we
-         * add these pages to the cpu_buffers. Otherwise we just free
-         * them all and return -ENOMEM;
-         */
-        if (RB_WARN_ON(buffer, nr_pages <= buffer->pages))
-                goto out_fail;
-        new_pages = nr_pages - buffer->pages;
+                if (nr_pages == cpu_buffer->nr_pages)
+                        goto out;
-        for_each_buffer_cpu(buffer, cpu) {
+                cpu_buffer->nr_pages_to_update = nr_pages -
-                for (i = 0; i < new_pages; i++) {
+                                                cpu_buffer->nr_pages;
-                        struct page *page;
-                        /*
+                INIT_LIST_HEAD(&cpu_buffer->new_pages);
-                         * __GFP_NORETRY flag makes sure that the allocation
+                if (cpu_buffer->nr_pages_to_update > 0 &&
-                         * fails gracefully without invoking oom-killer and
+                        __rb_allocate_pages(cpu_buffer->nr_pages_to_update,
-                         * the system is not destabilized.
+                                            &cpu_buffer->new_pages, cpu_id)) {
-                         */
+                        err = -ENOMEM;
-                        bpage = kzalloc_node(ALIGN(sizeof(*bpage),
+                        goto out_err;
-                                                  cache_line_size()),
-                                            GFP_KERNEL | __GFP_NORETRY,
-                                            cpu_to_node(cpu));
-                        if (!bpage)
-                                goto free_pages;
-                        list_add(&bpage->list, &pages);
-                        page = alloc_pages_node(cpu_to_node(cpu),
-                                                GFP_KERNEL | __GFP_NORETRY, 0);
-                        if (!page)
-                                goto free_pages;
-                        bpage->page = page_address(page);
-                        rb_init_page(bpage->page);
                }
-        }
-        for_each_buffer_cpu(buffer, cpu) {
+                get_online_cpus();
-                cpu_buffer = buffer->buffers[cpu];
-                rb_insert_pages(cpu_buffer, &pages, new_pages);
-        }
-        if (RB_WARN_ON(buffer, !list_empty(&pages)))
+                if (cpu_online(cpu_id)) {
-                goto out_fail;
+                        schedule_work_on(cpu_id,
+                                         &cpu_buffer->update_pages_work);
+                        wait_for_completion(&cpu_buffer->update_done);
+                } else
+                        rb_update_pages(cpu_buffer);
+                cpu_buffer->nr_pages_to_update = 0;
+                put_online_cpus();
+        }
 out:
-        buffer->pages = nr_pages;
+        /*
-        put_online_cpus();
+         * The ring buffer resize can happen with the ring buffer
+         * enabled, so that the update disturbs the tracing as little
+         * as possible. But if the buffer is disabled, we do not need
+         * to worry about that, and we can take the time to verify
+         * that the buffer is not corrupt.
+         */
+        if (atomic_read(&buffer->record_disabled)) {
+                atomic_inc(&buffer->record_disabled);
+                /*
+                 * Even though the buffer was disabled, we must make sure
+                 * that it is truly disabled before calling rb_check_pages.
+                 * There could have been a race between checking
+                 * record_disable and incrementing it.
+                 */
+                synchronize_sched();
+                for_each_buffer_cpu(buffer, cpu) {
+                        cpu_buffer = buffer->buffers[cpu];
+                        rb_check_pages(cpu_buffer);
+                }
+                atomic_dec(&buffer->record_disabled);
+        }
        mutex_unlock(&buffer->mutex);
+        return size;
-        atomic_dec(&buffer->record_disabled);
+ out_err:
+        for_each_buffer_cpu(buffer, cpu) {
+                struct buffer_page *bpage, *tmp;
-        return size;
+                cpu_buffer = buffer->buffers[cpu];
+                cpu_buffer->nr_pages_to_update = 0;
- free_pages:
+                if (list_empty(&cpu_buffer->new_pages))
-        list_for_each_entry_safe(bpage, tmp, &pages, list) {
+                        continue;
-                list_del_init(&bpage->list);
-                free_buffer_page(bpage);
-        }
-        put_online_cpus();
-        mutex_unlock(&buffer->mutex);
-        atomic_dec(&buffer->record_disabled);
-        return -ENOMEM;
-        /*
+                list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages,
-         * Something went totally wrong, and we are too paranoid
+                                        list) {
-         * to even clean up the mess.
+                        list_del_init(&bpage->list);
-         */
+                        free_buffer_page(bpage);
- out_fail:
+                }
-        put_online_cpus();
+        }
        mutex_unlock(&buffer->mutex);
-        atomic_dec(&buffer->record_disabled);
+        return err;
-        return -1;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_resize);
@@ -1447,21 +1679,11 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
        return __rb_page_index(iter->head_page, iter->head);
 }
-static inline unsigned long rb_page_write(struct buffer_page *bpage)
-{
-        return local_read(&bpage->write) & RB_WRITE_MASK;
-}
 static inline unsigned rb_page_commit(struct buffer_page *bpage)
 {
        return local_read(&bpage->page->commit);
 }
-static inline unsigned long rb_page_entries(struct buffer_page *bpage)
-{
-        return local_read(&bpage->entries) & RB_WRITE_MASK;
-}
 /* Size is determined by what has been committed */
 static inline unsigned rb_page_size(struct buffer_page *bpage)
 {
@@ -1510,7 +1732,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
         * assign the commit to the tail.
         */
 again:
-        max_count = cpu_buffer->buffer->pages * 100;
+        max_count = cpu_buffer->nr_pages * 100;
        while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
                if (RB_WARN_ON(cpu_buffer, !(--max_count)))
@@ -3486,6 +3708,7 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
        iter->cpu_buffer = cpu_buffer;
+        atomic_inc(&buffer->resize_disabled);
        atomic_inc(&cpu_buffer->record_disabled);
        return iter;
@@ -3548,7 +3771,14 @@ ring_buffer_read_finish(struct ring_buffer_iter *iter)
 {
        struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+        /*
+         * Ring buffer is disabled from recording, here's a good place
+         * to check the integrity of the ring buffer. 
+         */
+        rb_check_pages(cpu_buffer);
        atomic_dec(&cpu_buffer->record_disabled);
+        atomic_dec(&cpu_buffer->buffer->resize_disabled);
        kfree(iter);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_finish);
@@ -3588,9 +3818,18 @@ EXPORT_SYMBOL_GPL(ring_buffer_read);
 * ring_buffer_size - return the size of the ring buffer (in bytes)
 * @buffer: The ring buffer.
 */
-unsigned long ring_buffer_size(struct ring_buffer *buffer)
+unsigned long ring_buffer_size(struct ring_buffer *buffer, int cpu)
 {
-        return BUF_PAGE_SIZE * buffer->pages;
+        /*
+         * Earlier, this method returned
+         *      BUF_PAGE_SIZE * buffer->nr_pages
+         * Since the nr_pages field is now removed, we have converted this to
+         * return the per cpu buffer value.
+         */
+        if (!cpumask_test_cpu(cpu, buffer->cpumask))
+                return 0;
+        return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_size);
@@ -3611,6 +3850,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
        cpu_buffer->commit_page = cpu_buffer->head_page;
        INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
+        INIT_LIST_HEAD(&cpu_buffer->new_pages);
        local_set(&cpu_buffer->reader_page->write, 0);
        local_set(&cpu_buffer->reader_page->entries, 0);
        local_set(&cpu_buffer->reader_page->page->commit, 0);
@@ -3647,8 +3887,12 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
        if (!cpumask_test_cpu(cpu, buffer->cpumask))
                return;
+        atomic_inc(&buffer->resize_disabled);
        atomic_inc(&cpu_buffer->record_disabled);
+        /* Make sure all commits have finished */
+        synchronize_sched();
        raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
        if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
@@ -3664,6 +3908,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
        raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
        atomic_dec(&cpu_buffer->record_disabled);
+        atomic_dec(&buffer->resize_disabled);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
@@ -3765,8 +4010,11 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
            !cpumask_test_cpu(cpu, buffer_b->cpumask))
                goto out;
+        cpu_buffer_a = buffer_a->buffers[cpu];
+        cpu_buffer_b = buffer_b->buffers[cpu];
        /* At least make sure the two buffers are somewhat the same */
-        if (buffer_a->pages != buffer_b->pages)
+        if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
                goto out;
        ret = -EAGAIN;
@@ -3780,9 +4028,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
        if (atomic_read(&buffer_b->record_disabled))
                goto out;
-        cpu_buffer_a = buffer_a->buffers[cpu];
-        cpu_buffer_b = buffer_b->buffers[cpu];
        if (atomic_read(&cpu_buffer_a->record_disabled))
                goto out;
@@ -4071,6 +4316,8 @@ static int rb_cpu_notify(struct notifier_block *self,
        struct ring_buffer *buffer =
                container_of(self, struct ring_buffer, cpu_notify);
        long cpu = (long)hcpu;
+        int cpu_i, nr_pages_same;
+        unsigned int nr_pages;
        switch (action) {
        case CPU_UP_PREPARE:
@@ -4078,8 +4325,23 @@ static int rb_cpu_notify(struct notifier_block *self,
                if (cpumask_test_cpu(cpu, buffer->cpumask))
                        return NOTIFY_OK;
+                nr_pages = 0;
+                nr_pages_same = 1;
+                /* check if all cpu sizes are same */
+                for_each_buffer_cpu(buffer, cpu_i) {
+                        /* fill in the size from first enabled cpu */
+                        if (nr_pages == 0)
+                                nr_pages = buffer->buffers[cpu_i]->nr_pages;
+                        if (nr_pages != buffer->buffers[cpu_i]->nr_pages) {
+                                nr_pages_same = 0;
+                                break;
+                        }
+                }
+                /* allocate minimum pages, user can later expand it */
+                if (!nr_pages_same)
+                        nr_pages = 2;
                buffer->buffers[cpu] =
-                        rb_allocate_cpu_buffer(buffer, cpu);
+                        rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
                if (!buffer->buffers[cpu]) {
                        WARN(1, "failed to allocate ring buffer on CPU %ld\n",
                             cpu);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2a22255c1010..a7fa0702be1c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -87,18 +87,6 @@ static int tracing_disabled = 1;
 DEFINE_PER_CPU(int, ftrace_cpu_disabled);
-static inline void ftrace_disable_cpu(void)
-{
-        preempt_disable();
-        __this_cpu_inc(ftrace_cpu_disabled);
-}
-static inline void ftrace_enable_cpu(void)
-{
-        __this_cpu_dec(ftrace_cpu_disabled);
-        preempt_enable();
-}
 cpumask_var_t __read_mostly     tracing_buffer_mask;
 /*
@@ -383,7 +371,7 @@ EXPORT_SYMBOL_GPL(tracing_on);
 void tracing_off(void)
 {
        if (global_trace.buffer)
-                ring_buffer_record_on(global_trace.buffer);
+                ring_buffer_record_off(global_trace.buffer);
        /*
         * This flag is only looked at when buffers haven't been
         * allocated yet. We don't really care about the race
@@ -629,7 +617,6 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
 static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
 {
        int len;
-        void *ret;
        if (s->len <= s->readpos)
                return -EBUSY;
@@ -637,9 +624,7 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
        len = s->len - s->readpos;
        if (cnt > len)
                cnt = len;
-        ret = memcpy(buf, s->buffer + s->readpos, cnt);
+        memcpy(buf, s->buffer + s->readpos, cnt);
-        if (!ret)
-                return -EFAULT;
        s->readpos += cnt;
        return cnt;
@@ -751,8 +736,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
        arch_spin_lock(&ftrace_max_lock);
-        ftrace_disable_cpu();
        ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu);
        if (ret == -EBUSY) {
@@ -766,8 +749,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
                        "Failed to swap buffers due to commit in progress\n");
        }
-        ftrace_enable_cpu();
        WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
        __update_max_tr(tr, tsk, cpu);
@@ -782,8 +763,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 * Register a new plugin tracer.
 */
 int register_tracer(struct tracer *type)
-__releases(kernel_lock)
-__acquires(kernel_lock)
 {
        struct tracer *t;
        int ret = 0;
@@ -841,7 +820,8 @@ __acquires(kernel_lock)
                /* If we expanded the buffers, make sure the max is expanded too */
                if (ring_buffer_expanded && type->use_max_tr)
-                        ring_buffer_resize(max_tr.buffer, trace_buf_size);
+                        ring_buffer_resize(max_tr.buffer, trace_buf_size,
+                                                RING_BUFFER_ALL_CPUS);
                /* the test is responsible for initializing and enabling */
                pr_info("Testing tracer %s: ", type->name);
@@ -857,7 +837,8 @@ __acquires(kernel_lock)
                /* Shrink the max buffer again */
                if (ring_buffer_expanded && type->use_max_tr)
-                        ring_buffer_resize(max_tr.buffer, 1);
+                        ring_buffer_resize(max_tr.buffer, 1,
+                                                RING_BUFFER_ALL_CPUS);
                printk(KERN_CONT "PASSED\n");
        }
@@ -917,13 +898,6 @@ out:
        mutex_unlock(&trace_types_lock);
 }
-static void __tracing_reset(struct ring_buffer *buffer, int cpu)
-{
-        ftrace_disable_cpu();
-        ring_buffer_reset_cpu(buffer, cpu);
-        ftrace_enable_cpu();
-}
 void tracing_reset(struct trace_array *tr, int cpu)
 {
        struct ring_buffer *buffer = tr->buffer;
@@ -932,7 +906,7 @@ void tracing_reset(struct trace_array *tr, int cpu)
        /* Make sure all commits have finished */
        synchronize_sched();
-        __tracing_reset(buffer, cpu);
+        ring_buffer_reset_cpu(buffer, cpu);
        ring_buffer_record_enable(buffer);
 }
@@ -950,7 +924,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
        tr->time_start = ftrace_now(tr->cpu);
        for_each_online_cpu(cpu)
-                __tracing_reset(buffer, cpu);
+                ring_buffer_reset_cpu(buffer, cpu);
        ring_buffer_record_enable(buffer);
 }
@@ -1498,25 +1472,119 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
 #endif /* CONFIG_STACKTRACE */
+/* created for use with alloc_percpu */
+struct trace_buffer_struct {
+        char buffer[TRACE_BUF_SIZE];
+};
+static struct trace_buffer_struct *trace_percpu_buffer;
+static struct trace_buffer_struct *trace_percpu_sirq_buffer;
+static struct trace_buffer_struct *trace_percpu_irq_buffer;
+static struct trace_buffer_struct *trace_percpu_nmi_buffer;
+/*
+ * The buffer used is dependent on the context. There is a per cpu
+ * buffer for normal context, softirq contex, hard irq context and
+ * for NMI context. Thise allows for lockless recording.
+ *
+ * Note, if the buffers failed to be allocated, then this returns NULL
+ */
+static char *get_trace_buf(void)
+{
+        struct trace_buffer_struct *percpu_buffer;
+        struct trace_buffer_struct *buffer;
+        /*
+         * If we have allocated per cpu buffers, then we do not
+         * need to do any locking.
+         */
+        if (in_nmi())
+                percpu_buffer = trace_percpu_nmi_buffer;
+        else if (in_irq())
+                percpu_buffer = trace_percpu_irq_buffer;
+        else if (in_softirq())
+                percpu_buffer = trace_percpu_sirq_buffer;
+        else
+                percpu_buffer = trace_percpu_buffer;
+        if (!percpu_buffer)
+                return NULL;
+        buffer = per_cpu_ptr(percpu_buffer, smp_processor_id());
+        return buffer->buffer;
+}
+static int alloc_percpu_trace_buffer(void)
+{
+        struct trace_buffer_struct *buffers;
+        struct trace_buffer_struct *sirq_buffers;
+        struct trace_buffer_struct *irq_buffers;
+        struct trace_buffer_struct *nmi_buffers;
+        buffers = alloc_percpu(struct trace_buffer_struct);
+        if (!buffers)
+                goto err_warn;
+        sirq_buffers = alloc_percpu(struct trace_buffer_struct);
+        if (!sirq_buffers)
+                goto err_sirq;
+        irq_buffers = alloc_percpu(struct trace_buffer_struct);
+        if (!irq_buffers)
+                goto err_irq;
+        nmi_buffers = alloc_percpu(struct trace_buffer_struct);
+        if (!nmi_buffers)
+                goto err_nmi;
+        trace_percpu_buffer = buffers;
+        trace_percpu_sirq_buffer = sirq_buffers;
+        trace_percpu_irq_buffer = irq_buffers;
+        trace_percpu_nmi_buffer = nmi_buffers;
+        return 0;
+ err_nmi:
+        free_percpu(irq_buffers);
+ err_irq:
+        free_percpu(sirq_buffers);
+ err_sirq:
+        free_percpu(buffers);
+ err_warn:
+        WARN(1, "Could not allocate percpu trace_printk buffer");
+        return -ENOMEM;
+}
+void trace_printk_init_buffers(void)
+{
+        static int buffers_allocated;
+        if (buffers_allocated)
+                return;
+        if (alloc_percpu_trace_buffer())
+                return;
+        pr_info("ftrace: Allocated trace_printk buffers\n");
+        buffers_allocated = 1;
+}
 /**
 * trace_vbprintk - write binary msg to tracing buffer
 *
 */
 int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 {
-        static arch_spinlock_t trace_buf_lock =
-                (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
-        static u32 trace_buf[TRACE_BUF_SIZE];
        struct ftrace_event_call *call = &event_bprint;
        struct ring_buffer_event *event;
        struct ring_buffer *buffer;
        struct trace_array *tr = &global_trace;
-        struct trace_array_cpu *data;
        struct bprint_entry *entry;
        unsigned long flags;
-        int disable;
+        char *tbuffer;
-        int cpu, len = 0, size, pc;
+        int len = 0, size, pc;
        if (unlikely(tracing_selftest_running || tracing_disabled))
                return 0;
@@ -1526,43 +1594,36 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
        pc = preempt_count();
        preempt_disable_notrace();
-        cpu = raw_smp_processor_id();
-        data = tr->data[cpu];
-        disable = atomic_inc_return(&data->disabled);
+        tbuffer = get_trace_buf();
-        if (unlikely(disable != 1))
+        if (!tbuffer) {
+                len = 0;
                goto out;
+        }
-        /* Lockdep uses trace_printk for lock tracing */
+        len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args);
-        local_irq_save(flags);
-        arch_spin_lock(&trace_buf_lock);
-        len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
-        if (len > TRACE_BUF_SIZE || len < 0)
+        if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0)
-                goto out_unlock;
+                goto out;
+        local_save_flags(flags);
        size = sizeof(*entry) + sizeof(u32) * len;
        buffer = tr->buffer;
        event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
                                          flags, pc);
        if (!event)
-                goto out_unlock;
+                goto out;
        entry = ring_buffer_event_data(event);
        entry->ip                       = ip;
        entry->fmt                      = fmt;
-        memcpy(entry->buf, trace_buf, sizeof(u32) * len);
+        memcpy(entry->buf, tbuffer, sizeof(u32) * len);
        if (!filter_check_discard(call, entry, buffer, event)) {
                ring_buffer_unlock_commit(buffer, event);
                ftrace_trace_stack(buffer, flags, 6, pc);
        }
-out_unlock:
-        arch_spin_unlock(&trace_buf_lock);
-        local_irq_restore(flags);
 out:
-        atomic_dec_return(&data->disabled);
        preempt_enable_notrace();
        unpause_graph_tracing();
@@ -1588,58 +1649,53 @@ int trace_array_printk(struct trace_array *tr,
 int trace_array_vprintk(struct trace_array *tr,
                        unsigned long ip, const char *fmt, va_list args)
 {
-        static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED;
-        static char trace_buf[TRACE_BUF_SIZE];
        struct ftrace_event_call *call = &event_print;
        struct ring_buffer_event *event;
        struct ring_buffer *buffer;
-        struct trace_array_cpu *data;
+        int len = 0, size, pc;
-        int cpu, len = 0, size, pc;
        struct print_entry *entry;
-        unsigned long irq_flags;
+        unsigned long flags;
-        int disable;
+        char *tbuffer;
        if (tracing_disabled || tracing_selftest_running)
                return 0;
+        /* Don't pollute graph traces with trace_vprintk internals */
+        pause_graph_tracing();
        pc = preempt_count();
        preempt_disable_notrace();
-        cpu = raw_smp_processor_id();
-        data = tr->data[cpu];
-        disable = atomic_inc_return(&data->disabled);
-        if (unlikely(disable != 1))
+        tbuffer = get_trace_buf();
+        if (!tbuffer) {
+                len = 0;
                goto out;
+        }
-        pause_graph_tracing();
+        len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
-        raw_local_irq_save(irq_flags);
+        if (len > TRACE_BUF_SIZE)
-        arch_spin_lock(&trace_buf_lock);
+                goto out;
-        len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
+        local_save_flags(flags);
        size = sizeof(*entry) + len + 1;
        buffer = tr->buffer;
        event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
-                                          irq_flags, pc);
+                                          flags, pc);
        if (!event)
-                goto out_unlock;
+                goto out;
        entry = ring_buffer_event_data(event);
        entry->ip = ip;
-        memcpy(&entry->buf, trace_buf, len);
+        memcpy(&entry->buf, tbuffer, len);
        entry->buf[len] = '\0';
        if (!filter_check_discard(call, entry, buffer, event)) {
                ring_buffer_unlock_commit(buffer, event);
-                ftrace_trace_stack(buffer, irq_flags, 6, pc);
+                ftrace_trace_stack(buffer, flags, 6, pc);
        }
- out_unlock:
-        arch_spin_unlock(&trace_buf_lock);
-        raw_local_irq_restore(irq_flags);
-        unpause_graph_tracing();
 out:
-        atomic_dec_return(&data->disabled);
        preempt_enable_notrace();
+        unpause_graph_tracing();
        return len;
 }
@@ -1652,14 +1708,9 @@ EXPORT_SYMBOL_GPL(trace_vprintk);
 static void trace_iterator_increment(struct trace_iterator *iter)
 {
-        /* Don't allow ftrace to trace into the ring buffers */
-        ftrace_disable_cpu();
        iter->idx++;
        if (iter->buffer_iter[iter->cpu])
                ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
-        ftrace_enable_cpu();
 }
 static struct trace_entry *
@@ -1669,17 +1720,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
        struct ring_buffer_event *event;
        struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
-        /* Don't allow ftrace to trace into the ring buffers */
-        ftrace_disable_cpu();
        if (buf_iter)
                event = ring_buffer_iter_peek(buf_iter, ts);
        else
                event = ring_buffer_peek(iter->tr->buffer, cpu, ts,
                                         lost_events);
-        ftrace_enable_cpu();
        if (event) {
                iter->ent_size = ring_buffer_event_length(event);
                return ring_buffer_event_data(event);
@@ -1769,11 +1815,8 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter)
 static void trace_consume(struct trace_iterator *iter)
 {
-        /* Don't allow ftrace to trace into the ring buffers */
-        ftrace_disable_cpu();
        ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,
                            &iter->lost_events);
-        ftrace_enable_cpu();
 }
 static void *s_next(struct seq_file *m, void *v, loff_t *pos)
@@ -1862,16 +1905,12 @@ static void *s_start(struct seq_file *m, loff_t *pos)
                iter->cpu = 0;
                iter->idx = -1;
-                ftrace_disable_cpu();
                if (cpu_file == TRACE_PIPE_ALL_CPU) {
                        for_each_tracing_cpu(cpu)
                                tracing_iter_reset(iter, cpu);
                } else
                        tracing_iter_reset(iter, cpu_file);
-                ftrace_enable_cpu();
                iter->leftover = 0;
                for (p = iter; p && l < *pos; p = s_next(m, p, &l))
                        ;
@@ -2332,15 +2371,13 @@ static struct trace_iterator *
 __tracing_open(struct inode *inode, struct file *file)
 {
        long cpu_file = (long) inode->i_private;
-        void *fail_ret = ERR_PTR(-ENOMEM);
        struct trace_iterator *iter;
-        struct seq_file *m;
+        int cpu;
-        int cpu, ret;
        if (tracing_disabled)
                return ERR_PTR(-ENODEV);
-        iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+        iter = __seq_open_private(file, &tracer_seq_ops, sizeof(*iter));
        if (!iter)
                return ERR_PTR(-ENOMEM);
@@ -2397,32 +2434,15 @@ __tracing_open(struct inode *inode, struct file *file)
                tracing_iter_reset(iter, cpu);
        }
-        ret = seq_open(file, &tracer_seq_ops);
-        if (ret < 0) {
-                fail_ret = ERR_PTR(ret);
-                goto fail_buffer;
-        }
-        m = file->private_data;
-        m->private = iter;
        mutex_unlock(&trace_types_lock);
        return iter;
- fail_buffer:
-        for_each_tracing_cpu(cpu) {
-                if (iter->buffer_iter[cpu])
-                        ring_buffer_read_finish(iter->buffer_iter[cpu]);
-        }
-        free_cpumask_var(iter->started);
-        tracing_start();
 fail:
        mutex_unlock(&trace_types_lock);
        kfree(iter->trace);
-        kfree(iter);
+        seq_release_private(inode, file);
+        return ERR_PTR(-ENOMEM);
-        return fail_ret;
 }
 int tracing_open_generic(struct inode *inode, struct file *filp)
@@ -2458,11 +2478,10 @@ static int tracing_release(struct inode *inode, struct file *file)
        tracing_start();
        mutex_unlock(&trace_types_lock);
-        seq_release(inode, file);
        mutex_destroy(&iter->mutex);
        free_cpumask_var(iter->started);
        kfree(iter->trace);
-        kfree(iter);
+        seq_release_private(inode, file);
        return 0;
 }
@@ -2648,10 +2667,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
                if (cpumask_test_cpu(cpu, tracing_cpumask) &&
                                !cpumask_test_cpu(cpu, tracing_cpumask_new)) {
                        atomic_inc(&global_trace.data[cpu]->disabled);
+                        ring_buffer_record_disable_cpu(global_trace.buffer, cpu);
                }
                if (!cpumask_test_cpu(cpu, tracing_cpumask) &&
                                cpumask_test_cpu(cpu, tracing_cpumask_new)) {
                        atomic_dec(&global_trace.data[cpu]->disabled);
+                        ring_buffer_record_enable_cpu(global_trace.buffer, cpu);
                }
        }
        arch_spin_unlock(&ftrace_max_lock);
@@ -2974,7 +2995,14 @@ int tracer_init(struct tracer *t, struct trace_array *tr)
        return t->init(tr);
 }
-static int __tracing_resize_ring_buffer(unsigned long size)
+static void set_buffer_entries(struct trace_array *tr, unsigned long val)
+{
+        int cpu;
+        for_each_tracing_cpu(cpu)
+                tr->data[cpu]->entries = val;
+}
+static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
 {
        int ret;
@@ -2985,19 +3013,32 @@ static int __tracing_resize_ring_buffer(unsigned long size)
         */
        ring_buffer_expanded = 1;
-        ret = ring_buffer_resize(global_trace.buffer, size);
+        ret = ring_buffer_resize(global_trace.buffer, size, cpu);
        if (ret < 0)
                return ret;
        if (!current_trace->use_max_tr)
                goto out;
-        ret = ring_buffer_resize(max_tr.buffer, size);
+        ret = ring_buffer_resize(max_tr.buffer, size, cpu);
        if (ret < 0) {
-                int r;
+                int r = 0;
+                if (cpu == RING_BUFFER_ALL_CPUS) {
+                        int i;
+                        for_each_tracing_cpu(i) {
+                                r = ring_buffer_resize(global_trace.buffer,
+                                                global_trace.data[i]->entries,
+                                                i);
+                                if (r < 0)
+                                        break;
+                        }
+                } else {
+                        r = ring_buffer_resize(global_trace.buffer,
+                                                global_trace.data[cpu]->entries,
+                                                cpu);
+                }
-                r = ring_buffer_resize(global_trace.buffer,
-                                       global_trace.entries);
                if (r < 0) {
                        /*
                         * AARGH! We are left with different
@@ -3019,43 +3060,39 @@ static int __tracing_resize_ring_buffer(unsigned long size)
                return ret;
        }
-        max_tr.entries = size;
+        if (cpu == RING_BUFFER_ALL_CPUS)
+                set_buffer_entries(&max_tr, size);
+        else
+                max_tr.data[cpu]->entries = size;
 out:
-        global_trace.entries = size;
+        if (cpu == RING_BUFFER_ALL_CPUS)
+                set_buffer_entries(&global_trace, size);
+        else
+                global_trace.data[cpu]->entries = size;
        return ret;
 }
-static ssize_t tracing_resize_ring_buffer(unsigned long size)
+static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id)
 {
-        int cpu, ret = size;
+        int ret = size;
        mutex_lock(&trace_types_lock);
-        tracing_stop();
+        if (cpu_id != RING_BUFFER_ALL_CPUS) {
+                /* make sure, this cpu is enabled in the mask */
-        /* disable all cpu buffers */
+                if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) {
-        for_each_tracing_cpu(cpu) {
+                        ret = -EINVAL;
-                if (global_trace.data[cpu])
+                        goto out;
-                        atomic_inc(&global_trace.data[cpu]->disabled);
+                }
-                if (max_tr.data[cpu])
-                        atomic_inc(&max_tr.data[cpu]->disabled);
        }
-        if (size != global_trace.entries)
+        ret = __tracing_resize_ring_buffer(size, cpu_id);
-                ret = __tracing_resize_ring_buffer(size);
        if (ret < 0)
                ret = -ENOMEM;
-        for_each_tracing_cpu(cpu) {
+out:
-                if (global_trace.data[cpu])
-                        atomic_dec(&global_trace.data[cpu]->disabled);
-                if (max_tr.data[cpu])
-                        atomic_dec(&max_tr.data[cpu]->disabled);
-        }
-        tracing_start();
        mutex_unlock(&trace_types_lock);
        return ret;
@@ -3078,7 +3115,8 @@ int tracing_update_buffers(void)
        mutex_lock(&trace_types_lock);
        if (!ring_buffer_expanded)
-                ret = __tracing_resize_ring_buffer(trace_buf_size);
+                ret = __tracing_resize_ring_buffer(trace_buf_size,
+                                                RING_BUFFER_ALL_CPUS);
        mutex_unlock(&trace_types_lock);
        return ret;
@@ -3102,7 +3140,8 @@ static int tracing_set_tracer(const char *buf)
        mutex_lock(&trace_types_lock);
        if (!ring_buffer_expanded) {
-                ret = __tracing_resize_ring_buffer(trace_buf_size);
+                ret = __tracing_resize_ring_buffer(trace_buf_size,
+                                                RING_BUFFER_ALL_CPUS);
                if (ret < 0)
                        goto out;
                ret = 0;
@@ -3128,8 +3167,8 @@ static int tracing_set_tracer(const char *buf)
                 * The max_tr ring buffer has some state (e.g. ring->clock) and
                 * we want preserve it.
                 */
-                ring_buffer_resize(max_tr.buffer, 1);
+                ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS);
-                max_tr.entries = 1;
+                set_buffer_entries(&max_tr, 1);
        }
        destroy_trace_option_files(topts);
@@ -3137,10 +3176,17 @@ static int tracing_set_tracer(const char *buf)
        topts = create_trace_option_files(current_trace);
        if (current_trace->use_max_tr) {
-                ret = ring_buffer_resize(max_tr.buffer, global_trace.entries);
+                int cpu;
-                if (ret < 0)
+                /* we need to make per cpu buffer sizes equivalent */
-                        goto out;
+                for_each_tracing_cpu(cpu) {
-                max_tr.entries = global_trace.entries;
+                        ret = ring_buffer_resize(max_tr.buffer,
+                                                global_trace.data[cpu]->entries,
+                                                cpu);
+                        if (ret < 0)
+                                goto out;
+                        max_tr.data[cpu]->entries =
+                                        global_trace.data[cpu]->entries;
+                }
        }
        if (t->init) {
@@ -3563,6 +3609,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
                .pages          = pages_def,
                .partial        = partial_def,
                .nr_pages       = 0, /* This gets updated below. */
+                .nr_pages_max   = PIPE_DEF_BUFFERS,
                .flags          = flags,
                .ops            = &tracing_pipe_buf_ops,
                .spd_release    = tracing_spd_release_pipe,
@@ -3634,7 +3681,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
        ret = splice_to_pipe(pipe, &spd);
 out:
-        splice_shrink_spd(pipe, &spd);
+        splice_shrink_spd(&spd);
        return ret;
 out_err:
@@ -3642,30 +3689,82 @@ out_err:
        goto out;
 }
+struct ftrace_entries_info {
+        struct trace_array      *tr;
+        int                     cpu;
+};
+static int tracing_entries_open(struct inode *inode, struct file *filp)
+{
+        struct ftrace_entries_info *info;
+        if (tracing_disabled)
+                return -ENODEV;
+        info = kzalloc(sizeof(*info), GFP_KERNEL);
+        if (!info)
+                return -ENOMEM;
+        info->tr = &global_trace;
+        info->cpu = (unsigned long)inode->i_private;
+        filp->private_data = info;
+        return 0;
+}
 static ssize_t
 tracing_entries_read(struct file *filp, char __user *ubuf,
                     size_t cnt, loff_t *ppos)
 {
-        struct trace_array *tr = filp->private_data;
+        struct ftrace_entries_info *info = filp->private_data;
-        char buf[96];
+        struct trace_array *tr = info->tr;
-        int r;
+        char buf[64];
+        int r = 0;
+        ssize_t ret;
        mutex_lock(&trace_types_lock);
-        if (!ring_buffer_expanded)
-                r = sprintf(buf, "%lu (expanded: %lu)\n",
+        if (info->cpu == RING_BUFFER_ALL_CPUS) {
-                            tr->entries >> 10,
+                int cpu, buf_size_same;
-                            trace_buf_size >> 10);
+                unsigned long size;
-        else
-                r = sprintf(buf, "%lu\n", tr->entries >> 10);
+                size = 0;
+                buf_size_same = 1;
+                /* check if all cpu sizes are same */
+                for_each_tracing_cpu(cpu) {
+                        /* fill in the size from first enabled cpu */
+                        if (size == 0)
+                                size = tr->data[cpu]->entries;
+                        if (size != tr->data[cpu]->entries) {
+                                buf_size_same = 0;
+                                break;
+                        }
+                }
+                if (buf_size_same) {
+                        if (!ring_buffer_expanded)
+                                r = sprintf(buf, "%lu (expanded: %lu)\n",
+                                            size >> 10,
+                                            trace_buf_size >> 10);
+                        else
+                                r = sprintf(buf, "%lu\n", size >> 10);
+                } else
+                        r = sprintf(buf, "X\n");
+        } else
+                r = sprintf(buf, "%lu\n", tr->data[info->cpu]->entries >> 10);
        mutex_unlock(&trace_types_lock);
-        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+        ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+        return ret;
 }
 static ssize_t
 tracing_entries_write(struct file *filp, const char __user *ubuf,
                      size_t cnt, loff_t *ppos)
 {
+        struct ftrace_entries_info *info = filp->private_data;
        unsigned long val;
        int ret;
@@ -3680,7 +3779,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
        /* value is in KB */
        val <<= 10;
-        ret = tracing_resize_ring_buffer(val);
+        ret = tracing_resize_ring_buffer(val, info->cpu);
        if (ret < 0)
                return ret;
@@ -3689,6 +3788,16 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
        return cnt;
 }
+static int
+tracing_entries_release(struct inode *inode, struct file *filp)
+{
+        struct ftrace_entries_info *info = filp->private_data;
+        kfree(info);
+        return 0;
+}
 static ssize_t
 tracing_total_entries_read(struct file *filp, char __user *ubuf,
                                size_t cnt, loff_t *ppos)
@@ -3700,7 +3809,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,
        mutex_lock(&trace_types_lock);
        for_each_tracing_cpu(cpu) {
-                size += tr->entries >> 10;
+                size += tr->data[cpu]->entries >> 10;
                if (!ring_buffer_expanded)
                        expanded_size += trace_buf_size >> 10;
        }
@@ -3734,7 +3843,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
        if (trace_flags & TRACE_ITER_STOP_ON_FREE)
                tracing_off();
        /* resize the ring buffer to 0 */
-        tracing_resize_ring_buffer(0);
+        tracing_resize_ring_buffer(0, RING_BUFFER_ALL_CPUS);
        return 0;
 }
@@ -3749,14 +3858,14 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
        struct print_entry *entry;
        unsigned long irq_flags;
        struct page *pages[2];
+        void *map_page[2];
        int nr_pages = 1;
        ssize_t written;
-        void *page1;
-        void *page2;
        int offset;
        int size;
        int len;
        int ret;
+        int i;
        if (tracing_disabled)
                return -EINVAL;
@@ -3795,9 +3904,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
                goto out;
        }
-        page1 = kmap_atomic(pages[0]);
+        for (i = 0; i < nr_pages; i++)
-        if (nr_pages == 2)
+                map_page[i] = kmap_atomic(pages[i]);
-                page2 = kmap_atomic(pages[1]);
        local_save_flags(irq_flags);
        size = sizeof(*entry) + cnt + 2; /* possible \n added */
@@ -3815,10 +3923,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
        if (nr_pages == 2) {
                len = PAGE_SIZE - offset;
-                memcpy(&entry->buf, page1 + offset, len);
+                memcpy(&entry->buf, map_page[0] + offset, len);
-                memcpy(&entry->buf[len], page2, cnt - len);
+                memcpy(&entry->buf[len], map_page[1], cnt - len);
        } else
-                memcpy(&entry->buf, page1 + offset, cnt);
+                memcpy(&entry->buf, map_page[0] + offset, cnt);
        if (entry->buf[cnt - 1] != '\n') {
                entry->buf[cnt] = '\n';
@@ -3833,11 +3941,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
        *fpos += written;
 out_unlock:
-        if (nr_pages == 2)
+        for (i = 0; i < nr_pages; i++){
-                kunmap_atomic(page2);
+                kunmap_atomic(map_page[i]);
-        kunmap_atomic(page1);
+                put_page(pages[i]);
-        while (nr_pages > 0)
+        }
-                put_page(pages[--nr_pages]);
 out:
        return written;
 }
@@ -3933,9 +4040,10 @@ static const struct file_operations tracing_pipe_fops = {
 };
 static const struct file_operations tracing_entries_fops = {
-        .open           = tracing_open_generic,
+        .open           = tracing_entries_open,
        .read           = tracing_entries_read,
        .write          = tracing_entries_write,
+        .release        = tracing_entries_release,
        .llseek         = generic_file_llseek,
 };
@@ -4124,6 +4232,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
        struct splice_pipe_desc spd = {
                .pages          = pages_def,
                .partial        = partial_def,
+                .nr_pages_max   = PIPE_DEF_BUFFERS,
                .flags          = flags,
                .ops            = &buffer_pipe_buf_ops,
                .spd_release    = buffer_spd_release,
@@ -4211,7 +4320,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
        }
        ret = splice_to_pipe(pipe, &spd);
-        splice_shrink_spd(pipe, &spd);
+        splice_shrink_spd(&spd);
 out:
        return ret;
 }
@@ -4367,6 +4476,9 @@ static void tracing_init_debugfs_percpu(long cpu)
        struct dentry *d_cpu;
        char cpu_dir[30]; /* 30 characters should be more than enough */
+        if (!d_percpu)
+                return;
        snprintf(cpu_dir, 30, "cpu%ld", cpu);
        d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
        if (!d_cpu) {
@@ -4387,6 +4499,9 @@ static void tracing_init_debugfs_percpu(long cpu)
        trace_create_file("stats", 0444, d_cpu,
                        (void *) cpu, &tracing_stats_fops);
+        trace_create_file("buffer_size_kb", 0444, d_cpu,
+                        (void *) cpu, &tracing_entries_fops);
 }
 #ifdef CONFIG_FTRACE_SELFTEST
@@ -4718,7 +4833,7 @@ static __init int tracer_init_debugfs(void)
                        (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
        trace_create_file("buffer_size_kb", 0644, d_tracer,
-                        &global_trace, &tracing_entries_fops);
+                        (void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops);
        trace_create_file("buffer_total_size_kb", 0444, d_tracer,
                        &global_trace, &tracing_total_entries_fops);
@@ -4957,6 +5072,10 @@ __init static int tracer_alloc_buffers(void)
        if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
                goto out_free_buffer_mask;
+        /* Only allocate trace_printk buffers if a trace_printk exists */
+        if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt)
+                trace_printk_init_buffers();
        /* To save memory, keep the ring buffer size to its minimum */
        if (ring_buffer_expanded)
                ring_buf_size = trace_buf_size;
@@ -4975,7 +5094,6 @@ __init static int tracer_alloc_buffers(void)
                WARN_ON(1);
                goto out_free_cpumask;
        }
-        global_trace.entries = ring_buffer_size(global_trace.buffer);
        if (global_trace.buffer_disabled)
                tracing_off();
@@ -4988,7 +5106,6 @@ __init static int tracer_alloc_buffers(void)
                ring_buffer_free(global_trace.buffer);
                goto out_free_cpumask;
        }
-        max_tr.entries = 1;
 #endif
        /* Allocate the first page for all buffers */
@@ -4997,6 +5114,12 @@ __init static int tracer_alloc_buffers(void)
                max_tr.data[i] = &per_cpu(max_tr_data, i);
        }
+        set_buffer_entries(&global_trace,
+                           ring_buffer_size(global_trace.buffer, 0));
+#ifdef CONFIG_TRACER_MAX_TRACE
+        set_buffer_entries(&max_tr, 1);
+#endif
        trace_init_cmdlines();
        register_tracer(&nop_trace);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f95d65da6db8..5aec220d2de0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -103,6 +103,11 @@ struct kretprobe_trace_entry_head {
        unsigned long           ret_ip;
 };
+struct uprobe_trace_entry_head {
+        struct trace_entry      ent;
+        unsigned long           ip;
+};
 /*
 * trace_flag_type is an enumeration that holds different
 * states when a trace occurs. These are:
@@ -131,6 +136,7 @@ struct trace_array_cpu {
        atomic_t                disabled;
        void                    *buffer_page;   /* ring buffer spare */
+        unsigned long           entries;
        unsigned long           saved_latency;
        unsigned long           critical_start;
        unsigned long           critical_end;
@@ -152,7 +158,6 @@ struct trace_array_cpu {
 */
 struct trace_array {
        struct ring_buffer      *buffer;
-        unsigned long           entries;
        int                     cpu;
        int                     buffer_disabled;
        cycle_t                 time_start;
@@ -826,6 +831,8 @@ extern struct list_head ftrace_events;
 extern const char *__start___trace_bprintk_fmt[];
 extern const char *__stop___trace_bprintk_fmt[];
+void trace_printk_init_buffers(void);
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter)     \
        extern struct ftrace_event_call                                 \
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 079a93ae8a9d..29111da1d100 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -294,6 +294,9 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
                if (!call->name || !call->class || !call->class->reg)
                        continue;
+                if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
+                        continue;
                if (match &&
                    strcmp(match, call->name) != 0 &&
                    strcmp(match, call->class->system) != 0)
@@ -1164,7 +1167,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
                return -1;
        }
-        if (call->class->reg)
+        if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
                trace_create_file("enable", 0644, call->dir, call,
                                  enable);
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 3dd15e8bc856..e039906b037d 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -180,6 +180,7 @@ struct ftrace_event_call __used event_##call = {			\
        .event.type             = etype,                                \
        .class                  = &event_class_ftrace_##call,           \
        .print_fmt              = print,                                \
+        .flags                  = TRACE_EVENT_FL_IGNORE_ENABLE,         \
 };                                                                      \
 struct ftrace_event_call __used                                         \
 __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 580a05ec926b..b31d3d5699fe 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -19,547 +19,15 @@
 #include <linux/module.h>
 #include <linux/uaccess.h>
-#include <linux/kprobes.h>
-#include <linux/seq_file.h>
-#include <linux/slab.h>
-#include <linux/smp.h>
-#include <linux/debugfs.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/ctype.h>
-#include <linux/ptrace.h>
-#include <linux/perf_event.h>
-#include <linux/stringify.h>
-#include <linux/limits.h>
-#include <asm/bitsperlong.h>
-#include "trace.h"
-#include "trace_output.h"
-#define MAX_TRACE_ARGS 128
-#define MAX_ARGSTR_LEN 63
-#define MAX_EVENT_NAME_LEN 64
-#define MAX_STRING_SIZE PATH_MAX
-#define KPROBE_EVENT_SYSTEM "kprobes"
-/* Reserved field names */
-#define FIELD_STRING_IP "__probe_ip"
-#define FIELD_STRING_RETIP "__probe_ret_ip"
-#define FIELD_STRING_FUNC "__probe_func"
-const char *reserved_field_names[] = {
-        "common_type",
-        "common_flags",
-        "common_preempt_count",
-        "common_pid",
-        "common_tgid",
-        FIELD_STRING_IP,
-        FIELD_STRING_RETIP,
-        FIELD_STRING_FUNC,
-};
-/* Printing function type */
-typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *,
-                                 void *);
-#define PRINT_TYPE_FUNC_NAME(type)      print_type_##type
-#define PRINT_TYPE_FMT_NAME(type)       print_type_format_##type
-/* Printing  in basic type function template */
-#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast)                   \
-static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s,    \
-                                                const char *name,       \
-                                                void *data, void *ent)\
-{                                                                       \
-        return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
-}                                                                       \
-static const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
-DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
-DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
-DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
-DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
-DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
-DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
-DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
-DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
-/* data_rloc: data relative location, compatible with u32 */
-#define make_data_rloc(len, roffs)      \
-        (((u32)(len) << 16) | ((u32)(roffs) & 0xffff))
-#define get_rloc_len(dl)        ((u32)(dl) >> 16)
-#define get_rloc_offs(dl)       ((u32)(dl) & 0xffff)
-static inline void *get_rloc_data(u32 *dl)
-{
-        return (u8 *)dl + get_rloc_offs(*dl);
-}
-/* For data_loc conversion */
-static inline void *get_loc_data(u32 *dl, void *ent)
-{
-        return (u8 *)ent + get_rloc_offs(*dl);
-}
-/*
- * Convert data_rloc to data_loc:
- *  data_rloc stores the offset from data_rloc itself, but data_loc
- *  stores the offset from event entry.
- */
-#define convert_rloc_to_loc(dl, offs)   ((u32)(dl) + (offs))
-/* For defining macros, define string/string_size types */
-typedef u32 string;
-typedef u32 string_size;
-/* Print type function for string type */
-static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
-                                                  const char *name,
-                                                  void *data, void *ent)
-{
-        int len = *(u32 *)data >> 16;
-        if (!len)
-                return trace_seq_printf(s, " %s=(fault)", name);
-        else
-                return trace_seq_printf(s, " %s=\"%s\"", name,
-                                        (const char *)get_loc_data(data, ent));
-}
-static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
-/* Data fetch function type */
-typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
-struct fetch_param {
-        fetch_func_t    fn;
-        void *data;
-};
-static __kprobes void call_fetch(struct fetch_param *fprm,
-                                 struct pt_regs *regs, void *dest)
-{
-        return fprm->fn(regs, fprm->data, dest);
-}
-#define FETCH_FUNC_NAME(method, type)   fetch_##method##_##type
-/*
- * Define macro for basic types - we don't need to define s* types, because
- * we have to care only about bitwidth at recording time.
- */
-#define DEFINE_BASIC_FETCH_FUNCS(method) \
-DEFINE_FETCH_##method(u8)               \
-DEFINE_FETCH_##method(u16)              \
-DEFINE_FETCH_##method(u32)              \
-DEFINE_FETCH_##method(u64)
-#define CHECK_FETCH_FUNCS(method, fn)                   \
-        (((FETCH_FUNC_NAME(method, u8) == fn) ||        \
-          (FETCH_FUNC_NAME(method, u16) == fn) ||       \
-          (FETCH_FUNC_NAME(method, u32) == fn) ||       \
-          (FETCH_FUNC_NAME(method, u64) == fn) ||       \
-          (FETCH_FUNC_NAME(method, string) == fn) ||    \
-          (FETCH_FUNC_NAME(method, string_size) == fn)) \
-         && (fn != NULL))
-/* Data fetch function templates */
-#define DEFINE_FETCH_reg(type)                                          \
-static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs,  \
-                                        void *offset, void *dest)       \
-{                                                                       \
-        *(type *)dest = (type)regs_get_register(regs,                   \
-                                (unsigned int)((unsigned long)offset)); \
-}
-DEFINE_BASIC_FETCH_FUNCS(reg)
-/* No string on the register */
-#define fetch_reg_string NULL
-#define fetch_reg_string_size NULL
-#define DEFINE_FETCH_stack(type)                                        \
-static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
-                                          void *offset, void *dest)     \
-{                                                                       \
-        *(type *)dest = (type)regs_get_kernel_stack_nth(regs,           \
-                                (unsigned int)((unsigned long)offset)); \
-}
-DEFINE_BASIC_FETCH_FUNCS(stack)
-/* No string on the stack entry */
-#define fetch_stack_string NULL
-#define fetch_stack_string_size NULL
-#define DEFINE_FETCH_retval(type)                                       \
-static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
-                                          void *dummy, void *dest)      \
-{                                                                       \
-        *(type *)dest = (type)regs_return_value(regs);                  \
-}
-DEFINE_BASIC_FETCH_FUNCS(retval)
-/* No string on the retval */
-#define fetch_retval_string NULL
-#define fetch_retval_string_size NULL
-#define DEFINE_FETCH_memory(type)                                       \
-static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
-                                          void *addr, void *dest)       \
-{                                                                       \
-        type retval;                                                    \
-        if (probe_kernel_address(addr, retval))                         \
-                *(type *)dest = 0;                                      \
-        else                                                            \
-                *(type *)dest = retval;                                 \
-}
-DEFINE_BASIC_FETCH_FUNCS(memory)
-/*
- * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
- * length and relative data location.
- */
-static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
-                                                      void *addr, void *dest)
-{
-        long ret;
-        int maxlen = get_rloc_len(*(u32 *)dest);
-        u8 *dst = get_rloc_data(dest);
-        u8 *src = addr;
-        mm_segment_t old_fs = get_fs();
-        if (!maxlen)
-                return;
-        /*
-         * Try to get string again, since the string can be changed while
-         * probing.
-         */
-        set_fs(KERNEL_DS);
-        pagefault_disable();
-        do
-                ret = __copy_from_user_inatomic(dst++, src++, 1);
-        while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
-        dst[-1] = '\0';
-        pagefault_enable();
-        set_fs(old_fs);
-        if (ret < 0) {  /* Failed to fetch string */
-                ((u8 *)get_rloc_data(dest))[0] = '\0';
-                *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
-        } else
-                *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
-                                              get_rloc_offs(*(u32 *)dest));
-}
-/* Return the length of string -- including null terminal byte */
-static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
-                                                        void *addr, void *dest)
-{
-        int ret, len = 0;
-        u8 c;
-        mm_segment_t old_fs = get_fs();
-        set_fs(KERNEL_DS);
-        pagefault_disable();
-        do {
-                ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
-                len++;
-        } while (c && ret == 0 && len < MAX_STRING_SIZE);
-        pagefault_enable();
-        set_fs(old_fs);
-        if (ret < 0)    /* Failed to check the length */
-                *(u32 *)dest = 0;
-        else
-                *(u32 *)dest = len;
-}
-/* Memory fetching by symbol */
-struct symbol_cache {
-        char *symbol;
-        long offset;
-        unsigned long addr;
-};
-static unsigned long update_symbol_cache(struct symbol_cache *sc)
-{
-        sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
-        if (sc->addr)
-                sc->addr += sc->offset;
-        return sc->addr;
-}
-static void free_symbol_cache(struct symbol_cache *sc)
-{
-        kfree(sc->symbol);
-        kfree(sc);
-}
-static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
-{
-        struct symbol_cache *sc;
-        if (!sym || strlen(sym) == 0)
-                return NULL;
-        sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
-        if (!sc)
-                return NULL;
-        sc->symbol = kstrdup(sym, GFP_KERNEL);
-        if (!sc->symbol) {
-                kfree(sc);
-                return NULL;
-        }
-        sc->offset = offset;
-        update_symbol_cache(sc);
+#include "trace_probe.h"
-        return sc;
-}
-#define DEFINE_FETCH_symbol(type)                                       \
-static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
-                                          void *data, void *dest)       \
-{                                                                       \
-        struct symbol_cache *sc = data;                                 \
-        if (sc->addr)                                                   \
-                fetch_memory_##type(regs, (void *)sc->addr, dest);      \
-        else                                                            \
-                *(type *)dest = 0;                                      \
-}
-DEFINE_BASIC_FETCH_FUNCS(symbol)
-DEFINE_FETCH_symbol(string)
-DEFINE_FETCH_symbol(string_size)
-/* Dereference memory access function */
-struct deref_fetch_param {
-        struct fetch_param orig;
-        long offset;
-};
-#define DEFINE_FETCH_deref(type)                                        \
-static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
-                                            void *data, void *dest)     \
-{                                                                       \
-        struct deref_fetch_param *dprm = data;                          \
-        unsigned long addr;                                             \
-        call_fetch(&dprm->orig, regs, &addr);                           \
-        if (addr) {                                                     \
-                addr += dprm->offset;                                   \
-                fetch_memory_##type(regs, (void *)addr, dest);          \
-        } else                                                          \
-                *(type *)dest = 0;                                      \
-}
-DEFINE_BASIC_FETCH_FUNCS(deref)
-DEFINE_FETCH_deref(string)
-DEFINE_FETCH_deref(string_size)
-static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data)
-{
-        if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
-                update_deref_fetch_param(data->orig.data);
-        else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
-                update_symbol_cache(data->orig.data);
-}
-static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
-{
-        if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
-                free_deref_fetch_param(data->orig.data);
-        else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
-                free_symbol_cache(data->orig.data);
-        kfree(data);
-}
-/* Bitfield fetch function */
-struct bitfield_fetch_param {
-        struct fetch_param orig;
-        unsigned char hi_shift;
-        unsigned char low_shift;
-};
-#define DEFINE_FETCH_bitfield(type)                                     \
+#define KPROBE_EVENT_SYSTEM "kprobes"
-static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\
-                                            void *data, void *dest)     \
-{                                                                       \
-        struct bitfield_fetch_param *bprm = data;                       \
-        type buf = 0;                                                   \
-        call_fetch(&bprm->orig, regs, &buf);                            \
-        if (buf) {                                                      \
-                buf <<= bprm->hi_shift;                                 \
-                buf >>= bprm->low_shift;                                \
-        }                                                               \
-        *(type *)dest = buf;                                            \
-}
-DEFINE_BASIC_FETCH_FUNCS(bitfield)
-#define fetch_bitfield_string NULL
-#define fetch_bitfield_string_size NULL
-static __kprobes void
-update_bitfield_fetch_param(struct bitfield_fetch_param *data)
-{
-        /*
-         * Don't check the bitfield itself, because this must be the
-         * last fetch function.
-         */
-        if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
-                update_deref_fetch_param(data->orig.data);
-        else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
-                update_symbol_cache(data->orig.data);
-}
-static __kprobes void
-free_bitfield_fetch_param(struct bitfield_fetch_param *data)
-{
-        /*
-         * Don't check the bitfield itself, because this must be the
-         * last fetch function.
-         */
-        if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
-                free_deref_fetch_param(data->orig.data);
-        else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
-                free_symbol_cache(data->orig.data);
-        kfree(data);
-}
-/* Default (unsigned long) fetch type */
-#define __DEFAULT_FETCH_TYPE(t) u##t
-#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
-#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
-#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
-/* Fetch types */
-enum {
-        FETCH_MTD_reg = 0,
-        FETCH_MTD_stack,
-        FETCH_MTD_retval,
-        FETCH_MTD_memory,
-        FETCH_MTD_symbol,
-        FETCH_MTD_deref,
-        FETCH_MTD_bitfield,
-        FETCH_MTD_END,
-};
-#define ASSIGN_FETCH_FUNC(method, type) \
-        [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
-#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
-        {.name = _name,                         \
-         .size = _size,                                 \
-         .is_signed = sign,                             \
-         .print = PRINT_TYPE_FUNC_NAME(ptype),          \
-         .fmt = PRINT_TYPE_FMT_NAME(ptype),             \
-         .fmttype = _fmttype,                           \
-         .fetch = {                                     \
-ASSIGN_FETCH_FUNC(reg, ftype),                          \
-ASSIGN_FETCH_FUNC(stack, ftype),                        \
-ASSIGN_FETCH_FUNC(retval, ftype),                       \
-ASSIGN_FETCH_FUNC(memory, ftype),                       \
-ASSIGN_FETCH_FUNC(symbol, ftype),                       \
-ASSIGN_FETCH_FUNC(deref, ftype),                        \
-ASSIGN_FETCH_FUNC(bitfield, ftype),                     \
-          }                                             \
-        }
-#define ASSIGN_FETCH_TYPE(ptype, ftype, sign)                   \
-        __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
-#define FETCH_TYPE_STRING 0
-#define FETCH_TYPE_STRSIZE 1
-/* Fetch type information table */
-static const struct fetch_type {
-        const char      *name;          /* Name of type */
-        size_t          size;           /* Byte size of type */
-        int             is_signed;      /* Signed flag */
-        print_type_func_t       print;  /* Print functions */
-        const char      *fmt;           /* Fromat string */
-        const char      *fmttype;       /* Name in format file */
-        /* Fetch functions */
-        fetch_func_t    fetch[FETCH_MTD_END];
-} fetch_type_table[] = {
-        /* Special types */
-        [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
-                                        sizeof(u32), 1, "__data_loc char[]"),
-        [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
-                                        string_size, sizeof(u32), 0, "u32"),
-        /* Basic types */
-        ASSIGN_FETCH_TYPE(u8,  u8,  0),
-        ASSIGN_FETCH_TYPE(u16, u16, 0),
-        ASSIGN_FETCH_TYPE(u32, u32, 0),
-        ASSIGN_FETCH_TYPE(u64, u64, 0),
-        ASSIGN_FETCH_TYPE(s8,  u8,  1),
-        ASSIGN_FETCH_TYPE(s16, u16, 1),
-        ASSIGN_FETCH_TYPE(s32, u32, 1),
-        ASSIGN_FETCH_TYPE(s64, u64, 1),
-};
-static const struct fetch_type *find_fetch_type(const char *type)
-{
-        int i;
-        if (!type)
-                type = DEFAULT_FETCH_TYPE_STR;
-        /* Special case: bitfield */
-        if (*type == 'b') {
-                unsigned long bs;
-                type = strchr(type, '/');
-                if (!type)
-                        goto fail;
-                type++;
-                if (strict_strtoul(type, 0, &bs))
-                        goto fail;
-                switch (bs) {
-                case 8:
-                        return find_fetch_type("u8");
-                case 16:
-                        return find_fetch_type("u16");
-                case 32:
-                        return find_fetch_type("u32");
-                case 64:
-                        return find_fetch_type("u64");
-                default:
-                        goto fail;
-                }
-        }
-        for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
-                if (strcmp(type, fetch_type_table[i].name) == 0)
-                        return &fetch_type_table[i];
-fail:
-        return NULL;
-}
-/* Special function : only accept unsigned long */
-static __kprobes void fetch_stack_address(struct pt_regs *regs,
-                                          void *dummy, void *dest)
-{
-        *(unsigned long *)dest = kernel_stack_pointer(regs);
-}
-static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
-                                            fetch_func_t orig_fn)
-{
-        int i;
-        if (type != &fetch_type_table[FETCH_TYPE_STRING])
-                return NULL;    /* Only string type needs size function */
-        for (i = 0; i < FETCH_MTD_END; i++)
-                if (type->fetch[i] == orig_fn)
-                        return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i];
-        WARN_ON(1);     /* This should not happen */
-        return NULL;
-}
 /**
 * Kprobe event core functions
 */
-struct probe_arg {
-        struct fetch_param      fetch;
-        struct fetch_param      fetch_size;
-        unsigned int            offset; /* Offset from argument entry */
-        const char              *name;  /* Name of this argument */
-        const char              *comm;  /* Command of this argument */
-        const struct fetch_type *type;  /* Type of this argument */
-};
-/* Flags for trace_probe */
-#define TP_FLAG_TRACE   1
-#define TP_FLAG_PROFILE 2
-#define TP_FLAG_REGISTERED 4
 struct trace_probe {
        struct list_head        list;
        struct kretprobe        rp;     /* Use rp.kp for kprobe use */
@@ -631,18 +99,6 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
 static int kretprobe_dispatcher(struct kretprobe_instance *ri,
                                struct pt_regs *regs);
-/* Check the name is good for event/group/fields */
-static int is_good_name(const char *name)
-{
-        if (!isalpha(*name) && *name != '_')
-                return 0;
-        while (*++name != '\0') {
-                if (!isalpha(*name) && !isdigit(*name) && *name != '_')
-                        return 0;
-        }
-        return 1;
-}
 /*
 * Allocate new trace_probe and initialize it (including kprobes).
 */
@@ -651,7 +107,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
                                             void *addr,
                                             const char *symbol,
                                             unsigned long offs,
-                                             int nargs, int is_return)
+                                             int nargs, bool is_return)
 {
        struct trace_probe *tp;
        int ret = -ENOMEM;
@@ -702,34 +158,12 @@ error:
        return ERR_PTR(ret);
 }
-static void update_probe_arg(struct probe_arg *arg)
-{
-        if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
-                update_bitfield_fetch_param(arg->fetch.data);
-        else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
-                update_deref_fetch_param(arg->fetch.data);
-        else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
-                update_symbol_cache(arg->fetch.data);
-}
-static void free_probe_arg(struct probe_arg *arg)
-{
-        if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
-                free_bitfield_fetch_param(arg->fetch.data);
-        else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
-                free_deref_fetch_param(arg->fetch.data);
-        else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
-                free_symbol_cache(arg->fetch.data);
-        kfree(arg->name);
-        kfree(arg->comm);
-}
 static void free_trace_probe(struct trace_probe *tp)
 {
        int i;
        for (i = 0; i < tp->nr_args; i++)
-                free_probe_arg(&tp->args[i]);
+                traceprobe_free_probe_arg(&tp->args[i]);
        kfree(tp->call.class->system);
        kfree(tp->call.name);
@@ -787,7 +221,7 @@ static int __register_trace_probe(struct trace_probe *tp)
                return -EINVAL;
        for (i = 0; i < tp->nr_args; i++)
-                update_probe_arg(&tp->args[i]);
+                traceprobe_update_arg(&tp->args[i]);
        /* Set/clear disabled flag according to tp->flag */
        if (trace_probe_is_enabled(tp))
@@ -919,227 +353,6 @@ static struct notifier_block trace_probe_module_nb = {
        .priority = 1   /* Invoked after kprobe module callback */
 };
-/* Split symbol and offset. */
-static int split_symbol_offset(char *symbol, unsigned long *offset)
-{
-        char *tmp;
-        int ret;
-        if (!offset)
-                return -EINVAL;
-        tmp = strchr(symbol, '+');
-        if (tmp) {
-                /* skip sign because strict_strtol doesn't accept '+' */
-                ret = strict_strtoul(tmp + 1, 0, offset);
-                if (ret)
-                        return ret;
-                *tmp = '\0';
-        } else
-                *offset = 0;
-        return 0;
-}
-#define PARAM_MAX_ARGS 16
-#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
-static int parse_probe_vars(char *arg, const struct fetch_type *t,
-                            struct fetch_param *f, int is_return)
-{
-        int ret = 0;
-        unsigned long param;
-        if (strcmp(arg, "retval") == 0) {
-                if (is_return)
-                        f->fn = t->fetch[FETCH_MTD_retval];
-                else
-                        ret = -EINVAL;
-        } else if (strncmp(arg, "stack", 5) == 0) {
-                if (arg[5] == '\0') {
-                        if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0)
-                                f->fn = fetch_stack_address;
-                        else
-                                ret = -EINVAL;
-                } else if (isdigit(arg[5])) {
-                        ret = strict_strtoul(arg + 5, 10, &param);
-                        if (ret || param > PARAM_MAX_STACK)
-                                ret = -EINVAL;
-                        else {
-                                f->fn = t->fetch[FETCH_MTD_stack];
-                                f->data = (void *)param;
-                        }
-                } else
-                        ret = -EINVAL;
-        } else
-                ret = -EINVAL;
-        return ret;
-}
-/* Recursive argument parser */
-static int __parse_probe_arg(char *arg, const struct fetch_type *t,
-                             struct fetch_param *f, int is_return)
-{
-        int ret = 0;
-        unsigned long param;
-        long offset;
-        char *tmp;
-        switch (arg[0]) {
-        case '$':
-                ret = parse_probe_vars(arg + 1, t, f, is_return);
-                break;
-        case '%':       /* named register */
-                ret = regs_query_register_offset(arg + 1);
-                if (ret >= 0) {
-                        f->fn = t->fetch[FETCH_MTD_reg];
-                        f->data = (void *)(unsigned long)ret;
-                        ret = 0;
-                }
-                break;
-        case '@':       /* memory or symbol */
-                if (isdigit(arg[1])) {
-                        ret = strict_strtoul(arg + 1, 0, &param);
-                        if (ret)
-                                break;
-                        f->fn = t->fetch[FETCH_MTD_memory];
-                        f->data = (void *)param;
-                } else {
-                        ret = split_symbol_offset(arg + 1, &offset);
-                        if (ret)
-                                break;
-                        f->data = alloc_symbol_cache(arg + 1, offset);
-                        if (f->data)
-                                f->fn = t->fetch[FETCH_MTD_symbol];
-                }
-                break;
-        case '+':       /* deref memory */
-                arg++;  /* Skip '+', because strict_strtol() rejects it. */
-        case '-':
-                tmp = strchr(arg, '(');
-                if (!tmp)
-                        break;
-                *tmp = '\0';
-                ret = strict_strtol(arg, 0, &offset);
-                if (ret)
-                        break;
-                arg = tmp + 1;
-                tmp = strrchr(arg, ')');
-                if (tmp) {
-                        struct deref_fetch_param *dprm;
-                        const struct fetch_type *t2 = find_fetch_type(NULL);
-                        *tmp = '\0';
-                        dprm = kzalloc(sizeof(struct deref_fetch_param),
-                                       GFP_KERNEL);
-                        if (!dprm)
-                                return -ENOMEM;
-                        dprm->offset = offset;
-                        ret = __parse_probe_arg(arg, t2, &dprm->orig,
-                                                is_return);
-                        if (ret)
-                                kfree(dprm);
-                        else {
-                                f->fn = t->fetch[FETCH_MTD_deref];
-                                f->data = (void *)dprm;
-                        }
-                }
-                break;
-        }
-        if (!ret && !f->fn) {   /* Parsed, but do not find fetch method */
-                pr_info("%s type has no corresponding fetch method.\n",
-                        t->name);
-                ret = -EINVAL;
-        }
-        return ret;
-}
-#define BYTES_TO_BITS(nb)       ((BITS_PER_LONG * (nb)) / sizeof(long))
-/* Bitfield type needs to be parsed into a fetch function */
-static int __parse_bitfield_probe_arg(const char *bf,
-                                      const struct fetch_type *t,
-                                      struct fetch_param *f)
-{
-        struct bitfield_fetch_param *bprm;
-        unsigned long bw, bo;
-        char *tail;
-        if (*bf != 'b')
-                return 0;
-        bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
-        if (!bprm)
-                return -ENOMEM;
-        bprm->orig = *f;
-        f->fn = t->fetch[FETCH_MTD_bitfield];
-        f->data = (void *)bprm;
-        bw = simple_strtoul(bf + 1, &tail, 0);  /* Use simple one */
-        if (bw == 0 || *tail != '@')
-                return -EINVAL;
-        bf = tail + 1;
-        bo = simple_strtoul(bf, &tail, 0);
-        if (tail == bf || *tail != '/')
-                return -EINVAL;
-        bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
-        bprm->low_shift = bprm->hi_shift + bo;
-        return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
-}
-/* String length checking wrapper */
-static int parse_probe_arg(char *arg, struct trace_probe *tp,
-                           struct probe_arg *parg, int is_return)
-{
-        const char *t;
-        int ret;
-        if (strlen(arg) > MAX_ARGSTR_LEN) {
-                pr_info("Argument is too long.: %s\n",  arg);
-                return -ENOSPC;
-        }
-        parg->comm = kstrdup(arg, GFP_KERNEL);
-        if (!parg->comm) {
-                pr_info("Failed to allocate memory for command '%s'.\n", arg);
-                return -ENOMEM;
-        }
-        t = strchr(parg->comm, ':');
-        if (t) {
-                arg[t - parg->comm] = '\0';
-                t++;
-        }
-        parg->type = find_fetch_type(t);
-        if (!parg->type) {
-                pr_info("Unsupported type: %s\n", t);
-                return -EINVAL;
-        }
-        parg->offset = tp->size;
-        tp->size += parg->type->size;
-        ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
-        if (ret >= 0 && t != NULL)
-                ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
-        if (ret >= 0) {
-                parg->fetch_size.fn = get_fetch_size_function(parg->type,
-                                                              parg->fetch.fn);
-                parg->fetch_size.data = parg->fetch.data;
-        }
-        return ret;
-}
-/* Return 1 if name is reserved or already used by another argument */
-static int conflict_field_name(const char *name,
-                               struct probe_arg *args, int narg)
-{
-        int i;
-        for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
-                if (strcmp(reserved_field_names[i], name) == 0)
-                        return 1;
-        for (i = 0; i < narg; i++)
-                if (strcmp(args[i].name, name) == 0)
-                        return 1;
-        return 0;
-}
 static int create_trace_probe(int argc, char **argv)
 {
        /*
@@ -1162,7 +375,7 @@ static int create_trace_probe(int argc, char **argv)
         */
        struct trace_probe *tp;
        int i, ret = 0;
-        int is_return = 0, is_delete = 0;
+        bool is_return = false, is_delete = false;
        char *symbol = NULL, *event = NULL, *group = NULL;
        char *arg;
        unsigned long offset = 0;
@@ -1171,11 +384,11 @@ static int create_trace_probe(int argc, char **argv)
        /* argc must be >= 1 */
        if (argv[0][0] == 'p')
-                is_return = 0;
+                is_return = false;
        else if (argv[0][0] == 'r')
-                is_return = 1;
+                is_return = true;
        else if (argv[0][0] == '-')
-                is_delete = 1;
+                is_delete = true;
        else {
                pr_info("Probe definition must be started with 'p', 'r' or"
                        " '-'.\n");
@@ -1240,7 +453,7 @@ static int create_trace_probe(int argc, char **argv)
                /* a symbol specified */
                symbol = argv[1];
                /* TODO: support .init module functions */
-                ret = split_symbol_offset(symbol, &offset);
+                ret = traceprobe_split_symbol_offset(symbol, &offset);
                if (ret) {
                        pr_info("Failed to parse symbol.\n");
                        return ret;
@@ -1302,7 +515,8 @@ static int create_trace_probe(int argc, char **argv)
                        goto error;
                }
-                if (conflict_field_name(tp->args[i].name, tp->args, i)) {
+                if (traceprobe_conflict_field_name(tp->args[i].name,
+                                                        tp->args, i)) {
                        pr_info("Argument[%d] name '%s' conflicts with "
                                "another field.\n", i, argv[i]);
                        ret = -EINVAL;
@@ -1310,7 +524,8 @@ static int create_trace_probe(int argc, char **argv)
                }
                /* Parse fetch argument */
-                ret = parse_probe_arg(arg, tp, &tp->args[i], is_return);
+                ret = traceprobe_parse_probe_arg(arg, &tp->size, &tp->args[i],
+                                                is_return, true);
                if (ret) {
                        pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
                        goto error;
@@ -1412,70 +627,11 @@ static int probes_open(struct inode *inode, struct file *file)
        return seq_open(file, &probes_seq_op);
 }
-static int command_trace_probe(const char *buf)
-{
-        char **argv;
-        int argc = 0, ret = 0;
-        argv = argv_split(GFP_KERNEL, buf, &argc);
-        if (!argv)
-                return -ENOMEM;
-        if (argc)
-                ret = create_trace_probe(argc, argv);
-        argv_free(argv);
-        return ret;
-}
-#define WRITE_BUFSIZE 4096
 static ssize_t probes_write(struct file *file, const char __user *buffer,
                            size_t count, loff_t *ppos)
 {
-        char *kbuf, *tmp;
+        return traceprobe_probes_write(file, buffer, count, ppos,
-        int ret;
+                        create_trace_probe);
-        size_t done;
-        size_t size;
-        kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
-        if (!kbuf)
-                return -ENOMEM;
-        ret = done = 0;
-        while (done < count) {
-                size = count - done;
-                if (size >= WRITE_BUFSIZE)
-                        size = WRITE_BUFSIZE - 1;
-                if (copy_from_user(kbuf, buffer + done, size)) {
-                        ret = -EFAULT;
-                        goto out;
-                }
-                kbuf[size] = '\0';
-                tmp = strchr(kbuf, '\n');
-                if (tmp) {
-                        *tmp = '\0';
-                        size = tmp - kbuf + 1;
-                } else if (done + size < count) {
-                        pr_warning("Line length is too long: "
-                                   "Should be less than %d.", WRITE_BUFSIZE);
-                        ret = -EINVAL;
-                        goto out;
-                }
-                done += size;
-                /* Remove comments */
-                tmp = strchr(kbuf, '#');
-                if (tmp)
-                        *tmp = '\0';
-                ret = command_trace_probe(kbuf);
-                if (ret)
-                        goto out;
-        }
-        ret = done;
-out:
-        kfree(kbuf);
-        return ret;
 }
 static const struct file_operations kprobe_events_ops = {
@@ -1711,16 +867,6 @@ partial:
        return TRACE_TYPE_PARTIAL_LINE;
 }
-#undef DEFINE_FIELD
-#define DEFINE_FIELD(type, item, name, is_signed)                       \
-        do {                                                            \
-                ret = trace_define_field(event_call, #type, name,       \
-                                         offsetof(typeof(field), item), \
-                                         sizeof(field.item), is_signed, \
-                                         FILTER_OTHER);                 \
-                if (ret)                                                \
-                        return ret;                                     \
-        } while (0)
 static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
 {
@@ -2051,8 +1197,9 @@ static __init int kprobe_trace_self_tests_init(void)
        pr_info("Testing kprobe tracing: ");
-        ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
+        ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target "
-                                  "$stack $stack0 +0($stack)");
+                                  "$stack $stack0 +0($stack)",
+                                  create_trace_probe);
        if (WARN_ON_ONCE(ret)) {
                pr_warning("error on probing function entry.\n");
                warn++;
@@ -2066,8 +1213,8 @@ static __init int kprobe_trace_self_tests_init(void)
                        enable_trace_probe(tp, TP_FLAG_TRACE);
        }
-        ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
+        ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target "
-                                  "$retval");
+                                  "$retval", create_trace_probe);
        if (WARN_ON_ONCE(ret)) {
                pr_warning("error on probing function return.\n");
                warn++;
@@ -2101,13 +1248,13 @@ static __init int kprobe_trace_self_tests_init(void)
        } else
                disable_trace_probe(tp, TP_FLAG_TRACE);
-        ret = command_trace_probe("-:testprobe");
+        ret = traceprobe_command("-:testprobe", create_trace_probe);
        if (WARN_ON_ONCE(ret)) {
                pr_warning("error on deleting a probe.\n");
                warn++;
        }
-        ret = command_trace_probe("-:testprobe2");
+        ret = traceprobe_command("-:testprobe2", create_trace_probe);
        if (WARN_ON_ONCE(ret)) {
                pr_warning("error on deleting a probe.\n");
                warn++;
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 6fd4ffd042f9..a9077c1b4ad3 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -51,6 +51,10 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
        const char **iter;
        char *fmt;
+        /* allocate the trace_printk per cpu buffers */
+        if (start != end)
+                trace_printk_init_buffers();
        mutex_lock(&btrace_mutex);
        for (iter = start; iter < end; iter++) {
                struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
new file mode 100644
index 000000000000..daa9980153af
--- /dev/null
+++ b/kernel/trace/trace_probe.c
@@ -0,0 +1,839 @@
+/*
+ * Common code for probe-based Dynamic events.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * This code was copied from kernel/trace/trace_kprobe.c written by
+ * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
+ *
+ * Updates to make this generic:
+ * Copyright (C) IBM Corporation, 2010-2011
+ * Author:     Srikar Dronamraju
+ */
+#include "trace_probe.h"
+const char *reserved_field_names[] = {
+        "common_type",
+        "common_flags",
+        "common_preempt_count",
+        "common_pid",
+        "common_tgid",
+        FIELD_STRING_IP,
+        FIELD_STRING_RETIP,
+        FIELD_STRING_FUNC,
+};
+/* Printing function type */
+#define PRINT_TYPE_FUNC_NAME(type)      print_type_##type
+#define PRINT_TYPE_FMT_NAME(type)       print_type_format_##type
+/* Printing  in basic type function template */
+#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast)                   \
+static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s,    \
+                                                const char *name,       \
+                                                void *data, void *ent)\
+{                                                                       \
+        return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
+}                                                                       \
+static const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
+DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
+DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
+static inline void *get_rloc_data(u32 *dl)
+{
+        return (u8 *)dl + get_rloc_offs(*dl);
+}
+/* For data_loc conversion */
+static inline void *get_loc_data(u32 *dl, void *ent)
+{
+        return (u8 *)ent + get_rloc_offs(*dl);
+}
+/* For defining macros, define string/string_size types */
+typedef u32 string;
+typedef u32 string_size;
+/* Print type function for string type */
+static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
+                                                  const char *name,
+                                                  void *data, void *ent)
+{
+        int len = *(u32 *)data >> 16;
+        if (!len)
+                return trace_seq_printf(s, " %s=(fault)", name);
+        else
+                return trace_seq_printf(s, " %s=\"%s\"", name,
+                                        (const char *)get_loc_data(data, ent));
+}
+static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
+#define FETCH_FUNC_NAME(method, type)   fetch_##method##_##type
+/*
+ * Define macro for basic types - we don't need to define s* types, because
+ * we have to care only about bitwidth at recording time.
+ */
+#define DEFINE_BASIC_FETCH_FUNCS(method) \
+DEFINE_FETCH_##method(u8)               \
+DEFINE_FETCH_##method(u16)              \
+DEFINE_FETCH_##method(u32)              \
+DEFINE_FETCH_##method(u64)
+#define CHECK_FETCH_FUNCS(method, fn)                   \
+        (((FETCH_FUNC_NAME(method, u8) == fn) ||        \
+          (FETCH_FUNC_NAME(method, u16) == fn) ||       \
+          (FETCH_FUNC_NAME(method, u32) == fn) ||       \
+          (FETCH_FUNC_NAME(method, u64) == fn) ||       \
+          (FETCH_FUNC_NAME(method, string) == fn) ||    \
+          (FETCH_FUNC_NAME(method, string_size) == fn)) \
+         && (fn != NULL))
+/* Data fetch function templates */
+#define DEFINE_FETCH_reg(type)                                          \
+static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs,  \
+                                        void *offset, void *dest)       \
+{                                                                       \
+        *(type *)dest = (type)regs_get_register(regs,                   \
+                                (unsigned int)((unsigned long)offset)); \
+}
+DEFINE_BASIC_FETCH_FUNCS(reg)
+/* No string on the register */
+#define fetch_reg_string        NULL
+#define fetch_reg_string_size   NULL
+#define DEFINE_FETCH_stack(type)                                        \
+static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
+                                          void *offset, void *dest)     \
+{                                                                       \
+        *(type *)dest = (type)regs_get_kernel_stack_nth(regs,           \
+                                (unsigned int)((unsigned long)offset)); \
+}
+DEFINE_BASIC_FETCH_FUNCS(stack)
+/* No string on the stack entry */
+#define fetch_stack_string      NULL
+#define fetch_stack_string_size NULL
+#define DEFINE_FETCH_retval(type)                                       \
+static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
+                                          void *dummy, void *dest)      \
+{                                                                       \
+        *(type *)dest = (type)regs_return_value(regs);                  \
+}
+DEFINE_BASIC_FETCH_FUNCS(retval)
+/* No string on the retval */
+#define fetch_retval_string             NULL
+#define fetch_retval_string_size        NULL
+#define DEFINE_FETCH_memory(type)                                       \
+static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
+                                          void *addr, void *dest)       \
+{                                                                       \
+        type retval;                                                    \
+        if (probe_kernel_address(addr, retval))                         \
+                *(type *)dest = 0;                                      \
+        else                                                            \
+                *(type *)dest = retval;                                 \
+}
+DEFINE_BASIC_FETCH_FUNCS(memory)
+/*
+ * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
+ * length and relative data location.
+ */
+static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
+                                                      void *addr, void *dest)
+{
+        long ret;
+        int maxlen = get_rloc_len(*(u32 *)dest);
+        u8 *dst = get_rloc_data(dest);
+        u8 *src = addr;
+        mm_segment_t old_fs = get_fs();
+        if (!maxlen)
+                return;
+        /*
+         * Try to get string again, since the string can be changed while
+         * probing.
+         */
+        set_fs(KERNEL_DS);
+        pagefault_disable();
+        do
+                ret = __copy_from_user_inatomic(dst++, src++, 1);
+        while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
+        dst[-1] = '\0';
+        pagefault_enable();
+        set_fs(old_fs);
+        if (ret < 0) {  /* Failed to fetch string */
+                ((u8 *)get_rloc_data(dest))[0] = '\0';
+                *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
+        } else {
+                *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
+                                              get_rloc_offs(*(u32 *)dest));
+        }
+}
+/* Return the length of string -- including null terminal byte */
+static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
+                                                        void *addr, void *dest)
+{
+        mm_segment_t old_fs;
+        int ret, len = 0;
+        u8 c;
+        old_fs = get_fs();
+        set_fs(KERNEL_DS);
+        pagefault_disable();
+        do {
+                ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
+                len++;
+        } while (c && ret == 0 && len < MAX_STRING_SIZE);
+        pagefault_enable();
+        set_fs(old_fs);
+        if (ret < 0)    /* Failed to check the length */
+                *(u32 *)dest = 0;
+        else
+                *(u32 *)dest = len;
+}
+/* Memory fetching by symbol */
+struct symbol_cache {
+        char            *symbol;
+        long            offset;
+        unsigned long   addr;
+};
+static unsigned long update_symbol_cache(struct symbol_cache *sc)
+{
+        sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
+        if (sc->addr)
+                sc->addr += sc->offset;
+        return sc->addr;
+}
+static void free_symbol_cache(struct symbol_cache *sc)
+{
+        kfree(sc->symbol);
+        kfree(sc);
+}
+static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
+{
+        struct symbol_cache *sc;
+        if (!sym || strlen(sym) == 0)
+                return NULL;
+        sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
+        if (!sc)
+                return NULL;
+        sc->symbol = kstrdup(sym, GFP_KERNEL);
+        if (!sc->symbol) {
+                kfree(sc);
+                return NULL;
+        }
+        sc->offset = offset;
+        update_symbol_cache(sc);
+        return sc;
+}
+#define DEFINE_FETCH_symbol(type)                                       \
+static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
+                                          void *data, void *dest)       \
+{                                                                       \
+        struct symbol_cache *sc = data;                                 \
+        if (sc->addr)                                                   \
+                fetch_memory_##type(regs, (void *)sc->addr, dest);      \
+        else                                                            \
+                *(type *)dest = 0;                                      \
+}
+DEFINE_BASIC_FETCH_FUNCS(symbol)
+DEFINE_FETCH_symbol(string)
+DEFINE_FETCH_symbol(string_size)
+/* Dereference memory access function */
+struct deref_fetch_param {
+        struct fetch_param      orig;
+        long                    offset;
+};
+#define DEFINE_FETCH_deref(type)                                        \
+static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
+                                            void *data, void *dest)     \
+{                                                                       \
+        struct deref_fetch_param *dprm = data;                          \
+        unsigned long addr;                                             \
+        call_fetch(&dprm->orig, regs, &addr);                           \
+        if (addr) {                                                     \
+                addr += dprm->offset;                                   \
+                fetch_memory_##type(regs, (void *)addr, dest);          \
+        } else                                                          \
+                *(type *)dest = 0;                                      \
+}
+DEFINE_BASIC_FETCH_FUNCS(deref)
+DEFINE_FETCH_deref(string)
+DEFINE_FETCH_deref(string_size)
+static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data)
+{
+        if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+                update_deref_fetch_param(data->orig.data);
+        else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+                update_symbol_cache(data->orig.data);
+}
+static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
+{
+        if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+                free_deref_fetch_param(data->orig.data);
+        else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+                free_symbol_cache(data->orig.data);
+        kfree(data);
+}
+/* Bitfield fetch function */
+struct bitfield_fetch_param {
+        struct fetch_param      orig;
+        unsigned char           hi_shift;
+        unsigned char           low_shift;
+};
+#define DEFINE_FETCH_bitfield(type)                                     \
+static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\
+                                            void *data, void *dest)     \
+{                                                                       \
+        struct bitfield_fetch_param *bprm = data;                       \
+        type buf = 0;                                                   \
+        call_fetch(&bprm->orig, regs, &buf);                            \
+        if (buf) {                                                      \
+                buf <<= bprm->hi_shift;                                 \
+                buf >>= bprm->low_shift;                                \
+        }                                                               \
+        *(type *)dest = buf;                                            \
+}
+DEFINE_BASIC_FETCH_FUNCS(bitfield)
+#define fetch_bitfield_string           NULL
+#define fetch_bitfield_string_size      NULL
+static __kprobes void
+update_bitfield_fetch_param(struct bitfield_fetch_param *data)
+{
+        /*
+         * Don't check the bitfield itself, because this must be the
+         * last fetch function.
+         */
+        if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+                update_deref_fetch_param(data->orig.data);
+        else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+                update_symbol_cache(data->orig.data);
+}
+static __kprobes void
+free_bitfield_fetch_param(struct bitfield_fetch_param *data)
+{
+        /*
+         * Don't check the bitfield itself, because this must be the
+         * last fetch function.
+         */
+        if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+                free_deref_fetch_param(data->orig.data);
+        else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+                free_symbol_cache(data->orig.data);
+        kfree(data);
+}
+/* Default (unsigned long) fetch type */
+#define __DEFAULT_FETCH_TYPE(t) u##t
+#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
+#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
+#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
+#define ASSIGN_FETCH_FUNC(method, type) \
+        [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
+#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
+        {.name = _name,                         \
+         .size = _size,                                 \
+         .is_signed = sign,                             \
+         .print = PRINT_TYPE_FUNC_NAME(ptype),          \
+         .fmt = PRINT_TYPE_FMT_NAME(ptype),             \
+         .fmttype = _fmttype,                           \
+         .fetch = {                                     \
+ASSIGN_FETCH_FUNC(reg, ftype),                          \
+ASSIGN_FETCH_FUNC(stack, ftype),                        \
+ASSIGN_FETCH_FUNC(retval, ftype),                       \
+ASSIGN_FETCH_FUNC(memory, ftype),                       \
+ASSIGN_FETCH_FUNC(symbol, ftype),                       \
+ASSIGN_FETCH_FUNC(deref, ftype),                        \
+ASSIGN_FETCH_FUNC(bitfield, ftype),                     \
+          }                                             \
+        }
+#define ASSIGN_FETCH_TYPE(ptype, ftype, sign)                   \
+        __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
+#define FETCH_TYPE_STRING       0
+#define FETCH_TYPE_STRSIZE      1
+/* Fetch type information table */
+static const struct fetch_type fetch_type_table[] = {
+        /* Special types */
+        [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
+                                        sizeof(u32), 1, "__data_loc char[]"),
+        [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
+                                        string_size, sizeof(u32), 0, "u32"),
+        /* Basic types */
+        ASSIGN_FETCH_TYPE(u8,  u8,  0),
+        ASSIGN_FETCH_TYPE(u16, u16, 0),
+        ASSIGN_FETCH_TYPE(u32, u32, 0),
+        ASSIGN_FETCH_TYPE(u64, u64, 0),
+        ASSIGN_FETCH_TYPE(s8,  u8,  1),
+        ASSIGN_FETCH_TYPE(s16, u16, 1),
+        ASSIGN_FETCH_TYPE(s32, u32, 1),
+        ASSIGN_FETCH_TYPE(s64, u64, 1),
+};
+static const struct fetch_type *find_fetch_type(const char *type)
+{
+        int i;
+        if (!type)
+                type = DEFAULT_FETCH_TYPE_STR;
+        /* Special case: bitfield */
+        if (*type == 'b') {
+                unsigned long bs;
+                type = strchr(type, '/');
+                if (!type)
+                        goto fail;
+                type++;
+                if (strict_strtoul(type, 0, &bs))
+                        goto fail;
+                switch (bs) {
+                case 8:
+                        return find_fetch_type("u8");
+                case 16:
+                        return find_fetch_type("u16");
+                case 32:
+                        return find_fetch_type("u32");
+                case 64:
+                        return find_fetch_type("u64");
+                default:
+                        goto fail;
+                }
+        }
+        for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
+                if (strcmp(type, fetch_type_table[i].name) == 0)
+                        return &fetch_type_table[i];
+fail:
+        return NULL;
+}
+/* Special function : only accept unsigned long */
+static __kprobes void fetch_stack_address(struct pt_regs *regs,
+                                        void *dummy, void *dest)
+{
+        *(unsigned long *)dest = kernel_stack_pointer(regs);
+}
+static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
+                                        fetch_func_t orig_fn)
+{
+        int i;
+        if (type != &fetch_type_table[FETCH_TYPE_STRING])
+                return NULL;    /* Only string type needs size function */
+        for (i = 0; i < FETCH_MTD_END; i++)
+                if (type->fetch[i] == orig_fn)
+                        return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i];
+        WARN_ON(1);     /* This should not happen */
+        return NULL;
+}
+/* Split symbol and offset. */
+int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset)
+{
+        char *tmp;
+        int ret;
+        if (!offset)
+                return -EINVAL;
+        tmp = strchr(symbol, '+');
+        if (tmp) {
+                /* skip sign because strict_strtol doesn't accept '+' */
+                ret = strict_strtoul(tmp + 1, 0, offset);
+                if (ret)
+                        return ret;
+                *tmp = '\0';
+        } else
+                *offset = 0;
+        return 0;
+}
+#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
+static int parse_probe_vars(char *arg, const struct fetch_type *t,
+                            struct fetch_param *f, bool is_return)
+{
+        int ret = 0;
+        unsigned long param;
+        if (strcmp(arg, "retval") == 0) {
+                if (is_return)
+                        f->fn = t->fetch[FETCH_MTD_retval];
+                else
+                        ret = -EINVAL;
+        } else if (strncmp(arg, "stack", 5) == 0) {
+                if (arg[5] == '\0') {
+                        if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0)
+                                f->fn = fetch_stack_address;
+                        else
+                                ret = -EINVAL;
+                } else if (isdigit(arg[5])) {
+                        ret = strict_strtoul(arg + 5, 10, &param);
+                        if (ret || param > PARAM_MAX_STACK)
+                                ret = -EINVAL;
+                        else {
+                                f->fn = t->fetch[FETCH_MTD_stack];
+                                f->data = (void *)param;
+                        }
+                } else
+                        ret = -EINVAL;
+        } else
+                ret = -EINVAL;
+        return ret;
+}
+/* Recursive argument parser */
+static int parse_probe_arg(char *arg, const struct fetch_type *t,
+                     struct fetch_param *f, bool is_return, bool is_kprobe)
+{
+        unsigned long param;
+        long offset;
+        char *tmp;
+        int ret;
+        ret = 0;
+        /* Until uprobe_events supports only reg arguments */
+        if (!is_kprobe && arg[0] != '%')
+                return -EINVAL;
+        switch (arg[0]) {
+        case '$':
+                ret = parse_probe_vars(arg + 1, t, f, is_return);
+                break;
+        case '%':       /* named register */
+                ret = regs_query_register_offset(arg + 1);
+                if (ret >= 0) {
+                        f->fn = t->fetch[FETCH_MTD_reg];
+                        f->data = (void *)(unsigned long)ret;
+                        ret = 0;
+                }
+                break;
+        case '@':       /* memory or symbol */
+                if (isdigit(arg[1])) {
+                        ret = strict_strtoul(arg + 1, 0, &param);
+                        if (ret)
+                                break;
+                        f->fn = t->fetch[FETCH_MTD_memory];
+                        f->data = (void *)param;
+                } else {
+                        ret = traceprobe_split_symbol_offset(arg + 1, &offset);
+                        if (ret)
+                                break;
+                        f->data = alloc_symbol_cache(arg + 1, offset);
+                        if (f->data)
+                                f->fn = t->fetch[FETCH_MTD_symbol];
+                }
+                break;
+        case '+':       /* deref memory */
+                arg++;  /* Skip '+', because strict_strtol() rejects it. */
+        case '-':
+                tmp = strchr(arg, '(');
+                if (!tmp)
+                        break;
+                *tmp = '\0';
+                ret = strict_strtol(arg, 0, &offset);
+                if (ret)
+                        break;
+                arg = tmp + 1;
+                tmp = strrchr(arg, ')');
+                if (tmp) {
+                        struct deref_fetch_param        *dprm;
+                        const struct fetch_type         *t2;
+                        t2 = find_fetch_type(NULL);
+                        *tmp = '\0';
+                        dprm = kzalloc(sizeof(struct deref_fetch_param), GFP_KERNEL);
+                        if (!dprm)
+                                return -ENOMEM;
+                        dprm->offset = offset;
+                        ret = parse_probe_arg(arg, t2, &dprm->orig, is_return,
+                                                        is_kprobe);
+                        if (ret)
+                                kfree(dprm);
+                        else {
+                                f->fn = t->fetch[FETCH_MTD_deref];
+                                f->data = (void *)dprm;
+                        }
+                }
+                break;
+        }
+        if (!ret && !f->fn) {   /* Parsed, but do not find fetch method */
+                pr_info("%s type has no corresponding fetch method.\n", t->name);
+                ret = -EINVAL;
+        }
+        return ret;
+}
+#define BYTES_TO_BITS(nb)       ((BITS_PER_LONG * (nb)) / sizeof(long))
+/* Bitfield type needs to be parsed into a fetch function */
+static int __parse_bitfield_probe_arg(const char *bf,
+                                      const struct fetch_type *t,
+                                      struct fetch_param *f)
+{
+        struct bitfield_fetch_param *bprm;
+        unsigned long bw, bo;
+        char *tail;
+        if (*bf != 'b')
+                return 0;
+        bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
+        if (!bprm)
+                return -ENOMEM;
+        bprm->orig = *f;
+        f->fn = t->fetch[FETCH_MTD_bitfield];
+        f->data = (void *)bprm;
+        bw = simple_strtoul(bf + 1, &tail, 0);  /* Use simple one */
+        if (bw == 0 || *tail != '@')
+                return -EINVAL;
+        bf = tail + 1;
+        bo = simple_strtoul(bf, &tail, 0);
+        if (tail == bf || *tail != '/')
+                return -EINVAL;
+        bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
+        bprm->low_shift = bprm->hi_shift + bo;
+        return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
+}
+/* String length checking wrapper */
+int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
+                struct probe_arg *parg, bool is_return, bool is_kprobe)
+{
+        const char *t;
+        int ret;
+        if (strlen(arg) > MAX_ARGSTR_LEN) {
+                pr_info("Argument is too long.: %s\n",  arg);
+                return -ENOSPC;
+        }
+        parg->comm = kstrdup(arg, GFP_KERNEL);
+        if (!parg->comm) {
+                pr_info("Failed to allocate memory for command '%s'.\n", arg);
+                return -ENOMEM;
+        }
+        t = strchr(parg->comm, ':');
+        if (t) {
+                arg[t - parg->comm] = '\0';
+                t++;
+        }
+        parg->type = find_fetch_type(t);
+        if (!parg->type) {
+                pr_info("Unsupported type: %s\n", t);
+                return -EINVAL;
+        }
+        parg->offset = *size;
+        *size += parg->type->size;
+        ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, is_kprobe);
+        if (ret >= 0 && t != NULL)
+                ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
+        if (ret >= 0) {
+                parg->fetch_size.fn = get_fetch_size_function(parg->type,
+                                                              parg->fetch.fn);
+                parg->fetch_size.data = parg->fetch.data;
+        }
+        return ret;
+}
+/* Return 1 if name is reserved or already used by another argument */
+int traceprobe_conflict_field_name(const char *name,
+                               struct probe_arg *args, int narg)
+{
+        int i;
+        for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
+                if (strcmp(reserved_field_names[i], name) == 0)
+                        return 1;
+        for (i = 0; i < narg; i++)
+                if (strcmp(args[i].name, name) == 0)
+                        return 1;
+        return 0;
+}
+void traceprobe_update_arg(struct probe_arg *arg)
+{
+        if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
+                update_bitfield_fetch_param(arg->fetch.data);
+        else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
+                update_deref_fetch_param(arg->fetch.data);
+        else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
+                update_symbol_cache(arg->fetch.data);
+}
+void traceprobe_free_probe_arg(struct probe_arg *arg)
+{
+        if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
+                free_bitfield_fetch_param(arg->fetch.data);
+        else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
+                free_deref_fetch_param(arg->fetch.data);
+        else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
+                free_symbol_cache(arg->fetch.data);
+        kfree(arg->name);
+        kfree(arg->comm);
+}
+int traceprobe_command(const char *buf, int (*createfn)(int, char **))
+{
+        char **argv;
+        int argc, ret;
+        argc = 0;
+        ret = 0;
+        argv = argv_split(GFP_KERNEL, buf, &argc);
+        if (!argv)
+                return -ENOMEM;
+        if (argc)
+                ret = createfn(argc, argv);
+        argv_free(argv);
+        return ret;
+}
+#define WRITE_BUFSIZE  4096
+ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
+                                size_t count, loff_t *ppos,
+                                int (*createfn)(int, char **))
+{
+        char *kbuf, *tmp;
+        int ret = 0;
+        size_t done = 0;
+        size_t size;
+        kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
+        if (!kbuf)
+                return -ENOMEM;
+        while (done < count) {
+                size = count - done;
+                if (size >= WRITE_BUFSIZE)
+                        size = WRITE_BUFSIZE - 1;
+                if (copy_from_user(kbuf, buffer + done, size)) {
+                        ret = -EFAULT;
+                        goto out;
+                }
+                kbuf[size] = '\0';
+                tmp = strchr(kbuf, '\n');
+                if (tmp) {
+                        *tmp = '\0';
+                        size = tmp - kbuf + 1;
+                } else if (done + size < count) {
+                        pr_warning("Line length is too long: "
+                                   "Should be less than %d.", WRITE_BUFSIZE);
+                        ret = -EINVAL;
+                        goto out;
+                }
+                done += size;
+                /* Remove comments */
+                tmp = strchr(kbuf, '#');
+                if (tmp)
+                        *tmp = '\0';
+                ret = traceprobe_command(kbuf, createfn);
+                if (ret)
+                        goto out;
+        }
+        ret = done;
+out:
+        kfree(kbuf);
+        return ret;
+}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
new file mode 100644
index 000000000000..933708677814
--- /dev/null
+++ b/kernel/trace/trace_probe.h
@@ -0,0 +1,161 @@
+/*
+ * Common header file for probe-based Dynamic events.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * This code was copied from kernel/trace/trace_kprobe.h written by
+ * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
+ *
+ * Updates to make this generic:
+ * Copyright (C) IBM Corporation, 2010-2011
+ * Author:     Srikar Dronamraju
+ */
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/debugfs.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/ptrace.h>
+#include <linux/perf_event.h>
+#include <linux/kprobes.h>
+#include <linux/stringify.h>
+#include <linux/limits.h>
+#include <linux/uaccess.h>
+#include <asm/bitsperlong.h>
+#include "trace.h"
+#include "trace_output.h"
+#define MAX_TRACE_ARGS          128
+#define MAX_ARGSTR_LEN          63
+#define MAX_EVENT_NAME_LEN      64
+#define MAX_STRING_SIZE         PATH_MAX
+/* Reserved field names */
+#define FIELD_STRING_IP         "__probe_ip"
+#define FIELD_STRING_RETIP      "__probe_ret_ip"
+#define FIELD_STRING_FUNC       "__probe_func"
+#undef DEFINE_FIELD
+#define DEFINE_FIELD(type, item, name, is_signed)                       \
+        do {                                                            \
+                ret = trace_define_field(event_call, #type, name,       \
+                                         offsetof(typeof(field), item), \
+                                         sizeof(field.item), is_signed, \
+                                         FILTER_OTHER);                 \
+                if (ret)                                                \
+                        return ret;                                     \
+        } while (0)
+/* Flags for trace_probe */
+#define TP_FLAG_TRACE           1
+#define TP_FLAG_PROFILE         2
+#define TP_FLAG_REGISTERED      4
+#define TP_FLAG_UPROBE          8
+/* data_rloc: data relative location, compatible with u32 */
+#define make_data_rloc(len, roffs)      \
+        (((u32)(len) << 16) | ((u32)(roffs) & 0xffff))
+#define get_rloc_len(dl)                ((u32)(dl) >> 16)
+#define get_rloc_offs(dl)               ((u32)(dl) & 0xffff)
+/*
+ * Convert data_rloc to data_loc:
+ *  data_rloc stores the offset from data_rloc itself, but data_loc
+ *  stores the offset from event entry.
+ */
+#define convert_rloc_to_loc(dl, offs)   ((u32)(dl) + (offs))
+/* Data fetch function type */
+typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
+/* Printing function type */
+typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, void *);
+/* Fetch types */
+enum {
+        FETCH_MTD_reg = 0,
+        FETCH_MTD_stack,
+        FETCH_MTD_retval,
+        FETCH_MTD_memory,
+        FETCH_MTD_symbol,
+        FETCH_MTD_deref,
+        FETCH_MTD_bitfield,
+        FETCH_MTD_END,
+};
+/* Fetch type information table */
+struct fetch_type {
+        const char              *name;          /* Name of type */
+        size_t                  size;           /* Byte size of type */
+        int                     is_signed;      /* Signed flag */
+        print_type_func_t       print;          /* Print functions */
+        const char              *fmt;           /* Fromat string */
+        const char              *fmttype;       /* Name in format file */
+        /* Fetch functions */
+        fetch_func_t            fetch[FETCH_MTD_END];
+};
+struct fetch_param {
+        fetch_func_t            fn;
+        void                    *data;
+};
+struct probe_arg {
+        struct fetch_param      fetch;
+        struct fetch_param      fetch_size;
+        unsigned int            offset; /* Offset from argument entry */
+        const char              *name;  /* Name of this argument */
+        const char              *comm;  /* Command of this argument */
+        const struct fetch_type *type;  /* Type of this argument */
+};
+static inline __kprobes void call_fetch(struct fetch_param *fprm,
+                                 struct pt_regs *regs, void *dest)
+{
+        return fprm->fn(regs, fprm->data, dest);
+}
+/* Check the name is good for event/group/fields */
+static inline int is_good_name(const char *name)
+{
+        if (!isalpha(*name) && *name != '_')
+                return 0;
+        while (*++name != '\0') {
+                if (!isalpha(*name) && !isdigit(*name) && *name != '_')
+                        return 0;
+        }
+        return 1;
+}
+extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
+                   struct probe_arg *parg, bool is_return, bool is_kprobe);
+extern int traceprobe_conflict_field_name(const char *name,
+                               struct probe_arg *args, int narg);
+extern void traceprobe_update_arg(struct probe_arg *arg);
+extern void traceprobe_free_probe_arg(struct probe_arg *arg);
+extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset);
+extern ssize_t traceprobe_probes_write(struct file *file,
+                const char __user *buffer, size_t count, loff_t *ppos,
+                int (*createfn)(int, char**));
+extern int traceprobe_command(const char *buf, int (*createfn)(int, char**));
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
new file mode 100644
index 000000000000..2b36ac68549e
--- /dev/null
+++ b/kernel/trace/trace_uprobe.c
@@ -0,0 +1,788 @@
+/*
+ * uprobes-based tracing events
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Copyright (C) IBM Corporation, 2010-2012
+ * Author:      Srikar Dronamraju <srikar@linux.vnet.ibm.com>
+ */
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/uprobes.h>
+#include <linux/namei.h>
+#include "trace_probe.h"
+#define UPROBE_EVENT_SYSTEM     "uprobes"
+/*
+ * uprobe event core functions
+ */
+struct trace_uprobe;
+struct uprobe_trace_consumer {
+        struct uprobe_consumer          cons;
+        struct trace_uprobe             *tu;
+};
+struct trace_uprobe {
+        struct list_head                list;
+        struct ftrace_event_class       class;
+        struct ftrace_event_call        call;
+        struct uprobe_trace_consumer    *consumer;
+        struct inode                    *inode;
+        char                            *filename;
+        unsigned long                   offset;
+        unsigned long                   nhit;
+        unsigned int                    flags;  /* For TP_FLAG_* */
+        ssize_t                         size;   /* trace entry size */
+        unsigned int                    nr_args;
+        struct probe_arg                args[];
+};
+#define SIZEOF_TRACE_UPROBE(n)                  \
+        (offsetof(struct trace_uprobe, args) +  \
+        (sizeof(struct probe_arg) * (n)))
+static int register_uprobe_event(struct trace_uprobe *tu);
+static void unregister_uprobe_event(struct trace_uprobe *tu);
+static DEFINE_MUTEX(uprobe_lock);
+static LIST_HEAD(uprobe_list);
+static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
+/*
+ * Allocate new trace_uprobe and initialize it (including uprobes).
+ */
+static struct trace_uprobe *
+alloc_trace_uprobe(const char *group, const char *event, int nargs)
+{
+        struct trace_uprobe *tu;
+        if (!event || !is_good_name(event))
+                return ERR_PTR(-EINVAL);
+        if (!group || !is_good_name(group))
+                return ERR_PTR(-EINVAL);
+        tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL);
+        if (!tu)
+                return ERR_PTR(-ENOMEM);
+        tu->call.class = &tu->class;
+        tu->call.name = kstrdup(event, GFP_KERNEL);
+        if (!tu->call.name)
+                goto error;
+        tu->class.system = kstrdup(group, GFP_KERNEL);
+        if (!tu->class.system)
+                goto error;
+        INIT_LIST_HEAD(&tu->list);
+        return tu;
+error:
+        kfree(tu->call.name);
+        kfree(tu);
+        return ERR_PTR(-ENOMEM);
+}
+static void free_trace_uprobe(struct trace_uprobe *tu)
+{
+        int i;
+        for (i = 0; i < tu->nr_args; i++)
+                traceprobe_free_probe_arg(&tu->args[i]);
+        iput(tu->inode);
+        kfree(tu->call.class->system);
+        kfree(tu->call.name);
+        kfree(tu->filename);
+        kfree(tu);
+}
+static struct trace_uprobe *find_probe_event(const char *event, const char *group)
+{
+        struct trace_uprobe *tu;
+        list_for_each_entry(tu, &uprobe_list, list)
+                if (strcmp(tu->call.name, event) == 0 &&
+                    strcmp(tu->call.class->system, group) == 0)
+                        return tu;
+        return NULL;
+}
+/* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */
+static void unregister_trace_uprobe(struct trace_uprobe *tu)
+{
+        list_del(&tu->list);
+        unregister_uprobe_event(tu);
+        free_trace_uprobe(tu);
+}
+/* Register a trace_uprobe and probe_event */
+static int register_trace_uprobe(struct trace_uprobe *tu)
+{
+        struct trace_uprobe *old_tp;
+        int ret;
+        mutex_lock(&uprobe_lock);
+        /* register as an event */
+        old_tp = find_probe_event(tu->call.name, tu->call.class->system);
+        if (old_tp)
+                /* delete old event */
+                unregister_trace_uprobe(old_tp);
+        ret = register_uprobe_event(tu);
+        if (ret) {
+                pr_warning("Failed to register probe event(%d)\n", ret);
+                goto end;
+        }
+        list_add_tail(&tu->list, &uprobe_list);
+end:
+        mutex_unlock(&uprobe_lock);
+        return ret;
+}
+/*
+ * Argument syntax:
+ *  - Add uprobe: p[:[GRP/]EVENT] PATH:SYMBOL[+offs] [FETCHARGS]
+ *
+ *  - Remove uprobe: -:[GRP/]EVENT
+ */
+static int create_trace_uprobe(int argc, char **argv)
+{
+        struct trace_uprobe *tu;
+        struct inode *inode;
+        char *arg, *event, *group, *filename;
+        char buf[MAX_EVENT_NAME_LEN];
+        struct path path;
+        unsigned long offset;
+        bool is_delete;
+        int i, ret;
+        inode = NULL;
+        ret = 0;
+        is_delete = false;
+        event = NULL;
+        group = NULL;
+        /* argc must be >= 1 */
+        if (argv[0][0] == '-')
+                is_delete = true;
+        else if (argv[0][0] != 'p') {
+                pr_info("Probe definition must be started with 'p', 'r' or" " '-'.\n");
+                return -EINVAL;
+        }
+        if (argv[0][1] == ':') {
+                event = &argv[0][2];
+                arg = strchr(event, '/');
+                if (arg) {
+                        group = event;
+                        event = arg + 1;
+                        event[-1] = '\0';
+                        if (strlen(group) == 0) {
+                                pr_info("Group name is not specified\n");
+                                return -EINVAL;
+                        }
+                }
+                if (strlen(event) == 0) {
+                        pr_info("Event name is not specified\n");
+                        return -EINVAL;
+                }
+        }
+        if (!group)
+                group = UPROBE_EVENT_SYSTEM;
+        if (is_delete) {
+                if (!event) {
+                        pr_info("Delete command needs an event name.\n");
+                        return -EINVAL;
+                }
+                mutex_lock(&uprobe_lock);
+                tu = find_probe_event(event, group);
+                if (!tu) {
+                        mutex_unlock(&uprobe_lock);
+                        pr_info("Event %s/%s doesn't exist.\n", group, event);
+                        return -ENOENT;
+                }
+                /* delete an event */
+                unregister_trace_uprobe(tu);
+                mutex_unlock(&uprobe_lock);
+                return 0;
+        }
+        if (argc < 2) {
+                pr_info("Probe point is not specified.\n");
+                return -EINVAL;
+        }
+        if (isdigit(argv[1][0])) {
+                pr_info("probe point must be have a filename.\n");
+                return -EINVAL;
+        }
+        arg = strchr(argv[1], ':');
+        if (!arg)
+                goto fail_address_parse;
+        *arg++ = '\0';
+        filename = argv[1];
+        ret = kern_path(filename, LOOKUP_FOLLOW, &path);
+        if (ret)
+                goto fail_address_parse;
+        ret = strict_strtoul(arg, 0, &offset);
+        if (ret)
+                goto fail_address_parse;
+        inode = igrab(path.dentry->d_inode);
+        argc -= 2;
+        argv += 2;
+        /* setup a probe */
+        if (!event) {
+                char *tail = strrchr(filename, '/');
+                char *ptr;
+                ptr = kstrdup((tail ? tail + 1 : filename), GFP_KERNEL);
+                if (!ptr) {
+                        ret = -ENOMEM;
+                        goto fail_address_parse;
+                }
+                tail = ptr;
+                ptr = strpbrk(tail, ".-_");
+                if (ptr)
+                        *ptr = '\0';
+                snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_0x%lx", 'p', tail, offset);
+                event = buf;
+                kfree(tail);
+        }
+        tu = alloc_trace_uprobe(group, event, argc);
+        if (IS_ERR(tu)) {
+                pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu));
+                ret = PTR_ERR(tu);
+                goto fail_address_parse;
+        }
+        tu->offset = offset;
+        tu->inode = inode;
+        tu->filename = kstrdup(filename, GFP_KERNEL);
+        if (!tu->filename) {
+                pr_info("Failed to allocate filename.\n");
+                ret = -ENOMEM;
+                goto error;
+        }
+        /* parse arguments */
+        ret = 0;
+        for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+                /* Increment count for freeing args in error case */
+                tu->nr_args++;
+                /* Parse argument name */
+                arg = strchr(argv[i], '=');
+                if (arg) {
+                        *arg++ = '\0';
+                        tu->args[i].name = kstrdup(argv[i], GFP_KERNEL);
+                } else {
+                        arg = argv[i];
+                        /* If argument name is omitted, set "argN" */
+                        snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
+                        tu->args[i].name = kstrdup(buf, GFP_KERNEL);
+                }
+                if (!tu->args[i].name) {
+                        pr_info("Failed to allocate argument[%d] name.\n", i);
+                        ret = -ENOMEM;
+                        goto error;
+                }
+                if (!is_good_name(tu->args[i].name)) {
+                        pr_info("Invalid argument[%d] name: %s\n", i, tu->args[i].name);
+                        ret = -EINVAL;
+                        goto error;
+                }
+                if (traceprobe_conflict_field_name(tu->args[i].name, tu->args, i)) {
+                        pr_info("Argument[%d] name '%s' conflicts with "
+                                "another field.\n", i, argv[i]);
+                        ret = -EINVAL;
+                        goto error;
+                }
+                /* Parse fetch argument */
+                ret = traceprobe_parse_probe_arg(arg, &tu->size, &tu->args[i], false, false);
+                if (ret) {
+                        pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
+                        goto error;
+                }
+        }
+        ret = register_trace_uprobe(tu);
+        if (ret)
+                goto error;
+        return 0;
+error:
+        free_trace_uprobe(tu);
+        return ret;
+fail_address_parse:
+        if (inode)
+                iput(inode);
+        pr_info("Failed to parse address.\n");
+        return ret;
+}
+static void cleanup_all_probes(void)
+{
+        struct trace_uprobe *tu;
+        mutex_lock(&uprobe_lock);
+        while (!list_empty(&uprobe_list)) {
+                tu = list_entry(uprobe_list.next, struct trace_uprobe, list);
+                unregister_trace_uprobe(tu);
+        }
+        mutex_unlock(&uprobe_lock);
+}
+/* Probes listing interfaces */
+static void *probes_seq_start(struct seq_file *m, loff_t *pos)
+{
+        mutex_lock(&uprobe_lock);
+        return seq_list_start(&uprobe_list, *pos);
+}
+static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        return seq_list_next(v, &uprobe_list, pos);
+}
+static void probes_seq_stop(struct seq_file *m, void *v)
+{
+        mutex_unlock(&uprobe_lock);
+}
+static int probes_seq_show(struct seq_file *m, void *v)
+{
+        struct trace_uprobe *tu = v;
+        int i;
+        seq_printf(m, "p:%s/%s", tu->call.class->system, tu->call.name);
+        seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
+        for (i = 0; i < tu->nr_args; i++)
+                seq_printf(m, " %s=%s", tu->args[i].name, tu->args[i].comm);
+        seq_printf(m, "\n");
+        return 0;
+}
+static const struct seq_operations probes_seq_op = {
+        .start  = probes_seq_start,
+        .next   = probes_seq_next,
+        .stop   = probes_seq_stop,
+        .show   = probes_seq_show
+};
+static int probes_open(struct inode *inode, struct file *file)
+{
+        if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC))
+                cleanup_all_probes();
+        return seq_open(file, &probes_seq_op);
+}
+static ssize_t probes_write(struct file *file, const char __user *buffer,
+                            size_t count, loff_t *ppos)
+{
+        return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe);
+}
+static const struct file_operations uprobe_events_ops = {
+        .owner          = THIS_MODULE,
+        .open           = probes_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+        .write          = probes_write,
+};
+/* Probes profiling interfaces */
+static int probes_profile_seq_show(struct seq_file *m, void *v)
+{
+        struct trace_uprobe *tu = v;
+        seq_printf(m, "  %s %-44s %15lu\n", tu->filename, tu->call.name, tu->nhit);
+        return 0;
+}
+static const struct seq_operations profile_seq_op = {
+        .start  = probes_seq_start,
+        .next   = probes_seq_next,
+        .stop   = probes_seq_stop,
+        .show   = probes_profile_seq_show
+};
+static int profile_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &profile_seq_op);
+}
+static const struct file_operations uprobe_profile_ops = {
+        .owner          = THIS_MODULE,
+        .open           = profile_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+/* uprobe handler */
+static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
+{
+        struct uprobe_trace_entry_head *entry;
+        struct ring_buffer_event *event;
+        struct ring_buffer *buffer;
+        u8 *data;
+        int size, i, pc;
+        unsigned long irq_flags;
+        struct ftrace_event_call *call = &tu->call;
+        tu->nhit++;
+        local_save_flags(irq_flags);
+        pc = preempt_count();
+        size = sizeof(*entry) + tu->size;
+        event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
+                                                  size, irq_flags, pc);
+        if (!event)
+                return;
+        entry = ring_buffer_event_data(event);
+        entry->ip = uprobe_get_swbp_addr(task_pt_regs(current));
+        data = (u8 *)&entry[1];
+        for (i = 0; i < tu->nr_args; i++)
+                call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
+        if (!filter_current_check_discard(buffer, call, entry, event))
+                trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
+}
+/* Event entry printers */
+static enum print_line_t
+print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event)
+{
+        struct uprobe_trace_entry_head *field;
+        struct trace_seq *s = &iter->seq;
+        struct trace_uprobe *tu;
+        u8 *data;
+        int i;
+        field = (struct uprobe_trace_entry_head *)iter->ent;
+        tu = container_of(event, struct trace_uprobe, call.event);
+        if (!trace_seq_printf(s, "%s: (", tu->call.name))
+                goto partial;
+        if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
+                goto partial;
+        if (!trace_seq_puts(s, ")"))
+                goto partial;
+        data = (u8 *)&field[1];
+        for (i = 0; i < tu->nr_args; i++) {
+                if (!tu->args[i].type->print(s, tu->args[i].name,
+                                             data + tu->args[i].offset, field))
+                        goto partial;
+        }
+        if (trace_seq_puts(s, "\n"))
+                return TRACE_TYPE_HANDLED;
+partial:
+        return TRACE_TYPE_PARTIAL_LINE;
+}
+static int probe_event_enable(struct trace_uprobe *tu, int flag)
+{
+        struct uprobe_trace_consumer *utc;
+        int ret = 0;
+        if (!tu->inode || tu->consumer)
+                return -EINTR;
+        utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL);
+        if (!utc)
+                return -EINTR;
+        utc->cons.handler = uprobe_dispatcher;
+        utc->cons.filter = NULL;
+        ret = uprobe_register(tu->inode, tu->offset, &utc->cons);
+        if (ret) {
+                kfree(utc);
+                return ret;
+        }
+        tu->flags |= flag;
+        utc->tu = tu;
+        tu->consumer = utc;
+        return 0;
+}
+static void probe_event_disable(struct trace_uprobe *tu, int flag)
+{
+        if (!tu->inode || !tu->consumer)
+                return;
+        uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons);
+        tu->flags &= ~flag;
+        kfree(tu->consumer);
+        tu->consumer = NULL;
+}
+static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
+{
+        int ret, i;
+        struct uprobe_trace_entry_head field;
+        struct trace_uprobe *tu = (struct trace_uprobe *)event_call->data;
+        DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
+        /* Set argument names as fields */
+        for (i = 0; i < tu->nr_args; i++) {
+                ret = trace_define_field(event_call, tu->args[i].type->fmttype,
+                                         tu->args[i].name,
+                                         sizeof(field) + tu->args[i].offset,
+                                         tu->args[i].type->size,
+                                         tu->args[i].type->is_signed,
+                                         FILTER_OTHER);
+                if (ret)
+                        return ret;
+        }
+        return 0;
+}
+#define LEN_OR_ZERO             (len ? len - pos : 0)
+static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len)
+{
+        const char *fmt, *arg;
+        int i;
+        int pos = 0;
+        fmt = "(%lx)";
+        arg = "REC->" FIELD_STRING_IP;
+        /* When len=0, we just calculate the needed length */
+        pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
+        for (i = 0; i < tu->nr_args; i++) {
+                pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
+                                tu->args[i].name, tu->args[i].type->fmt);
+        }
+        pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
+        for (i = 0; i < tu->nr_args; i++) {
+                pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
+                                tu->args[i].name);
+        }
+        return pos;     /* return the length of print_fmt */
+}
+#undef LEN_OR_ZERO
+static int set_print_fmt(struct trace_uprobe *tu)
+{
+        char *print_fmt;
+        int len;
+        /* First: called with 0 length to calculate the needed length */
+        len = __set_print_fmt(tu, NULL, 0);
+        print_fmt = kmalloc(len + 1, GFP_KERNEL);
+        if (!print_fmt)
+                return -ENOMEM;
+        /* Second: actually write the @print_fmt */
+        __set_print_fmt(tu, print_fmt, len + 1);
+        tu->call.print_fmt = print_fmt;
+        return 0;
+}
+#ifdef CONFIG_PERF_EVENTS
+/* uprobe profile handler */
+static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
+{
+        struct ftrace_event_call *call = &tu->call;
+        struct uprobe_trace_entry_head *entry;
+        struct hlist_head *head;
+        u8 *data;
+        int size, __size, i;
+        int rctx;
+        __size = sizeof(*entry) + tu->size;
+        size = ALIGN(__size + sizeof(u32), sizeof(u64));
+        size -= sizeof(u32);
+        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
+                return;
+        preempt_disable();
+        entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
+        if (!entry)
+                goto out;
+        entry->ip = uprobe_get_swbp_addr(task_pt_regs(current));
+        data = (u8 *)&entry[1];
+        for (i = 0; i < tu->nr_args; i++)
+                call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
+        head = this_cpu_ptr(call->perf_events);
+        perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
+ out:
+        preempt_enable();
+}
+#endif  /* CONFIG_PERF_EVENTS */
+static
+int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data)
+{
+        struct trace_uprobe *tu = (struct trace_uprobe *)event->data;
+        switch (type) {
+        case TRACE_REG_REGISTER:
+                return probe_event_enable(tu, TP_FLAG_TRACE);
+        case TRACE_REG_UNREGISTER:
+                probe_event_disable(tu, TP_FLAG_TRACE);
+                return 0;
+#ifdef CONFIG_PERF_EVENTS
+        case TRACE_REG_PERF_REGISTER:
+                return probe_event_enable(tu, TP_FLAG_PROFILE);
+        case TRACE_REG_PERF_UNREGISTER:
+                probe_event_disable(tu, TP_FLAG_PROFILE);
+                return 0;
+#endif
+        default:
+                return 0;
+        }
+        return 0;
+}
+static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
+{
+        struct uprobe_trace_consumer *utc;
+        struct trace_uprobe *tu;
+        utc = container_of(con, struct uprobe_trace_consumer, cons);
+        tu = utc->tu;
+        if (!tu || tu->consumer != utc)
+                return 0;
+        if (tu->flags & TP_FLAG_TRACE)
+                uprobe_trace_func(tu, regs);
+#ifdef CONFIG_PERF_EVENTS
+        if (tu->flags & TP_FLAG_PROFILE)
+                uprobe_perf_func(tu, regs);
+#endif
+        return 0;
+}
+static struct trace_event_functions uprobe_funcs = {
+        .trace          = print_uprobe_event
+};
+static int register_uprobe_event(struct trace_uprobe *tu)
+{
+        struct ftrace_event_call *call = &tu->call;
+        int ret;
+        /* Initialize ftrace_event_call */
+        INIT_LIST_HEAD(&call->class->fields);
+        call->event.funcs = &uprobe_funcs;
+        call->class->define_fields = uprobe_event_define_fields;
+        if (set_print_fmt(tu) < 0)
+                return -ENOMEM;
+        ret = register_ftrace_event(&call->event);
+        if (!ret) {
+                kfree(call->print_fmt);
+                return -ENODEV;
+        }
+        call->flags = 0;
+        call->class->reg = trace_uprobe_register;
+        call->data = tu;
+        ret = trace_add_event_call(call);
+        if (ret) {
+                pr_info("Failed to register uprobe event: %s\n", call->name);
+                kfree(call->print_fmt);
+                unregister_ftrace_event(&call->event);
+        }
+        return ret;
+}
+static void unregister_uprobe_event(struct trace_uprobe *tu)
+{
+        /* tu->event is unregistered in trace_remove_event_call() */
+        trace_remove_event_call(&tu->call);
+        kfree(tu->call.print_fmt);
+        tu->call.print_fmt = NULL;
+}
+/* Make a trace interface for controling probe points */
+static __init int init_uprobe_trace(void)
+{
+        struct dentry *d_tracer;
+        d_tracer = tracing_init_dentry();
+        if (!d_tracer)
+                return 0;
+        trace_create_file("uprobe_events", 0644, d_tracer,
+                                    NULL, &uprobe_events_ops);
+        /* Profile interface */
+        trace_create_file("uprobe_profile", 0444, d_tracer,
+                                    NULL, &uprobe_profile_ops);
+        return 0;
+}
+fs_initcall(init_uprobe_trace);
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
deleted file mode 100644
index 209b379a4721..000000000000
--- a/kernel/trace/trace_workqueue.c
+++ /dev/null
@@ -1,300 +0,0 @@
-/*
- * Workqueue statistical tracer.
- *
- * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
- *
- */
-#include <trace/events/workqueue.h>
-#include <linux/list.h>
-#include <linux/percpu.h>
-#include <linux/slab.h>
-#include <linux/kref.h>
-#include "trace_stat.h"
-#include "trace.h"
-/* A cpu workqueue thread */
-struct cpu_workqueue_stats {
-        struct list_head            list;
-        struct kref                 kref;
-        int                         cpu;
-        pid_t                       pid;
-/* Can be inserted from interrupt or user context, need to be atomic */
-        atomic_t                    inserted;
-/*
- *  Don't need to be atomic, works are serialized in a single workqueue thread
- *  on a single CPU.
- */
-        unsigned int                executed;
-};
-/* List of workqueue threads on one cpu */
-struct workqueue_global_stats {
-        struct list_head        list;
-        spinlock_t              lock;
-};
-/* Don't need a global lock because allocated before the workqueues, and
- * never freed.
- */
-static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);
-#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu))
-static void cpu_workqueue_stat_free(struct kref *kref)
-{
-        kfree(container_of(kref, struct cpu_workqueue_stats, kref));
-}
-/* Insertion of a work */
-static void
-probe_workqueue_insertion(void *ignore,
-                          struct task_struct *wq_thread,
-                          struct work_struct *work)
-{
-        int cpu = cpumask_first(&wq_thread->cpus_allowed);
-        struct cpu_workqueue_stats *node;
-        unsigned long flags;
-        spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-        list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
-                if (node->pid == wq_thread->pid) {
-                        atomic_inc(&node->inserted);
-                        goto found;
-                }
-        }
-        pr_debug("trace_workqueue: entry not found\n");
-found:
-        spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-}
-/* Execution of a work */
-static void
-probe_workqueue_execution(void *ignore,
-                          struct task_struct *wq_thread,
-                          struct work_struct *work)
-{
-        int cpu = cpumask_first(&wq_thread->cpus_allowed);
-        struct cpu_workqueue_stats *node;
-        unsigned long flags;
-        spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-        list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
-                if (node->pid == wq_thread->pid) {
-                        node->executed++;
-                        goto found;
-                }
-        }
-        pr_debug("trace_workqueue: entry not found\n");
-found:
-        spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-}
-/* Creation of a cpu workqueue thread */
-static void probe_workqueue_creation(void *ignore,
-                                     struct task_struct *wq_thread, int cpu)
-{
-        struct cpu_workqueue_stats *cws;
-        unsigned long flags;
-        WARN_ON(cpu < 0);
-        /* Workqueues are sometimes created in atomic context */
-        cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC);
-        if (!cws) {
-                pr_warning("trace_workqueue: not enough memory\n");
-                return;
-        }
-        INIT_LIST_HEAD(&cws->list);
-        kref_init(&cws->kref);
-        cws->cpu = cpu;
-        cws->pid = wq_thread->pid;
-        spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-        list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list);
-        spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-}
-/* Destruction of a cpu workqueue thread */
-static void
-probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread)
-{
-        /* Workqueue only execute on one cpu */
-        int cpu = cpumask_first(&wq_thread->cpus_allowed);
-        struct cpu_workqueue_stats *node, *next;
-        unsigned long flags;
-        spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-        list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
-                                                        list) {
-                if (node->pid == wq_thread->pid) {
-                        list_del(&node->list);
-                        kref_put(&node->kref, cpu_workqueue_stat_free);
-                        goto found;
-                }
-        }
-        pr_debug("trace_workqueue: don't find workqueue to destroy\n");
-found:
-        spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-}
-static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
-{
-        unsigned long flags;
-        struct cpu_workqueue_stats *ret = NULL;
-        spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-        if (!list_empty(&workqueue_cpu_stat(cpu)->list)) {
-                ret = list_entry(workqueue_cpu_stat(cpu)->list.next,
-                                 struct cpu_workqueue_stats, list);
-                kref_get(&ret->kref);
-        }
-        spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-        return ret;
-}
-static void *workqueue_stat_start(struct tracer_stat *trace)
-{
-        int cpu;
-        void *ret = NULL;
-        for_each_possible_cpu(cpu) {
-                ret = workqueue_stat_start_cpu(cpu);
-                if (ret)
-                        return ret;
-        }
-        return NULL;
-}
-static void *workqueue_stat_next(void *prev, int idx)
-{
-        struct cpu_workqueue_stats *prev_cws = prev;
-        struct cpu_workqueue_stats *ret;
-        int cpu = prev_cws->cpu;
-        unsigned long flags;
-        spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-        if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
-                spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-                do {
-                        cpu = cpumask_next(cpu, cpu_possible_mask);
-                        if (cpu >= nr_cpu_ids)
-                                return NULL;
-                } while (!(ret = workqueue_stat_start_cpu(cpu)));
-                return ret;
-        } else {
-                ret = list_entry(prev_cws->list.next,
-                                 struct cpu_workqueue_stats, list);
-                kref_get(&ret->kref);
-        }
-        spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-        return ret;
-}
-static int workqueue_stat_show(struct seq_file *s, void *p)
-{
-        struct cpu_workqueue_stats *cws = p;
-        struct pid *pid;
-        struct task_struct *tsk;
-        pid = find_get_pid(cws->pid);
-        if (pid) {
-                tsk = get_pid_task(pid, PIDTYPE_PID);
-                if (tsk) {
-                        seq_printf(s, "%3d %6d     %6u       %s\n", cws->cpu,
-                                   atomic_read(&cws->inserted), cws->executed,
-                                   tsk->comm);
-                        put_task_struct(tsk);
-                }
-                put_pid(pid);
-        }
-        return 0;
-}
-static void workqueue_stat_release(void *stat)
-{
-        struct cpu_workqueue_stats *node = stat;
-        kref_put(&node->kref, cpu_workqueue_stat_free);
-}
-static int workqueue_stat_headers(struct seq_file *s)
-{
-        seq_printf(s, "# CPU  INSERTED  EXECUTED   NAME\n");
-        seq_printf(s, "# |      |         |          |\n");
-        return 0;
-}
-struct tracer_stat workqueue_stats __read_mostly = {
-        .name = "workqueues",
-        .stat_start = workqueue_stat_start,
-        .stat_next = workqueue_stat_next,
-        .stat_show = workqueue_stat_show,
-        .stat_release = workqueue_stat_release,
-        .stat_headers = workqueue_stat_headers
-};
-int __init stat_workqueue_init(void)
-{
-        if (register_stat_tracer(&workqueue_stats)) {
-                pr_warning("Unable to register workqueue stat tracer\n");
-                return 1;
-        }
-        return 0;
-}
-fs_initcall(stat_workqueue_init);
-/*
- * Workqueues are created very early, just after pre-smp initcalls.
- * So we must register our tracepoints at this stage.
- */
-int __init trace_workqueue_early_init(void)
-{
-        int ret, cpu;
-        for_each_possible_cpu(cpu) {
-                spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
-                INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
-        }
-        ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
-        if (ret)
-                goto out;
-        ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL);
-        if (ret)
-                goto no_insertion;
-        ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL);
-        if (ret)
-                goto no_execution;
-        ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL);
-        if (ret)
-                goto no_creation;
-        return 0;
-no_creation:
-        unregister_trace_workqueue_creation(probe_workqueue_creation, NULL);
-no_execution:
-        unregister_trace_workqueue_execution(probe_workqueue_execution, NULL);
-no_insertion:
-        unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
-out:
-        pr_warning("trace_workqueue: unable to trace workqueues\n");
-        return 1;
-}
-early_initcall(trace_workqueue_early_init);
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 51c6e89e8619..d7948eb10225 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -81,14 +81,19 @@ SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid)
        return ret;
 }
-SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruid, old_uid_t __user *, euid, old_uid_t __user *, suid)
+SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euidp, old_uid_t __user *, suidp)
 {
        const struct cred *cred = current_cred();
        int retval;
+        old_uid_t ruid, euid, suid;
-        if (!(retval   = put_user(high2lowuid(cred->uid),  ruid)) &&
+        ruid = high2lowuid(from_kuid_munged(cred->user_ns, cred->uid));
-            !(retval   = put_user(high2lowuid(cred->euid), euid)))
+        euid = high2lowuid(from_kuid_munged(cred->user_ns, cred->euid));
-                retval = put_user(high2lowuid(cred->suid), suid);
+        suid = high2lowuid(from_kuid_munged(cred->user_ns, cred->suid));
+        if (!(retval   = put_user(ruid, ruidp)) &&
+            !(retval   = put_user(euid, euidp)))
+                retval = put_user(suid, suidp);
        return retval;
 }
@@ -103,14 +108,19 @@ SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid)
 }
-SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgid, old_gid_t __user *, egid, old_gid_t __user *, sgid)
+SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egidp, old_gid_t __user *, sgidp)
 {
        const struct cred *cred = current_cred();
        int retval;
+        old_gid_t rgid, egid, sgid;
+        rgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->gid));
+        egid = high2lowgid(from_kgid_munged(cred->user_ns, cred->egid));
+        sgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->sgid));
-        if (!(retval   = put_user(high2lowgid(cred->gid),  rgid)) &&
+        if (!(retval   = put_user(rgid, rgidp)) &&
-            !(retval   = put_user(high2lowgid(cred->egid), egid)))
+            !(retval   = put_user(egid, egidp)))
-                retval = put_user(high2lowgid(cred->sgid), sgid);
+                retval = put_user(sgid, sgidp);
        return retval;
 }
@@ -134,11 +144,14 @@ SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid)
 static int groups16_to_user(old_gid_t __user *grouplist,
    struct group_info *group_info)
 {
+        struct user_namespace *user_ns = current_user_ns();
        int i;
        old_gid_t group;
+        kgid_t kgid;
        for (i = 0; i < group_info->ngroups; i++) {
-                group = high2lowgid(GROUP_AT(group_info, i));
+                kgid = GROUP_AT(group_info, i);
+                group = high2lowgid(from_kgid_munged(user_ns, kgid));
                if (put_user(group, grouplist+i))
                        return -EFAULT;
        }
@@ -149,13 +162,20 @@ static int groups16_to_user(old_gid_t __user *grouplist,
 static int groups16_from_user(struct group_info *group_info,
    old_gid_t __user *grouplist)
 {
+        struct user_namespace *user_ns = current_user_ns();
        int i;
        old_gid_t group;
+        kgid_t kgid;
        for (i = 0; i < group_info->ngroups; i++) {
                if (get_user(group, grouplist+i))
                        return  -EFAULT;
-                GROUP_AT(group_info, i) = low2highgid(group);
+                kgid = make_kgid(user_ns, low2highgid(group));
+                if (!gid_valid(kgid))
+                        return -EINVAL;
+                GROUP_AT(group_info, i) = kgid;
        }
        return 0;
@@ -211,20 +231,20 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
 SYSCALL_DEFINE0(getuid16)
 {
-        return high2lowuid(current_uid());
+        return high2lowuid(from_kuid_munged(current_user_ns(), current_uid()));
 }
 SYSCALL_DEFINE0(geteuid16)
 {
-        return high2lowuid(current_euid());
+        return high2lowuid(from_kuid_munged(current_user_ns(), current_euid()));
 }
 SYSCALL_DEFINE0(getgid16)
 {
-        return high2lowgid(current_gid());
+        return high2lowgid(from_kgid_munged(current_user_ns(), current_gid()));
 }
 SYSCALL_DEFINE0(getegid16)
 {
-        return high2lowgid(current_egid());
+        return high2lowgid(from_kgid_munged(current_user_ns(), current_egid()));
 }
diff --git a/kernel/user.c b/kernel/user.c
index 71dd2363ab0f..b815fefbe76f 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -22,10 +22,27 @@
 * and 1 for... ?
 */
 struct user_namespace init_user_ns = {
+        .uid_map = {
+                .nr_extents = 1,
+                .extent[0] = {
+                        .first = 0,
+                        .lower_first = 0,
+                        .count = 4294967295U,
+                },
+        },
+        .gid_map = {
+                .nr_extents = 1,
+                .extent[0] = {
+                        .first = 0,
+                        .lower_first = 0,
+                        .count = 4294967295U,
+                },
+        },
        .kref = {
                .refcount       = ATOMIC_INIT(3),
        },
-        .creator = &root_user,
+        .owner = GLOBAL_ROOT_UID,
+        .group = GLOBAL_ROOT_GID,
 };
 EXPORT_SYMBOL_GPL(init_user_ns);
@@ -34,11 +51,14 @@ EXPORT_SYMBOL_GPL(init_user_ns);
 * when changing user ID's (ie setuid() and friends).
 */
+#define UIDHASH_BITS    (CONFIG_BASE_SMALL ? 3 : 7)
+#define UIDHASH_SZ      (1 << UIDHASH_BITS)
 #define UIDHASH_MASK            (UIDHASH_SZ - 1)
 #define __uidhashfn(uid)        (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
-#define uidhashentry(ns, uid)   ((ns)->uidhash_table + __uidhashfn((uid)))
+#define uidhashentry(uid)       (uidhash_table + __uidhashfn((__kuid_val(uid))))
 static struct kmem_cache *uid_cachep;
+struct hlist_head uidhash_table[UIDHASH_SZ];
 /*
 * The uidhash_lock is mostly taken from process context, but it is
@@ -51,14 +71,14 @@ static struct kmem_cache *uid_cachep;
 */
 static DEFINE_SPINLOCK(uidhash_lock);
-/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */
+/* root_user.__count is 1, for init task cred */
 struct user_struct root_user = {
-        .__count        = ATOMIC_INIT(2),
+        .__count        = ATOMIC_INIT(1),
        .processes      = ATOMIC_INIT(1),
        .files          = ATOMIC_INIT(0),
        .sigpending     = ATOMIC_INIT(0),
        .locked_shm     = 0,
-        .user_ns        = &init_user_ns,
+        .uid            = GLOBAL_ROOT_UID,
 };
 /*
@@ -72,16 +92,15 @@ static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
 static void uid_hash_remove(struct user_struct *up)
 {
        hlist_del_init(&up->uidhash_node);
-        put_user_ns(up->user_ns);
 }
-static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
+static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent)
 {
        struct user_struct *user;
        struct hlist_node *h;
        hlist_for_each_entry(user, h, hashent, uidhash_node) {
-                if (user->uid == uid) {
+                if (uid_eq(user->uid, uid)) {
                        atomic_inc(&user->__count);
                        return user;
                }
@@ -110,14 +129,13 @@ static void free_user(struct user_struct *up, unsigned long flags)
 *
 * If the user_struct could not be found, return NULL.
 */
-struct user_struct *find_user(uid_t uid)
+struct user_struct *find_user(kuid_t uid)
 {
        struct user_struct *ret;
        unsigned long flags;
-        struct user_namespace *ns = current_user_ns();
        spin_lock_irqsave(&uidhash_lock, flags);
-        ret = uid_hash_find(uid, uidhashentry(ns, uid));
+        ret = uid_hash_find(uid, uidhashentry(uid));
        spin_unlock_irqrestore(&uidhash_lock, flags);
        return ret;
 }
@@ -136,9 +154,9 @@ void free_uid(struct user_struct *up)
                local_irq_restore(flags);
 }
-struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
+struct user_struct *alloc_uid(kuid_t uid)
 {
-        struct hlist_head *hashent = uidhashentry(ns, uid);
+        struct hlist_head *hashent = uidhashentry(uid);
        struct user_struct *up, *new;
        spin_lock_irq(&uidhash_lock);
@@ -153,8 +171,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
                new->uid = uid;
                atomic_set(&new->__count, 1);
-                new->user_ns = get_user_ns(ns);
                /*
                 * Before adding this, check whether we raced
                 * on adding the same user already..
@@ -162,7 +178,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
                spin_lock_irq(&uidhash_lock);
                up = uid_hash_find(uid, hashent);
                if (up) {
-                        put_user_ns(ns);
                        key_put(new->uid_keyring);
                        key_put(new->session_keyring);
                        kmem_cache_free(uid_cachep, new);
@@ -187,11 +202,11 @@ static int __init uid_cache_init(void)
                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
        for(n = 0; n < UIDHASH_SZ; ++n)
-                INIT_HLIST_HEAD(init_user_ns.uidhash_table + n);
+                INIT_HLIST_HEAD(uidhash_table + n);
        /* Insert the root user immediately (init already runs as root) */
        spin_lock_irq(&uidhash_lock);
-        uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0));
+        uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID));
        spin_unlock_irq(&uidhash_lock);
        return 0;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 3b906e98b1db..86602316422d 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -11,9 +11,20 @@
 #include <linux/user_namespace.h>
 #include <linux/highuid.h>
 #include <linux/cred.h>
+#include <linux/securebits.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
+#include <keys/user-type.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/ctype.h>
 static struct kmem_cache *user_ns_cachep __read_mostly;
+static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
+                                struct uid_gid_map *map);
 /*
 * Create a new user namespace, deriving the creator from the user in the
 * passed credentials, and replacing that user with the new root user for the
@@ -24,109 +35,565 @@ static struct kmem_cache *user_ns_cachep __read_mostly;
 */
 int create_user_ns(struct cred *new)
 {
-        struct user_namespace *ns;
+        struct user_namespace *ns, *parent_ns = new->user_ns;
-        struct user_struct *root_user;
+        kuid_t owner = new->euid;
-        int n;
+        kgid_t group = new->egid;
+        /* The creator needs a mapping in the parent user namespace
+         * or else we won't be able to reasonably tell userspace who
+         * created a user_namespace.
+         */
+        if (!kuid_has_mapping(parent_ns, owner) ||
+            !kgid_has_mapping(parent_ns, group))
+                return -EPERM;
-        ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL);
+        ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
        if (!ns)
                return -ENOMEM;
        kref_init(&ns->kref);
+        ns->parent = parent_ns;
+        ns->owner = owner;
+        ns->group = group;
-        for (n = 0; n < UIDHASH_SZ; ++n)
+        /* Start with the same capabilities as init but useless for doing
-                INIT_HLIST_HEAD(ns->uidhash_table + n);
+         * anything as the capabilities are bound to the new user namespace.
+         */
-        /* Alloc new root user.  */
+        new->securebits = SECUREBITS_DEFAULT;
-        root_user = alloc_uid(ns, 0);
+        new->cap_inheritable = CAP_EMPTY_SET;
-        if (!root_user) {
+        new->cap_permitted = CAP_FULL_SET;
-                kmem_cache_free(user_ns_cachep, ns);
+        new->cap_effective = CAP_FULL_SET;
-                return -ENOMEM;
+        new->cap_bset = CAP_FULL_SET;
-        }
-        /* set the new root user in the credentials under preparation */
-        ns->creator = new->user;
-        new->user = root_user;
-        new->uid = new->euid = new->suid = new->fsuid = 0;
-        new->gid = new->egid = new->sgid = new->fsgid = 0;
-        put_group_info(new->group_info);
-        new->group_info = get_group_info(&init_groups);
 #ifdef CONFIG_KEYS
        key_put(new->request_key_auth);
        new->request_key_auth = NULL;
 #endif
        /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
-        /* root_user holds a reference to ns, our reference can be dropped */
+        /* Leave the new->user_ns reference with the new user namespace. */
-        put_user_ns(ns);
+        /* Leave the reference to our user_ns with the new cred. */
+        new->user_ns = ns;
        return 0;
 }
-/*
+void free_user_ns(struct kref *kref)
- * Deferred destructor for a user namespace.  This is required because
- * free_user_ns() may be called with uidhash_lock held, but we need to call
- * back to free_uid() which will want to take the lock again.
- */
-static void free_user_ns_work(struct work_struct *work)
 {
-        struct user_namespace *ns =
+        struct user_namespace *parent, *ns =
-                container_of(work, struct user_namespace, destroyer);
+                container_of(kref, struct user_namespace, kref);
-        free_uid(ns->creator);
+        parent = ns->parent;
        kmem_cache_free(user_ns_cachep, ns);
+        put_user_ns(parent);
 }
+EXPORT_SYMBOL(free_user_ns);
-void free_user_ns(struct kref *kref)
+static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
 {
-        struct user_namespace *ns =
+        unsigned idx, extents;
-                container_of(kref, struct user_namespace, kref);
+        u32 first, last, id2;
+        id2 = id + count - 1;
+        /* Find the matching extent */
+        extents = map->nr_extents;
+        smp_read_barrier_depends();
+        for (idx = 0; idx < extents; idx++) {
+                first = map->extent[idx].first;
+                last = first + map->extent[idx].count - 1;
+                if (id >= first && id <= last &&
+                    (id2 >= first && id2 <= last))
+                        break;
+        }
+        /* Map the id or note failure */
+        if (idx < extents)
+                id = (id - first) + map->extent[idx].lower_first;
+        else
+                id = (u32) -1;
-        INIT_WORK(&ns->destroyer, free_user_ns_work);
+        return id;
-        schedule_work(&ns->destroyer);
 }
-EXPORT_SYMBOL(free_user_ns);
-uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid)
+static u32 map_id_down(struct uid_gid_map *map, u32 id)
 {
-        struct user_namespace *tmp;
+        unsigned idx, extents;
+        u32 first, last;
-        if (likely(to == cred->user->user_ns))
+        /* Find the matching extent */
-                return uid;
+        extents = map->nr_extents;
+        smp_read_barrier_depends();
+        for (idx = 0; idx < extents; idx++) {
+                first = map->extent[idx].first;
+                last = first + map->extent[idx].count - 1;
+                if (id >= first && id <= last)
+                        break;
+        }
+        /* Map the id or note failure */
+        if (idx < extents)
+                id = (id - first) + map->extent[idx].lower_first;
+        else
+                id = (u32) -1;
+        return id;
+}
-        /* Is cred->user the creator of the target user_ns
+static u32 map_id_up(struct uid_gid_map *map, u32 id)
-         * or the creator of one of it's parents?
+{
-         */
+        unsigned idx, extents;
-        for ( tmp = to; tmp != &init_user_ns;
+        u32 first, last;
-              tmp = tmp->creator->user_ns ) {
-                if (cred->user == tmp->creator) {
+        /* Find the matching extent */
-                        return (uid_t)0;
+        extents = map->nr_extents;
-                }
+        smp_read_barrier_depends();
+        for (idx = 0; idx < extents; idx++) {
+                first = map->extent[idx].lower_first;
+                last = first + map->extent[idx].count - 1;
+                if (id >= first && id <= last)
+                        break;
        }
+        /* Map the id or note failure */
+        if (idx < extents)
+                id = (id - first) + map->extent[idx].first;
+        else
+                id = (u32) -1;
+        return id;
+}
+/**
+ *      make_kuid - Map a user-namespace uid pair into a kuid.
+ *      @ns:  User namespace that the uid is in
+ *      @uid: User identifier
+ *
+ *      Maps a user-namespace uid pair into a kernel internal kuid,
+ *      and returns that kuid.
+ *
+ *      When there is no mapping defined for the user-namespace uid
+ *      pair INVALID_UID is returned.  Callers are expected to test
+ *      for and handle handle INVALID_UID being returned.  INVALID_UID
+ *      may be tested for using uid_valid().
+ */
+kuid_t make_kuid(struct user_namespace *ns, uid_t uid)
+{
+        /* Map the uid to a global kernel uid */
+        return KUIDT_INIT(map_id_down(&ns->uid_map, uid));
+}
+EXPORT_SYMBOL(make_kuid);
+/**
+ *      from_kuid - Create a uid from a kuid user-namespace pair.
+ *      @targ: The user namespace we want a uid in.
+ *      @kuid: The kernel internal uid to start with.
+ *
+ *      Map @kuid into the user-namespace specified by @targ and
+ *      return the resulting uid.
+ *
+ *      There is always a mapping into the initial user_namespace.
+ *
+ *      If @kuid has no mapping in @targ (uid_t)-1 is returned.
+ */
+uid_t from_kuid(struct user_namespace *targ, kuid_t kuid)
+{
+        /* Map the uid from a global kernel uid */
+        return map_id_up(&targ->uid_map, __kuid_val(kuid));
+}
+EXPORT_SYMBOL(from_kuid);
-        /* No useful relationship so no mapping */
+/**
-        return overflowuid;
+ *      from_kuid_munged - Create a uid from a kuid user-namespace pair.
+ *      @targ: The user namespace we want a uid in.
+ *      @kuid: The kernel internal uid to start with.
+ *
+ *      Map @kuid into the user-namespace specified by @targ and
+ *      return the resulting uid.
+ *
+ *      There is always a mapping into the initial user_namespace.
+ *
+ *      Unlike from_kuid from_kuid_munged never fails and always
+ *      returns a valid uid.  This makes from_kuid_munged appropriate
+ *      for use in syscalls like stat and getuid where failing the
+ *      system call and failing to provide a valid uid are not an
+ *      options.
+ *
+ *      If @kuid has no mapping in @targ overflowuid is returned.
+ */
+uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid)
+{
+        uid_t uid;
+        uid = from_kuid(targ, kuid);
+        if (uid == (uid_t) -1)
+                uid = overflowuid;
+        return uid;
 }
+EXPORT_SYMBOL(from_kuid_munged);
-gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid)
+/**
+ *      make_kgid - Map a user-namespace gid pair into a kgid.
+ *      @ns:  User namespace that the gid is in
+ *      @uid: group identifier
+ *
+ *      Maps a user-namespace gid pair into a kernel internal kgid,
+ *      and returns that kgid.
+ *
+ *      When there is no mapping defined for the user-namespace gid
+ *      pair INVALID_GID is returned.  Callers are expected to test
+ *      for and handle INVALID_GID being returned.  INVALID_GID may be
+ *      tested for using gid_valid().
+ */
+kgid_t make_kgid(struct user_namespace *ns, gid_t gid)
 {
-        struct user_namespace *tmp;
+        /* Map the gid to a global kernel gid */
+        return KGIDT_INIT(map_id_down(&ns->gid_map, gid));
+}
+EXPORT_SYMBOL(make_kgid);
-        if (likely(to == cred->user->user_ns))
+/**
-                return gid;
+ *      from_kgid - Create a gid from a kgid user-namespace pair.
+ *      @targ: The user namespace we want a gid in.
+ *      @kgid: The kernel internal gid to start with.
+ *
+ *      Map @kgid into the user-namespace specified by @targ and
+ *      return the resulting gid.
+ *
+ *      There is always a mapping into the initial user_namespace.
+ *
+ *      If @kgid has no mapping in @targ (gid_t)-1 is returned.
+ */
+gid_t from_kgid(struct user_namespace *targ, kgid_t kgid)
+{
+        /* Map the gid from a global kernel gid */
+        return map_id_up(&targ->gid_map, __kgid_val(kgid));
+}
+EXPORT_SYMBOL(from_kgid);
+/**
+ *      from_kgid_munged - Create a gid from a kgid user-namespace pair.
+ *      @targ: The user namespace we want a gid in.
+ *      @kgid: The kernel internal gid to start with.
+ *
+ *      Map @kgid into the user-namespace specified by @targ and
+ *      return the resulting gid.
+ *
+ *      There is always a mapping into the initial user_namespace.
+ *
+ *      Unlike from_kgid from_kgid_munged never fails and always
+ *      returns a valid gid.  This makes from_kgid_munged appropriate
+ *      for use in syscalls like stat and getgid where failing the
+ *      system call and failing to provide a valid gid are not options.
+ *
+ *      If @kgid has no mapping in @targ overflowgid is returned.
+ */
+gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid)
+{
+        gid_t gid;
+        gid = from_kgid(targ, kgid);
-        /* Is cred->user the creator of the target user_ns
+        if (gid == (gid_t) -1)
-         * or the creator of one of it's parents?
+                gid = overflowgid;
+        return gid;
+}
+EXPORT_SYMBOL(from_kgid_munged);
+static int uid_m_show(struct seq_file *seq, void *v)
+{
+        struct user_namespace *ns = seq->private;
+        struct uid_gid_extent *extent = v;
+        struct user_namespace *lower_ns;
+        uid_t lower;
+        lower_ns = current_user_ns();
+        if ((lower_ns == ns) && lower_ns->parent)
+                lower_ns = lower_ns->parent;
+        lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first));
+        seq_printf(seq, "%10u %10u %10u\n",
+                extent->first,
+                lower,
+                extent->count);
+        return 0;
+}
+static int gid_m_show(struct seq_file *seq, void *v)
+{
+        struct user_namespace *ns = seq->private;
+        struct uid_gid_extent *extent = v;
+        struct user_namespace *lower_ns;
+        gid_t lower;
+        lower_ns = current_user_ns();
+        if ((lower_ns == ns) && lower_ns->parent)
+                lower_ns = lower_ns->parent;
+        lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first));
+        seq_printf(seq, "%10u %10u %10u\n",
+                extent->first,
+                lower,
+                extent->count);
+        return 0;
+}
+static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map)
+{
+        struct uid_gid_extent *extent = NULL;
+        loff_t pos = *ppos;
+        if (pos < map->nr_extents)
+                extent = &map->extent[pos];
+        return extent;
+}
+static void *uid_m_start(struct seq_file *seq, loff_t *ppos)
+{
+        struct user_namespace *ns = seq->private;
+        return m_start(seq, ppos, &ns->uid_map);
+}
+static void *gid_m_start(struct seq_file *seq, loff_t *ppos)
+{
+        struct user_namespace *ns = seq->private;
+        return m_start(seq, ppos, &ns->gid_map);
+}
+static void *m_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+        (*pos)++;
+        return seq->op->start(seq, pos);
+}
+static void m_stop(struct seq_file *seq, void *v)
+{
+        return;
+}
+struct seq_operations proc_uid_seq_operations = {
+        .start = uid_m_start,
+        .stop = m_stop,
+        .next = m_next,
+        .show = uid_m_show,
+};
+struct seq_operations proc_gid_seq_operations = {
+        .start = gid_m_start,
+        .stop = m_stop,
+        .next = m_next,
+        .show = gid_m_show,
+};
+static DEFINE_MUTEX(id_map_mutex);
+static ssize_t map_write(struct file *file, const char __user *buf,
+                         size_t count, loff_t *ppos,
+                         int cap_setid,
+                         struct uid_gid_map *map,
+                         struct uid_gid_map *parent_map)
+{
+        struct seq_file *seq = file->private_data;
+        struct user_namespace *ns = seq->private;
+        struct uid_gid_map new_map;
+        unsigned idx;
+        struct uid_gid_extent *extent, *last = NULL;
+        unsigned long page = 0;
+        char *kbuf, *pos, *next_line;
+        ssize_t ret = -EINVAL;
+        /*
+         * The id_map_mutex serializes all writes to any given map.
+         *
+         * Any map is only ever written once.
+         *
+         * An id map fits within 1 cache line on most architectures.
+         *
+         * On read nothing needs to be done unless you are on an
+         * architecture with a crazy cache coherency model like alpha.
+         *
+         * There is a one time data dependency between reading the
+         * count of the extents and the values of the extents.  The
+         * desired behavior is to see the values of the extents that
+         * were written before the count of the extents.
+         *
+         * To achieve this smp_wmb() is used on guarantee the write
+         * order and smp_read_barrier_depends() is guaranteed that we
+         * don't have crazy architectures returning stale data.
+         *
         */
-        for ( tmp = to; tmp != &init_user_ns;
+        mutex_lock(&id_map_mutex);
-              tmp = tmp->creator->user_ns ) {
-                if (cred->user == tmp->creator) {
+        ret = -EPERM;
-                        return (gid_t)0;
+        /* Only allow one successful write to the map */
+        if (map->nr_extents != 0)
+                goto out;
+        /* Require the appropriate privilege CAP_SETUID or CAP_SETGID
+         * over the user namespace in order to set the id mapping.
+         */
+        if (!ns_capable(ns, cap_setid))
+                goto out;
+        /* Get a buffer */
+        ret = -ENOMEM;
+        page = __get_free_page(GFP_TEMPORARY);
+        kbuf = (char *) page;
+        if (!page)
+                goto out;
+        /* Only allow <= page size writes at the beginning of the file */
+        ret = -EINVAL;
+        if ((*ppos != 0) || (count >= PAGE_SIZE))
+                goto out;
+        /* Slurp in the user data */
+        ret = -EFAULT;
+        if (copy_from_user(kbuf, buf, count))
+                goto out;
+        kbuf[count] = '\0';
+        /* Parse the user data */
+        ret = -EINVAL;
+        pos = kbuf;
+        new_map.nr_extents = 0;
+        for (;pos; pos = next_line) {
+                extent = &new_map.extent[new_map.nr_extents];
+                /* Find the end of line and ensure I don't look past it */
+                next_line = strchr(pos, '\n');
+                if (next_line) {
+                        *next_line = '\0';
+                        next_line++;
+                        if (*next_line == '\0')
+                                next_line = NULL;
                }
+                pos = skip_spaces(pos);
+                extent->first = simple_strtoul(pos, &pos, 10);
+                if (!isspace(*pos))
+                        goto out;
+                pos = skip_spaces(pos);
+                extent->lower_first = simple_strtoul(pos, &pos, 10);
+                if (!isspace(*pos))
+                        goto out;
+                pos = skip_spaces(pos);
+                extent->count = simple_strtoul(pos, &pos, 10);
+                if (*pos && !isspace(*pos))
+                        goto out;
+                /* Verify there is not trailing junk on the line */
+                pos = skip_spaces(pos);
+                if (*pos != '\0')
+                        goto out;
+                /* Verify we have been given valid starting values */
+                if ((extent->first == (u32) -1) ||
+                    (extent->lower_first == (u32) -1 ))
+                        goto out;
+                /* Verify count is not zero and does not cause the extent to wrap */
+                if ((extent->first + extent->count) <= extent->first)
+                        goto out;
+                if ((extent->lower_first + extent->count) <= extent->lower_first)
+                        goto out;
+                /* For now only accept extents that are strictly in order */
+                if (last &&
+                    (((last->first + last->count) > extent->first) ||
+                     ((last->lower_first + last->count) > extent->lower_first)))
+                        goto out;
+                new_map.nr_extents++;
+                last = extent;
+                /* Fail if the file contains too many extents */
+                if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) &&
+                    (next_line != NULL))
+                        goto out;
+        }
+        /* Be very certaint the new map actually exists */
+        if (new_map.nr_extents == 0)
+                goto out;
+        ret = -EPERM;
+        /* Validate the user is allowed to use user id's mapped to. */
+        if (!new_idmap_permitted(ns, cap_setid, &new_map))
+                goto out;
+        /* Map the lower ids from the parent user namespace to the
+         * kernel global id space.
+         */
+        for (idx = 0; idx < new_map.nr_extents; idx++) {
+                u32 lower_first;
+                extent = &new_map.extent[idx];
+                lower_first = map_id_range_down(parent_map,
+                                                extent->lower_first,
+                                                extent->count);
+                /* Fail if we can not map the specified extent to
+                 * the kernel global id space.
+                 */
+                if (lower_first == (u32) -1)
+                        goto out;
+                extent->lower_first = lower_first;
        }
-        /* No useful relationship so no mapping */
+        /* Install the map */
-        return overflowgid;
+        memcpy(map->extent, new_map.extent,
+                new_map.nr_extents*sizeof(new_map.extent[0]));
+        smp_wmb();
+        map->nr_extents = new_map.nr_extents;
+        *ppos = count;
+        ret = count;
+out:
+        mutex_unlock(&id_map_mutex);
+        if (page)
+                free_page(page);
+        return ret;
+}
+ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos)
+{
+        struct seq_file *seq = file->private_data;
+        struct user_namespace *ns = seq->private;
+        if (!ns->parent)
+                return -EPERM;
+        return map_write(file, buf, size, ppos, CAP_SETUID,
+                         &ns->uid_map, &ns->parent->uid_map);
+}
+ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos)
+{
+        struct seq_file *seq = file->private_data;
+        struct user_namespace *ns = seq->private;
+        if (!ns->parent)
+                return -EPERM;
+        return map_write(file, buf, size, ppos, CAP_SETGID,
+                         &ns->gid_map, &ns->parent->gid_map);
+}
+static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
+                                struct uid_gid_map *new_map)
+{
+        /* Allow the specified ids if we have the appropriate capability
+         * (CAP_SETUID or CAP_SETGID) over the parent user namespace.
+         */
+        if (ns_capable(ns->parent, cap_setid))
+                return true;
+        return false;
 }
 static __init int user_namespaces_init(void)
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 405caf91aad5..679d97a5d3fd 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -43,7 +43,7 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
        down_read(&uts_sem);
        memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
-        ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns);
+        ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns));
        up_read(&uts_sem);
        return ns;
 }
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index df30ee08bdd4..4b1dfba70f7c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,6 +24,7 @@
 #include <linux/sysctl.h>
 #include <asm/irq_regs.h>
+#include <linux/kvm_para.h>
 #include <linux/perf_event.h>
 int watchdog_enabled = 1;
@@ -280,6 +281,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
                        __this_cpu_write(softlockup_touch_sync, false);
                        sched_clock_tick();
                }
+                /* Clear the guest paused flag on watchdog reset */
+                kvm_check_and_clear_guest_paused();
                __touch_watchdog();
                return HRTIMER_RESTART;
        }
@@ -292,6 +296,14 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
         */
        duration = is_softlockup(touch_ts);
        if (unlikely(duration)) {
+                /*
+                 * If a virtual machine is stopped by the host it can look to
+                 * the watchdog like a soft lockup, check to see if the host
+                 * stopped the vm before we issue the warning
+                 */
+                if (kvm_check_and_clear_guest_paused())
+                        return HRTIMER_RESTART;
                /* only warn once */
                if (__this_cpu_read(soft_watchdog_warn) == true)
                        return HRTIMER_RESTART;
@@ -360,6 +372,13 @@ static int watchdog(void *unused)
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
+/*
+ * People like the simple clean cpu node info on boot.
+ * Reduce the watchdog noise by only printing messages
+ * that are different from what cpu0 displayed.
+ */
+static unsigned long cpu0_err;
 static int watchdog_nmi_enable(int cpu)
 {
        struct perf_event_attr *wd_attr;
@@ -378,11 +397,21 @@ static int watchdog_nmi_enable(int cpu)
        /* Try to register using hardware perf events */
        event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
+        /* save cpu0 error for future comparision */
+        if (cpu == 0 && IS_ERR(event))
+                cpu0_err = PTR_ERR(event);
        if (!IS_ERR(event)) {
-                pr_info("enabled, takes one hw-pmu counter.\n");
+                /* only print for cpu0 or different than cpu0 */
+                if (cpu == 0 || cpu0_err)
+                        pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
                goto out_save;
        }
+        /* skip displaying the same error again */
+        if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
+                return PTR_ERR(event);
        /* vary the KERN level based on the returned errno */
        if (PTR_ERR(event) == -EOPNOTSUPP)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5abf42f63c08..9a3128dc67df 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1032,7 +1032,10 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
        cwq = get_cwq(gcwq->cpu, wq);
        trace_workqueue_queue_work(cpu, cwq, work);
-        BUG_ON(!list_empty(&work->entry));
+        if (WARN_ON(!list_empty(&work->entry))) {
+                spin_unlock_irqrestore(&gcwq->lock, flags);
+                return;
+        }
        cwq->nr_in_flight[cwq->work_color]++;
        work_flags = work_color_to_flags(cwq->work_color);
@@ -1210,8 +1213,13 @@ static void worker_enter_idle(struct worker *worker)
        } else
                wake_up_all(&gcwq->trustee_wait);
-        /* sanity check nr_running */
+        /*
-        WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle &&
+         * Sanity check nr_running.  Because trustee releases gcwq->lock
+         * between setting %WORKER_ROGUE and zapping nr_running, the
+         * warning may trigger spuriously.  Check iff trustee is idle.
+         */
+        WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE &&
+                     gcwq->nr_workers == gcwq->nr_idle &&
                     atomic_read(get_gcwq_nr_running(gcwq->cpu)));
 }
@@ -1810,7 +1818,9 @@ __acquires(&gcwq->lock)
         * lock freed" warnings as well as problems when looking into
         * work->lockdep_map, make a copy and use that here.
         */
-        struct lockdep_map lockdep_map = work->lockdep_map;
+        struct lockdep_map lockdep_map;
+        lockdep_copy_map(&lockdep_map, &work->lockdep_map);
 #endif
        /*
         * A single work shouldn't be executed concurrently by
@@ -2506,6 +2516,9 @@ bool flush_work(struct work_struct *work)
 {
        struct wq_barrier barr;
+        lock_map_acquire(&work->lockdep_map);
+        lock_map_release(&work->lockdep_map);
        if (start_flush_work(work, &barr, true)) {
                wait_for_completion(&barr.done);
                destroy_work_on_stack(&barr.work);