Merge branch 'for-3.5' of ../cgroup into block/for-3.5/core-merged

cgroup/for-3.5 contains the following changes which blk-cgroup needs to proceed with the on-going cleanup. * Dynamic addition and removal of cftypes to make config/stat file handling modular for policies. * cgroup removal update to not wait for css references to drain to fix blkcg removal hang caused by cfq caching cfqgs. Pull in cgroup/for-3.5 into block/for-3.5/core. This causes the following conflicts in block/blk-cgroup.c. * 761b3ef50e "cgroup: remove cgroup_subsys argument from callbacks" conflicts with blkiocg_pre_destroy() addition and blkiocg_attach() removal. Resolved by removing @subsys from all subsys methods. * 676f7c8f84 "cgroup: relocate cftype and cgroup_subsys definitions in controllers" conflicts with ->pre_destroy() and ->attach() updates and removal of modular config. Resolved by dropping forward declarations of the methods and applying updates to the relocated blkio_subsys. * 4baf6e3325 "cgroup: convert all non-memcg controllers to the new cftype interface" builds upon the previous item. Resolved by adding ->base_cftypes to the relocated blkio_subsys. Signed-off-by: Tejun Heo <tj@kernel.org>
author: Tejun Heo <tj@kernel.org> 2012-04-01 15:30:01 -0400
committer: Tejun Heo <tj@kernel.org> 2012-04-01 15:55:00 -0400
commit: 959d851caa48829eb85cb85aa949fd6b4c5d5bc6 (patch)
tree: 3ba9c94ec346275fb44c4f0d1cd2537cdff8d811 /kernel
parent: a5567932fc926739e29e98487128080f40c61710 (diff)
parent: 48ddbe194623ae089cc0576e60363f2d2e85662a (diff)
106 files changed, 5181 insertions, 3197 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 5068e2a4e75f..2251882daf53 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -124,8 +124,8 @@ config INLINE_SPIN_LOCK_IRQSAVE
        def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
                 ARCH_INLINE_SPIN_LOCK_IRQSAVE
-config INLINE_SPIN_UNLOCK
+config UNINLINE_SPIN_UNLOCK
-        def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK)
+        bool
 config INLINE_SPIN_UNLOCK_BH
        def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 24e7cb0ba26a..3f9c97419f02 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -36,6 +36,7 @@ config PREEMPT_VOLUNTARY
 config PREEMPT
        bool "Preemptible Kernel (Low-Latency Desktop)"
        select PREEMPT_COUNT
+        select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
        help
          This option reduces the latency of the kernel by making
          all kernel code (that is not executing in a critical section)
diff --git a/kernel/Makefile b/kernel/Makefile
index 2d9de86b7e76..cb41b9547c9f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -27,7 +27,6 @@ obj-y += power/
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
-obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
diff --git a/kernel/audit.c b/kernel/audit.c
index bb0eb5bb9a0a..1c7f2c61416b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1418,7 +1418,7 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
 /* This is a helper-function to print the escaped d_path */
 void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
-                      struct path *path)
+                      const struct path *path)
 {
        char *p, *pathname;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a5d3b5325f77..2905977e0f33 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -63,6 +63,9 @@
 #include <linux/atomic.h>
+/* css deactivation bias, makes css->refcnt negative to deny new trygets */
+#define CSS_DEACT_BIAS          INT_MIN
 /*
 * cgroup_mutex is the master lock.  Any modification to cgroup or its
 * hierarchy must be performed while holding it.
@@ -127,6 +130,9 @@ struct cgroupfs_root {
        /* A list running through the active hierarchies */
        struct list_head root_list;
+        /* All cgroups on this root, cgroup_mutex protected */
+        struct list_head allcg_list;
        /* Hierarchy-specific flags */
        unsigned long flags;
@@ -145,6 +151,15 @@ struct cgroupfs_root {
 static struct cgroupfs_root rootnode;
 /*
+ * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
+ */
+struct cfent {
+        struct list_head                node;
+        struct dentry                   *dentry;
+        struct cftype                   *type;
+};
+/*
 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
 * cgroup_subsys->use_id != 0.
 */
@@ -239,6 +254,14 @@ int cgroup_lock_is_held(void)
 EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
+/* the current nr of refs, always >= 0 whether @css is deactivated or not */
+static int css_refcnt(struct cgroup_subsys_state *css)
+{
+        int v = atomic_read(&css->refcnt);
+        return v >= 0 ? v : v - CSS_DEACT_BIAS;
+}
 /* convenient tests for these bits */
 inline int cgroup_is_removed(const struct cgroup *cgrp)
 {
@@ -279,6 +302,21 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling)
 #define for_each_active_root(_root) \
 list_for_each_entry(_root, &roots, root_list)
+static inline struct cgroup *__d_cgrp(struct dentry *dentry)
+{
+        return dentry->d_fsdata;
+}
+static inline struct cfent *__d_cfe(struct dentry *dentry)
+{
+        return dentry->d_fsdata;
+}
+static inline struct cftype *__d_cft(struct dentry *dentry)
+{
+        return __d_cfe(dentry)->type;
+}
 /* the list of cgroups eligible for automatic release. Protected by
 * release_list_lock */
 static LIST_HEAD(release_list);
@@ -816,12 +854,17 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
        struct cgroup_subsys *ss;
        int ret = 0;
-        for_each_subsys(cgrp->root, ss)
+        for_each_subsys(cgrp->root, ss) {
-                if (ss->pre_destroy) {
+                if (!ss->pre_destroy)
-                        ret = ss->pre_destroy(ss, cgrp);
+                        continue;
-                        if (ret)
-                                break;
+                ret = ss->pre_destroy(cgrp);
+                if (ret) {
+                        /* ->pre_destroy() failure is being deprecated */
+                        WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
+                        break;
                }
+        }
        return ret;
 }
@@ -846,7 +889,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
                 * Release the subsystem state objects.
                 */
                for_each_subsys(cgrp->root, ss)
-                        ss->destroy(ss, cgrp);
+                        ss->destroy(cgrp);
                cgrp->root->number_of_cgroups--;
                mutex_unlock(&cgroup_mutex);
@@ -864,6 +907,14 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
                BUG_ON(!list_empty(&cgrp->pidlists));
                kfree_rcu(cgrp, rcu_head);
+        } else {
+                struct cfent *cfe = __d_cfe(dentry);
+                struct cgroup *cgrp = dentry->d_parent->d_fsdata;
+                WARN_ONCE(!list_empty(&cfe->node) &&
+                          cgrp != &cgrp->root->top_cgroup,
+                          "cfe still linked for %s\n", cfe->type->name);
+                kfree(cfe);
        }
        iput(inode);
 }
@@ -882,34 +933,36 @@ static void remove_dir(struct dentry *d)
        dput(parent);
 }
-static void cgroup_clear_directory(struct dentry *dentry)
+static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
-{
+{
-        struct list_head *node;
+        struct cfent *cfe;
-        BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+        lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
-        spin_lock(&dentry->d_lock);
+        lockdep_assert_held(&cgroup_mutex);
-        node = dentry->d_subdirs.next;
-        while (node != &dentry->d_subdirs) {
+        list_for_each_entry(cfe, &cgrp->files, node) {
-                struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
+                struct dentry *d = cfe->dentry;
-                spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
+                if (cft && cfe->type != cft)
-                list_del_init(node);
+                        continue;
-                if (d->d_inode) {
-                        /* This should never be called on a cgroup
+                dget(d);
-                         * directory with child cgroups */
+                d_delete(d);
-                        BUG_ON(d->d_inode->i_mode & S_IFDIR);
+                simple_unlink(d->d_inode, d);
-                        dget_dlock(d);
+                list_del_init(&cfe->node);
-                        spin_unlock(&d->d_lock);
+                dput(d);
-                        spin_unlock(&dentry->d_lock);
-                        d_delete(d);
+                return 0;
-                        simple_unlink(dentry->d_inode, d);
-                        dput(d);
-                        spin_lock(&dentry->d_lock);
-                } else
-                        spin_unlock(&d->d_lock);
-                node = dentry->d_subdirs.next;
        }
-        spin_unlock(&dentry->d_lock);
+        return -ENOENT;
+}
+static void cgroup_clear_directory(struct dentry *dir)
+{
+        struct cgroup *cgrp = __d_cgrp(dir);
+        while (!list_empty(&cgrp->files))
+                cgroup_rm_file(cgrp, NULL);
 }
 /*
@@ -1015,7 +1068,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                        list_move(&ss->sibling, &root->subsys_list);
                        ss->root = root;
                        if (ss->bind)
-                                ss->bind(ss, cgrp);
+                                ss->bind(cgrp);
                        mutex_unlock(&ss->hierarchy_mutex);
                        /* refcount was already taken, and we're keeping it */
                } else if (bit & removed_bits) {
@@ -1025,7 +1078,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                        BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
                        mutex_lock(&ss->hierarchy_mutex);
                        if (ss->bind)
-                                ss->bind(ss, dummytop);
+                                ss->bind(dummytop);
                        dummytop->subsys[i]->cgroup = dummytop;
                        cgrp->subsys[i] = NULL;
                        subsys[i]->root = &rootnode;
@@ -1294,6 +1347,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
        if (ret)
                goto out_unlock;
+        /* See feature-removal-schedule.txt */
+        if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent)
+                pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
+                           task_tgid_nr(current), current->comm);
        /* Don't allow flags or name to change at remount */
        if (opts.flags != root->flags ||
            (opts.name && strcmp(opts.name, root->name))) {
@@ -1308,7 +1366,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
                goto out_unlock;
        }
-        /* (re)populate subsystem files */
+        /* clear out any existing files and repopulate subsystem files */
+        cgroup_clear_directory(cgrp->dentry);
        cgroup_populate_dir(cgrp);
        if (opts.release_agent)
@@ -1333,6 +1392,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 {
        INIT_LIST_HEAD(&cgrp->sibling);
        INIT_LIST_HEAD(&cgrp->children);
+        INIT_LIST_HEAD(&cgrp->files);
        INIT_LIST_HEAD(&cgrp->css_sets);
        INIT_LIST_HEAD(&cgrp->release_list);
        INIT_LIST_HEAD(&cgrp->pidlists);
@@ -1344,11 +1404,14 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 static void init_cgroup_root(struct cgroupfs_root *root)
 {
        struct cgroup *cgrp = &root->top_cgroup;
        INIT_LIST_HEAD(&root->subsys_list);
        INIT_LIST_HEAD(&root->root_list);
+        INIT_LIST_HEAD(&root->allcg_list);
        root->number_of_cgroups = 1;
        cgrp->root = root;
        cgrp->top_cgroup = cgrp;
+        list_add_tail(&cgrp->allcg_node, &root->allcg_list);
        init_cgroup_housekeeping(cgrp);
 }
@@ -1472,7 +1535,6 @@ static int cgroup_get_rootdir(struct super_block *sb)
        struct inode *inode =
                cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
-        struct dentry *dentry;
        if (!inode)
                return -ENOMEM;
@@ -1481,12 +1543,9 @@ static int cgroup_get_rootdir(struct super_block *sb)
        inode->i_op = &cgroup_dir_inode_operations;
        /* directories start off with i_nlink == 2 (for "." entry) */
        inc_nlink(inode);
-        dentry = d_alloc_root(inode);
+        sb->s_root = d_make_root(inode);
-        if (!dentry) {
+        if (!sb->s_root)
-                iput(inode);
                return -ENOMEM;
-        }
-        sb->s_root = dentry;
        /* for everything else we want ->d_op set */
        sb->s_d_op = &cgroup_dops;
        return 0;
@@ -1696,16 +1755,6 @@ static struct file_system_type cgroup_fs_type = {
 static struct kobject *cgroup_kobj;
-static inline struct cgroup *__d_cgrp(struct dentry *dentry)
-{
-        return dentry->d_fsdata;
-}
-static inline struct cftype *__d_cft(struct dentry *dentry)
-{
-        return dentry->d_fsdata;
-}
 /**
 * cgroup_path - generate the path of a cgroup
 * @cgrp: the cgroup in question
@@ -1763,6 +1812,7 @@ EXPORT_SYMBOL_GPL(cgroup_path);
 struct task_and_cgroup {
        struct task_struct      *task;
        struct cgroup           *cgrp;
+        struct css_set          *cg;
 };
 struct cgroup_taskset {
@@ -1843,11 +1893,10 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
 * will already exist. If not set, this function might sleep, and can fail with
 * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
 */
-static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
+static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
-                               struct task_struct *tsk, bool guarantee)
+                                struct task_struct *tsk, struct css_set *newcg)
 {
        struct css_set *oldcg;
-        struct css_set *newcg;
        /*
         * We are synchronized through threadgroup_lock() against PF_EXITING
@@ -1857,23 +1906,6 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
        WARN_ON_ONCE(tsk->flags & PF_EXITING);
        oldcg = tsk->cgroups;
-        /* locate or allocate a new css_set for this task. */
-        if (guarantee) {
-                /* we know the css_set we want already exists. */
-                struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
-                read_lock(&css_set_lock);
-                newcg = find_existing_css_set(oldcg, cgrp, template);
-                BUG_ON(!newcg);
-                get_css_set(newcg);
-                read_unlock(&css_set_lock);
-        } else {
-                might_sleep();
-                /* find_css_set will give us newcg already referenced. */
-                newcg = find_css_set(oldcg, cgrp);
-                if (!newcg)
-                        return -ENOMEM;
-        }
        task_lock(tsk);
        rcu_assign_pointer(tsk->cgroups, newcg);
        task_unlock(tsk);
@@ -1892,7 +1924,6 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
        put_css_set(oldcg);
        set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
-        return 0;
 }
 /**
@@ -1905,11 +1936,12 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
 */
 int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
-        int retval;
+        int retval = 0;
        struct cgroup_subsys *ss, *failed_ss = NULL;
        struct cgroup *oldcgrp;
        struct cgroupfs_root *root = cgrp->root;
        struct cgroup_taskset tset = { };
+        struct css_set *newcg;
        /* @tsk either already exited or can't exit until the end */
        if (tsk->flags & PF_EXITING)
@@ -1925,7 +1957,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
        for_each_subsys(root, ss) {
                if (ss->can_attach) {
-                        retval = ss->can_attach(ss, cgrp, &tset);
+                        retval = ss->can_attach(cgrp, &tset);
                        if (retval) {
                                /*
                                 * Remember on which subsystem the can_attach()
@@ -1939,13 +1971,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
                }
        }
-        retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
+        newcg = find_css_set(tsk->cgroups, cgrp);
-        if (retval)
+        if (!newcg) {
+                retval = -ENOMEM;
                goto out;
+        }
+        cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg);
        for_each_subsys(root, ss) {
                if (ss->attach)
-                        ss->attach(ss, cgrp, &tset);
+                        ss->attach(cgrp, &tset);
        }
        synchronize_rcu();
@@ -1967,7 +2003,7 @@ out:
                                 */
                                break;
                        if (ss->cancel_attach)
-                                ss->cancel_attach(ss, cgrp, &tset);
+                                ss->cancel_attach(cgrp, &tset);
                }
        }
        return retval;
@@ -1997,66 +2033,6 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 }
 EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
-/*
- * cgroup_attach_proc works in two stages, the first of which prefetches all
- * new css_sets needed (to make sure we have enough memory before committing
- * to the move) and stores them in a list of entries of the following type.
- * TODO: possible optimization: use css_set->rcu_head for chaining instead
- */
-struct cg_list_entry {
-        struct css_set *cg;
-        struct list_head links;
-};
-static bool css_set_check_fetched(struct cgroup *cgrp,
-                                  struct task_struct *tsk, struct css_set *cg,
-                                  struct list_head *newcg_list)
-{
-        struct css_set *newcg;
-        struct cg_list_entry *cg_entry;
-        struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
-        read_lock(&css_set_lock);
-        newcg = find_existing_css_set(cg, cgrp, template);
-        read_unlock(&css_set_lock);
-        /* doesn't exist at all? */
-        if (!newcg)
-                return false;
-        /* see if it's already in the list */
-        list_for_each_entry(cg_entry, newcg_list, links)
-                if (cg_entry->cg == newcg)
-                        return true;
-        /* not found */
-        return false;
-}
-/*
- * Find the new css_set and store it in the list in preparation for moving the
- * given task to the given cgroup. Returns 0 or -ENOMEM.
- */
-static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
-                            struct list_head *newcg_list)
-{
-        struct css_set *newcg;
-        struct cg_list_entry *cg_entry;
-        /* ensure a new css_set will exist for this thread */
-        newcg = find_css_set(cg, cgrp);
-        if (!newcg)
-                return -ENOMEM;
-        /* add it to the list */
-        cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
-        if (!cg_entry) {
-                put_css_set(newcg);
-                return -ENOMEM;
-        }
-        cg_entry->cg = newcg;
-        list_add(&cg_entry->links, newcg_list);
-        return 0;
-}
 /**
 * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
 * @cgrp: the cgroup to attach to
@@ -2070,20 +2046,12 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
        int retval, i, group_size;
        struct cgroup_subsys *ss, *failed_ss = NULL;
        /* guaranteed to be initialized later, but the compiler needs this */
-        struct css_set *oldcg;
        struct cgroupfs_root *root = cgrp->root;
        /* threadgroup list cursor and array */
        struct task_struct *tsk;
        struct task_and_cgroup *tc;
        struct flex_array *group;
        struct cgroup_taskset tset = { };
-        /*
-         * we need to make sure we have css_sets for all the tasks we're
-         * going to move -before- we actually start moving them, so that in
-         * case we get an ENOMEM we can bail out before making any changes.
-         */
-        struct list_head newcg_list;
-        struct cg_list_entry *cg_entry, *temp_nobe;
        /*
         * step 0: in order to do expensive, possibly blocking operations for
@@ -2102,23 +2070,14 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
        if (retval)
                goto out_free_group_list;
-        /* prevent changes to the threadgroup list while we take a snapshot. */
-        read_lock(&tasklist_lock);
-        if (!thread_group_leader(leader)) {
-                /*
-                 * a race with de_thread from another thread's exec() may strip
-                 * us of our leadership, making while_each_thread unsafe to use
-                 * on this task. if this happens, there is no choice but to
-                 * throw this task away and try again (from cgroup_procs_write);
-                 * this is "double-double-toil-and-trouble-check locking".
-                 */
-                read_unlock(&tasklist_lock);
-                retval = -EAGAIN;
-                goto out_free_group_list;
-        }
        tsk = leader;
        i = 0;
+        /*
+         * Prevent freeing of tasks while we take a snapshot. Tasks that are
+         * already PF_EXITING could be freed from underneath us unless we
+         * take an rcu_read_lock.
+         */
+        rcu_read_lock();
        do {
                struct task_and_cgroup ent;
@@ -2128,24 +2087,24 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
                /* as per above, nr_threads may decrease, but not increase. */
                BUG_ON(i >= group_size);
-                /*
-                 * saying GFP_ATOMIC has no effect here because we did prealloc
-                 * earlier, but it's good form to communicate our expectations.
-                 */
                ent.task = tsk;
                ent.cgrp = task_cgroup_from_root(tsk, root);
                /* nothing to do if this task is already in the cgroup */
                if (ent.cgrp == cgrp)
                        continue;
+                /*
+                 * saying GFP_ATOMIC has no effect here because we did prealloc
+                 * earlier, but it's good form to communicate our expectations.
+                 */
                retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
                BUG_ON(retval != 0);
                i++;
        } while_each_thread(leader, tsk);
+        rcu_read_unlock();
        /* remember the number of threads in the array for later. */
        group_size = i;
        tset.tc_array = group;
        tset.tc_array_len = group_size;
-        read_unlock(&tasklist_lock);
        /* methods shouldn't be called if no task is actually migrating */
        retval = 0;
@@ -2157,7 +2116,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
         */
        for_each_subsys(root, ss) {
                if (ss->can_attach) {
-                        retval = ss->can_attach(ss, cgrp, &tset);
+                        retval = ss->can_attach(cgrp, &tset);
                        if (retval) {
                                failed_ss = ss;
                                goto out_cancel_attach;
@@ -2169,17 +2128,12 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
         * step 2: make sure css_sets exist for all threads to be migrated.
         * we use find_css_set, which allocates a new one if necessary.
         */
-        INIT_LIST_HEAD(&newcg_list);
        for (i = 0; i < group_size; i++) {
                tc = flex_array_get(group, i);
-                oldcg = tc->task->cgroups;
+                tc->cg = find_css_set(tc->task->cgroups, cgrp);
+                if (!tc->cg) {
-                /* if we don't already have it in the list get a new one */
+                        retval = -ENOMEM;
-                if (!css_set_check_fetched(cgrp, tc->task, oldcg,
+                        goto out_put_css_set_refs;
-                                           &newcg_list)) {
-                        retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
-                        if (retval)
-                                goto out_list_teardown;
                }
        }
@@ -2190,8 +2144,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
         */
        for (i = 0; i < group_size; i++) {
                tc = flex_array_get(group, i);
-                retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true);
+                cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg);
-                BUG_ON(retval);
        }
        /* nothing is sensitive to fork() after this point. */
@@ -2200,7 +2153,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
         */
        for_each_subsys(root, ss) {
                if (ss->attach)
-                        ss->attach(ss, cgrp, &tset);
+                        ss->attach(cgrp, &tset);
        }
        /*
@@ -2209,21 +2162,22 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
        synchronize_rcu();
        cgroup_wakeup_rmdir_waiter(cgrp);
        retval = 0;
-out_list_teardown:
+out_put_css_set_refs:
-        /* clean up the list of prefetched css_sets. */
+        if (retval) {
-        list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
+                for (i = 0; i < group_size; i++) {
-                list_del(&cg_entry->links);
+                        tc = flex_array_get(group, i);
-                put_css_set(cg_entry->cg);
+                        if (!tc->cg)
-                kfree(cg_entry);
+                                break;
+                        put_css_set(tc->cg);
+                }
        }
 out_cancel_attach:
-        /* same deal as in cgroup_attach_task */
        if (retval) {
                for_each_subsys(root, ss) {
                        if (ss == failed_ss)
                                break;
                        if (ss->cancel_attach)
-                                ss->cancel_attach(ss, cgrp, &tset);
+                                ss->cancel_attach(cgrp, &tset);
                }
        }
 out_free_group_list:
@@ -2245,22 +2199,14 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
        if (!cgroup_lock_live_group(cgrp))
                return -ENODEV;
+retry_find_task:
+        rcu_read_lock();
        if (pid) {
-                rcu_read_lock();
                tsk = find_task_by_vpid(pid);
                if (!tsk) {
                        rcu_read_unlock();
-                        cgroup_unlock();
+                        ret= -ESRCH;
-                        return -ESRCH;
+                        goto out_unlock_cgroup;
-                }
-                if (threadgroup) {
-                        /*
-                         * RCU protects this access, since tsk was found in the
-                         * tid map. a race with de_thread may cause group_leader
-                         * to stop being the leader, but cgroup_attach_proc will
-                         * detect it later.
-                         */
-                        tsk = tsk->group_leader;
                }
                /*
                 * even if we're attaching all tasks in the thread group, we
@@ -2271,29 +2217,38 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
                    cred->euid != tcred->uid &&
                    cred->euid != tcred->suid) {
                        rcu_read_unlock();
-                        cgroup_unlock();
+                        ret = -EACCES;
-                        return -EACCES;
+                        goto out_unlock_cgroup;
                }
-                get_task_struct(tsk);
+        } else
-                rcu_read_unlock();
+                tsk = current;
-        } else {
-                if (threadgroup)
-                        tsk = current->group_leader;
-                else
-                        tsk = current;
-                get_task_struct(tsk);
-        }
-        threadgroup_lock(tsk);
        if (threadgroup)
+                tsk = tsk->group_leader;
+        get_task_struct(tsk);
+        rcu_read_unlock();
+        threadgroup_lock(tsk);
+        if (threadgroup) {
+                if (!thread_group_leader(tsk)) {
+                        /*
+                         * a race with de_thread from another thread's exec()
+                         * may strip us of our leadership, if this happens,
+                         * there is no choice but to throw this task away and
+                         * try again; this is
+                         * "double-double-toil-and-trouble-check locking".
+                         */
+                        threadgroup_unlock(tsk);
+                        put_task_struct(tsk);
+                        goto retry_find_task;
+                }
                ret = cgroup_attach_proc(cgrp, tsk);
-        else
+        } else
                ret = cgroup_attach_task(cgrp, tsk);
        threadgroup_unlock(tsk);
        put_task_struct(tsk);
+out_unlock_cgroup:
        cgroup_unlock();
        return ret;
 }
@@ -2305,16 +2260,7 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
 static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
 {
-        int ret;
+        return attach_task_by_pid(cgrp, tgid, true);
-        do {
-                /*
-                 * attach_proc fails with -EAGAIN if threadgroup leadership
-                 * changes in the middle of the operation, in which case we need
-                 * to find the task_struct for the new leader and start over.
-                 */
-                ret = attach_task_by_pid(cgrp, tgid, true);
-        } while (ret == -EAGAIN);
-        return ret;
 }
 /**
@@ -2710,50 +2656,191 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
        return mode;
 }
-int cgroup_add_file(struct cgroup *cgrp,
+static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
-                       struct cgroup_subsys *subsys,
+                           const struct cftype *cft)
-                       const struct cftype *cft)
 {
        struct dentry *dir = cgrp->dentry;
+        struct cgroup *parent = __d_cgrp(dir);
        struct dentry *dentry;
+        struct cfent *cfe;
        int error;
        umode_t mode;
        char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
+        /* does @cft->flags tell us to skip creation on @cgrp? */
+        if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
+                return 0;
+        if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
+                return 0;
        if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
                strcpy(name, subsys->name);
                strcat(name, ".");
        }
        strcat(name, cft->name);
        BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
+        cfe = kzalloc(sizeof(*cfe), GFP_KERNEL);
+        if (!cfe)
+                return -ENOMEM;
        dentry = lookup_one_len(name, dir, strlen(name));
-        if (!IS_ERR(dentry)) {
+        if (IS_ERR(dentry)) {
-                mode = cgroup_file_mode(cft);
-                error = cgroup_create_file(dentry, mode | S_IFREG,
-                                                cgrp->root->sb);
-                if (!error)
-                        dentry->d_fsdata = (void *)cft;
-                dput(dentry);
-        } else
                error = PTR_ERR(dentry);
+                goto out;
+        }
+        mode = cgroup_file_mode(cft);
+        error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
+        if (!error) {
+                cfe->type = (void *)cft;
+                cfe->dentry = dentry;
+                dentry->d_fsdata = cfe;
+                list_add_tail(&cfe->node, &parent->files);
+                cfe = NULL;
+        }
+        dput(dentry);
+out:
+        kfree(cfe);
        return error;
 }
-EXPORT_SYMBOL_GPL(cgroup_add_file);
-int cgroup_add_files(struct cgroup *cgrp,
+static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
-                        struct cgroup_subsys *subsys,
+                              const struct cftype cfts[], bool is_add)
-                        const struct cftype cft[],
-                        int count)
 {
-        int i, err;
+        const struct cftype *cft;
-        for (i = 0; i < count; i++) {
+        int err, ret = 0;
-                err = cgroup_add_file(cgrp, subsys, &cft[i]);
-                if (err)
+        for (cft = cfts; cft->name[0] != '\0'; cft++) {
-                        return err;
+                if (is_add)
+                        err = cgroup_add_file(cgrp, subsys, cft);
+                else
+                        err = cgroup_rm_file(cgrp, cft);
+                if (err) {
+                        pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
+                                   is_add ? "add" : "remove", cft->name, err);
+                        ret = err;
+                }
+        }
+        return ret;
+}
+static DEFINE_MUTEX(cgroup_cft_mutex);
+static void cgroup_cfts_prepare(void)
+        __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex)
+{
+        /*
+         * Thanks to the entanglement with vfs inode locking, we can't walk
+         * the existing cgroups under cgroup_mutex and create files.
+         * Instead, we increment reference on all cgroups and build list of
+         * them using @cgrp->cft_q_node.  Grab cgroup_cft_mutex to ensure
+         * exclusive access to the field.
+         */
+        mutex_lock(&cgroup_cft_mutex);
+        mutex_lock(&cgroup_mutex);
+}
+static void cgroup_cfts_commit(struct cgroup_subsys *ss,
+                               const struct cftype *cfts, bool is_add)
+        __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex)
+{
+        LIST_HEAD(pending);
+        struct cgroup *cgrp, *n;
+        /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
+        if (cfts && ss->root != &rootnode) {
+                list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) {
+                        dget(cgrp->dentry);
+                        list_add_tail(&cgrp->cft_q_node, &pending);
+                }
+        }
+        mutex_unlock(&cgroup_mutex);
+        /*
+         * All new cgroups will see @cfts update on @ss->cftsets.  Add/rm
+         * files for all cgroups which were created before.
+         */
+        list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) {
+                struct inode *inode = cgrp->dentry->d_inode;
+                mutex_lock(&inode->i_mutex);
+                mutex_lock(&cgroup_mutex);
+                if (!cgroup_is_removed(cgrp))
+                        cgroup_addrm_files(cgrp, ss, cfts, is_add);
+                mutex_unlock(&cgroup_mutex);
+                mutex_unlock(&inode->i_mutex);
+                list_del_init(&cgrp->cft_q_node);
+                dput(cgrp->dentry);
        }
+        mutex_unlock(&cgroup_cft_mutex);
+}
+/**
+ * cgroup_add_cftypes - add an array of cftypes to a subsystem
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Register @cfts to @ss.  Files described by @cfts are created for all
+ * existing cgroups to which @ss is attached and all future cgroups will
+ * have them too.  This function can be called anytime whether @ss is
+ * attached or not.
+ *
+ * Returns 0 on successful registration, -errno on failure.  Note that this
+ * function currently returns 0 as long as @cfts registration is successful
+ * even if some file creation attempts on existing cgroups fail.
+ */
+int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
+{
+        struct cftype_set *set;
+        set = kzalloc(sizeof(*set), GFP_KERNEL);
+        if (!set)
+                return -ENOMEM;
+        cgroup_cfts_prepare();
+        set->cfts = cfts;
+        list_add_tail(&set->node, &ss->cftsets);
+        cgroup_cfts_commit(ss, cfts, true);
        return 0;
 }
-EXPORT_SYMBOL_GPL(cgroup_add_files);
+EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
+/**
+ * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Unregister @cfts from @ss.  Files described by @cfts are removed from
+ * all existing cgroups to which @ss is attached and all future cgroups
+ * won't have them either.  This function can be called anytime whether @ss
+ * is attached or not.
+ *
+ * Returns 0 on successful unregistration, -ENOENT if @cfts is not
+ * registered with @ss.
+ */
+int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
+{
+        struct cftype_set *set;
+        cgroup_cfts_prepare();
+        list_for_each_entry(set, &ss->cftsets, node) {
+                if (set->cfts == cfts) {
+                        list_del_init(&set->node);
+                        cgroup_cfts_commit(ss, cfts, false);
+                        return 0;
+                }
+        }
+        cgroup_cfts_commit(ss, NULL, false);
+        return -ENOENT;
+}
 /**
 * cgroup_task_count - count the number of tasks in a cgroup.
@@ -2804,15 +2891,20 @@ static void cgroup_advance_iter(struct cgroup *cgrp,
 * using their cgroups capability, we don't maintain the lists running
 * through each css_set to its tasks until we see the list actually
 * used - in other words after the first call to cgroup_iter_start().
- *
- * The tasklist_lock is not held here, as do_each_thread() and
- * while_each_thread() are protected by RCU.
 */
 static void cgroup_enable_task_cg_lists(void)
 {
        struct task_struct *p, *g;
        write_lock(&css_set_lock);
        use_task_css_set_links = 1;
+        /*
+         * We need tasklist_lock because RCU is not safe against
+         * while_each_thread(). Besides, a forking task that has passed
+         * cgroup_post_fork() without seeing use_task_css_set_links = 1
+         * is not guaranteed to have its child immediately visible in the
+         * tasklist if we walk through it with RCU.
+         */
+        read_lock(&tasklist_lock);
        do_each_thread(g, p) {
                task_lock(p);
                /*
@@ -2824,6 +2916,7 @@ static void cgroup_enable_task_cg_lists(void)
                        list_add(&p->cg_list, &p->cgroups->tasks);
                task_unlock(p);
        } while_each_thread(g, p);
+        read_unlock(&tasklist_lock);
        write_unlock(&css_set_lock);
 }
@@ -3043,6 +3136,38 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
 *
 */
+/* which pidlist file are we talking about? */
+enum cgroup_filetype {
+        CGROUP_FILE_PROCS,
+        CGROUP_FILE_TASKS,
+};
+/*
+ * A pidlist is a list of pids that virtually represents the contents of one
+ * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
+ * a pair (one each for procs, tasks) for each pid namespace that's relevant
+ * to the cgroup.
+ */
+struct cgroup_pidlist {
+        /*
+         * used to find which pidlist is wanted. doesn't change as long as
+         * this particular list stays in the list.
+        */
+        struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
+        /* array of xids */
+        pid_t *list;
+        /* how many elements the above list has */
+        int length;
+        /* how many files are using the current array */
+        int use_count;
+        /* each of these stored in a list by its cgroup */
+        struct list_head links;
+        /* pointer to the cgroup we belong to, for list removal purposes */
+        struct cgroup *owner;
+        /* protects the other fields */
+        struct rw_semaphore mutex;
+};
 /*
 * The following two functions "fix" the issue where there are more pids
 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
@@ -3694,13 +3819,14 @@ static struct cftype files[] = {
                .read_u64 = cgroup_clone_children_read,
                .write_u64 = cgroup_clone_children_write,
        },
-};
+        {
+                .name = "release_agent",
-static struct cftype cft_release_agent = {
+                .flags = CFTYPE_ONLY_ON_ROOT,
-        .name = "release_agent",
+                .read_seq_string = cgroup_release_agent_show,
-        .read_seq_string = cgroup_release_agent_show,
+                .write_string = cgroup_release_agent_write,
-        .write_string = cgroup_release_agent_write,
+                .max_write_len = PATH_MAX,
-        .max_write_len = PATH_MAX,
+        },
+        { }     /* terminate */
 };
 static int cgroup_populate_dir(struct cgroup *cgrp)
@@ -3708,22 +3834,21 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
        int err;
        struct cgroup_subsys *ss;
-        /* First clear out any existing files */
+        err = cgroup_addrm_files(cgrp, NULL, files, true);
-        cgroup_clear_directory(cgrp->dentry);
-        err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
        if (err < 0)
                return err;
-        if (cgrp == cgrp->top_cgroup) {
+        /* process cftsets of each subsystem */
-                if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
-                        return err;
-        }
        for_each_subsys(cgrp->root, ss) {
+                struct cftype_set *set;
                if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
                        return err;
+                list_for_each_entry(set, &ss->cftsets, node)
+                        cgroup_addrm_files(cgrp, ss, set->cfts, true);
        }
        /* This cgroup is ready now */
        for_each_subsys(cgrp->root, ss) {
                struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
@@ -3739,6 +3864,14 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
        return 0;
 }
+static void css_dput_fn(struct work_struct *work)
+{
+        struct cgroup_subsys_state *css =
+                container_of(work, struct cgroup_subsys_state, dput_work);
+        dput(css->cgroup->dentry);
+}
 static void init_cgroup_css(struct cgroup_subsys_state *css,
                               struct cgroup_subsys *ss,
                               struct cgroup *cgrp)
@@ -3751,6 +3884,16 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
                set_bit(CSS_ROOT, &css->flags);
        BUG_ON(cgrp->subsys[ss->subsys_id]);
        cgrp->subsys[ss->subsys_id] = css;
+        /*
+         * If !clear_css_refs, css holds an extra ref to @cgrp->dentry
+         * which is put on the last css_put().  dput() requires process
+         * context, which css_put() may be called without.  @css->dput_work
+         * will be used to invoke dput() asynchronously from css_put().
+         */
+        INIT_WORK(&css->dput_work, css_dput_fn);
+        if (ss->__DEPRECATED_clear_css_refs)
+                set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
 }
 static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
@@ -3827,7 +3970,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
        for_each_subsys(root, ss) {
-                struct cgroup_subsys_state *css = ss->create(ss, cgrp);
+                struct cgroup_subsys_state *css = ss->create(cgrp);
                if (IS_ERR(css)) {
                        err = PTR_ERR(css);
@@ -3841,7 +3984,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                }
                /* At error, ->destroy() callback has to free assigned ID. */
                if (clone_children(parent) && ss->post_clone)
-                        ss->post_clone(ss, cgrp);
+                        ss->post_clone(cgrp);
        }
        cgroup_lock_hierarchy(root);
@@ -3853,9 +3996,16 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        if (err < 0)
                goto err_remove;
+        /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
+        for_each_subsys(root, ss)
+                if (!ss->__DEPRECATED_clear_css_refs)
+                        dget(dentry);
        /* The cgroup directory was pre-locked for us */
        BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
+        list_add_tail(&cgrp->allcg_node, &root->allcg_list);
        err = cgroup_populate_dir(cgrp);
        /* If err < 0, we have a half-filled directory - oh well ;) */
@@ -3875,7 +4025,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        for_each_subsys(root, ss) {
                if (cgrp->subsys[ss->subsys_id])
-                        ss->destroy(ss, cgrp);
+                        ss->destroy(cgrp);
        }
        mutex_unlock(&cgroup_mutex);
@@ -3895,18 +4045,19 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        return cgroup_create(c_parent, dentry, mode | S_IFDIR);
 }
+/*
+ * Check the reference count on each subsystem. Since we already
+ * established that there are no tasks in the cgroup, if the css refcount
+ * is also 1, then there should be no outstanding references, so the
+ * subsystem is safe to destroy. We scan across all subsystems rather than
+ * using the per-hierarchy linked list of mounted subsystems since we can
+ * be called via check_for_release() with no synchronization other than
+ * RCU, and the subsystem linked list isn't RCU-safe.
+ */
 static int cgroup_has_css_refs(struct cgroup *cgrp)
 {
-        /* Check the reference count on each subsystem. Since we
-         * already established that there are no tasks in the
-         * cgroup, if the css refcount is also 1, then there should
-         * be no outstanding references, so the subsystem is safe to
-         * destroy. We scan across all subsystems rather than using
-         * the per-hierarchy linked list of mounted subsystems since
-         * we can be called via check_for_release() with no
-         * synchronization other than RCU, and the subsystem linked
-         * list isn't RCU-safe */
        int i;
        /*
         * We won't need to lock the subsys array, because the subsystems
         * we're concerned about aren't going anywhere since our cgroup root
@@ -3915,17 +4066,21 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
                struct cgroup_subsys_state *css;
                /* Skip subsystems not present or not in this hierarchy */
                if (ss == NULL || ss->root != cgrp->root)
                        continue;
                css = cgrp->subsys[ss->subsys_id];
-                /* When called from check_for_release() it's possible
+                /*
+                 * When called from check_for_release() it's possible
                 * that by this point the cgroup has been removed
                 * and the css deleted. But a false-positive doesn't
                 * matter, since it can only happen if the cgroup
                 * has been deleted and hence no longer needs the
-                 * release agent to be called anyway. */
+                 * release agent to be called anyway.
-                if (css && (atomic_read(&css->refcnt) > 1))
+                 */
+                if (css && css_refcnt(css) > 1)
                        return 1;
        }
        return 0;
@@ -3935,51 +4090,63 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
 * Atomically mark all (or else none) of the cgroup's CSS objects as
 * CSS_REMOVED. Return true on success, or false if the cgroup has
 * busy subsystems. Call with cgroup_mutex held
+ *
+ * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
+ * not, cgroup removal behaves differently.
+ *
+ * If clear is set, css refcnt for the subsystem should be zero before
+ * cgroup removal can be committed.  This is implemented by
+ * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
+ * called multiple times until all css refcnts reach zero and is allowed to
+ * veto removal on any invocation.  This behavior is deprecated and will be
+ * removed as soon as the existing user (memcg) is updated.
+ *
+ * If clear is not set, each css holds an extra reference to the cgroup's
+ * dentry and cgroup removal proceeds regardless of css refs.
+ * ->pre_destroy() will be called at least once and is not allowed to fail.
+ * On the last put of each css, whenever that may be, the extra dentry ref
+ * is put so that dentry destruction happens only after all css's are
+ * released.
 */
 static int cgroup_clear_css_refs(struct cgroup *cgrp)
 {
        struct cgroup_subsys *ss;
        unsigned long flags;
        bool failed = false;
        local_irq_save(flags);
+        /*
+         * Block new css_tryget() by deactivating refcnt.  If all refcnts
+         * for subsystems w/ clear_css_refs set were 1 at the moment of
+         * deactivation, we succeeded.
+         */
        for_each_subsys(cgrp->root, ss) {
                struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
-                int refcnt;
-                while (1) {
+                WARN_ON(atomic_read(&css->refcnt) < 0);
-                        /* We can only remove a CSS with a refcnt==1 */
+                atomic_add(CSS_DEACT_BIAS, &css->refcnt);
-                        refcnt = atomic_read(&css->refcnt);
-                        if (refcnt > 1) {
+                if (ss->__DEPRECATED_clear_css_refs)
-                                failed = true;
+                        failed |= css_refcnt(css) != 1;
-                                goto done;
-                        }
-                        BUG_ON(!refcnt);
-                        /*
-                         * Drop the refcnt to 0 while we check other
-                         * subsystems. This will cause any racing
-                         * css_tryget() to spin until we set the
-                         * CSS_REMOVED bits or abort
-                         */
-                        if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
-                                break;
-                        cpu_relax();
-                }
        }
- done:
+        /*
+         * If succeeded, set REMOVED and put all the base refs; otherwise,
+         * restore refcnts to positive values.  Either way, all in-progress
+         * css_tryget() will be released.
+         */
        for_each_subsys(cgrp->root, ss) {
                struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
-                if (failed) {
-                        /*
+                if (!failed) {
-                         * Restore old refcnt if we previously managed
-                         * to clear it from 1 to 0
-                         */
-                        if (!atomic_read(&css->refcnt))
-                                atomic_set(&css->refcnt, 1);
-                } else {
-                        /* Commit the fact that the CSS is removed */
                        set_bit(CSS_REMOVED, &css->flags);
+                        css_put(css);
+                } else {
+                        atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
                }
        }
        local_irq_restore(flags);
        return !failed;
 }
@@ -4064,6 +4231,8 @@ again:
        list_del_init(&cgrp->sibling);
        cgroup_unlock_hierarchy(cgrp->root);
+        list_del_init(&cgrp->allcg_node);
        d = dget(cgrp->dentry);
        cgroup_d_remove_dir(d);
@@ -4090,16 +4259,33 @@ again:
        return 0;
 }
+static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
+{
+        INIT_LIST_HEAD(&ss->cftsets);
+        /*
+         * base_cftset is embedded in subsys itself, no need to worry about
+         * deregistration.
+         */
+        if (ss->base_cftypes) {
+                ss->base_cftset.cfts = ss->base_cftypes;
+                list_add_tail(&ss->base_cftset.node, &ss->cftsets);
+        }
+}
 static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 {
        struct cgroup_subsys_state *css;
        printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
+        /* init base cftset */
+        cgroup_init_cftsets(ss);
        /* Create the top cgroup state for this subsystem */
        list_add(&ss->sibling, &rootnode.subsys_list);
        ss->root = &rootnode;
-        css = ss->create(ss, dummytop);
+        css = ss->create(dummytop);
        /* We don't handle early failures gracefully */
        BUG_ON(IS_ERR(css));
        init_cgroup_css(css, ss, dummytop);
@@ -4165,6 +4351,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
                return 0;
        }
+        /* init base cftset */
+        cgroup_init_cftsets(ss);
        /*
         * need to register a subsys id before anything else - for example,
         * init_cgroup_css needs it.
@@ -4188,7 +4377,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
         * no ss->create seems to need anything important in the ss struct, so
         * this can happen first (i.e. before the rootnode attachment).
         */
-        css = ss->create(ss, dummytop);
+        css = ss->create(dummytop);
        if (IS_ERR(css)) {
                /* failure case - need to deassign the subsys[] slot. */
                subsys[i] = NULL;
@@ -4206,7 +4395,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
                int ret = cgroup_init_idr(ss, css);
                if (ret) {
                        dummytop->subsys[ss->subsys_id] = NULL;
-                        ss->destroy(ss, dummytop);
+                        ss->destroy(dummytop);
                        subsys[i] = NULL;
                        mutex_unlock(&cgroup_mutex);
                        return ret;
@@ -4304,7 +4493,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
         * pointer to find their state. note that this also takes care of
         * freeing the css_id.
         */
-        ss->destroy(ss, dummytop);
+        ss->destroy(dummytop);
        dummytop->subsys[ss->subsys_id] = NULL;
        mutex_unlock(&cgroup_mutex);
@@ -4580,7 +4769,7 @@ void cgroup_fork_callbacks(struct task_struct *child)
                for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                        struct cgroup_subsys *ss = subsys[i];
                        if (ss->fork)
-                                ss->fork(ss, child);
+                                ss->fork(child);
                }
        }
 }
@@ -4596,6 +4785,17 @@ void cgroup_fork_callbacks(struct task_struct *child)
 */
 void cgroup_post_fork(struct task_struct *child)
 {
+        /*
+         * use_task_css_set_links is set to 1 before we walk the tasklist
+         * under the tasklist_lock and we read it here after we added the child
+         * to the tasklist under the tasklist_lock as well. If the child wasn't
+         * yet in the tasklist when we walked through it from
+         * cgroup_enable_task_cg_lists(), then use_task_css_set_links value
+         * should be visible now due to the paired locking and barriers implied
+         * by LOCK/UNLOCK: it is written before the tasklist_lock unlock
+         * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock
+         * lock on fork.
+         */
        if (use_task_css_set_links) {
                write_lock(&css_set_lock);
                if (list_empty(&child->cg_list)) {
@@ -4682,7 +4882,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
                                struct cgroup *old_cgrp =
                                        rcu_dereference_raw(cg->subsys[i])->cgroup;
                                struct cgroup *cgrp = task_cgroup(tsk, i);
-                                ss->exit(ss, cgrp, old_cgrp, tsk);
+                                ss->exit(cgrp, old_cgrp, tsk);
                        }
                }
        }
@@ -4743,21 +4943,41 @@ static void check_for_release(struct cgroup *cgrp)
 }
 /* Caller must verify that the css is not for root cgroup */
-void __css_put(struct cgroup_subsys_state *css, int count)
+bool __css_tryget(struct cgroup_subsys_state *css)
+{
+        do {
+                int v = css_refcnt(css);
+                if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v)
+                        return true;
+                cpu_relax();
+        } while (!test_bit(CSS_REMOVED, &css->flags));
+        return false;
+}
+EXPORT_SYMBOL_GPL(__css_tryget);
+/* Caller must verify that the css is not for root cgroup */
+void __css_put(struct cgroup_subsys_state *css)
 {
        struct cgroup *cgrp = css->cgroup;
-        int val;
        rcu_read_lock();
-        val = atomic_sub_return(count, &css->refcnt);
+        atomic_dec(&css->refcnt);
-        if (val == 1) {
+        switch (css_refcnt(css)) {
+        case 1:
                if (notify_on_release(cgrp)) {
                        set_bit(CGRP_RELEASABLE, &cgrp->flags);
                        check_for_release(cgrp);
                }
                cgroup_wakeup_rmdir_waiter(cgrp);
+                break;
+        case 0:
+                if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags))
+                        schedule_work(&css->dput_work);
+                break;
        }
        rcu_read_unlock();
-        WARN_ON_ONCE(val < 1);
 }
 EXPORT_SYMBOL_GPL(__css_put);
@@ -4876,7 +5096,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
         * on this or this is under rcu_read_lock(). Once css->id is allocated,
         * it's unchanged until freed.
         */
-        cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
+        cssid = rcu_dereference_check(css->id, css_refcnt(css));
        if (cssid)
                return cssid->id;
@@ -4888,7 +5108,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
 {
        struct css_id *cssid;
-        cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
+        cssid = rcu_dereference_check(css->id, css_refcnt(css));
        if (cssid)
                return cssid->depth;
@@ -4939,9 +5159,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
        rcu_assign_pointer(id->css, NULL);
        rcu_assign_pointer(css->id, NULL);
-        write_lock(&ss->id_lock);
+        spin_lock(&ss->id_lock);
        idr_remove(&ss->idr, id->id);
-        write_unlock(&ss->id_lock);
+        spin_unlock(&ss->id_lock);
        kfree_rcu(id, rcu_head);
 }
 EXPORT_SYMBOL_GPL(free_css_id);
@@ -4967,10 +5187,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
                error = -ENOMEM;
                goto err_out;
        }
-        write_lock(&ss->id_lock);
+        spin_lock(&ss->id_lock);
        /* Don't use 0. allocates an ID of 1-65535 */
        error = idr_get_new_above(&ss->idr, newid, 1, &myid);
-        write_unlock(&ss->id_lock);
+        spin_unlock(&ss->id_lock);
        /* Returns error when there are no free spaces for new ID.*/
        if (error) {
@@ -4985,9 +5205,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
        return newid;
 remove_idr:
        error = -ENOSPC;
-        write_lock(&ss->id_lock);
+        spin_lock(&ss->id_lock);
        idr_remove(&ss->idr, myid);
-        write_unlock(&ss->id_lock);
+        spin_unlock(&ss->id_lock);
 err_out:
        kfree(newid);
        return ERR_PTR(error);
@@ -4999,7 +5219,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
 {
        struct css_id *newid;
-        rwlock_init(&ss->id_lock);
+        spin_lock_init(&ss->id_lock);
        idr_init(&ss->idr);
        newid = get_new_cssid(ss, 0);
@@ -5087,6 +5307,8 @@ css_get_next(struct cgroup_subsys *ss, int id,
                return NULL;
        BUG_ON(!ss->use_id);
+        WARN_ON_ONCE(!rcu_read_lock_held());
        /* fill start point for scan */
        tmpid = id;
        while (1) {
@@ -5094,10 +5316,7 @@ css_get_next(struct cgroup_subsys *ss, int id,
                 * scan next entry from bitmap(tree), tmpid is updated after
                 * idr_get_next().
                 */
-                read_lock(&ss->id_lock);
                tmp = idr_get_next(&ss->idr, &tmpid);
-                read_unlock(&ss->id_lock);
                if (!tmp)
                        break;
                if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
@@ -5137,8 +5356,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
 }
 #ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
+static struct cgroup_subsys_state *debug_create(struct cgroup *cont)
-                                                   struct cgroup *cont)
 {
        struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
@@ -5148,7 +5366,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
        return css;
 }
-static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
+static void debug_destroy(struct cgroup *cont)
 {
        kfree(cont->subsys[debug_subsys_id]);
 }
@@ -5271,19 +5489,15 @@ static struct cftype debug_files[] =  {
                .name = "releasable",
                .read_u64 = releasable_read,
        },
-};
-static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
+        { }     /* terminate */
-{
+};
-        return cgroup_add_files(cont, ss, debug_files,
-                                ARRAY_SIZE(debug_files));
-}
 struct cgroup_subsys debug_subsys = {
        .name = "debug",
        .create = debug_create,
        .destroy = debug_destroy,
-        .populate = debug_populate,
        .subsys_id = debug_subsys_id,
+        .base_cftypes = debug_files,
 };
 #endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index fc0646b78a64..3649fc6b3eaa 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -128,8 +128,7 @@ struct cgroup_subsys freezer_subsys;
 *    task->alloc_lock (inside __thaw_task(), prevents race with refrigerator())
 *     sighand->siglock
 */
-static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
+static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup)
-                                                  struct cgroup *cgroup)
 {
        struct freezer *freezer;
@@ -142,8 +141,7 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
        return &freezer->css;
 }
-static void freezer_destroy(struct cgroup_subsys *ss,
+static void freezer_destroy(struct cgroup *cgroup)
-                            struct cgroup *cgroup)
 {
        struct freezer *freezer = cgroup_freezer(cgroup);
@@ -164,8 +162,7 @@ static bool is_task_frozen_enough(struct task_struct *task)
 * a write to that file racing against an attach, and hence the
 * can_attach() result will remain valid until the attach completes.
 */
-static int freezer_can_attach(struct cgroup_subsys *ss,
+static int freezer_can_attach(struct cgroup *new_cgroup,
-                              struct cgroup *new_cgroup,
                              struct cgroup_taskset *tset)
 {
        struct freezer *freezer;
@@ -185,7 +182,7 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
        return 0;
 }
-static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
+static void freezer_fork(struct task_struct *task)
 {
        struct freezer *freezer;
@@ -361,24 +358,19 @@ static int freezer_write(struct cgroup *cgroup,
 static struct cftype files[] = {
        {
                .name = "state",
+                .flags = CFTYPE_NOT_ON_ROOT,
                .read_seq_string = freezer_read,
                .write_string = freezer_write,
        },
+        { }     /* terminate */
 };
-static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
-{
-        if (!cgroup->parent)
-                return 0;
-        return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
-}
 struct cgroup_subsys freezer_subsys = {
        .name           = "freezer",
        .create         = freezer_create,
        .destroy        = freezer_destroy,
-        .populate       = freezer_populate,
        .subsys_id      = freezer_subsys_id,
        .can_attach     = freezer_can_attach,
        .fork           = freezer_fork,
+        .base_cftypes   = files,
 };
diff --git a/kernel/compat.c b/kernel/compat.c
index f346cedfe24d..74ff8498809a 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -31,11 +31,10 @@
 #include <asm/uaccess.h>
 /*
- * Note that the native side is already converted to a timespec, because
+ * Get/set struct timeval with struct timespec on the native side
- * that's what we want anyway.
 */
-static int compat_get_timeval(struct timespec *o,
+static int compat_get_timeval_convert(struct timespec *o,
-                struct compat_timeval __user *i)
+                                      struct compat_timeval __user *i)
 {
        long usec;
@@ -46,8 +45,8 @@ static int compat_get_timeval(struct timespec *o,
        return 0;
 }
-static int compat_put_timeval(struct compat_timeval __user *o,
+static int compat_put_timeval_convert(struct compat_timeval __user *o,
-                struct timeval *i)
+                                      struct timeval *i)
 {
        return (put_user(i->tv_sec, &o->tv_sec) ||
                put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
@@ -117,7 +116,7 @@ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
        if (tv) {
                struct timeval ktv;
                do_gettimeofday(&ktv);
-                if (compat_put_timeval(tv, &ktv))
+                if (compat_put_timeval_convert(tv, &ktv))
                        return -EFAULT;
        }
        if (tz) {
@@ -135,7 +134,7 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
        struct timezone ktz;
        if (tv) {
-                if (compat_get_timeval(&kts, tv))
+                if (compat_get_timeval_convert(&kts, tv))
                        return -EFAULT;
        }
        if (tz) {
@@ -146,12 +145,29 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
        return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
 }
+int get_compat_timeval(struct timeval *tv, const struct compat_timeval __user *ctv)
+{
+        return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) ||
+                        __get_user(tv->tv_sec, &ctv->tv_sec) ||
+                        __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
+}
+EXPORT_SYMBOL_GPL(get_compat_timeval);
+int put_compat_timeval(const struct timeval *tv, struct compat_timeval __user *ctv)
+{
+        return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) ||
+                        __put_user(tv->tv_sec, &ctv->tv_sec) ||
+                        __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
+}
+EXPORT_SYMBOL_GPL(put_compat_timeval);
 int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
 {
        return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
                        __get_user(ts->tv_sec, &cts->tv_sec) ||
                        __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
 }
+EXPORT_SYMBOL_GPL(get_compat_timespec);
 int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts)
 {
@@ -161,6 +177,42 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user
 }
 EXPORT_SYMBOL_GPL(put_compat_timespec);
+int compat_get_timeval(struct timeval *tv, const void __user *utv)
+{
+        if (COMPAT_USE_64BIT_TIME)
+                return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0;
+        else
+                return get_compat_timeval(tv, utv);
+}
+EXPORT_SYMBOL_GPL(compat_get_timeval);
+int compat_put_timeval(const struct timeval *tv, void __user *utv)
+{
+        if (COMPAT_USE_64BIT_TIME)
+                return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0;
+        else
+                return put_compat_timeval(tv, utv);
+}
+EXPORT_SYMBOL_GPL(compat_put_timeval);
+int compat_get_timespec(struct timespec *ts, const void __user *uts)
+{
+        if (COMPAT_USE_64BIT_TIME)
+                return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0;
+        else
+                return get_compat_timespec(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_get_timespec);
+int compat_put_timespec(const struct timespec *ts, void __user *uts)
+{
+        if (COMPAT_USE_64BIT_TIME)
+                return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0;
+        else
+                return put_compat_timespec(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_put_timespec);
 static long compat_nanosleep_restart(struct restart_block *restart)
 {
        struct compat_timespec __user *rmtp;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index a09ac2b9a661..2382683617a3 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -964,7 +964,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
 {
        bool need_loop;
-repeat:
        /*
         * Allow tasks that have access to memory reserves because they have
         * been OOM killed to get memory anywhere.
@@ -983,45 +982,19 @@ repeat:
         */
        need_loop = task_has_mempolicy(tsk) ||
                        !nodes_intersects(*newmems, tsk->mems_allowed);
-        nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
-        mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
-        /*
+        if (need_loop)
-         * ensure checking ->mems_allowed_change_disable after setting all new
+                write_seqcount_begin(&tsk->mems_allowed_seq);
-         * allowed nodes.
-         *
-         * the read-side task can see an nodemask with new allowed nodes and
-         * old allowed nodes. and if it allocates page when cpuset clears newly
-         * disallowed ones continuous, it can see the new allowed bits.
-         *
-         * And if setting all new allowed nodes is after the checking, setting
-         * all new allowed nodes and clearing newly disallowed ones will be done
-         * continuous, and the read-side task may find no node to alloc page.
-         */
-        smp_mb();
-        /*
-         * Allocation of memory is very fast, we needn't sleep when waiting
-         * for the read-side.
-         */
-        while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
-                task_unlock(tsk);
-                if (!task_curr(tsk))
-                        yield();
-                goto repeat;
-        }
-        /*
+        nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
-         * ensure checking ->mems_allowed_change_disable before clearing all new
+        mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
-         * disallowed nodes.
-         *
-         * if clearing newly disallowed bits before the checking, the read-side
-         * task may find no node to alloc page.
-         */
-        smp_mb();
        mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
        tsk->mems_allowed = *newmems;
+        if (need_loop)
+                write_seqcount_end(&tsk->mems_allowed_seq);
        task_unlock(tsk);
 }
@@ -1399,8 +1372,7 @@ static nodemask_t cpuset_attach_nodemask_from;
 static nodemask_t cpuset_attach_nodemask_to;
 /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
-static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
-                             struct cgroup_taskset *tset)
 {
        struct cpuset *cs = cgroup_cs(cgrp);
        struct task_struct *task;
@@ -1436,8 +1408,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
        return 0;
 }
-static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
-                          struct cgroup_taskset *tset)
 {
        struct mm_struct *mm;
        struct task_struct *task;
@@ -1794,28 +1765,17 @@ static struct cftype files[] = {
                .write_u64 = cpuset_write_u64,
                .private = FILE_SPREAD_SLAB,
        },
-};
-static struct cftype cft_memory_pressure_enabled = {
+        {
-        .name = "memory_pressure_enabled",
+                .name = "memory_pressure_enabled",
-        .read_u64 = cpuset_read_u64,
+                .flags = CFTYPE_ONLY_ON_ROOT,
-        .write_u64 = cpuset_write_u64,
+                .read_u64 = cpuset_read_u64,
-        .private = FILE_MEMORY_PRESSURE_ENABLED,
+                .write_u64 = cpuset_write_u64,
-};
+                .private = FILE_MEMORY_PRESSURE_ENABLED,
+        },
-static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-{
-        int err;
-        err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
+        { }     /* terminate */
-        if (err)
+};
-                return err;
-        /* memory_pressure_enabled is in root cpuset only */
-        if (!cont->parent)
-                err = cgroup_add_file(cont, ss,
-                                      &cft_memory_pressure_enabled);
-        return err;
-}
 /*
 * post_clone() is called during cgroup_create() when the
@@ -1833,8 +1793,7 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
 * (and likewise for mems) to the new cgroup. Called with cgroup_mutex
 * held.
 */
-static void cpuset_post_clone(struct cgroup_subsys *ss,
+static void cpuset_post_clone(struct cgroup *cgroup)
-                              struct cgroup *cgroup)
 {
        struct cgroup *parent, *child;
        struct cpuset *cs, *parent_cs;
@@ -1857,13 +1816,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
 /*
 *      cpuset_create - create a cpuset
- *      ss:     cpuset cgroup subsystem
 *      cont:   control group that the new cpuset will be part of
 */
-static struct cgroup_subsys_state *cpuset_create(
+static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont)
-        struct cgroup_subsys *ss,
-        struct cgroup *cont)
 {
        struct cpuset *cs;
        struct cpuset *parent;
@@ -1902,7 +1858,7 @@ static struct cgroup_subsys_state *cpuset_create(
 * will call async_rebuild_sched_domains().
 */
-static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
+static void cpuset_destroy(struct cgroup *cont)
 {
        struct cpuset *cs = cgroup_cs(cont);
@@ -1920,9 +1876,9 @@ struct cgroup_subsys cpuset_subsys = {
        .destroy = cpuset_destroy,
        .can_attach = cpuset_can_attach,
        .attach = cpuset_attach,
-        .populate = cpuset_populate,
        .post_clone = cpuset_post_clone,
        .subsys_id = cpuset_subsys_id,
+        .base_cftypes = files,
        .early_init = 1,
 };
@@ -2195,10 +2151,9 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
        mutex_unlock(&callback_mutex);
 }
-int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
+void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
 {
        const struct cpuset *cs;
-        int cpu;
        rcu_read_lock();
        cs = task_cs(tsk);
@@ -2219,22 +2174,10 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
         * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
         * set any mask even if it is not right from task_cs() pov,
         * the pending set_cpus_allowed_ptr() will fix things.
+         *
+         * select_fallback_rq() will fix things ups and set cpu_possible_mask
+         * if required.
         */
-        cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
-        if (cpu >= nr_cpu_ids) {
-                /*
-                 * Either tsk->cpus_allowed is wrong (see above) or it
-                 * is actually empty. The latter case is only possible
-                 * if we are racing with remove_tasks_in_empty_cpuset().
-                 * Like above we can temporary set any mask and rely on
-                 * set_cpus_allowed_ptr() as synchronization point.
-                 */
-                do_set_cpus_allowed(tsk, cpu_possible_mask);
-                cpu = cpumask_any(cpu_active_mask);
-        }
-        return cpu;
 }
 void cpuset_init_current_mems_allowed(void)
diff --git a/kernel/cred.c b/kernel/cred.c
index 5791612a4045..97b36eeca4c9 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -16,6 +16,7 @@
 #include <linux/keyctl.h>
 #include <linux/init_task.h>
 #include <linux/security.h>
+#include <linux/binfmts.h>
 #include <linux/cn_proc.h>
 #if 0
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0d7c08784efb..1dc53bae56e1 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -41,6 +41,7 @@
 #include <linux/delay.h>
 #include <linux/sched.h>
 #include <linux/sysrq.h>
+#include <linux/reboot.h>
 #include <linux/init.h>
 #include <linux/kgdb.h>
 #include <linux/kdb.h>
@@ -52,7 +53,6 @@
 #include <asm/cacheflush.h>
 #include <asm/byteorder.h>
 #include <linux/atomic.h>
-#include <asm/system.h>
 #include "debug_core.h"
@@ -75,6 +75,8 @@ static int			exception_level;
 struct kgdb_io          *dbg_io_ops;
 static DEFINE_SPINLOCK(kgdb_registration_lock);
+/* Action for the reboot notifiter, a global allow kdb to change it */
+static int kgdbreboot;
 /* kgdb console driver is loaded */
 static int kgdb_con_registered;
 /* determine if kgdb console output should be used */
@@ -96,6 +98,7 @@ static int __init opt_kgdb_con(char *str)
 early_param("kgdbcon", opt_kgdb_con);
 module_param(kgdb_use_con, int, 0644);
+module_param(kgdbreboot, int, 0644);
 /*
 * Holds information about breakpoints in a kernel. These breakpoints are
@@ -784,6 +787,33 @@ void __init dbg_late_init(void)
        kdb_init(KDB_INIT_FULL);
 }
+static int
+dbg_notify_reboot(struct notifier_block *this, unsigned long code, void *x)
+{
+        /*
+         * Take the following action on reboot notify depending on value:
+         *    1 == Enter debugger
+         *    0 == [the default] detatch debug client
+         *   -1 == Do nothing... and use this until the board resets
+         */
+        switch (kgdbreboot) {
+        case 1:
+                kgdb_breakpoint();
+        case -1:
+                goto done;
+        }
+        if (!dbg_kdb_mode)
+                gdbstub_exit(code);
+done:
+        return NOTIFY_DONE;
+}
+static struct notifier_block dbg_reboot_notifier = {
+        .notifier_call          = dbg_notify_reboot,
+        .next                   = NULL,
+        .priority               = INT_MAX,
+};
 static void kgdb_register_callbacks(void)
 {
        if (!kgdb_io_module_registered) {
@@ -791,6 +821,7 @@ static void kgdb_register_callbacks(void)
                kgdb_arch_init();
                if (!dbg_is_early)
                        kgdb_arch_late();
+                register_reboot_notifier(&dbg_reboot_notifier);
                atomic_notifier_chain_register(&panic_notifier_list,
                                               &kgdb_panic_event_nb);
 #ifdef CONFIG_MAGIC_SYSRQ
@@ -812,6 +843,7 @@ static void kgdb_unregister_callbacks(void)
         */
        if (kgdb_io_module_registered) {
                kgdb_io_module_registered = 0;
+                unregister_reboot_notifier(&dbg_reboot_notifier);
                atomic_notifier_chain_unregister(&panic_notifier_list,
                                               &kgdb_panic_event_nb);
                kgdb_arch_exit();
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index c22d8c28ad84..ce615e064482 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1111,6 +1111,13 @@ void gdbstub_exit(int status)
        unsigned char checksum, ch, buffer[3];
        int loop;
+        if (!kgdb_connected)
+                return;
+        kgdb_connected = 0;
+        if (!dbg_io_ops || dbg_kdb_mode)
+                return;
        buffer[0] = 'W';
        buffer[1] = hex_asc_hi(status);
        buffer[2] = hex_asc_lo(status);
@@ -1129,5 +1136,6 @@ void gdbstub_exit(int status)
        dbg_io_ops->write_char(hex_asc_lo(checksum));
        /* make sure the output is flushed, lest the bootloader clobber it */
-        dbg_io_ops->flush();
+        if (dbg_io_ops->flush)
+                dbg_io_ops->flush();
 }
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 20059ef4459a..8418c2f8ec5d 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -153,6 +153,13 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
        } else {
                kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
                           __func__, bp->bp_addr);
+#ifdef CONFIG_DEBUG_RODATA
+                if (!bp->bp_type) {
+                        kdb_printf("Software breakpoints are unavailable.\n"
+                                   "  Change the kernel CONFIG_DEBUG_RODATA=n\n"
+                                   "  OR use hw breaks: help bph\n");
+                }
+#endif
                return 1;
        }
        return 0;
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 7179eac7b41c..07c9bbb94a0b 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -15,7 +15,6 @@
 #include <linux/sched.h>
 #include <linux/kdb.h>
 #include <linux/nmi.h>
-#include <asm/system.h>
 #include "kdb_private.h"
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 4802eb5840e1..9b5f17da1c56 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -689,7 +689,7 @@ kdb_printit:
        if (!dbg_kdb_mode && kgdb_connected) {
                gdbstub_msg_write(kdb_buffer, retlen);
        } else {
-                if (!dbg_io_ops->is_console) {
+                if (dbg_io_ops && !dbg_io_ops->is_console) {
                        len = strlen(kdb_buffer);
                        cp = kdb_buffer;
                        while (len--) {
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
index 4bca634975c0..118527aa60ea 100644
--- a/kernel/debug/kdb/kdb_keyboard.c
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -25,6 +25,7 @@
 #define KBD_STAT_MOUSE_OBF      0x20    /* Mouse output buffer full */
 static int kbd_exists;
+static int kbd_last_ret;
 /*
 * Check if the keyboard controller has a keypress for us.
@@ -90,8 +91,11 @@ int kdb_get_kbd_char(void)
                return -1;
        }
-        if ((scancode & 0x80) != 0)
+        if ((scancode & 0x80) != 0) {
+                if (scancode == 0x9c)
+                        kbd_last_ret = 0;
                return -1;
+        }
        scancode &= 0x7f;
@@ -178,35 +182,82 @@ int kdb_get_kbd_char(void)
                return -1;      /* ignore unprintables */
        }
-        if ((scancode & 0x7f) == 0x1c) {
+        if (scancode == 0x1c) {
-                /*
+                kbd_last_ret = 1;
-                 * enter key.  All done.  Absorb the release scancode.
+                return 13;
-                 */
+        }
+        return keychar & 0xff;
+}
+EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
+/*
+ * Best effort cleanup of ENTER break codes on leaving KDB. Called on
+ * exiting KDB, when we know we processed an ENTER or KP ENTER scan
+ * code.
+ */
+void kdb_kbd_cleanup_state(void)
+{
+        int scancode, scanstatus;
+        /*
+         * Nothing to clean up, since either
+         * ENTER was never pressed, or has already
+         * gotten cleaned up.
+         */
+        if (!kbd_last_ret)
+                return;
+        kbd_last_ret = 0;
+        /*
+         * Enter key. Need to absorb the break code here, lest it gets
+         * leaked out if we exit KDB as the result of processing 'g'.
+         *
+         * This has several interesting implications:
+         * + Need to handle KP ENTER, which has break code 0xe0 0x9c.
+         * + Need to handle repeat ENTER and repeat KP ENTER. Repeats
+         *   only get a break code at the end of the repeated
+         *   sequence. This means we can't propagate the repeated key
+         *   press, and must swallow it away.
+         * + Need to handle possible PS/2 mouse input.
+         * + Need to handle mashed keys.
+         */
+        while (1) {
                while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
-                        ;
+                        cpu_relax();
                /*
-                 * Fetch the scancode
+                 * Fetch the scancode.
                 */
                scancode = inb(KBD_DATA_REG);
                scanstatus = inb(KBD_STATUS_REG);
-                while (scanstatus & KBD_STAT_MOUSE_OBF) {
+                /*
-                        scancode = inb(KBD_DATA_REG);
+                 * Skip mouse input.
-                        scanstatus = inb(KBD_STATUS_REG);
+                 */
-                }
+                if (scanstatus & KBD_STAT_MOUSE_OBF)
+                        continue;
-                if (scancode != 0x9c) {
+                /*
-                        /*
+                 * If we see 0xe0, this is either a break code for KP
-                         * Wasn't an enter-release,  why not?
+                 * ENTER, or a repeat make for KP ENTER. Either way,
-                         */
+                 * since the second byte is equivalent to an ENTER,
-                        kdb_printf("kdb: expected enter got 0x%x status 0x%x\n",
+                 * skip the 0xe0 and try again.
-                               scancode, scanstatus);
+                 *
-                }
+                 * If we see 0x1c, this must be a repeat ENTER or KP
+                 * ENTER (and we swallowed 0xe0 before). Try again.
+                 *
+                 * We can also see make and break codes for other keys
+                 * mashed before or after pressing ENTER. Thus, if we
+                 * see anything other than 0x9c, we have to try again.
+                 *
+                 * Note, if you held some key as ENTER was depressed,
+                 * that break code would get leaked out.
+                 */
+                if (scancode != 0x9c)
+                        continue;
-                return 13;
+                return;
        }
-        return keychar & 0xff;
 }
-EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index e2ae7349437f..67b847dfa2bb 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1400,6 +1400,9 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
        if (KDB_STATE(DOING_SS))
                KDB_STATE_CLEAR(SSBPT);
+        /* Clean up any keyboard devices before leaving */
+        kdb_kbd_cleanup_state();
        return result;
 }
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index e381d105b40b..47c4e56e513b 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -246,6 +246,13 @@ extern void debug_kusage(void);
 extern void kdb_set_current_task(struct task_struct *);
 extern struct task_struct *kdb_current_task;
+#ifdef CONFIG_KDB_KEYBOARD
+extern void kdb_kbd_cleanup_state(void);
+#else /* ! CONFIG_KDB_KEYBOARD */
+#define kdb_kbd_cleanup_state()
+#endif /* ! CONFIG_KDB_KEYBOARD */
 #ifdef CONFIG_MODULES
 extern struct list_head *kdb_modules;
 #endif /* CONFIG_MODULES */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 7d6fb40d2188..d35cc2d3a4cc 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -384,9 +384,9 @@ static int kdb_getphys(void *res, unsigned long addr, size_t size)
        if (!pfn_valid(pfn))
                return 1;
        page = pfn_to_page(pfn);
-        vaddr = kmap_atomic(page, KM_KDB);
+        vaddr = kmap_atomic(page);
        memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size);
-        kunmap_atomic(vaddr, KM_KDB);
+        kunmap_atomic(vaddr);
        return 0;
 }
diff --git a/kernel/dma.c b/kernel/dma.c
index 68a2306522c8..6c6262f86c17 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -18,7 +18,6 @@
 #include <linux/proc_fs.h>
 #include <linux/init.h>
 #include <asm/dma.h>
-#include <asm/system.h>
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1b5c081d8b9f..a6a9ec4cd8f5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -118,6 +118,13 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
                       PERF_FLAG_FD_OUTPUT  |\
                       PERF_FLAG_PID_CGROUP)
+/*
+ * branch priv levels that need permission checks
+ */
+#define PERF_SAMPLE_BRANCH_PERM_PLM \
+        (PERF_SAMPLE_BRANCH_KERNEL |\
+         PERF_SAMPLE_BRANCH_HV)
 enum event_type_t {
        EVENT_FLEXIBLE = 0x1,
        EVENT_PINNED = 0x2,
@@ -128,8 +135,9 @@ enum event_type_t {
 * perf_sched_events : >0 events exist
 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
 */
-struct jump_label_key_deferred perf_sched_events __read_mostly;
+struct static_key_deferred perf_sched_events __read_mostly;
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
@@ -881,6 +889,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
        if (is_cgroup_event(event))
                ctx->nr_cgroups++;
+        if (has_branch_stack(event))
+                ctx->nr_branch_stack++;
        list_add_rcu(&event->event_entry, &ctx->event_list);
        if (!ctx->nr_events)
                perf_pmu_rotate_start(ctx->pmu);
@@ -1020,6 +1031,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
                        cpuctx->cgrp = NULL;
        }
+        if (has_branch_stack(event))
+                ctx->nr_branch_stack--;
        ctx->nr_events--;
        if (event->attr.inherit_stat)
                ctx->nr_stat--;
@@ -2195,6 +2209,66 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 }
 /*
+ * When sampling the branck stack in system-wide, it may be necessary
+ * to flush the stack on context switch. This happens when the branch
+ * stack does not tag its entries with the pid of the current task.
+ * Otherwise it becomes impossible to associate a branch entry with a
+ * task. This ambiguity is more likely to appear when the branch stack
+ * supports priv level filtering and the user sets it to monitor only
+ * at the user level (which could be a useful measurement in system-wide
+ * mode). In that case, the risk is high of having a branch stack with
+ * branch from multiple tasks. Flushing may mean dropping the existing
+ * entries or stashing them somewhere in the PMU specific code layer.
+ *
+ * This function provides the context switch callback to the lower code
+ * layer. It is invoked ONLY when there is at least one system-wide context
+ * with at least one active event using taken branch sampling.
+ */
+static void perf_branch_stack_sched_in(struct task_struct *prev,
+                                       struct task_struct *task)
+{
+        struct perf_cpu_context *cpuctx;
+        struct pmu *pmu;
+        unsigned long flags;
+        /* no need to flush branch stack if not changing task */
+        if (prev == task)
+                return;
+        local_irq_save(flags);
+        rcu_read_lock();
+        list_for_each_entry_rcu(pmu, &pmus, entry) {
+                cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+                /*
+                 * check if the context has at least one
+                 * event using PERF_SAMPLE_BRANCH_STACK
+                 */
+                if (cpuctx->ctx.nr_branch_stack > 0
+                    && pmu->flush_branch_stack) {
+                        pmu = cpuctx->ctx.pmu;
+                        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+                        perf_pmu_disable(pmu);
+                        pmu->flush_branch_stack();
+                        perf_pmu_enable(pmu);
+                        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+                }
+        }
+        rcu_read_unlock();
+        local_irq_restore(flags);
+}
+/*
 * Called from scheduler to add the events of the current task
 * with interrupts disabled.
 *
@@ -2225,6 +2299,10 @@ void __perf_event_task_sched_in(struct task_struct *prev,
         */
        if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
                perf_cgroup_sched_in(prev, task);
+        /* check for system-wide branch_stack events */
+        if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
+                perf_branch_stack_sched_in(prev, task);
 }
 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -2778,7 +2856,7 @@ static void free_event(struct perf_event *event)
        if (!event->parent) {
                if (event->attach_state & PERF_ATTACH_TASK)
-                        jump_label_dec_deferred(&perf_sched_events);
+                        static_key_slow_dec_deferred(&perf_sched_events);
                if (event->attr.mmap || event->attr.mmap_data)
                        atomic_dec(&nr_mmap_events);
                if (event->attr.comm)
@@ -2789,7 +2867,15 @@ static void free_event(struct perf_event *event)
                        put_callchain_buffers();
                if (is_cgroup_event(event)) {
                        atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
-                        jump_label_dec_deferred(&perf_sched_events);
+                        static_key_slow_dec_deferred(&perf_sched_events);
+                }
+                if (has_branch_stack(event)) {
+                        static_key_slow_dec_deferred(&perf_sched_events);
+                        /* is system-wide event */
+                        if (!(event->attach_state & PERF_ATTACH_TASK))
+                                atomic_dec(&per_cpu(perf_branch_stack_events,
+                                                    event->cpu));
                }
        }
@@ -3238,10 +3324,6 @@ int perf_event_task_disable(void)
        return 0;
 }
-#ifndef PERF_EVENT_INDEX_OFFSET
-# define PERF_EVENT_INDEX_OFFSET 0
-#endif
 static int perf_event_index(struct perf_event *event)
 {
        if (event->hw.state & PERF_HES_STOPPED)
@@ -3250,21 +3332,26 @@ static int perf_event_index(struct perf_event *event)
        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return 0;
-        return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
+        return event->pmu->event_idx(event);
 }
 static void calc_timer_values(struct perf_event *event,
+                                u64 *now,
                                u64 *enabled,
                                u64 *running)
 {
-        u64 now, ctx_time;
+        u64 ctx_time;
-        now = perf_clock();
+        *now = perf_clock();
-        ctx_time = event->shadow_ctx_time + now;
+        ctx_time = event->shadow_ctx_time + *now;
        *enabled = ctx_time - event->tstamp_enabled;
        *running = ctx_time - event->tstamp_running;
 }
+void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
+{
+}
 /*
 * Callers need to ensure there can be no nesting of this function, otherwise
 * the seqlock logic goes bad. We can not serialize this because the arch
@@ -3274,7 +3361,7 @@ void perf_event_update_userpage(struct perf_event *event)
 {
        struct perf_event_mmap_page *userpg;
        struct ring_buffer *rb;
-        u64 enabled, running;
+        u64 enabled, running, now;
        rcu_read_lock();
        /*
@@ -3286,7 +3373,7 @@ void perf_event_update_userpage(struct perf_event *event)
         * because of locking issue as we can be called in
         * NMI context
         */
-        calc_timer_values(event, &enabled, &running);
+        calc_timer_values(event, &now, &enabled, &running);
        rb = rcu_dereference(event->rb);
        if (!rb)
                goto unlock;
@@ -3302,7 +3389,7 @@ void perf_event_update_userpage(struct perf_event *event)
        barrier();
        userpg->index = perf_event_index(event);
        userpg->offset = perf_event_count(event);
-        if (event->state == PERF_EVENT_STATE_ACTIVE)
+        if (userpg->index)
                userpg->offset -= local64_read(&event->hw.prev_count);
        userpg->time_enabled = enabled +
@@ -3311,6 +3398,8 @@ void perf_event_update_userpage(struct perf_event *event)
        userpg->time_running = running +
                        atomic64_read(&event->child_total_time_running);
+        arch_perf_update_userpage(userpg, now);
        barrier();
        ++userpg->lock;
        preempt_enable();
@@ -3568,6 +3657,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        event->mmap_user = get_current_user();
        vma->vm_mm->pinned_vm += event->mmap_locked;
+        perf_event_update_userpage(event);
 unlock:
        if (!ret)
                atomic_inc(&event->mmap_count);
@@ -3799,7 +3890,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 static void perf_output_read(struct perf_output_handle *handle,
                             struct perf_event *event)
 {
-        u64 enabled = 0, running = 0;
+        u64 enabled = 0, running = 0, now;
        u64 read_format = event->attr.read_format;
        /*
@@ -3812,7 +3903,7 @@ static void perf_output_read(struct perf_output_handle *handle,
         * NMI context
         */
        if (read_format & PERF_FORMAT_TOTAL_TIMES)
-                calc_timer_values(event, &enabled, &running);
+                calc_timer_values(event, &now, &enabled, &running);
        if (event->attr.read_format & PERF_FORMAT_GROUP)
                perf_output_read_group(handle, event, enabled, running);
@@ -3902,6 +3993,24 @@ void perf_output_sample(struct perf_output_handle *handle,
                        }
                }
        }
+        if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
+                if (data->br_stack) {
+                        size_t size;
+                        size = data->br_stack->nr
+                             * sizeof(struct perf_branch_entry);
+                        perf_output_put(handle, data->br_stack->nr);
+                        perf_output_copy(handle, data->br_stack->entries, size);
+                } else {
+                        /*
+                         * we always store at least the value of nr
+                         */
+                        u64 nr = 0;
+                        perf_output_put(handle, nr);
+                }
+        }
 }
 void perf_prepare_sample(struct perf_event_header *header,
@@ -3944,6 +4053,15 @@ void perf_prepare_sample(struct perf_event_header *header,
                WARN_ON_ONCE(size & (sizeof(u64)-1));
                header->size += size;
        }
+        if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
+                int size = sizeof(u64); /* nr */
+                if (data->br_stack) {
+                        size += data->br_stack->nr
+                              * sizeof(struct perf_branch_entry);
+                }
+                header->size += size;
+        }
 }
 static void perf_event_output(struct perf_event *event,
@@ -4986,7 +5104,7 @@ fail:
        return err;
 }
-struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
+struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
 static void sw_perf_event_destroy(struct perf_event *event)
 {
@@ -4994,7 +5112,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
        WARN_ON(event->parent);
-        jump_label_dec(&perf_swevent_enabled[event_id]);
+        static_key_slow_dec(&perf_swevent_enabled[event_id]);
        swevent_hlist_put(event);
 }
@@ -5005,6 +5123,12 @@ static int perf_swevent_init(struct perf_event *event)
        if (event->attr.type != PERF_TYPE_SOFTWARE)
                return -ENOENT;
+        /*
+         * no branch sampling for software events
+         */
+        if (has_branch_stack(event))
+                return -EOPNOTSUPP;
        switch (event_id) {
        case PERF_COUNT_SW_CPU_CLOCK:
        case PERF_COUNT_SW_TASK_CLOCK:
@@ -5024,13 +5148,18 @@ static int perf_swevent_init(struct perf_event *event)
                if (err)
                        return err;
-                jump_label_inc(&perf_swevent_enabled[event_id]);
+                static_key_slow_inc(&perf_swevent_enabled[event_id]);
                event->destroy = sw_perf_event_destroy;
        }
        return 0;
 }
+static int perf_swevent_event_idx(struct perf_event *event)
+{
+        return 0;
+}
 static struct pmu perf_swevent = {
        .task_ctx_nr    = perf_sw_context,
@@ -5040,6 +5169,8 @@ static struct pmu perf_swevent = {
        .start          = perf_swevent_start,
        .stop           = perf_swevent_stop,
        .read           = perf_swevent_read,
+        .event_idx      = perf_swevent_event_idx,
 };
 #ifdef CONFIG_EVENT_TRACING
@@ -5108,6 +5239,12 @@ static int perf_tp_event_init(struct perf_event *event)
        if (event->attr.type != PERF_TYPE_TRACEPOINT)
                return -ENOENT;
+        /*
+         * no branch sampling for tracepoint events
+         */
+        if (has_branch_stack(event))
+                return -EOPNOTSUPP;
        err = perf_trace_init(event);
        if (err)
                return err;
@@ -5126,6 +5263,8 @@ static struct pmu perf_tracepoint = {
        .start          = perf_swevent_start,
        .stop           = perf_swevent_stop,
        .read           = perf_swevent_read,
+        .event_idx      = perf_swevent_event_idx,
 };
 static inline void perf_tp_register(void)
@@ -5331,6 +5470,12 @@ static int cpu_clock_event_init(struct perf_event *event)
        if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
                return -ENOENT;
+        /*
+         * no branch sampling for software events
+         */
+        if (has_branch_stack(event))
+                return -EOPNOTSUPP;
        perf_swevent_init_hrtimer(event);
        return 0;
@@ -5345,6 +5490,8 @@ static struct pmu perf_cpu_clock = {
        .start          = cpu_clock_event_start,
        .stop           = cpu_clock_event_stop,
        .read           = cpu_clock_event_read,
+        .event_idx      = perf_swevent_event_idx,
 };
 /*
@@ -5403,6 +5550,12 @@ static int task_clock_event_init(struct perf_event *event)
        if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
                return -ENOENT;
+        /*
+         * no branch sampling for software events
+         */
+        if (has_branch_stack(event))
+                return -EOPNOTSUPP;
        perf_swevent_init_hrtimer(event);
        return 0;
@@ -5417,6 +5570,8 @@ static struct pmu perf_task_clock = {
        .start          = task_clock_event_start,
        .stop           = task_clock_event_stop,
        .read           = task_clock_event_read,
+        .event_idx      = perf_swevent_event_idx,
 };
 static void perf_pmu_nop_void(struct pmu *pmu)
@@ -5444,6 +5599,11 @@ static void perf_pmu_cancel_txn(struct pmu *pmu)
        perf_pmu_enable(pmu);
 }
+static int perf_event_idx_default(struct perf_event *event)
+{
+        return event->hw.idx + 1;
+}
 /*
 * Ensures all contexts with the same task_ctx_nr have the same
 * pmu_cpu_context too.
@@ -5530,6 +5690,7 @@ static int pmu_dev_alloc(struct pmu *pmu)
        if (!pmu->dev)
                goto out;
+        pmu->dev->groups = pmu->attr_groups;
        device_initialize(pmu->dev);
        ret = dev_set_name(pmu->dev, "%s", pmu->name);
        if (ret)
@@ -5633,6 +5794,9 @@ got_cpu_context:
                pmu->pmu_disable = perf_pmu_nop_void;
        }
+        if (!pmu->event_idx)
+                pmu->event_idx = perf_event_idx_default;
        list_add_rcu(&pmu->entry, &pmus);
        ret = 0;
 unlock:
@@ -5825,7 +5989,7 @@ done:
        if (!event->parent) {
                if (event->attach_state & PERF_ATTACH_TASK)
-                        jump_label_inc(&perf_sched_events.key);
+                        static_key_slow_inc(&perf_sched_events.key);
                if (event->attr.mmap || event->attr.mmap_data)
                        atomic_inc(&nr_mmap_events);
                if (event->attr.comm)
@@ -5839,6 +6003,12 @@ done:
                                return ERR_PTR(err);
                        }
                }
+                if (has_branch_stack(event)) {
+                        static_key_slow_inc(&perf_sched_events.key);
+                        if (!(event->attach_state & PERF_ATTACH_TASK))
+                                atomic_inc(&per_cpu(perf_branch_stack_events,
+                                                    event->cpu));
+                }
        }
        return event;
@@ -5908,6 +6078,40 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
        if (attr->read_format & ~(PERF_FORMAT_MAX-1))
                return -EINVAL;
+        if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
+                u64 mask = attr->branch_sample_type;
+                /* only using defined bits */
+                if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
+                        return -EINVAL;
+                /* at least one branch bit must be set */
+                if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
+                        return -EINVAL;
+                /* kernel level capture: check permissions */
+                if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
+                    && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+                        return -EACCES;
+                /* propagate priv level, when not set for branch */
+                if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
+                        /* exclude_kernel checked on syscall entry */
+                        if (!attr->exclude_kernel)
+                                mask |= PERF_SAMPLE_BRANCH_KERNEL;
+                        if (!attr->exclude_user)
+                                mask |= PERF_SAMPLE_BRANCH_USER;
+                        if (!attr->exclude_hv)
+                                mask |= PERF_SAMPLE_BRANCH_HV;
+                        /*
+                         * adjust user setting (for HW filter setup)
+                         */
+                        attr->branch_sample_type = mask;
+                }
+        }
 out:
        return ret;
@@ -6063,7 +6267,7 @@ SYSCALL_DEFINE5(perf_event_open,
                 * - that may need work on context switch
                 */
                atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
-                jump_label_inc(&perf_sched_events.key);
+                static_key_slow_inc(&perf_sched_events.key);
        }
        /*
@@ -6912,6 +7116,13 @@ void __init perf_event_init(void)
        /* do not patch jump label more than once per second */
        jump_label_rate_limit(&perf_sched_events, HZ);
+        /*
+         * Build time assertion that we keep the data_head at the intended
+         * location.  IOW, validation we got the __reserved[] size right.
+         */
+        BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
+                     != 1024);
 }
 static int __init perf_event_sysfs_init(void)
@@ -6943,8 +7154,7 @@ unlock:
 device_initcall(perf_event_sysfs_init);
 #ifdef CONFIG_CGROUP_PERF
-static struct cgroup_subsys_state *perf_cgroup_create(
+static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont)
-        struct cgroup_subsys *ss, struct cgroup *cont)
 {
        struct perf_cgroup *jc;
@@ -6961,8 +7171,7 @@ static struct cgroup_subsys_state *perf_cgroup_create(
        return &jc->css;
 }
-static void perf_cgroup_destroy(struct cgroup_subsys *ss,
+static void perf_cgroup_destroy(struct cgroup *cont)
-                                struct cgroup *cont)
 {
        struct perf_cgroup *jc;
        jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
@@ -6978,8 +7187,7 @@ static int __perf_cgroup_move(void *info)
        return 0;
 }
-static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
-                               struct cgroup_taskset *tset)
 {
        struct task_struct *task;
@@ -6987,8 +7195,8 @@ static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
                task_function_call(task, __perf_cgroup_move, task);
 }
-static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
+static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
-                struct cgroup *old_cgrp, struct task_struct *task)
+                             struct task_struct *task)
 {
        /*
         * cgroup_exit() is called in the copy_process() failure path.
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index b7971d6f38bf..bb38c4d3ee12 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -581,6 +581,12 @@ static int hw_breakpoint_event_init(struct perf_event *bp)
        if (bp->attr.type != PERF_TYPE_BREAKPOINT)
                return -ENOENT;
+        /*
+         * no branch sampling for breakpoint events
+         */
+        if (has_branch_stack(bp))
+                return -EOPNOTSUPP;
        err = register_perf_hw_breakpoint(bp);
        if (err)
                return err;
@@ -613,6 +619,11 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags)
        bp->hw.state = PERF_HES_STOPPED;
 }
+static int hw_breakpoint_event_idx(struct perf_event *bp)
+{
+        return 0;
+}
 static struct pmu perf_breakpoint = {
        .task_ctx_nr    = perf_sw_context, /* could eventually get its own */
@@ -622,6 +633,8 @@ static struct pmu perf_breakpoint = {
        .start          = hw_breakpoint_start,
        .stop           = hw_breakpoint_stop,
        .read           = hw_breakpoint_pmu_read,
+        .event_idx      = hw_breakpoint_event_idx,
 };
 int __init init_hw_breakpoint(void)
@@ -651,10 +664,10 @@ int __init init_hw_breakpoint(void)
 err_alloc:
        for_each_possible_cpu(err_cpu) {
-                if (err_cpu == cpu)
-                        break;
                for (i = 0; i < TYPE_MAX; i++)
                        kfree(per_cpu(nr_task_bp_pinned[i], cpu));
+                if (err_cpu == cpu)
+                        break;
        }
        return -ENOMEM;
diff --git a/kernel/exit.c b/kernel/exit.c
index 4b4042f9bc6a..d8bd3b425fa7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -52,6 +52,7 @@
 #include <linux/hw_breakpoint.h>
 #include <linux/oom.h>
 #include <linux/writeback.h>
+#include <linux/shm.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -424,7 +425,7 @@ void daemonize(const char *name, ...)
         */
        exit_mm(current);
        /*
-         * We don't want to have TIF_FREEZE set if the system-wide hibernation
+         * We don't want to get frozen, in case system-wide hibernation
         * or suspend transition begins right now.
         */
        current->flags |= (PF_NOFREEZE | PF_KTHREAD);
@@ -473,7 +474,7 @@ static void close_files(struct files_struct * files)
                i = j * __NFDBITS;
                if (i >= fdt->max_fds)
                        break;
-                set = fdt->open_fds->fds_bits[j++];
+                set = fdt->open_fds[j++];
                while (set) {
                        if (set & 1) {
                                struct file * file = xchg(&fdt->fd[i], NULL);
@@ -686,11 +687,11 @@ static void exit_mm(struct task_struct * tsk)
 }
 /*
- * When we die, we re-parent all our children.
+ * When we die, we re-parent all our children, and try to:
- * Try to give them to another thread in our thread
+ * 1. give them to another thread in our thread group, if such a member exists
- * group, and if no such member exists, give it to
+ * 2. give it to the first ancestor process which prctl'd itself as a
- * the child reaper process (ie "init") in our pid
+ *    child_subreaper for its children (like a service manager)
- * space.
+ * 3. give it to the init process (PID 1) in our pid namespace
 */
 static struct task_struct *find_new_reaper(struct task_struct *father)
        __releases(&tasklist_lock)
@@ -710,8 +711,11 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
        if (unlikely(pid_ns->child_reaper == father)) {
                write_unlock_irq(&tasklist_lock);
-                if (unlikely(pid_ns == &init_pid_ns))
+                if (unlikely(pid_ns == &init_pid_ns)) {
-                        panic("Attempted to kill init!");
+                        panic("Attempted to kill init! exitcode=0x%08x\n",
+                                father->signal->group_exit_code ?:
+                                        father->exit_code);
+                }
                zap_pid_ns_processes(pid_ns);
                write_lock_irq(&tasklist_lock);
@@ -721,6 +725,29 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
                 * forget_original_parent() must move them somewhere.
                 */
                pid_ns->child_reaper = init_pid_ns.child_reaper;
+        } else if (father->signal->has_child_subreaper) {
+                struct task_struct *reaper;
+                /*
+                 * Find the first ancestor marked as child_subreaper.
+                 * Note that the code below checks same_thread_group(reaper,
+                 * pid_ns->child_reaper).  This is what we need to DTRT in a
+                 * PID namespace. However we still need the check above, see
+                 * http://marc.info/?l=linux-kernel&m=131385460420380
+                 */
+                for (reaper = father->real_parent;
+                     reaper != &init_task;
+                     reaper = reaper->real_parent) {
+                        if (same_thread_group(reaper, pid_ns->child_reaper))
+                                break;
+                        if (!reaper->signal->is_child_subreaper)
+                                continue;
+                        thread = reaper;
+                        do {
+                                if (!(thread->flags & PF_EXITING))
+                                        return reaper;
+                        } while_each_thread(reaper, thread);
+                }
        }
        return pid_ns->child_reaper;
@@ -818,25 +845,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
        if (group_dead)
                kill_orphaned_pgrp(tsk->group_leader, NULL);
-        /* Let father know we died
-         *
-         * Thread signals are configurable, but you aren't going to use
-         * that to send signals to arbitrary processes.
-         * That stops right now.
-         *
-         * If the parent exec id doesn't match the exec id we saved
-         * when we started then we know the parent has changed security
-         * domain.
-         *
-         * If our self_exec id doesn't match our parent_exec_id then
-         * we have changed execution domain as these two values started
-         * the same after a fork.
-         */
-        if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD &&
-            (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
-             tsk->self_exec_id != tsk->parent_exec_id))
-                tsk->exit_signal = SIGCHLD;
        if (unlikely(tsk->ptrace)) {
                int sig = thread_group_leader(tsk) &&
                                thread_group_empty(tsk) &&
@@ -935,8 +943,6 @@ void do_exit(long code)
                schedule();
        }
-        exit_irq_thread();
        exit_signals(tsk);  /* sets PF_EXITING */
        /*
         * tsk->flags are checked in the futex code to protect against
@@ -945,6 +951,8 @@ void do_exit(long code)
        smp_mb();
        raw_spin_unlock_wait(&tsk->pi_lock);
+        exit_irq_thread();
        if (unlikely(in_atomic()))
                printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
                                current->comm, task_pid_nr(current),
@@ -953,7 +961,7 @@ void do_exit(long code)
        acct_update_integrals(tsk);
        /* sync mm's RSS info before statistics gathering */
        if (tsk->mm)
-                sync_mm_rss(tsk, tsk->mm);
+                sync_mm_rss(tsk->mm);
        group_dead = atomic_dec_and_test(&tsk->signal->live);
        if (group_dead) {
                hrtimer_cancel(&tsk->signal->real_timer);
diff --git a/kernel/fork.c b/kernel/fork.c
index a1b632713e43..08eb8584e2a8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -66,6 +66,7 @@
 #include <linux/user-return-notifier.h>
 #include <linux/oom.h>
 #include <linux/khugepaged.h>
+#include <linux/signalfd.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -192,6 +193,7 @@ void __put_task_struct(struct task_struct *tsk)
        WARN_ON(atomic_read(&tsk->usage));
        WARN_ON(tsk == current);
+        security_task_free(tsk);
        exit_creds(tsk);
        delayacct_tsk_free(tsk);
        put_signal_struct(tsk->signal);
@@ -354,7 +356,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                charge = 0;
                if (mpnt->vm_flags & VM_ACCOUNT) {
                        unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
-                        if (security_vm_enough_memory(len))
+                        if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
                                goto fail_nomem;
                        charge = len;
                }
@@ -510,6 +512,23 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
        return NULL;
 }
+static void check_mm(struct mm_struct *mm)
+{
+        int i;
+        for (i = 0; i < NR_MM_COUNTERS; i++) {
+                long x = atomic_long_read(&mm->rss_stat.count[i]);
+                if (unlikely(x))
+                        printk(KERN_ALERT "BUG: Bad rss-counter state "
+                                          "mm:%p idx:%d val:%ld\n", mm, i, x);
+        }
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        VM_BUG_ON(mm->pmd_huge_pte);
+#endif
+}
 /*
 * Allocate and initialize an mm_struct.
 */
@@ -537,9 +556,7 @@ void __mmdrop(struct mm_struct *mm)
        mm_free_pgd(mm);
        destroy_context(mm);
        mmu_notifier_mm_destroy(mm);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        check_mm(mm);
-        VM_BUG_ON(mm->pmd_huge_pte);
-#endif
        free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
@@ -667,6 +684,38 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
        return mm;
 }
+static void complete_vfork_done(struct task_struct *tsk)
+{
+        struct completion *vfork;
+        task_lock(tsk);
+        vfork = tsk->vfork_done;
+        if (likely(vfork)) {
+                tsk->vfork_done = NULL;
+                complete(vfork);
+        }
+        task_unlock(tsk);
+}
+static int wait_for_vfork_done(struct task_struct *child,
+                                struct completion *vfork)
+{
+        int killed;
+        freezer_do_not_count();
+        killed = wait_for_completion_killable(vfork);
+        freezer_count();
+        if (killed) {
+                task_lock(child);
+                child->vfork_done = NULL;
+                task_unlock(child);
+        }
+        put_task_struct(child);
+        return killed;
+}
 /* Please note the differences between mmput and mm_release.
 * mmput is called whenever we stop holding onto a mm_struct,
 * error success whatever.
@@ -682,8 +731,6 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
 */
 void mm_release(struct task_struct *tsk, struct mm_struct *mm)
 {
-        struct completion *vfork_done = tsk->vfork_done;
        /* Get rid of any futexes when releasing the mm */
 #ifdef CONFIG_FUTEX
        if (unlikely(tsk->robust_list)) {
@@ -703,17 +750,15 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
        /* Get rid of any cached register state */
        deactivate_mm(tsk, mm);
-        /* notify parent sleeping on vfork() */
+        if (tsk->vfork_done)
-        if (vfork_done) {
+                complete_vfork_done(tsk);
-                tsk->vfork_done = NULL;
-                complete(vfork_done);
-        }
        /*
         * If we're exiting normally, clear a user-space tid field if
         * requested.  We leave this alone when dying by signal, to leave
         * the value intact in a core dump, and to save the unnecessary
-         * trouble otherwise.  Userland only wants this done for a sys_exit.
+         * trouble, say, a killed vfork parent shouldn't touch this mm.
+         * Userland only wants this done for a sys_exit.
         */
        if (tsk->clear_child_tid) {
                if (!(tsk->flags & PF_SIGNALED) &&
@@ -934,8 +979,10 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
 void __cleanup_sighand(struct sighand_struct *sighand)
 {
-        if (atomic_dec_and_test(&sighand->count))
+        if (atomic_dec_and_test(&sighand->count)) {
+                signalfd_cleanup(sighand);
                kmem_cache_free(sighand_cachep, sighand);
+        }
 }
@@ -1003,6 +1050,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        sig->oom_score_adj = current->signal->oom_score_adj;
        sig->oom_score_adj_min = current->signal->oom_score_adj_min;
+        sig->has_child_subreaper = current->signal->has_child_subreaper ||
+                                   current->signal->is_child_subreaper;
        mutex_init(&sig->cred_guard_mutex);
        return 0;
@@ -1014,7 +1064,6 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
        new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
        new_flags |= PF_FORKNOEXEC;
-        new_flags |= PF_STARTING;
        p->flags = new_flags;
 }
@@ -1191,6 +1240,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_CPUSETS
        p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
        p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
+        seqcount_init(&p->mems_allowed_seq);
 #endif
 #ifdef CONFIG_TRACE_IRQFLAGS
        p->irq_events = 0;
@@ -1309,7 +1359,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        clear_all_latency_tracing(p);
        /* ok, now we should be set up.. */
-        p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
+        if (clone_flags & CLONE_THREAD)
+                p->exit_signal = -1;
+        else if (clone_flags & CLONE_PARENT)
+                p->exit_signal = current->group_leader->exit_signal;
+        else
+                p->exit_signal = (clone_flags & CSIGNAL);
        p->pdeath_signal = 0;
        p->exit_state = 0;
@@ -1544,16 +1600,9 @@ long do_fork(unsigned long clone_flags,
                if (clone_flags & CLONE_VFORK) {
                        p->vfork_done = &vfork;
                        init_completion(&vfork);
+                        get_task_struct(p);
                }
-                /*
-                 * We set PF_STARTING at creation in case tracing wants to
-                 * use this to distinguish a fully live task from one that
-                 * hasn't finished SIGSTOP raising yet.  Now we clear it
-                 * and set the child going.
-                 */
-                p->flags &= ~PF_STARTING;
                wake_up_new_task(p);
                /* forking complete and child started to run, tell ptracer */
@@ -1561,10 +1610,8 @@ long do_fork(unsigned long clone_flags,
                        ptrace_event(trace, nr);
                if (clone_flags & CLONE_VFORK) {
-                        freezer_do_not_count();
+                        if (!wait_for_vfork_done(p, &vfork))
-                        wait_for_completion(&vfork);
+                                ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
-                        freezer_count();
-                        ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
                }
        } else {
                nr = PTR_ERR(p);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 9815b8d1eed5..11f82a4d4eae 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -99,9 +99,9 @@ static void fake_signal_wake_up(struct task_struct *p)
 * freeze_task - send a freeze request to given task
 * @p: task to send the request to
 *
- * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE
+ * If @p is freezing, the freeze request is sent either by sending a fake
- * flag and either sending a fake signal to it or waking it up, depending
+ * signal (if it's not a kernel thread) or waking it up (if it's a kernel
- * on whether it has %PF_FREEZER_NOSIG set.
+ * thread).
 *
 * RETURNS:
 * %false, if @p is not freezing or already frozen; %true, otherwise
diff --git a/kernel/futex.c b/kernel/futex.c
index 1614be20173d..e2b0fb9a0b3b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -59,6 +59,7 @@
 #include <linux/magic.h>
 #include <linux/pid.h>
 #include <linux/nsproxy.h>
+#include <linux/ptrace.h>
 #include <asm/futex.h>
@@ -2443,40 +2444,31 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
 {
        struct robust_list_head __user *head;
        unsigned long ret;
-        const struct cred *cred = current_cred(), *pcred;
+        struct task_struct *p;
        if (!futex_cmpxchg_enabled)
                return -ENOSYS;
+        WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
+        rcu_read_lock();
+        ret = -ESRCH;
        if (!pid)
-                head = current->robust_list;
+                p = current;
        else {
-                struct task_struct *p;
-                ret = -ESRCH;
-                rcu_read_lock();
                p = find_task_by_vpid(pid);
                if (!p)
                        goto err_unlock;
-                ret = -EPERM;
-                pcred = __task_cred(p);
-                /* If victim is in different user_ns, then uids are not
-                   comparable, so we must have CAP_SYS_PTRACE */
-                if (cred->user->user_ns != pcred->user->user_ns) {
-                        if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
-                                goto err_unlock;
-                        goto ok;
-                }
-                /* If victim is in same user_ns, then uids are comparable */
-                if (cred->euid != pcred->euid &&
-                    cred->euid != pcred->uid &&
-                    !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
-                        goto err_unlock;
-ok:
-                head = p->robust_list;
-                rcu_read_unlock();
        }
+        ret = -EPERM;
+        if (!ptrace_may_access(p, PTRACE_MODE_READ))
+                goto err_unlock;
+        head = p->robust_list;
+        rcu_read_unlock();
        if (put_user(sizeof(*head), len_ptr))
                return -EFAULT;
        return put_user(head, head_ptr);
@@ -2628,7 +2620,7 @@ void exit_robust_list(struct task_struct *curr)
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
                u32 __user *uaddr2, u32 val2, u32 val3)
 {
-        int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK;
+        int cmd = op & FUTEX_CMD_MASK;
        unsigned int flags = 0;
        if (!(op & FUTEX_PRIVATE_FLAG))
@@ -2641,49 +2633,44 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
        }
        switch (cmd) {
+        case FUTEX_LOCK_PI:
+        case FUTEX_UNLOCK_PI:
+        case FUTEX_TRYLOCK_PI:
+        case FUTEX_WAIT_REQUEUE_PI:
+        case FUTEX_CMP_REQUEUE_PI:
+                if (!futex_cmpxchg_enabled)
+                        return -ENOSYS;
+        }
+        switch (cmd) {
        case FUTEX_WAIT:
                val3 = FUTEX_BITSET_MATCH_ANY;
        case FUTEX_WAIT_BITSET:
-                ret = futex_wait(uaddr, flags, val, timeout, val3);
+                return futex_wait(uaddr, flags, val, timeout, val3);
-                break;
        case FUTEX_WAKE:
                val3 = FUTEX_BITSET_MATCH_ANY;
        case FUTEX_WAKE_BITSET:
-                ret = futex_wake(uaddr, flags, val, val3);
+                return futex_wake(uaddr, flags, val, val3);
-                break;
        case FUTEX_REQUEUE:
-                ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
+                return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
-                break;
        case FUTEX_CMP_REQUEUE:
-                ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
+                return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
-                break;
        case FUTEX_WAKE_OP:
-                ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
+                return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
-                break;
        case FUTEX_LOCK_PI:
-                if (futex_cmpxchg_enabled)
+                return futex_lock_pi(uaddr, flags, val, timeout, 0);
-                        ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
-                break;
        case FUTEX_UNLOCK_PI:
-                if (futex_cmpxchg_enabled)
+                return futex_unlock_pi(uaddr, flags);
-                        ret = futex_unlock_pi(uaddr, flags);
-                break;
        case FUTEX_TRYLOCK_PI:
-                if (futex_cmpxchg_enabled)
+                return futex_lock_pi(uaddr, flags, 0, timeout, 1);
-                        ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
-                break;
        case FUTEX_WAIT_REQUEUE_PI:
                val3 = FUTEX_BITSET_MATCH_ANY;
-                ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
+                return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
-                                            uaddr2);
+                                             uaddr2);
-                break;
        case FUTEX_CMP_REQUEUE_PI:
-                ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
+                return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
-                break;
-        default:
-                ret = -ENOSYS;
        }
-        return ret;
+        return -ENOSYS;
 }
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 5f9e689dc8f0..83e368b005fc 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -10,6 +10,7 @@
 #include <linux/compat.h>
 #include <linux/nsproxy.h>
 #include <linux/futex.h>
+#include <linux/ptrace.h>
 #include <asm/uaccess.h>
@@ -136,40 +137,31 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
 {
        struct compat_robust_list_head __user *head;
        unsigned long ret;
-        const struct cred *cred = current_cred(), *pcred;
+        struct task_struct *p;
        if (!futex_cmpxchg_enabled)
                return -ENOSYS;
+        WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
+        rcu_read_lock();
+        ret = -ESRCH;
        if (!pid)
-                head = current->compat_robust_list;
+                p = current;
        else {
-                struct task_struct *p;
-                ret = -ESRCH;
-                rcu_read_lock();
                p = find_task_by_vpid(pid);
                if (!p)
                        goto err_unlock;
-                ret = -EPERM;
-                pcred = __task_cred(p);
-                /* If victim is in different user_ns, then uids are not
-                   comparable, so we must have CAP_SYS_PTRACE */
-                if (cred->user->user_ns != pcred->user->user_ns) {
-                        if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
-                                goto err_unlock;
-                        goto ok;
-                }
-                /* If victim is in same user_ns, then uids are comparable */
-                if (cred->euid != pcred->euid &&
-                    cred->euid != pcred->uid &&
-                    !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
-                        goto err_unlock;
-ok:
-                head = p->compat_robust_list;
-                rcu_read_unlock();
        }
+        ret = -EPERM;
+        if (!ptrace_may_access(p, PTRACE_MODE_READ))
+                goto err_unlock;
+        head = p->compat_robust_list;
+        rcu_read_unlock();
        if (put_user(sizeof(*head), len_ptr))
                return -EFAULT;
        return put_user(ptr_to_compat(head), head_ptr);
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 2e48ec0c2e91..c21449f85a2a 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -119,15 +119,20 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
 * For preemptible RCU it is sufficient to call rcu_read_unlock in order
 * to exit the grace period. For classic RCU, a reschedule is required.
 */
-static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
+static bool rcu_lock_break(struct task_struct *g, struct task_struct *t)
 {
+        bool can_cont;
        get_task_struct(g);
        get_task_struct(t);
        rcu_read_unlock();
        cond_resched();
        rcu_read_lock();
+        can_cont = pid_alive(g) && pid_alive(t);
        put_task_struct(t);
        put_task_struct(g);
+        return can_cont;
 }
 /*
@@ -154,9 +159,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
                        goto unlock;
                if (!--batch_count) {
                        batch_count = HUNG_TASK_BATCHING;
-                        rcu_lock_break(g, t);
+                        if (!rcu_lock_break(g, t))
-                        /* Exit if t or g was unhashed during refresh. */
-                        if (t->state == TASK_DEAD || g->state == TASK_DEAD)
                                goto unlock;
                }
                /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 5a38bf4de641..cf1a4a68ce44 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -13,7 +13,7 @@ config GENERIC_HARDIRQS
 # Options selectable by the architecture code
 # Make sparse irq Kconfig switch below available
-config HAVE_SPARSE_IRQ
+config MAY_HAVE_SPARSE_IRQ
       bool
 # Enable the generic irq autoprobe mechanism
@@ -56,13 +56,22 @@ config GENERIC_IRQ_CHIP
 config IRQ_DOMAIN
        bool
+config IRQ_DOMAIN_DEBUG
+        bool "Expose hardware/virtual IRQ mapping via debugfs"
+        depends on IRQ_DOMAIN && DEBUG_FS
+        help
+          This option will show the mapping relationship between hardware irq
+          numbers and Linux irq numbers. The mapping is exposed via debugfs
+          in the file "virq_mapping".
+          If you don't know what this means you don't need it.
 # Support forced irq threading
 config IRQ_FORCED_THREADING
       bool
 config SPARSE_IRQ
-        bool "Support sparse irq numbering"
+        bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ
-        depends on HAVE_SPARSE_IRQ
        ---help---
          Sparse irq numbering is useful for distro kernels that want
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 342d8f44e401..0119b9d467ae 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -53,7 +53,7 @@ unsigned long probe_irq_on(void)
                        if (desc->irq_data.chip->irq_set_type)
                                desc->irq_data.chip->irq_set_type(&desc->irq_data,
                                                         IRQ_TYPE_PROBE);
-                        irq_startup(desc);
+                        irq_startup(desc, false);
                }
                raw_spin_unlock_irq(&desc->lock);
        }
@@ -70,7 +70,7 @@ unsigned long probe_irq_on(void)
                raw_spin_lock_irq(&desc->lock);
                if (!desc->action && irq_settings_can_probe(desc)) {
                        desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
-                        if (irq_startup(desc))
+                        if (irq_startup(desc, false))
                                desc->istate |= IRQS_PENDING;
                }
                raw_spin_unlock_irq(&desc->lock);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index f7c543a801d9..6080f6bc8c33 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -16,6 +16,8 @@
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
+#include <trace/events/irq.h>
 #include "internals.h"
 /**
@@ -61,8 +63,7 @@ int irq_set_irq_type(unsigned int irq, unsigned int type)
                return -EINVAL;
        type &= IRQ_TYPE_SENSE_MASK;
-        if (type != IRQ_TYPE_NONE)
+        ret = __irq_set_trigger(desc, irq, type);
-                ret = __irq_set_trigger(desc, irq, type);
        irq_put_desc_busunlock(desc, flags);
        return ret;
 }
@@ -157,19 +158,22 @@ static void irq_state_set_masked(struct irq_desc *desc)
        irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
 }
-int irq_startup(struct irq_desc *desc)
+int irq_startup(struct irq_desc *desc, bool resend)
 {
+        int ret = 0;
        irq_state_clr_disabled(desc);
        desc->depth = 0;
        if (desc->irq_data.chip->irq_startup) {
-                int ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
+                ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
                irq_state_clr_masked(desc);
-                return ret;
+        } else {
+                irq_enable(desc);
        }
+        if (resend)
-        irq_enable(desc);
+                check_irq_resend(desc, desc->irq_data.irq);
-        return 0;
+        return ret;
 }
 void irq_shutdown(struct irq_desc *desc)
@@ -330,6 +334,24 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(handle_simple_irq);
+/*
+ * Called unconditionally from handle_level_irq() and only for oneshot
+ * interrupts from handle_fasteoi_irq()
+ */
+static void cond_unmask_irq(struct irq_desc *desc)
+{
+        /*
+         * We need to unmask in the following cases:
+         * - Standard level irq (IRQF_ONESHOT is not set)
+         * - Oneshot irq which did not wake the thread (caused by a
+         *   spurious interrupt or a primary handler handling it
+         *   completely).
+         */
+        if (!irqd_irq_disabled(&desc->irq_data) &&
+            irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot)
+                unmask_irq(desc);
+}
 /**
 *      handle_level_irq - Level type irq handler
 *      @irq:   the interrupt number
@@ -362,8 +384,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
        handle_irq_event(desc);
-        if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT))
+        cond_unmask_irq(desc);
-                unmask_irq(desc);
 out_unlock:
        raw_spin_unlock(&desc->lock);
 }
@@ -417,6 +439,9 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
        preflow_handler(desc);
        handle_irq_event(desc);
+        if (desc->istate & IRQS_ONESHOT)
+                cond_unmask_irq(desc);
 out_eoi:
        desc->irq_data.chip->irq_eoi(&desc->irq_data);
 out_unlock:
@@ -625,7 +650,7 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
                irq_settings_set_noprobe(desc);
                irq_settings_set_norequest(desc);
                irq_settings_set_nothread(desc);
-                irq_startup(desc);
+                irq_startup(desc, true);
        }
 out:
        irq_put_desc_busunlock(desc, flags);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 470d08c82bbe..bdb180325551 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -54,14 +54,18 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
 static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
 {
        /*
-         * Wake up the handler thread for this action. In case the
+         * In case the thread crashed and was killed we just pretend that
-         * thread crashed and was killed we just pretend that we
+         * we handled the interrupt. The hardirq handler has disabled the
-         * handled the interrupt. The hardirq handler has disabled the
+         * device interrupt, so no irq storm is lurking.
-         * device interrupt, so no irq storm is lurking. If the
+         */
+        if (action->thread->flags & PF_EXITING)
+                return;
+        /*
+         * Wake up the handler thread for this action. If the
         * RUNTHREAD bit is already set, nothing to do.
         */
-        if (test_bit(IRQTF_DIED, &action->thread_flags) ||
+        if (test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
-            test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
                return;
        /*
@@ -110,6 +114,18 @@ static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
         * threads_oneshot untouched and runs the thread another time.
         */
        desc->threads_oneshot |= action->thread_mask;
+        /*
+         * We increment the threads_active counter in case we wake up
+         * the irq thread. The irq thread decrements the counter when
+         * it returns from the handler or in the exit path and wakes
+         * up waiters which are stuck in synchronize_irq() when the
+         * active count becomes zero. synchronize_irq() is serialized
+         * against this code (hard irq handler) via IRQS_INPROGRESS
+         * like the finalize_oneshot() code. See comment above.
+         */
+        atomic_inc(&desc->threads_active);
        wake_up_process(action->thread);
 }
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index b7952316016a..8e5c56b3b7d9 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -20,14 +20,12 @@ extern bool noirqdebug;
 /*
 * Bits used by threaded handlers:
 * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
- * IRQTF_DIED      - handler thread died
 * IRQTF_WARNED    - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
 * IRQTF_AFFINITY  - irq thread is requested to adjust affinity
 * IRQTF_FORCED_THREAD  - irq action is force threaded
 */
 enum {
        IRQTF_RUNTHREAD,
-        IRQTF_DIED,
        IRQTF_WARNED,
        IRQTF_AFFINITY,
        IRQTF_FORCED_THREAD,
@@ -67,7 +65,7 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
 extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
-extern int irq_startup(struct irq_desc *desc);
+extern int irq_startup(struct irq_desc *desc, bool resend);
 extern void irq_shutdown(struct irq_desc *desc);
 extern void irq_enable(struct irq_desc *desc);
 extern void irq_disable(struct irq_desc *desc);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 1f9e26526b69..3601f3fbf67c 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1,189 +1,793 @@
+#include <linux/debugfs.h>
+#include <linux/hardirq.h>
+#include <linux/interrupt.h>
 #include <linux/irq.h>
+#include <linux/irqdesc.h>
 #include <linux/irqdomain.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
+#include <linux/seq_file.h>
 #include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/fs.h>
+#define IRQ_DOMAIN_MAP_LEGACY 0 /* driver allocated fixed range of irqs.
+                                 * ie. legacy 8259, gets irqs 1..15 */
+#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */
+#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */
+#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */
 static LIST_HEAD(irq_domain_list);
 static DEFINE_MUTEX(irq_domain_mutex);
+static DEFINE_MUTEX(revmap_trees_mutex);
+static unsigned int irq_virq_count = NR_IRQS;
+static struct irq_domain *irq_default_domain;
 /**
- * irq_domain_add() - Register an irq_domain
+ * irq_domain_alloc() - Allocate a new irq_domain data structure
- * @domain: ptr to initialized irq_domain structure
+ * @of_node: optional device-tree node of the interrupt controller
+ * @revmap_type: type of reverse mapping to use
+ * @ops: map/unmap domain callbacks
+ * @host_data: Controller private data pointer
 *
- * Registers an irq_domain structure.  The irq_domain must at a minimum be
+ * Allocates and initialize and irq_domain structure.  Caller is expected to
- * initialized with an ops structure pointer, and either a ->to_irq hook or
+ * register allocated irq_domain with irq_domain_register().  Returns pointer
- * a valid irq_base value.  Everything else is optional.
+ * to IRQ domain, or NULL on failure.
 */
-void irq_domain_add(struct irq_domain *domain)
+static struct irq_domain *irq_domain_alloc(struct device_node *of_node,
+                                           unsigned int revmap_type,
+                                           const struct irq_domain_ops *ops,
+                                           void *host_data)
 {
-        struct irq_data *d;
+        struct irq_domain *domain;
-        int hwirq, irq;
-        /*
+        domain = kzalloc(sizeof(*domain), GFP_KERNEL);
-         * This assumes that the irq_domain owner has already allocated
+        if (WARN_ON(!domain))
-         * the irq_descs.  This block will be removed when support for dynamic
+                return NULL;
-         * allocation of irq_descs is added to irq_domain.
-         */
+        /* Fill structure */
-        irq_domain_for_each_irq(domain, hwirq, irq) {
+        domain->revmap_type = revmap_type;
-                d = irq_get_irq_data(irq);
+        domain->ops = ops;
-                if (!d) {
+        domain->host_data = host_data;
-                        WARN(1, "error: assigning domain to non existant irq_desc");
+        domain->of_node = of_node_get(of_node);
-                        return;
-                }
+        return domain;
-                if (d->domain) {
+}
-                        /* things are broken; just report, don't clean up */
-                        WARN(1, "error: irq_desc already assigned to a domain");
+static void irq_domain_add(struct irq_domain *domain)
-                        return;
+{
+        mutex_lock(&irq_domain_mutex);
+        list_add(&domain->link, &irq_domain_list);
+        mutex_unlock(&irq_domain_mutex);
+        pr_debug("irq: Allocated domain of type %d @0x%p\n",
+                 domain->revmap_type, domain);
+}
+static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
+                                             irq_hw_number_t hwirq)
+{
+        irq_hw_number_t first_hwirq = domain->revmap_data.legacy.first_hwirq;
+        int size = domain->revmap_data.legacy.size;
+        if (WARN_ON(hwirq < first_hwirq || hwirq >= first_hwirq + size))
+                return 0;
+        return hwirq - first_hwirq + domain->revmap_data.legacy.first_irq;
+}
+/**
+ * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
+ * @of_node: pointer to interrupt controller's device tree node.
+ * @size: total number of irqs in legacy mapping
+ * @first_irq: first number of irq block assigned to the domain
+ * @first_hwirq: first hwirq number to use for the translation. Should normally
+ *               be '0', but a positive integer can be used if the effective
+ *               hwirqs numbering does not begin at zero.
+ * @ops: map/unmap domain callbacks
+ * @host_data: Controller private data pointer
+ *
+ * Note: the map() callback will be called before this function returns
+ * for all legacy interrupts except 0 (which is always the invalid irq for
+ * a legacy controller).
+ */
+struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
+                                         unsigned int size,
+                                         unsigned int first_irq,
+                                         irq_hw_number_t first_hwirq,
+                                         const struct irq_domain_ops *ops,
+                                         void *host_data)
+{
+        struct irq_domain *domain;
+        unsigned int i;
+        domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LEGACY, ops, host_data);
+        if (!domain)
+                return NULL;
+        domain->revmap_data.legacy.first_irq = first_irq;
+        domain->revmap_data.legacy.first_hwirq = first_hwirq;
+        domain->revmap_data.legacy.size = size;
+        mutex_lock(&irq_domain_mutex);
+        /* Verify that all the irqs are available */
+        for (i = 0; i < size; i++) {
+                int irq = first_irq + i;
+                struct irq_data *irq_data = irq_get_irq_data(irq);
+                if (WARN_ON(!irq_data || irq_data->domain)) {
+                        mutex_unlock(&irq_domain_mutex);
+                        of_node_put(domain->of_node);
+                        kfree(domain);
+                        return NULL;
                }
-                d->domain = domain;
-                d->hwirq = hwirq;
        }
-        mutex_lock(&irq_domain_mutex);
+        /* Claim all of the irqs before registering a legacy domain */
-        list_add(&domain->list, &irq_domain_list);
+        for (i = 0; i < size; i++) {
+                struct irq_data *irq_data = irq_get_irq_data(first_irq + i);
+                irq_data->hwirq = first_hwirq + i;
+                irq_data->domain = domain;
+        }
        mutex_unlock(&irq_domain_mutex);
+        for (i = 0; i < size; i++) {
+                int irq = first_irq + i;
+                int hwirq = first_hwirq + i;
+                /* IRQ0 gets ignored */
+                if (!irq)
+                        continue;
+                /* Legacy flags are left to default at this point,
+                 * one can then use irq_create_mapping() to
+                 * explicitly change them
+                 */
+                ops->map(domain, irq, hwirq);
+                /* Clear norequest flags */
+                irq_clear_status_flags(irq, IRQ_NOREQUEST);
+        }
+        irq_domain_add(domain);
+        return domain;
+}
+/**
+ * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain.
+ * @of_node: pointer to interrupt controller's device tree node.
+ * @ops: map/unmap domain callbacks
+ * @host_data: Controller private data pointer
+ */
+struct irq_domain *irq_domain_add_linear(struct device_node *of_node,
+                                         unsigned int size,
+                                         const struct irq_domain_ops *ops,
+                                         void *host_data)
+{
+        struct irq_domain *domain;
+        unsigned int *revmap;
+        revmap = kzalloc(sizeof(*revmap) * size, GFP_KERNEL);
+        if (WARN_ON(!revmap))
+                return NULL;
+        domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, ops, host_data);
+        if (!domain) {
+                kfree(revmap);
+                return NULL;
+        }
+        domain->revmap_data.linear.size = size;
+        domain->revmap_data.linear.revmap = revmap;
+        irq_domain_add(domain);
+        return domain;
+}
+struct irq_domain *irq_domain_add_nomap(struct device_node *of_node,
+                                         const struct irq_domain_ops *ops,
+                                         void *host_data)
+{
+        struct irq_domain *domain = irq_domain_alloc(of_node,
+                                        IRQ_DOMAIN_MAP_NOMAP, ops, host_data);
+        if (domain)
+                irq_domain_add(domain);
+        return domain;
+}
+/**
+ * irq_domain_add_tree()
+ * @of_node: pointer to interrupt controller's device tree node.
+ * @ops: map/unmap domain callbacks
+ *
+ * Note: The radix tree will be allocated later during boot automatically
+ * (the reverse mapping will use the slow path until that happens).
+ */
+struct irq_domain *irq_domain_add_tree(struct device_node *of_node,
+                                         const struct irq_domain_ops *ops,
+                                         void *host_data)
+{
+        struct irq_domain *domain = irq_domain_alloc(of_node,
+                                        IRQ_DOMAIN_MAP_TREE, ops, host_data);
+        if (domain) {
+                INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL);
+                irq_domain_add(domain);
+        }
+        return domain;
 }
 /**
- * irq_domain_del() - Unregister an irq_domain
+ * irq_find_host() - Locates a domain for a given device node
- * @domain: ptr to registered irq_domain.
+ * @node: device-tree node of the interrupt controller
 */
-void irq_domain_del(struct irq_domain *domain)
+struct irq_domain *irq_find_host(struct device_node *node)
 {
-        struct irq_data *d;
+        struct irq_domain *h, *found = NULL;
-        int hwirq, irq;
+        int rc;
+        /* We might want to match the legacy controller last since
+         * it might potentially be set to match all interrupts in
+         * the absence of a device node. This isn't a problem so far
+         * yet though...
+         */
        mutex_lock(&irq_domain_mutex);
-        list_del(&domain->list);
+        list_for_each_entry(h, &irq_domain_list, link) {
+                if (h->ops->match)
+                        rc = h->ops->match(h, node);
+                else
+                        rc = (h->of_node != NULL) && (h->of_node == node);
+                if (rc) {
+                        found = h;
+                        break;
+                }
+        }
        mutex_unlock(&irq_domain_mutex);
+        return found;
+}
+EXPORT_SYMBOL_GPL(irq_find_host);
+/**
+ * irq_set_default_host() - Set a "default" irq domain
+ * @domain: default domain pointer
+ *
+ * For convenience, it's possible to set a "default" domain that will be used
+ * whenever NULL is passed to irq_create_mapping(). It makes life easier for
+ * platforms that want to manipulate a few hard coded interrupt numbers that
+ * aren't properly represented in the device-tree.
+ */
+void irq_set_default_host(struct irq_domain *domain)
+{
+        pr_debug("irq: Default domain set to @0x%p\n", domain);
+        irq_default_domain = domain;
+}
+/**
+ * irq_set_virq_count() - Set the maximum number of linux irqs
+ * @count: number of linux irqs, capped with NR_IRQS
+ *
+ * This is mainly for use by platforms like iSeries who want to program
+ * the virtual irq number in the controller to avoid the reverse mapping
+ */
+void irq_set_virq_count(unsigned int count)
+{
+        pr_debug("irq: Trying to set virq count to %d\n", count);
-        /* Clear the irq_domain assignments */
+        BUG_ON(count < NUM_ISA_INTERRUPTS);
-        irq_domain_for_each_irq(domain, hwirq, irq) {
+        if (count < NR_IRQS)
-                d = irq_get_irq_data(irq);
+                irq_virq_count = count;
-                d->domain = NULL;
+}
+static int irq_setup_virq(struct irq_domain *domain, unsigned int virq,
+                            irq_hw_number_t hwirq)
+{
+        struct irq_data *irq_data = irq_get_irq_data(virq);
+        irq_data->hwirq = hwirq;
+        irq_data->domain = domain;
+        if (domain->ops->map(domain, virq, hwirq)) {
+                pr_debug("irq: -> mapping failed, freeing\n");
+                irq_data->domain = NULL;
+                irq_data->hwirq = 0;
+                return -1;
        }
+        irq_clear_status_flags(virq, IRQ_NOREQUEST);
+        return 0;
 }
-#if defined(CONFIG_OF_IRQ)
 /**
- * irq_create_of_mapping() - Map a linux irq number from a DT interrupt spec
+ * irq_create_direct_mapping() - Allocate an irq for direct mapping
+ * @domain: domain to allocate the irq for or NULL for default domain
 *
- * Used by the device tree interrupt mapping code to translate a device tree
+ * This routine is used for irq controllers which can choose the hardware
- * interrupt specifier to a valid linux irq number.  Returns either a valid
+ * interrupt numbers they generate. In such a case it's simplest to use
- * linux IRQ number or 0.
+ * the linux irq as the hardware interrupt number.
+ */
+unsigned int irq_create_direct_mapping(struct irq_domain *domain)
+{
+        unsigned int virq;
+        if (domain == NULL)
+                domain = irq_default_domain;
+        BUG_ON(domain == NULL);
+        WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP);
+        virq = irq_alloc_desc_from(1, 0);
+        if (!virq) {
+                pr_debug("irq: create_direct virq allocation failed\n");
+                return 0;
+        }
+        if (virq >= irq_virq_count) {
+                pr_err("ERROR: no free irqs available below %i maximum\n",
+                        irq_virq_count);
+                irq_free_desc(virq);
+                return 0;
+        }
+        pr_debug("irq: create_direct obtained virq %d\n", virq);
+        if (irq_setup_virq(domain, virq, virq)) {
+                irq_free_desc(virq);
+                return 0;
+        }
+        return virq;
+}
+/**
+ * irq_create_mapping() - Map a hardware interrupt into linux irq space
+ * @domain: domain owning this hardware interrupt or NULL for default domain
+ * @hwirq: hardware irq number in that domain space
 *
- * When the caller no longer need the irq number returned by this function it
+ * Only one mapping per hardware interrupt is permitted. Returns a linux
- * should arrange to call irq_dispose_mapping().
+ * irq number.
+ * If the sense/trigger is to be specified, set_irq_type() should be called
+ * on the number returned from that call.
 */
+unsigned int irq_create_mapping(struct irq_domain *domain,
+                                irq_hw_number_t hwirq)
+{
+        unsigned int virq, hint;
+        pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
+        /* Look for default domain if nececssary */
+        if (domain == NULL)
+                domain = irq_default_domain;
+        if (domain == NULL) {
+                printk(KERN_WARNING "irq_create_mapping called for"
+                       " NULL domain, hwirq=%lx\n", hwirq);
+                WARN_ON(1);
+                return 0;
+        }
+        pr_debug("irq: -> using domain @%p\n", domain);
+        /* Check if mapping already exists */
+        virq = irq_find_mapping(domain, hwirq);
+        if (virq) {
+                pr_debug("irq: -> existing mapping on virq %d\n", virq);
+                return virq;
+        }
+        /* Get a virtual interrupt number */
+        if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
+                return irq_domain_legacy_revmap(domain, hwirq);
+        /* Allocate a virtual interrupt number */
+        hint = hwirq % irq_virq_count;
+        if (hint == 0)
+                hint++;
+        virq = irq_alloc_desc_from(hint, 0);
+        if (!virq)
+                virq = irq_alloc_desc_from(1, 0);
+        if (!virq) {
+                pr_debug("irq: -> virq allocation failed\n");
+                return 0;
+        }
+        if (irq_setup_virq(domain, virq, hwirq)) {
+                if (domain->revmap_type != IRQ_DOMAIN_MAP_LEGACY)
+                        irq_free_desc(virq);
+                return 0;
+        }
+        pr_debug("irq: irq %lu on domain %s mapped to virtual irq %u\n",
+                hwirq, domain->of_node ? domain->of_node->full_name : "null", virq);
+        return virq;
+}
+EXPORT_SYMBOL_GPL(irq_create_mapping);
 unsigned int irq_create_of_mapping(struct device_node *controller,
                                   const u32 *intspec, unsigned int intsize)
 {
        struct irq_domain *domain;
-        unsigned long hwirq;
+        irq_hw_number_t hwirq;
-        unsigned int irq, type;
+        unsigned int type = IRQ_TYPE_NONE;
-        int rc = -EINVAL;
+        unsigned int virq;
-        /* Find a domain which can translate the irq spec */
+        domain = controller ? irq_find_host(controller) : irq_default_domain;
-        mutex_lock(&irq_domain_mutex);
+        if (!domain) {
-        list_for_each_entry(domain, &irq_domain_list, list) {
+#ifdef CONFIG_MIPS
-                if (!domain->ops->dt_translate)
+                /*
-                        continue;
+                 * Workaround to avoid breaking interrupt controller drivers
-                rc = domain->ops->dt_translate(domain, controller,
+                 * that don't yet register an irq_domain.  This is temporary
-                                        intspec, intsize, &hwirq, &type);
+                 * code. ~~~gcl, Feb 24, 2012
-                if (rc == 0)
+                 *
-                        break;
+                 * Scheduled for removal in Linux v3.6.  That should be enough
+                 * time.
+                 */
+                if (intsize > 0)
+                        return intspec[0];
+#endif
+                printk(KERN_WARNING "irq: no irq domain found for %s !\n",
+                       controller->full_name);
+                return 0;
        }
-        mutex_unlock(&irq_domain_mutex);
-        if (rc != 0)
+        /* If domain has no translation, then we assume interrupt line */
-                return 0;
+        if (domain->ops->xlate == NULL)
+                hwirq = intspec[0];
+        else {
+                if (domain->ops->xlate(domain, controller, intspec, intsize,
+                                     &hwirq, &type))
+                        return 0;
+        }
+        /* Create mapping */
+        virq = irq_create_mapping(domain, hwirq);
+        if (!virq)
+                return virq;
-        irq = irq_domain_to_irq(domain, hwirq);
+        /* Set type if specified and different than the current one */
-        if (type != IRQ_TYPE_NONE)
+        if (type != IRQ_TYPE_NONE &&
-                irq_set_irq_type(irq, type);
+            type != (irqd_get_trigger_type(irq_get_irq_data(virq))))
-        pr_debug("%s: mapped hwirq=%i to irq=%i, flags=%x\n",
+                irq_set_irq_type(virq, type);
-                 controller->full_name, (int)hwirq, irq, type);
+        return virq;
-        return irq;
 }
 EXPORT_SYMBOL_GPL(irq_create_of_mapping);
 /**
- * irq_dispose_mapping() - Discard a mapping created by irq_create_of_mapping()
+ * irq_dispose_mapping() - Unmap an interrupt
- * @irq: linux irq number to be discarded
+ * @virq: linux irq number of the interrupt to unmap
+ */
+void irq_dispose_mapping(unsigned int virq)
+{
+        struct irq_data *irq_data = irq_get_irq_data(virq);
+        struct irq_domain *domain;
+        irq_hw_number_t hwirq;
+        if (!virq || !irq_data)
+                return;
+        domain = irq_data->domain;
+        if (WARN_ON(domain == NULL))
+                return;
+        /* Never unmap legacy interrupts */
+        if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
+                return;
+        irq_set_status_flags(virq, IRQ_NOREQUEST);
+        /* remove chip and handler */
+        irq_set_chip_and_handler(virq, NULL, NULL);
+        /* Make sure it's completed */
+        synchronize_irq(virq);
+        /* Tell the PIC about it */
+        if (domain->ops->unmap)
+                domain->ops->unmap(domain, virq);
+        smp_mb();
+        /* Clear reverse map */
+        hwirq = irq_data->hwirq;
+        switch(domain->revmap_type) {
+        case IRQ_DOMAIN_MAP_LINEAR:
+                if (hwirq < domain->revmap_data.linear.size)
+                        domain->revmap_data.linear.revmap[hwirq] = 0;
+                break;
+        case IRQ_DOMAIN_MAP_TREE:
+                mutex_lock(&revmap_trees_mutex);
+                radix_tree_delete(&domain->revmap_data.tree, hwirq);
+                mutex_unlock(&revmap_trees_mutex);
+                break;
+        }
+        irq_free_desc(virq);
+}
+EXPORT_SYMBOL_GPL(irq_dispose_mapping);
+/**
+ * irq_find_mapping() - Find a linux irq from an hw irq number.
+ * @domain: domain owning this hardware interrupt
+ * @hwirq: hardware irq number in that domain space
+ *
+ * This is a slow path, for use by generic code. It's expected that an
+ * irq controller implementation directly calls the appropriate low level
+ * mapping function.
+ */
+unsigned int irq_find_mapping(struct irq_domain *domain,
+                              irq_hw_number_t hwirq)
+{
+        unsigned int i;
+        unsigned int hint = hwirq % irq_virq_count;
+        /* Look for default domain if nececssary */
+        if (domain == NULL)
+                domain = irq_default_domain;
+        if (domain == NULL)
+                return 0;
+        /* legacy -> bail early */
+        if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
+                return irq_domain_legacy_revmap(domain, hwirq);
+        /* Slow path does a linear search of the map */
+        if (hint == 0)
+                hint = 1;
+        i = hint;
+        do {
+                struct irq_data *data = irq_get_irq_data(i);
+                if (data && (data->domain == domain) && (data->hwirq == hwirq))
+                        return i;
+                i++;
+                if (i >= irq_virq_count)
+                        i = 1;
+        } while(i != hint);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(irq_find_mapping);
+/**
+ * irq_radix_revmap_lookup() - Find a linux irq from a hw irq number.
+ * @domain: domain owning this hardware interrupt
+ * @hwirq: hardware irq number in that domain space
 *
- * Calling this function indicates the caller no longer needs a reference to
+ * This is a fast path, for use by irq controller code that uses radix tree
- * the linux irq number returned by a prior call to irq_create_of_mapping().
+ * revmaps
 */
-void irq_dispose_mapping(unsigned int irq)
+unsigned int irq_radix_revmap_lookup(struct irq_domain *domain,
+                                     irq_hw_number_t hwirq)
 {
+        struct irq_data *irq_data;
+        if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_TREE))
+                return irq_find_mapping(domain, hwirq);
+        /*
+         * Freeing an irq can delete nodes along the path to
+         * do the lookup via call_rcu.
+         */
+        rcu_read_lock();
+        irq_data = radix_tree_lookup(&domain->revmap_data.tree, hwirq);
+        rcu_read_unlock();
        /*
-         * nothing yet; will be filled when support for dynamic allocation of
+         * If found in radix tree, then fine.
-         * irq_descs is added to irq_domain
+         * Else fallback to linear lookup - this should not happen in practice
+         * as it means that we failed to insert the node in the radix tree.
         */
+        return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq);
 }
-EXPORT_SYMBOL_GPL(irq_dispose_mapping);
-int irq_domain_simple_dt_translate(struct irq_domain *d,
+/**
-                            struct device_node *controller,
+ * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping.
-                            const u32 *intspec, unsigned int intsize,
+ * @domain: domain owning this hardware interrupt
-                            unsigned long *out_hwirq, unsigned int *out_type)
+ * @virq: linux irq number
+ * @hwirq: hardware irq number in that domain space
+ *
+ * This is for use by irq controllers that use a radix tree reverse
+ * mapping for fast lookup.
+ */
+void irq_radix_revmap_insert(struct irq_domain *domain, unsigned int virq,
+                             irq_hw_number_t hwirq)
 {
-        if (d->of_node != controller)
+        struct irq_data *irq_data = irq_get_irq_data(virq);
-                return -EINVAL;
-        if (intsize < 1)
+        if (WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_TREE))
-                return -EINVAL;
+                return;
-        if (d->nr_irq && ((intspec[0] < d->hwirq_base) ||
-            (intspec[0] >= d->hwirq_base + d->nr_irq)))
+        if (virq) {
-                return -EINVAL;
+                mutex_lock(&revmap_trees_mutex);
+                radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data);
+                mutex_unlock(&revmap_trees_mutex);
+        }
+}
+/**
+ * irq_linear_revmap() - Find a linux irq from a hw irq number.
+ * @domain: domain owning this hardware interrupt
+ * @hwirq: hardware irq number in that domain space
+ *
+ * This is a fast path, for use by irq controller code that uses linear
+ * revmaps. It does fallback to the slow path if the revmap doesn't exist
+ * yet and will create the revmap entry with appropriate locking
+ */
+unsigned int irq_linear_revmap(struct irq_domain *domain,
+                               irq_hw_number_t hwirq)
+{
+        unsigned int *revmap;
+        if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR))
+                return irq_find_mapping(domain, hwirq);
+        /* Check revmap bounds */
+        if (unlikely(hwirq >= domain->revmap_data.linear.size))
+                return irq_find_mapping(domain, hwirq);
+        /* Check if revmap was allocated */
+        revmap = domain->revmap_data.linear.revmap;
+        if (unlikely(revmap == NULL))
+                return irq_find_mapping(domain, hwirq);
+        /* Fill up revmap with slow path if no mapping found */
+        if (unlikely(!revmap[hwirq]))
+                revmap[hwirq] = irq_find_mapping(domain, hwirq);
+        return revmap[hwirq];
+}
+#ifdef CONFIG_IRQ_DOMAIN_DEBUG
+static int virq_debug_show(struct seq_file *m, void *private)
+{
+        unsigned long flags;
+        struct irq_desc *desc;
+        const char *p;
+        static const char none[] = "none";
+        void *data;
+        int i;
+        seq_printf(m, "%-5s  %-7s  %-15s  %-18s  %s\n", "virq", "hwirq",
+                      "chip name", "chip data", "domain name");
+        for (i = 1; i < nr_irqs; i++) {
+                desc = irq_to_desc(i);
+                if (!desc)
+                        continue;
+                raw_spin_lock_irqsave(&desc->lock, flags);
+                if (desc->action && desc->action->handler) {
+                        struct irq_chip *chip;
+                        seq_printf(m, "%5d  ", i);
+                        seq_printf(m, "0x%05lx  ", desc->irq_data.hwirq);
+                        chip = irq_desc_get_chip(desc);
+                        if (chip && chip->name)
+                                p = chip->name;
+                        else
+                                p = none;
+                        seq_printf(m, "%-15s  ", p);
+                        data = irq_desc_get_chip_data(desc);
+                        seq_printf(m, "0x%16p  ", data);
+                        if (desc->irq_data.domain && desc->irq_data.domain->of_node)
+                                p = desc->irq_data.domain->of_node->full_name;
+                        else
+                                p = none;
+                        seq_printf(m, "%s\n", p);
+                }
+                raw_spin_unlock_irqrestore(&desc->lock, flags);
+        }
+        return 0;
+}
+static int virq_debug_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, virq_debug_show, inode->i_private);
+}
+static const struct file_operations virq_debug_fops = {
+        .open = virq_debug_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
+static int __init irq_debugfs_init(void)
+{
+        if (debugfs_create_file("irq_domain_mapping", S_IRUGO, NULL,
+                                 NULL, &virq_debug_fops) == NULL)
+                return -ENOMEM;
+        return 0;
+}
+__initcall(irq_debugfs_init);
+#endif /* CONFIG_IRQ_DOMAIN_DEBUG */
+int irq_domain_simple_map(struct irq_domain *d, unsigned int irq,
+                          irq_hw_number_t hwirq)
+{
+        return 0;
+}
+/**
+ * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings
+ *
+ * Device Tree IRQ specifier translation function which works with one cell
+ * bindings where the cell value maps directly to the hwirq number.
+ */
+int irq_domain_xlate_onecell(struct irq_domain *d, struct device_node *ctrlr,
+                             const u32 *intspec, unsigned int intsize,
+                             unsigned long *out_hwirq, unsigned int *out_type)
+{
+        if (WARN_ON(intsize < 1))
+                return -EINVAL;
        *out_hwirq = intspec[0];
        *out_type = IRQ_TYPE_NONE;
-        if (intsize > 1)
-                *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
        return 0;
 }
+EXPORT_SYMBOL_GPL(irq_domain_xlate_onecell);
 /**
- * irq_domain_create_simple() - Set up a 'simple' translation range
+ * irq_domain_xlate_twocell() - Generic xlate for direct two cell bindings
+ *
+ * Device Tree IRQ specifier translation function which works with two cell
+ * bindings where the cell values map directly to the hwirq number
+ * and linux irq flags.
 */
-void irq_domain_add_simple(struct device_node *controller, int irq_base)
+int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr,
+                        const u32 *intspec, unsigned int intsize,
+                        irq_hw_number_t *out_hwirq, unsigned int *out_type)
 {
-        struct irq_domain *domain;
+        if (WARN_ON(intsize < 2))
+                return -EINVAL;
-        domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+        *out_hwirq = intspec[0];
-        if (!domain) {
+        *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
-                WARN_ON(1);
+        return 0;
-                return;
+}
-        }
+EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell);
-        domain->irq_base = irq_base;
+/**
-        domain->of_node = of_node_get(controller);
+ * irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings
-        domain->ops = &irq_domain_simple_ops;
+ *
-        irq_domain_add(domain);
+ * Device Tree IRQ specifier translation function which works with either one
+ * or two cell bindings where the cell values map directly to the hwirq number
+ * and linux irq flags.
+ *
+ * Note: don't use this function unless your interrupt controller explicitly
+ * supports both one and two cell bindings.  For the majority of controllers
+ * the _onecell() or _twocell() variants above should be used.
+ */
+int irq_domain_xlate_onetwocell(struct irq_domain *d,
+                                struct device_node *ctrlr,
+                                const u32 *intspec, unsigned int intsize,
+                                unsigned long *out_hwirq, unsigned int *out_type)
+{
+        if (WARN_ON(intsize < 1))
+                return -EINVAL;
+        *out_hwirq = intspec[0];
+        *out_type = (intsize > 1) ? intspec[1] : IRQ_TYPE_NONE;
+        return 0;
 }
-EXPORT_SYMBOL_GPL(irq_domain_add_simple);
+EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell);
+const struct irq_domain_ops irq_domain_simple_ops = {
+        .map = irq_domain_simple_map,
+        .xlate = irq_domain_xlate_onetwocell,
+};
+EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
+#ifdef CONFIG_OF_IRQ
 void irq_domain_generate_simple(const struct of_device_id *match,
                                u64 phys_base, unsigned int irq_start)
 {
        struct device_node *node;
-        pr_info("looking for phys_base=%llx, irq_start=%i\n",
+        pr_debug("looking for phys_base=%llx, irq_start=%i\n",
                (unsigned long long) phys_base, (int) irq_start);
        node = of_find_matching_node_by_address(NULL, match, phys_base);
        if (node)
-                irq_domain_add_simple(node, irq_start);
+                irq_domain_add_legacy(node, 32, irq_start, 0,
-        else
+                                      &irq_domain_simple_ops, NULL);
-                pr_info("no node found\n");
 }
 EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
-#endif /* CONFIG_OF_IRQ */
+#endif
-struct irq_domain_ops irq_domain_simple_ops = {
-#ifdef CONFIG_OF_IRQ
-        .dt_translate = irq_domain_simple_dt_translate,
-#endif /* CONFIG_OF_IRQ */
-};
-EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index a9a9dbe49fea..89a3ea82569b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -282,7 +282,7 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
 {
        struct irq_chip *chip = irq_desc_get_chip(desc);
        struct cpumask *set = irq_default_affinity;
-        int ret;
+        int ret, node = desc->irq_data.node;
        /* Excludes PER_CPU and NO_BALANCE interrupts */
        if (!irq_can_set_affinity(irq))
@@ -301,6 +301,13 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
        }
        cpumask_and(mask, cpu_online_mask, set);
+        if (node != NUMA_NO_NODE) {
+                const struct cpumask *nodemask = cpumask_of_node(node);
+                /* make sure at least one of the cpus in nodemask is online */
+                if (cpumask_intersects(mask, nodemask))
+                        cpumask_and(mask, mask, nodemask);
+        }
        ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
        switch (ret) {
        case IRQ_SET_MASK_OK:
@@ -645,7 +652,7 @@ static int irq_wait_for_interrupt(struct irqaction *action)
 * is marked MASKED.
 */
 static void irq_finalize_oneshot(struct irq_desc *desc,
-                                 struct irqaction *action, bool force)
+                                 struct irqaction *action)
 {
        if (!(desc->istate & IRQS_ONESHOT))
                return;
@@ -679,7 +686,7 @@ again:
         * we would clear the threads_oneshot bit of this thread which
         * was just set.
         */
-        if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+        if (test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
                goto out_unlock;
        desc->threads_oneshot &= ~action->thread_mask;
@@ -739,7 +746,7 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
        local_bh_disable();
        ret = action->thread_fn(action->irq, action->dev_id);
-        irq_finalize_oneshot(desc, action, false);
+        irq_finalize_oneshot(desc, action);
        local_bh_enable();
        return ret;
 }
@@ -755,10 +762,17 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc,
        irqreturn_t ret;
        ret = action->thread_fn(action->irq, action->dev_id);
-        irq_finalize_oneshot(desc, action, false);
+        irq_finalize_oneshot(desc, action);
        return ret;
 }
+static void wake_threads_waitq(struct irq_desc *desc)
+{
+        if (atomic_dec_and_test(&desc->threads_active) &&
+            waitqueue_active(&desc->wait_for_threads))
+                wake_up(&desc->wait_for_threads);
+}
 /*
 * Interrupt handler thread
 */
@@ -771,57 +785,41 @@ static int irq_thread(void *data)
        struct irq_desc *desc = irq_to_desc(action->irq);
        irqreturn_t (*handler_fn)(struct irq_desc *desc,
                        struct irqaction *action);
-        int wake;
-        if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD,
+        if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD,
                                        &action->thread_flags))
                handler_fn = irq_forced_thread_fn;
        else
                handler_fn = irq_thread_fn;
        sched_setscheduler(current, SCHED_FIFO, &param);
-        current->irqaction = action;
+        current->irq_thread = 1;
        while (!irq_wait_for_interrupt(action)) {
+                irqreturn_t action_ret;
                irq_thread_check_affinity(desc, action);
-                atomic_inc(&desc->threads_active);
+                action_ret = handler_fn(desc, action);
+                if (!noirqdebug)
+                        note_interrupt(action->irq, desc, action_ret);
-                raw_spin_lock_irq(&desc->lock);
+                wake_threads_waitq(desc);
-                if (unlikely(irqd_irq_disabled(&desc->irq_data))) {
-                        /*
-                         * CHECKME: We might need a dedicated
-                         * IRQ_THREAD_PENDING flag here, which
-                         * retriggers the thread in check_irq_resend()
-                         * but AFAICT IRQS_PENDING should be fine as it
-                         * retriggers the interrupt itself --- tglx
-                         */
-                        desc->istate |= IRQS_PENDING;
-                        raw_spin_unlock_irq(&desc->lock);
-                } else {
-                        irqreturn_t action_ret;
-                        raw_spin_unlock_irq(&desc->lock);
-                        action_ret = handler_fn(desc, action);
-                        if (!noirqdebug)
-                                note_interrupt(action->irq, desc, action_ret);
-                }
-                wake = atomic_dec_and_test(&desc->threads_active);
-                if (wake && waitqueue_active(&desc->wait_for_threads))
-                        wake_up(&desc->wait_for_threads);
        }
-        /* Prevent a stale desc->threads_oneshot */
-        irq_finalize_oneshot(desc, action, true);
        /*
-         * Clear irqaction. Otherwise exit_irq_thread() would make
+         * This is the regular exit path. __free_irq() is stopping the
+         * thread via kthread_stop() after calling
+         * synchronize_irq(). So neither IRQTF_RUNTHREAD nor the
+         * oneshot mask bit can be set. We cannot verify that as we
+         * cannot touch the oneshot mask at this point anymore as
+         * __setup_irq() might have given out currents thread_mask
+         * again.
+         *
+         * Clear irq_thread. Otherwise exit_irq_thread() would make
         * fuzz about an active irq thread going into nirvana.
         */
-        current->irqaction = NULL;
+        current->irq_thread = 0;
        return 0;
 }
@@ -832,27 +830,28 @@ void exit_irq_thread(void)
 {
        struct task_struct *tsk = current;
        struct irq_desc *desc;
+        struct irqaction *action;
-        if (!tsk->irqaction)
+        if (!tsk->irq_thread)
                return;
+        action = kthread_data(tsk);
        printk(KERN_ERR
               "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
-               tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq);
+               tsk->comm ? tsk->comm : "", tsk->pid, action->irq);
-        desc = irq_to_desc(tsk->irqaction->irq);
+        desc = irq_to_desc(action->irq);
        /*
-         * Prevent a stale desc->threads_oneshot. Must be called
+         * If IRQTF_RUNTHREAD is set, we need to decrement
-         * before setting the IRQTF_DIED flag.
+         * desc->threads_active and wake possible waiters.
         */
-        irq_finalize_oneshot(desc, tsk->irqaction, true);
+        if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+                wake_threads_waitq(desc);
-        /*
+        /* Prevent a stale desc->threads_oneshot */
-         * Set the THREAD DIED flag to prevent further wakeups of the
+        irq_finalize_oneshot(desc, action);
-         * soon to be gone threaded handler.
-         */
-        set_bit(IRQTF_DIED, &tsk->irqaction->flags);
 }
 static void irq_setup_forced_threading(struct irqaction *new)
@@ -985,6 +984,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                /* add new interrupt at end of irq queue */
                do {
+                        /*
+                         * Or all existing action->thread_mask bits,
+                         * so we can find the next zero bit for this
+                         * new action.
+                         */
                        thread_mask |= old->thread_mask;
                        old_ptr = &old->next;
                        old = *old_ptr;
@@ -993,14 +997,41 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
        }
        /*
-         * Setup the thread mask for this irqaction. Unlikely to have
+         * Setup the thread mask for this irqaction for ONESHOT. For
-         * 32 resp 64 irqs sharing one line, but who knows.
+         * !ONESHOT irqs the thread mask is 0 so we can avoid a
+         * conditional in irq_wake_thread().
         */
-        if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) {
+        if (new->flags & IRQF_ONESHOT) {
-                ret = -EBUSY;
+                /*
-                goto out_mask;
+                 * Unlikely to have 32 resp 64 irqs sharing one line,
+                 * but who knows.
+                 */
+                if (thread_mask == ~0UL) {
+                        ret = -EBUSY;
+                        goto out_mask;
+                }
+                /*
+                 * The thread_mask for the action is or'ed to
+                 * desc->thread_active to indicate that the
+                 * IRQF_ONESHOT thread handler has been woken, but not
+                 * yet finished. The bit is cleared when a thread
+                 * completes. When all threads of a shared interrupt
+                 * line have completed desc->threads_active becomes
+                 * zero and the interrupt line is unmasked. See
+                 * handle.c:irq_wake_thread() for further information.
+                 *
+                 * If no thread is woken by primary (hard irq context)
+                 * interrupt handlers, then desc->threads_active is
+                 * also checked for zero to unmask the irq line in the
+                 * affected hard irq flow handlers
+                 * (handle_[fasteoi|level]_irq).
+                 *
+                 * The new action gets the first zero bit of
+                 * thread_mask assigned. See the loop above which or's
+                 * all existing action->thread_mask bits.
+                 */
+                new->thread_mask = 1 << ffz(thread_mask);
        }
-        new->thread_mask = 1 << ffz(thread_mask);
        if (!shared) {
                init_waitqueue_head(&desc->wait_for_threads);
@@ -1027,7 +1058,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                        desc->istate |= IRQS_ONESHOT;
                if (irq_settings_can_autoenable(desc))
-                        irq_startup(desc);
+                        irq_startup(desc, true);
                else
                        /* Undo nested disables: */
                        desc->depth = 1;
@@ -1103,8 +1134,7 @@ out_thread:
                struct task_struct *t = new->thread;
                new->thread = NULL;
-                if (likely(!test_bit(IRQTF_DIED, &new->thread_flags)))
+                kthread_stop(t);
-                        kthread_stop(t);
                put_task_struct(t);
        }
 out_mput:
@@ -1214,8 +1244,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 #endif
        if (action->thread) {
-                if (!test_bit(IRQTF_DIED, &action->thread_flags))
+                kthread_stop(action->thread);
-                        kthread_stop(action->thread);
                put_task_struct(action->thread);
        }
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 47420908fba0..c3c89751b327 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -43,12 +43,16 @@ void irq_move_masked_irq(struct irq_data *idata)
         * masking the irqs.
         */
        if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
-                   < nr_cpu_ids))
+                   < nr_cpu_ids)) {
-                if (!chip->irq_set_affinity(&desc->irq_data,
+                int ret = chip->irq_set_affinity(&desc->irq_data,
-                                            desc->pending_mask, false)) {
+                                                 desc->pending_mask, false);
+                switch (ret) {
+                case IRQ_SET_MASK_OK:
                        cpumask_copy(desc->irq_data.affinity, desc->pending_mask);
+                case IRQ_SET_MASK_OK_NOCOPY:
                        irq_set_thread_affinity(desc);
                }
+        }
        cpumask_clear(desc->pending_mask);
 }
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 01d3b70fc98a..43049192b5ec 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -12,7 +12,7 @@
 #include <linux/slab.h>
 #include <linux/sort.h>
 #include <linux/err.h>
-#include <linux/jump_label.h>
+#include <linux/static_key.h>
 #ifdef HAVE_JUMP_LABEL
@@ -29,11 +29,6 @@ void jump_label_unlock(void)
        mutex_unlock(&jump_label_mutex);
 }
-bool jump_label_enabled(struct jump_label_key *key)
-{
-        return !!atomic_read(&key->enabled);
-}
 static int jump_label_cmp(const void *a, const void *b)
 {
        const struct jump_entry *jea = a;
@@ -58,56 +53,66 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
        sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
 }
-static void jump_label_update(struct jump_label_key *key, int enable);
+static void jump_label_update(struct static_key *key, int enable);
-void jump_label_inc(struct jump_label_key *key)
+void static_key_slow_inc(struct static_key *key)
 {
        if (atomic_inc_not_zero(&key->enabled))
                return;
        jump_label_lock();
-        if (atomic_read(&key->enabled) == 0)
+        if (atomic_read(&key->enabled) == 0) {
-                jump_label_update(key, JUMP_LABEL_ENABLE);
+                if (!jump_label_get_branch_default(key))
+                        jump_label_update(key, JUMP_LABEL_ENABLE);
+                else
+                        jump_label_update(key, JUMP_LABEL_DISABLE);
+        }
        atomic_inc(&key->enabled);
        jump_label_unlock();
 }
-EXPORT_SYMBOL_GPL(jump_label_inc);
+EXPORT_SYMBOL_GPL(static_key_slow_inc);
-static void __jump_label_dec(struct jump_label_key *key,
+static void __static_key_slow_dec(struct static_key *key,
                unsigned long rate_limit, struct delayed_work *work)
 {
-        if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex))
+        if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
+                WARN(atomic_read(&key->enabled) < 0,
+                     "jump label: negative count!\n");
                return;
+        }
        if (rate_limit) {
                atomic_inc(&key->enabled);
                schedule_delayed_work(work, rate_limit);
-        } else
+        } else {
-                jump_label_update(key, JUMP_LABEL_DISABLE);
+                if (!jump_label_get_branch_default(key))
+                        jump_label_update(key, JUMP_LABEL_DISABLE);
+                else
+                        jump_label_update(key, JUMP_LABEL_ENABLE);
+        }
        jump_label_unlock();
 }
-EXPORT_SYMBOL_GPL(jump_label_dec);
 static void jump_label_update_timeout(struct work_struct *work)
 {
-        struct jump_label_key_deferred *key =
+        struct static_key_deferred *key =
-                container_of(work, struct jump_label_key_deferred, work.work);
+                container_of(work, struct static_key_deferred, work.work);
-        __jump_label_dec(&key->key, 0, NULL);
+        __static_key_slow_dec(&key->key, 0, NULL);
 }
-void jump_label_dec(struct jump_label_key *key)
+void static_key_slow_dec(struct static_key *key)
 {
-        __jump_label_dec(key, 0, NULL);
+        __static_key_slow_dec(key, 0, NULL);
 }
+EXPORT_SYMBOL_GPL(static_key_slow_dec);
-void jump_label_dec_deferred(struct jump_label_key_deferred *key)
+void static_key_slow_dec_deferred(struct static_key_deferred *key)
 {
-        __jump_label_dec(&key->key, key->timeout, &key->work);
+        __static_key_slow_dec(&key->key, key->timeout, &key->work);
 }
+EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
+void jump_label_rate_limit(struct static_key_deferred *key,
-void jump_label_rate_limit(struct jump_label_key_deferred *key,
                unsigned long rl)
 {
        key->timeout = rl;
@@ -150,7 +155,7 @@ void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry
        arch_jump_label_transform(entry, type); 
 }
-static void __jump_label_update(struct jump_label_key *key,
+static void __jump_label_update(struct static_key *key,
                                struct jump_entry *entry,
                                struct jump_entry *stop, int enable)
 {
@@ -167,27 +172,40 @@ static void __jump_label_update(struct jump_label_key *key,
        }
 }
+static enum jump_label_type jump_label_type(struct static_key *key)
+{
+        bool true_branch = jump_label_get_branch_default(key);
+        bool state = static_key_enabled(key);
+        if ((!true_branch && state) || (true_branch && !state))
+                return JUMP_LABEL_ENABLE;
+        return JUMP_LABEL_DISABLE;
+}
 void __init jump_label_init(void)
 {
        struct jump_entry *iter_start = __start___jump_table;
        struct jump_entry *iter_stop = __stop___jump_table;
-        struct jump_label_key *key = NULL;
+        struct static_key *key = NULL;
        struct jump_entry *iter;
        jump_label_lock();
        jump_label_sort_entries(iter_start, iter_stop);
        for (iter = iter_start; iter < iter_stop; iter++) {
-                struct jump_label_key *iterk;
+                struct static_key *iterk;
-                iterk = (struct jump_label_key *)(unsigned long)iter->key;
+                iterk = (struct static_key *)(unsigned long)iter->key;
-                arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ?
+                arch_jump_label_transform_static(iter, jump_label_type(iterk));
-                                                 JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE);
                if (iterk == key)
                        continue;
                key = iterk;
-                key->entries = iter;
+                /*
+                 * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
+                 */
+                *((unsigned long *)&key->entries) += (unsigned long)iter;
 #ifdef CONFIG_MODULES
                key->next = NULL;
 #endif
@@ -197,8 +215,8 @@ void __init jump_label_init(void)
 #ifdef CONFIG_MODULES
-struct jump_label_mod {
+struct static_key_mod {
-        struct jump_label_mod *next;
+        struct static_key_mod *next;
        struct jump_entry *entries;
        struct module *mod;
 };
@@ -218,9 +236,9 @@ static int __jump_label_mod_text_reserved(void *start, void *end)
                                start, end);
 }
-static void __jump_label_mod_update(struct jump_label_key *key, int enable)
+static void __jump_label_mod_update(struct static_key *key, int enable)
 {
-        struct jump_label_mod *mod = key->next;
+        struct static_key_mod *mod = key->next;
        while (mod) {
                struct module *m = mod->mod;
@@ -251,11 +269,7 @@ void jump_label_apply_nops(struct module *mod)
                return;
        for (iter = iter_start; iter < iter_stop; iter++) {
-                struct jump_label_key *iterk;
+                arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE);
-                iterk = (struct jump_label_key *)(unsigned long)iter->key;
-                arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ?
-                                JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE);
        }
 }
@@ -264,8 +278,8 @@ static int jump_label_add_module(struct module *mod)
        struct jump_entry *iter_start = mod->jump_entries;
        struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
        struct jump_entry *iter;
-        struct jump_label_key *key = NULL;
+        struct static_key *key = NULL;
-        struct jump_label_mod *jlm;
+        struct static_key_mod *jlm;
        /* if the module doesn't have jump label entries, just return */
        if (iter_start == iter_stop)
@@ -274,28 +288,30 @@ static int jump_label_add_module(struct module *mod)
        jump_label_sort_entries(iter_start, iter_stop);
        for (iter = iter_start; iter < iter_stop; iter++) {
-                if (iter->key == (jump_label_t)(unsigned long)key)
+                struct static_key *iterk;
-                        continue;
-                key = (struct jump_label_key *)(unsigned long)iter->key;
+                iterk = (struct static_key *)(unsigned long)iter->key;
+                if (iterk == key)
+                        continue;
+                key = iterk;
                if (__module_address(iter->key) == mod) {
-                        atomic_set(&key->enabled, 0);
+                        /*
-                        key->entries = iter;
+                         * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
+                         */
+                        *((unsigned long *)&key->entries) += (unsigned long)iter;
                        key->next = NULL;
                        continue;
                }
+                jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL);
-                jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL);
                if (!jlm)
                        return -ENOMEM;
                jlm->mod = mod;
                jlm->entries = iter;
                jlm->next = key->next;
                key->next = jlm;
-                if (jump_label_enabled(key))
+                if (jump_label_type(key) == JUMP_LABEL_ENABLE)
                        __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE);
        }
@@ -307,14 +323,14 @@ static void jump_label_del_module(struct module *mod)
        struct jump_entry *iter_start = mod->jump_entries;
        struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
        struct jump_entry *iter;
-        struct jump_label_key *key = NULL;
+        struct static_key *key = NULL;
-        struct jump_label_mod *jlm, **prev;
+        struct static_key_mod *jlm, **prev;
        for (iter = iter_start; iter < iter_stop; iter++) {
                if (iter->key == (jump_label_t)(unsigned long)key)
                        continue;
-                key = (struct jump_label_key *)(unsigned long)iter->key;
+                key = (struct static_key *)(unsigned long)iter->key;
                if (__module_address(iter->key) == mod)
                        continue;
@@ -416,12 +432,13 @@ int jump_label_text_reserved(void *start, void *end)
        return ret;
 }
-static void jump_label_update(struct jump_label_key *key, int enable)
+static void jump_label_update(struct static_key *key, int enable)
 {
-        struct jump_entry *entry = key->entries, *stop = __stop___jump_table;
+        struct jump_entry *stop = __stop___jump_table;
+        struct jump_entry *entry = jump_label_get_entries(key);
 #ifdef CONFIG_MODULES
-        struct module *mod = __module_address((jump_label_t)key);
+        struct module *mod = __module_address((unsigned long)key);
        __jump_label_mod_update(key, enable);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 7b0886786701..4e2e472f6aeb 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -37,7 +37,6 @@
 #include <asm/page.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
-#include <asm/system.h>
 #include <asm/sections.h>
 /* Per cpu memory for storing cpu states in case of system crash. */
@@ -1359,6 +1358,10 @@ static int __init parse_crashkernel_simple(char 		*cmdline,
        if (*cur == '@')
                *crash_base = memparse(cur+1, &cur);
+        else if (*cur != ' ' && *cur != '\0') {
+                pr_warning("crashkernel: unrecognized char\n");
+                return -EINVAL;
+        }
        return 0;
 }
@@ -1462,7 +1465,9 @@ static int __init crash_save_vmcoreinfo_init(void)
        VMCOREINFO_SYMBOL(init_uts_ns);
        VMCOREINFO_SYMBOL(node_online_map);
+#ifdef CONFIG_MMU
        VMCOREINFO_SYMBOL(swapper_pg_dir);
+#endif
        VMCOREINFO_SYMBOL(_stext);
        VMCOREINFO_SYMBOL(vmlist);
@@ -1546,13 +1551,13 @@ int kernel_kexec(void)
                if (error)
                        goto Resume_console;
                /* At this point, dpm_suspend_start() has been called,
-                 * but *not* dpm_suspend_noirq(). We *must* call
+                 * but *not* dpm_suspend_end(). We *must* call
-                 * dpm_suspend_noirq() now.  Otherwise, drivers for
+                 * dpm_suspend_end() now.  Otherwise, drivers for
                 * some devices (e.g. interrupt controllers) become
                 * desynchronized with the actual state of the
                 * hardware at resume time, and evil weirdness ensues.
                 */
-                error = dpm_suspend_noirq(PMSG_FREEZE);
+                error = dpm_suspend_end(PMSG_FREEZE);
                if (error)
                        goto Resume_devices;
                error = disable_nonboot_cpus();
@@ -1579,7 +1584,7 @@ int kernel_kexec(void)
                local_irq_enable();
 Enable_cpus:
                enable_nonboot_cpus();
-                dpm_resume_noirq(PMSG_RESTORE);
+                dpm_resume_start(PMSG_RESTORE);
 Resume_devices:
                dpm_resume_end(PMSG_RESTORE);
 Resume_console:
diff --git a/kernel/kmod.c b/kernel/kmod.c
index a0a88543934e..957a7aab8ebc 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -60,6 +60,43 @@ static DECLARE_RWSEM(umhelper_sem);
 */
 char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
+static void free_modprobe_argv(struct subprocess_info *info)
+{
+        kfree(info->argv[3]); /* check call_modprobe() */
+        kfree(info->argv);
+}
+static int call_modprobe(char *module_name, int wait)
+{
+        static char *envp[] = {
+                "HOME=/",
+                "TERM=linux",
+                "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+                NULL
+        };
+        char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL);
+        if (!argv)
+                goto out;
+        module_name = kstrdup(module_name, GFP_KERNEL);
+        if (!module_name)
+                goto free_argv;
+        argv[0] = modprobe_path;
+        argv[1] = "-q";
+        argv[2] = "--";
+        argv[3] = module_name;  /* check free_modprobe_argv() */
+        argv[4] = NULL;
+        return call_usermodehelper_fns(modprobe_path, argv, envp,
+                wait | UMH_KILLABLE, NULL, free_modprobe_argv, NULL);
+free_argv:
+        kfree(argv);
+out:
+        return -ENOMEM;
+}
 /**
 * __request_module - try to load a kernel module
 * @wait: wait (or not) for the operation to complete
@@ -81,11 +118,6 @@ int __request_module(bool wait, const char *fmt, ...)
        char module_name[MODULE_NAME_LEN];
        unsigned int max_modprobes;
        int ret;
-        char *argv[] = { modprobe_path, "-q", "--", module_name, NULL };
-        static char *envp[] = { "HOME=/",
-                                "TERM=linux",
-                                "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
-                                NULL };
        static atomic_t kmod_concurrent = ATOMIC_INIT(0);
 #define MAX_KMOD_CONCURRENT 50  /* Completely arbitrary value - KAO */
        static int kmod_loop_msg;
@@ -128,9 +160,7 @@ int __request_module(bool wait, const char *fmt, ...)
        trace_module_request(module_name, wait, _RET_IP_);
-        ret = call_usermodehelper_fns(modprobe_path, argv, envp,
+        ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
-                        wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC,
-                        NULL, NULL, NULL);
        atomic_dec(&kmod_concurrent);
        return ret;
@@ -188,7 +218,7 @@ static int ____call_usermodehelper(void *data)
        /* Exec failed? */
 fail:
        sub_info->retval = retval;
-        do_exit(0);
+        return 0;
 }
 void call_usermodehelper_freeinfo(struct subprocess_info *info)
@@ -199,6 +229,19 @@ void call_usermodehelper_freeinfo(struct subprocess_info *info)
 }
 EXPORT_SYMBOL(call_usermodehelper_freeinfo);
+static void umh_complete(struct subprocess_info *sub_info)
+{
+        struct completion *comp = xchg(&sub_info->complete, NULL);
+        /*
+         * See call_usermodehelper_exec(). If xchg() returns NULL
+         * we own sub_info, the UMH_KILLABLE caller has gone away.
+         */
+        if (comp)
+                complete(comp);
+        else
+                call_usermodehelper_freeinfo(sub_info);
+}
 /* Keventd can't block, but this (a child) can. */
 static int wait_for_helper(void *data)
 {
@@ -235,7 +278,7 @@ static int wait_for_helper(void *data)
                        sub_info->retval = ret;
        }
-        complete(sub_info->complete);
+        umh_complete(sub_info);
        return 0;
 }
@@ -244,7 +287,7 @@ static void __call_usermodehelper(struct work_struct *work)
 {
        struct subprocess_info *sub_info =
                container_of(work, struct subprocess_info, work);
-        enum umh_wait wait = sub_info->wait;
+        int wait = sub_info->wait & ~UMH_KILLABLE;
        pid_t pid;
        /* CLONE_VFORK: wait until the usermode helper has execve'd
@@ -269,7 +312,7 @@ static void __call_usermodehelper(struct work_struct *work)
        case UMH_WAIT_EXEC:
                if (pid < 0)
                        sub_info->retval = pid;
-                complete(sub_info->complete);
+                umh_complete(sub_info);
        }
 }
@@ -435,8 +478,7 @@ EXPORT_SYMBOL(call_usermodehelper_setfns);
 * asynchronously if wait is not set, and runs as a child of keventd.
 * (ie. it runs with full root capabilities).
 */
-int call_usermodehelper_exec(struct subprocess_info *sub_info,
+int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
-                             enum umh_wait wait)
 {
        DECLARE_COMPLETION_ONSTACK(done);
        int retval = 0;
@@ -456,9 +498,21 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
        queue_work(khelper_wq, &sub_info->work);
        if (wait == UMH_NO_WAIT)        /* task has freed sub_info */
                goto unlock;
+        if (wait & UMH_KILLABLE) {
+                retval = wait_for_completion_killable(&done);
+                if (!retval)
+                        goto wait_done;
+                /* umh_complete() will see NULL and free sub_info */
+                if (xchg(&sub_info->complete, NULL))
+                        goto unlock;
+                /* fallthrough, umh_complete() was already called */
+        }
        wait_for_completion(&done);
+wait_done:
        retval = sub_info->retval;
 out:
        call_usermodehelper_freeinfo(sub_info);
 unlock:
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9788c0ec6f43..c62b8546cc90 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1334,8 +1334,10 @@ int __kprobes register_kprobe(struct kprobe *p)
        if (!kernel_text_address((unsigned long) p->addr) ||
            in_kprobes_functions((unsigned long) p->addr) ||
            ftrace_text_reserved(p->addr, p->addr) ||
-            jump_label_text_reserved(p->addr, p->addr))
+            jump_label_text_reserved(p->addr, p->addr)) {
-                goto fail_with_jump_label;
+                ret = -EINVAL;
+                goto cannot_probe;
+        }
        /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
        p->flags &= KPROBE_FLAG_DISABLED;
@@ -1352,7 +1354,7 @@ int __kprobes register_kprobe(struct kprobe *p)
                 * its code to prohibit unexpected unloading.
                 */
                if (unlikely(!try_module_get(probed_mod)))
-                        goto fail_with_jump_label;
+                        goto cannot_probe;
                /*
                 * If the module freed .init.text, we couldn't insert
@@ -1361,7 +1363,7 @@ int __kprobes register_kprobe(struct kprobe *p)
                if (within_module_init((unsigned long)p->addr, probed_mod) &&
                    probed_mod->state != MODULE_STATE_COMING) {
                        module_put(probed_mod);
-                        goto fail_with_jump_label;
+                        goto cannot_probe;
                }
                /* ret will be updated by following code */
        }
@@ -1409,7 +1411,7 @@ out:
        return ret;
-fail_with_jump_label:
+cannot_probe:
        preempt_enable();
        jump_label_unlock();
        return ret;
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 8889f7dd7c46..ea9ee4518c35 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -4176,7 +4176,13 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
        printk("-------------------------------\n");
        printk("%s:%d %s!\n", file, line, s);
        printk("\nother info that might help us debug this:\n\n");
-        printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
+        printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
+               !rcu_lockdep_current_cpu_online()
+                        ? "RCU used illegally from offline CPU!\n"
+                        : rcu_is_cpu_idle()
+                                ? "RCU used illegally from idle CPU!\n"
+                                : "",
+               rcu_scheduler_active, debug_locks);
        /*
         * If a CPU is in the RCU-free window in idle (ie: in the section
diff --git a/kernel/module.c b/kernel/module.c
index 2c932760fd33..78ac6ec1e425 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -105,6 +105,7 @@ struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
 /* Block module loading/unloading? */
 int modules_disabled = 0;
+core_param(nomodule, modules_disabled, bint, 0);
 /* Waiting for a module to finish initializing? */
 static DECLARE_WAIT_QUEUE_HEAD(module_wq);
@@ -903,6 +904,36 @@ static ssize_t show_refcnt(struct module_attribute *mattr,
 static struct module_attribute modinfo_refcnt =
        __ATTR(refcnt, 0444, show_refcnt, NULL);
+void __module_get(struct module *module)
+{
+        if (module) {
+                preempt_disable();
+                __this_cpu_inc(module->refptr->incs);
+                trace_module_get(module, _RET_IP_);
+                preempt_enable();
+        }
+}
+EXPORT_SYMBOL(__module_get);
+bool try_module_get(struct module *module)
+{
+        bool ret = true;
+        if (module) {
+                preempt_disable();
+                if (likely(module_is_live(module))) {
+                        __this_cpu_inc(module->refptr->incs);
+                        trace_module_get(module, _RET_IP_);
+                } else
+                        ret = false;
+                preempt_enable();
+        }
+        return ret;
+}
+EXPORT_SYMBOL(try_module_get);
 void module_put(struct module *module)
 {
        if (module) {
@@ -2380,8 +2411,7 @@ static int copy_and_check(struct load_info *info,
                return -ENOEXEC;
        /* Suck in entire file: we'll want most of it. */
-        /* vmalloc barfs on "unusual" numbers.  Check here */
+        if ((hdr = vmalloc(len)) == NULL)
-        if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
                return -ENOMEM;
        if (copy_from_user(hdr, umod, len) != 0) {
@@ -2922,7 +2952,8 @@ static struct module *load_module(void __user *umod,
        mutex_unlock(&module_mutex);
        /* Module is ready to execute: parsing args may do that. */
-        err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
+        err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
+                         -32768, 32767, NULL);
        if (err < 0)
                goto unlink;
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 89096dd8786f..a307cc9c9526 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -240,9 +240,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                /* didn't get the lock, go to sleep: */
                spin_unlock_mutex(&lock->wait_lock, flags);
-                preempt_enable_no_resched();
+                schedule_preempt_disabled();
-                schedule();
-                preempt_disable();
                spin_lock_mutex(&lock->wait_lock, flags);
        }
diff --git a/kernel/padata.c b/kernel/padata.c
index b45259931512..6f10eb285ece 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -29,7 +29,6 @@
 #include <linux/sysfs.h>
 #include <linux/rcupdate.h>
-#define MAX_SEQ_NR (INT_MAX - NR_CPUS)
 #define MAX_OBJ_NUM 1000
 static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
@@ -43,18 +42,19 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
        return target_cpu;
 }
-static int padata_cpu_hash(struct padata_priv *padata)
+static int padata_cpu_hash(struct parallel_data *pd)
 {
        int cpu_index;
-        struct parallel_data *pd;
-        pd =  padata->pd;
        /*
         * Hash the sequence numbers to the cpus by taking
         * seq_nr mod. number of cpus in use.
         */
-        cpu_index =  padata->seq_nr % cpumask_weight(pd->cpumask.pcpu);
+        spin_lock(&pd->seq_lock);
+        cpu_index =  pd->seq_nr % cpumask_weight(pd->cpumask.pcpu);
+        pd->seq_nr++;
+        spin_unlock(&pd->seq_lock);
        return padata_index_to_cpu(pd, cpu_index);
 }
@@ -132,12 +132,7 @@ int padata_do_parallel(struct padata_instance *pinst,
        padata->pd = pd;
        padata->cb_cpu = cb_cpu;
-        if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr))
+        target_cpu = padata_cpu_hash(pd);
-                atomic_set(&pd->seq_nr, -1);
-        padata->seq_nr = atomic_inc_return(&pd->seq_nr);
-        target_cpu = padata_cpu_hash(padata);
        queue = per_cpu_ptr(pd->pqueue, target_cpu);
        spin_lock(&queue->parallel.lock);
@@ -173,7 +168,7 @@ EXPORT_SYMBOL(padata_do_parallel);
 static struct padata_priv *padata_get_next(struct parallel_data *pd)
 {
        int cpu, num_cpus;
-        int next_nr, next_index;
+        unsigned int next_nr, next_index;
        struct padata_parallel_queue *queue, *next_queue;
        struct padata_priv *padata;
        struct padata_list *reorder;
@@ -189,14 +184,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
        cpu = padata_index_to_cpu(pd, next_index);
        next_queue = per_cpu_ptr(pd->pqueue, cpu);
-        if (unlikely(next_nr > pd->max_seq_nr)) {
-                next_nr = next_nr - pd->max_seq_nr - 1;
-                next_index = next_nr % num_cpus;
-                cpu = padata_index_to_cpu(pd, next_index);
-                next_queue = per_cpu_ptr(pd->pqueue, cpu);
-                pd->processed = 0;
-        }
        padata = NULL;
        reorder = &next_queue->reorder;
@@ -205,8 +192,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
                padata = list_entry(reorder->list.next,
                                    struct padata_priv, list);
-                BUG_ON(next_nr != padata->seq_nr);
                spin_lock(&reorder->lock);
                list_del_init(&padata->list);
                atomic_dec(&pd->reorder_objects);
@@ -230,6 +215,7 @@ out:
 static void padata_reorder(struct parallel_data *pd)
 {
+        int cb_cpu;
        struct padata_priv *padata;
        struct padata_serial_queue *squeue;
        struct padata_instance *pinst = pd->pinst;
@@ -270,13 +256,14 @@ static void padata_reorder(struct parallel_data *pd)
                        return;
                }
-                squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu);
+                cb_cpu = padata->cb_cpu;
+                squeue = per_cpu_ptr(pd->squeue, cb_cpu);
                spin_lock(&squeue->serial.lock);
                list_add_tail(&padata->list, &squeue->serial.list);
                spin_unlock(&squeue->serial.lock);
-                queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work);
+                queue_work_on(cb_cpu, pinst->wq, &squeue->work);
        }
        spin_unlock_bh(&pd->lock);
@@ -400,7 +387,7 @@ static void padata_init_squeues(struct parallel_data *pd)
 /* Initialize all percpu queues used by parallel workers */
 static void padata_init_pqueues(struct parallel_data *pd)
 {
-        int cpu_index, num_cpus, cpu;
+        int cpu_index, cpu;
        struct padata_parallel_queue *pqueue;
        cpu_index = 0;
@@ -415,9 +402,6 @@ static void padata_init_pqueues(struct parallel_data *pd)
                INIT_WORK(&pqueue->work, padata_parallel_worker);
                atomic_set(&pqueue->num_obj, 0);
        }
-        num_cpus = cpumask_weight(pd->cpumask.pcpu);
-        pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0;
 }
 /* Allocate and initialize the internal cpumask dependend resources. */
@@ -444,7 +428,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
        padata_init_pqueues(pd);
        padata_init_squeues(pd);
        setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
-        atomic_set(&pd->seq_nr, -1);
+        pd->seq_nr = 0;
        atomic_set(&pd->reorder_objects, 0);
        atomic_set(&pd->refcnt, 0);
        pd->pinst = pinst;
diff --git a/kernel/params.c b/kernel/params.c
index 4bc965d8a1fe..f37d82631347 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -15,7 +15,6 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
-#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/errno.h>
@@ -88,6 +87,8 @@ static int parse_one(char *param,
                     char *val,
                     const struct kernel_param *params,
                     unsigned num_params,
+                     s16 min_level,
+                     s16 max_level,
                     int (*handle_unknown)(char *param, char *val))
 {
        unsigned int i;
@@ -96,6 +97,9 @@ static int parse_one(char *param,
        /* Find parameter */
        for (i = 0; i < num_params; i++) {
                if (parameq(param, params[i].name)) {
+                        if (params[i].level < min_level
+                            || params[i].level > max_level)
+                                return 0;
                        /* No one handled NULL, so do it here. */
                        if (!val && params[i].ops->set != param_set_bool
                            && params[i].ops->set != param_set_bint)
@@ -175,6 +179,8 @@ int parse_args(const char *name,
               char *args,
               const struct kernel_param *params,
               unsigned num,
+               s16 min_level,
+               s16 max_level,
               int (*unknown)(char *param, char *val))
 {
        char *param, *val;
@@ -190,7 +196,8 @@ int parse_args(const char *name,
                args = next_arg(args, &param, &val);
                irq_was_disabled = irqs_disabled();
-                ret = parse_one(param, val, params, num, unknown);
+                ret = parse_one(param, val, params, num,
+                                min_level, max_level, unknown);
                if (irq_was_disabled && !irqs_disabled()) {
                        printk(KERN_WARNING "parse_args(): option '%s' enabled "
                                        "irq's!\n", param);
@@ -298,35 +305,18 @@ EXPORT_SYMBOL(param_ops_charp);
 /* Actually could be a bool or an int, for historical reasons. */
 int param_set_bool(const char *val, const struct kernel_param *kp)
 {
-        bool v;
-        int ret;
        /* No equals means "set"... */
        if (!val) val = "1";
        /* One of =[yYnN01] */
-        ret = strtobool(val, &v);
+        return strtobool(val, kp->arg);
-        if (ret)
-                return ret;
-        if (kp->flags & KPARAM_ISBOOL)
-                *(bool *)kp->arg = v;
-        else
-                *(int *)kp->arg = v;
-        return 0;
 }
 EXPORT_SYMBOL(param_set_bool);
 int param_get_bool(char *buffer, const struct kernel_param *kp)
 {
-        bool val;
-        if (kp->flags & KPARAM_ISBOOL)
-                val = *(bool *)kp->arg;
-        else
-                val = *(int *)kp->arg;
        /* Y and N chosen as being relatively non-coder friendly */
-        return sprintf(buffer, "%c", val ? 'Y' : 'N');
+        return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N');
 }
 EXPORT_SYMBOL(param_get_bool);
@@ -344,7 +334,6 @@ int param_set_invbool(const char *val, const struct kernel_param *kp)
        struct kernel_param dummy;
        dummy.arg = &boolval;
-        dummy.flags = KPARAM_ISBOOL;
        ret = param_set_bool(val, &dummy);
        if (ret == 0)
                *(bool *)kp->arg = !boolval;
@@ -373,7 +362,6 @@ int param_set_bint(const char *val, const struct kernel_param *kp)
        /* Match bool exactly, by re-using it. */
        boolkp = *kp;
        boolkp.arg = &v;
-        boolkp.flags |= KPARAM_ISBOOL;
        ret = param_set_bool(val, &boolkp);
        if (ret == 0)
@@ -394,7 +382,7 @@ static int param_array(const char *name,
                       unsigned int min, unsigned int max,
                       void *elem, int elemsize,
                       int (*set)(const char *, const struct kernel_param *kp),
-                       u16 flags,
+                       s16 level,
                       unsigned int *num)
 {
        int ret;
@@ -404,7 +392,7 @@ static int param_array(const char *name,
        /* Get the name right for errors. */
        kp.name = name;
        kp.arg = elem;
-        kp.flags = flags;
+        kp.level = level;
        *num = 0;
        /* We expect a comma-separated list of values. */
@@ -445,7 +433,7 @@ static int param_array_set(const char *val, const struct kernel_param *kp)
        unsigned int temp_num;
        return param_array(kp->name, val, 1, arr->max, arr->elem,
-                           arr->elemsize, arr->ops->set, kp->flags,
+                           arr->elemsize, arr->ops->set, kp->level,
                           arr->num ?: &temp_num);
 }
diff --git a/kernel/pid.c b/kernel/pid.c
index ce8e00deaccb..9f08dfabaf13 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -543,12 +543,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 */
 void __init pidhash_init(void)
 {
-        int i, pidhash_size;
+        unsigned int i, pidhash_size;
        pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
                                           HASH_EARLY | HASH_SMALL,
                                           &pidhash_shift, NULL, 4096);
-        pidhash_size = 1 << pidhash_shift;
+        pidhash_size = 1U << pidhash_shift;
        for (i = 0; i < pidhash_size; i++)
                INIT_HLIST_HEAD(&pid_hash[i]);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a8968396046d..57bc1fd35b3c 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -15,6 +15,7 @@
 #include <linux/acct.h>
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
+#include <linux/reboot.h>
 #define BITS_PER_PAGE           (PAGE_SIZE*8)
@@ -168,13 +169,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
        while (nr > 0) {
                rcu_read_lock();
-                /*
-                 * Any nested-container's init processes won't ignore the
-                 * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser().
-                 */
                task = pid_task(find_vpid(nr), PIDTYPE_PID);
-                if (task)
+                if (task && !__fatal_signal_pending(task))
-                        send_sig_info(SIGKILL, SEND_SIG_NOINFO, task);
+                        send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
                rcu_read_unlock();
@@ -187,6 +184,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
                rc = sys_wait4(-1, NULL, __WALL, NULL);
        } while (rc != -ECHILD);
+        if (pid_ns->reboot)
+                current->signal->group_exit_code = pid_ns->reboot;
        acct_exit_ns(pid_ns);
        return;
 }
@@ -221,6 +221,35 @@ static struct ctl_table pid_ns_ctl_table[] = {
 static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
+int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
+{
+        if (pid_ns == &init_pid_ns)
+                return 0;
+        switch (cmd) {
+        case LINUX_REBOOT_CMD_RESTART2:
+        case LINUX_REBOOT_CMD_RESTART:
+                pid_ns->reboot = SIGHUP;
+                break;
+        case LINUX_REBOOT_CMD_POWER_OFF:
+        case LINUX_REBOOT_CMD_HALT:
+                pid_ns->reboot = SIGINT;
+                break;
+        default:
+                return -EINVAL;
+        }
+        read_lock(&tasklist_lock);
+        force_sig(SIGKILL, pid_ns->child_reaper);
+        read_unlock(&tasklist_lock);
+        do_exit(0);
+        /* Not reached */
+        return 0;
+}
 static __init int pid_namespaces_init(void)
 {
        pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 07e0e28ffba7..66d808ec5252 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,7 +1,8 @@
 ccflags-$(CONFIG_PM_DEBUG)      := -DDEBUG
-obj-$(CONFIG_PM)                += main.o qos.o
+obj-y                           += qos.o
+obj-$(CONFIG_PM)                += main.o
 obj-$(CONFIG_VT_CONSOLE_SLEEP)  += console.o
 obj-$(CONFIG_FREEZER)           += process.o
 obj-$(CONFIG_SUSPEND)           += suspend.o
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 6d6d28870335..0a186cfde788 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -245,8 +245,8 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
 * create_image - Create a hibernation image.
 * @platform_mode: Whether or not to use the platform driver.
 *
- * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image
+ * Execute device drivers' "late" and "noirq" freeze callbacks, create a
- * and execute the drivers' .thaw_noirq() callbacks.
+ * hibernation image and run the drivers' "noirq" and "early" thaw callbacks.
 *
 * Control reappears in this routine after the subsequent restore.
 */
@@ -254,7 +254,7 @@ static int create_image(int platform_mode)
 {
        int error;
-        error = dpm_suspend_noirq(PMSG_FREEZE);
+        error = dpm_suspend_end(PMSG_FREEZE);
        if (error) {
                printk(KERN_ERR "PM: Some devices failed to power down, "
                        "aborting hibernation\n");
@@ -306,7 +306,7 @@ static int create_image(int platform_mode)
 Platform_finish:
        platform_finish(platform_mode);
-        dpm_resume_noirq(in_suspend ?
+        dpm_resume_start(in_suspend ?
                (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
        return error;
@@ -343,13 +343,13 @@ int hibernation_snapshot(int platform_mode)
                 * successful freezer test.
                 */
                freezer_test_done = true;
-                goto Cleanup;
+                goto Thaw;
        }
        error = dpm_prepare(PMSG_FREEZE);
        if (error) {
                dpm_complete(PMSG_RECOVER);
-                goto Cleanup;
+                goto Thaw;
        }
        suspend_console();
@@ -385,6 +385,8 @@ int hibernation_snapshot(int platform_mode)
        platform_end(platform_mode);
        return error;
+ Thaw:
+        thaw_kernel_threads();
 Cleanup:
        swsusp_free();
        goto Close;
@@ -394,16 +396,16 @@ int hibernation_snapshot(int platform_mode)
 * resume_target_kernel - Restore system state from a hibernation image.
 * @platform_mode: Whether or not to use the platform driver.
 *
- * Execute device drivers' .freeze_noirq() callbacks, restore the contents of
+ * Execute device drivers' "noirq" and "late" freeze callbacks, restore the
- * highmem that have not been restored yet from the image and run the low-level
+ * contents of highmem that have not been restored yet from the image and run
- * code that will restore the remaining contents of memory and switch to the
+ * the low-level code that will restore the remaining contents of memory and
- * just restored target kernel.
+ * switch to the just restored target kernel.
 */
 static int resume_target_kernel(bool platform_mode)
 {
        int error;
-        error = dpm_suspend_noirq(PMSG_QUIESCE);
+        error = dpm_suspend_end(PMSG_QUIESCE);
        if (error) {
                printk(KERN_ERR "PM: Some devices failed to power down, "
                        "aborting resume\n");
@@ -460,7 +462,7 @@ static int resume_target_kernel(bool platform_mode)
 Cleanup:
        platform_restore_cleanup(platform_mode);
-        dpm_resume_noirq(PMSG_RECOVER);
+        dpm_resume_start(PMSG_RECOVER);
        return error;
 }
@@ -518,7 +520,7 @@ int hibernation_platform_enter(void)
                goto Resume_devices;
        }
-        error = dpm_suspend_noirq(PMSG_HIBERNATE);
+        error = dpm_suspend_end(PMSG_HIBERNATE);
        if (error)
                goto Resume_devices;
@@ -549,7 +551,7 @@ int hibernation_platform_enter(void)
 Platform_finish:
        hibernation_ops->finish();
-        dpm_resume_noirq(PMSG_RESTORE);
+        dpm_resume_start(PMSG_RESTORE);
 Resume_devices:
        entering_platform_hibernation = false;
@@ -616,7 +618,7 @@ int hibernate(void)
        /* Allocate memory management structures */
        error = create_basic_memory_bitmaps();
        if (error)
-                goto Exit;
+                goto Enable_umh;
        printk(KERN_INFO "PM: Syncing filesystems ... ");
        sys_sync();
@@ -624,15 +626,11 @@ int hibernate(void)
        error = freeze_processes();
        if (error)
-                goto Finish;
+                goto Free_bitmaps;
        error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
-        if (error)
+        if (error || freezer_test_done)
-                goto Thaw;
-        if (freezer_test_done) {
-                freezer_test_done = false;
                goto Thaw;
-        }
        if (in_suspend) {
                unsigned int flags = 0;
@@ -657,8 +655,13 @@ int hibernate(void)
 Thaw:
        thaw_processes();
- Finish:
+        /* Don't bother checking whether freezer_test_done is true */
+        freezer_test_done = false;
+ Free_bitmaps:
        free_basic_memory_bitmaps();
+ Enable_umh:
        usermodehelper_enable();
 Exit:
        pm_notifier_call_chain(PM_POST_HIBERNATION);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 9824b41e5a18..1c12581f1c62 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -165,16 +165,20 @@ static int suspend_stats_show(struct seq_file *s, void *unused)
        last_errno %= REC_FAILED_NUM;
        last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1;
        last_step %= REC_FAILED_NUM;
-        seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
+        seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
-                        "%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
+                        "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
                        "success", suspend_stats.success,
                        "fail", suspend_stats.fail,
                        "failed_freeze", suspend_stats.failed_freeze,
                        "failed_prepare", suspend_stats.failed_prepare,
                        "failed_suspend", suspend_stats.failed_suspend,
+                        "failed_suspend_late",
+                                suspend_stats.failed_suspend_late,
                        "failed_suspend_noirq",
                                suspend_stats.failed_suspend_noirq,
                        "failed_resume", suspend_stats.failed_resume,
+                        "failed_resume_early",
+                                suspend_stats.failed_resume_early,
                        "failed_resume_noirq",
                                suspend_stats.failed_resume_noirq);
        seq_printf(s,   "failures:\n  last_failed_dev:\t%-s\n",
@@ -287,16 +291,10 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
 #ifdef CONFIG_SUSPEND
        for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
-                if (*s && len == strlen(*s) && !strncmp(buf, *s, len))
+                if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) {
+                        error = pm_suspend(state);
                        break;
-        }
+                }
-        if (state < PM_SUSPEND_MAX && *s) {
-                error = enter_state(state);
-                if (error) {
-                        suspend_stats.fail++;
-                        dpm_save_failed_errno(error);
-                } else
-                        suspend_stats.success++;
        }
 #endif
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 21724eee5206..98f3622d7407 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -177,13 +177,11 @@ extern const char *const pm_states[];
 extern bool valid_state(suspend_state_t state);
 extern int suspend_devices_and_enter(suspend_state_t state);
-extern int enter_state(suspend_state_t state);
 #else /* !CONFIG_SUSPEND */
 static inline int suspend_devices_and_enter(suspend_state_t state)
 {
        return -ENOSYS;
 }
-static inline int enter_state(suspend_state_t state) { return -ENOSYS; }
 static inline bool valid_state(suspend_state_t state) { return false; }
 #endif /* !CONFIG_SUSPEND */
@@ -234,16 +232,14 @@ static inline int suspend_freeze_processes(void)
        int error;
        error = freeze_processes();
        /*
         * freeze_processes() automatically thaws every task if freezing
         * fails. So we need not do anything extra upon error.
         */
        if (error)
-                goto Finish;
+                return error;
        error = freeze_kernel_threads();
        /*
         * freeze_kernel_threads() thaws only kernel threads upon freezing
         * failure. So we have to thaw the userspace tasks ourselves.
@@ -251,7 +247,6 @@ static inline int suspend_freeze_processes(void)
        if (error)
                thaw_processes();
- Finish:
        return error;
 }
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 7e426459e60a..0d2aeb226108 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -53,11 +53,9 @@ static int try_to_freeze_tasks(bool user_only)
                         * It is "frozen enough".  If the task does wake
                         * up, it will immediately call try_to_freeze.
                         *
-                         * Because freeze_task() goes through p's
+                         * Because freeze_task() goes through p's scheduler lock, it's
-                         * scheduler lock after setting TIF_FREEZE, it's
+                         * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING
-                         * guaranteed that either we see TASK_RUNNING or
+                         * transition can't race with task state testing here.
-                         * try_to_stop() after schedule() in ptrace/signal
-                         * stop sees TIF_FREEZE.
                         */
                        if (!task_is_stopped_or_traced(p) &&
                            !freezer_should_skip(p))
@@ -98,13 +96,15 @@ static int try_to_freeze_tasks(bool user_only)
                       elapsed_csecs / 100, elapsed_csecs % 100,
                       todo - wq_busy, wq_busy);
-                read_lock(&tasklist_lock);
+                if (!wakeup) {
-                do_each_thread(g, p) {
+                        read_lock(&tasklist_lock);
-                        if (!wakeup && !freezer_should_skip(p) &&
+                        do_each_thread(g, p) {
-                            p != current && freezing(p) && !frozen(p))
+                                if (p != current && !freezer_should_skip(p)
-                                sched_show_task(p);
+                                    && freezing(p) && !frozen(p))
-                } while_each_thread(g, p);
+                                        sched_show_task(p);
-                read_unlock(&tasklist_lock);
+                        } while_each_thread(g, p);
+                        read_unlock(&tasklist_lock);
+                }
        } else {
                printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100,
                        elapsed_csecs % 100);
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 995e3bd3417b..d6d6dbd1ecc0 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -469,21 +469,18 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
 static int __init pm_qos_power_init(void)
 {
        int ret = 0;
+        int i;
-        ret = register_pm_qos_misc(&cpu_dma_pm_qos);
+        BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
-        if (ret < 0) {
-                printk(KERN_ERR "pm_qos_param: cpu_dma_latency setup failed\n");
+        for (i = 1; i < PM_QOS_NUM_CLASSES; i++) {
-                return ret;
+                ret = register_pm_qos_misc(pm_qos_array[i]);
-        }
+                if (ret < 0) {
-        ret = register_pm_qos_misc(&network_lat_pm_qos);
+                        printk(KERN_ERR "pm_qos_param: %s setup failed\n",
-        if (ret < 0) {
+                               pm_qos_array[i]->name);
-                printk(KERN_ERR "pm_qos_param: network_latency setup failed\n");
+                        return ret;
-                return ret;
+                }
        }
-        ret = register_pm_qos_misc(&network_throughput_pm_qos);
-        if (ret < 0)
-                printk(KERN_ERR
-                        "pm_qos_param: network_throughput setup failed\n");
        return ret;
 }
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 6a768e537001..0de28576807d 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -711,9 +711,10 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
        list_for_each_entry(region, &nosave_regions, list) {
                unsigned long pfn;
-                pr_debug("PM: Marking nosave pages: %016lx - %016lx\n",
+                pr_debug("PM: Marking nosave pages: [mem %#010llx-%#010llx]\n",
-                                region->start_pfn << PAGE_SHIFT,
+                         (unsigned long long) region->start_pfn << PAGE_SHIFT,
-                                region->end_pfn << PAGE_SHIFT);
+                         ((unsigned long long) region->end_pfn << PAGE_SHIFT)
+                                - 1);
                for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
                        if (pfn_valid(pfn)) {
@@ -1000,20 +1001,20 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
        s_page = pfn_to_page(src_pfn);
        d_page = pfn_to_page(dst_pfn);
        if (PageHighMem(s_page)) {
-                src = kmap_atomic(s_page, KM_USER0);
+                src = kmap_atomic(s_page);
-                dst = kmap_atomic(d_page, KM_USER1);
+                dst = kmap_atomic(d_page);
                do_copy_page(dst, src);
-                kunmap_atomic(dst, KM_USER1);
+                kunmap_atomic(dst);
-                kunmap_atomic(src, KM_USER0);
+                kunmap_atomic(src);
        } else {
                if (PageHighMem(d_page)) {
                        /* Page pointed to by src may contain some kernel
                         * data modified by kmap_atomic()
                         */
                        safe_copy_page(buffer, s_page);
-                        dst = kmap_atomic(d_page, KM_USER0);
+                        dst = kmap_atomic(d_page);
                        copy_page(dst, buffer);
-                        kunmap_atomic(dst, KM_USER0);
+                        kunmap_atomic(dst);
                } else {
                        safe_copy_page(page_address(d_page), s_page);
                }
@@ -1728,9 +1729,9 @@ int snapshot_read_next(struct snapshot_handle *handle)
                         */
                        void *kaddr;
-                        kaddr = kmap_atomic(page, KM_USER0);
+                        kaddr = kmap_atomic(page);
                        copy_page(buffer, kaddr);
-                        kunmap_atomic(kaddr, KM_USER0);
+                        kunmap_atomic(kaddr);
                        handle->buffer = buffer;
                } else {
                        handle->buffer = page_address(page);
@@ -2014,9 +2015,9 @@ static void copy_last_highmem_page(void)
        if (last_highmem_page) {
                void *dst;
-                dst = kmap_atomic(last_highmem_page, KM_USER0);
+                dst = kmap_atomic(last_highmem_page);
                copy_page(dst, buffer);
-                kunmap_atomic(dst, KM_USER0);
+                kunmap_atomic(dst);
                last_highmem_page = NULL;
        }
 }
@@ -2309,13 +2310,13 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
 {
        void *kaddr1, *kaddr2;
-        kaddr1 = kmap_atomic(p1, KM_USER0);
+        kaddr1 = kmap_atomic(p1);
-        kaddr2 = kmap_atomic(p2, KM_USER1);
+        kaddr2 = kmap_atomic(p2);
        copy_page(buf, kaddr1);
        copy_page(kaddr1, kaddr2);
        copy_page(kaddr2, buf);
-        kunmap_atomic(kaddr2, KM_USER1);
+        kunmap_atomic(kaddr2);
-        kunmap_atomic(kaddr1, KM_USER0);
+        kunmap_atomic(kaddr1);
 }
 /**
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4fd51beed879..88e5c967370d 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -37,8 +37,8 @@ const char *const pm_states[PM_SUSPEND_MAX] = {
 static const struct platform_suspend_ops *suspend_ops;
 /**
- *      suspend_set_ops - Set the global suspend method table.
+ * suspend_set_ops - Set the global suspend method table.
- *      @ops:   Pointer to ops structure.
+ * @ops: Suspend operations to use.
 */
 void suspend_set_ops(const struct platform_suspend_ops *ops)
 {
@@ -58,11 +58,11 @@ bool valid_state(suspend_state_t state)
 }
 /**
- * suspend_valid_only_mem - generic memory-only valid callback
+ * suspend_valid_only_mem - Generic memory-only valid callback.
 *
- * Platform drivers that implement mem suspend only and only need
+ * Platform drivers that implement mem suspend only and only need to check for
- * to check for that in their .valid callback can use this instead
+ * that in their .valid() callback can use this instead of rolling their own
- * of rolling their own .valid callback.
+ * .valid() callback.
 */
 int suspend_valid_only_mem(suspend_state_t state)
 {
@@ -83,10 +83,11 @@ static int suspend_test(int level)
 }
 /**
- *      suspend_prepare - Do prep work before entering low-power state.
+ * suspend_prepare - Prepare for entering system sleep state.
 *
- *      This is common code that is called for each state that we're entering.
+ * Common code run for every system sleep state that can be entered (except for
- *      Run suspend notifiers, allocate a console and stop all processes.
+ * hibernation).  Run suspend notifiers, allocate the "suspend" console and
+ * freeze processes.
 */
 static int suspend_prepare(void)
 {
@@ -131,9 +132,9 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
 }
 /**
- * suspend_enter - enter the desired system sleep state.
+ * suspend_enter - Make the system enter the given sleep state.
- * @state: State to enter
+ * @state: System sleep state to enter.
- * @wakeup: Returns information that suspend should not be entered again.
+ * @wakeup: Returns information that the sleep state should not be re-entered.
 *
 * This function should be called after devices have been suspended.
 */
@@ -147,7 +148,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
                        goto Platform_finish;
        }
-        error = dpm_suspend_noirq(PMSG_SUSPEND);
+        error = dpm_suspend_end(PMSG_SUSPEND);
        if (error) {
                printk(KERN_ERR "PM: Some devices failed to power down\n");
                goto Platform_finish;
@@ -189,7 +190,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
        if (suspend_ops->wake)
                suspend_ops->wake();
-        dpm_resume_noirq(PMSG_RESUME);
+        dpm_resume_start(PMSG_RESUME);
 Platform_finish:
        if (suspend_ops->finish)
@@ -199,9 +200,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 }
 /**
- *      suspend_devices_and_enter - suspend devices and enter the desired system
+ * suspend_devices_and_enter - Suspend devices and enter system sleep state.
- *                                  sleep state.
+ * @state: System sleep state to enter.
- *      @state:           state to enter
 */
 int suspend_devices_and_enter(suspend_state_t state)
 {
@@ -251,10 +251,10 @@ int suspend_devices_and_enter(suspend_state_t state)
 }
 /**
- *      suspend_finish - Do final work before exiting suspend sequence.
+ * suspend_finish - Clean up before finishing the suspend sequence.
 *
- *      Call platform code to clean up, restart processes, and free the
+ * Call platform code to clean up, restart processes, and free the console that
- *      console that we've allocated. This is not called for suspend-to-disk.
+ * we've allocated. This routine is not called for hibernation.
 */
 static void suspend_finish(void)
 {
@@ -265,16 +265,14 @@ static void suspend_finish(void)
 }
 /**
- *      enter_state - Do common work of entering low-power state.
+ * enter_state - Do common work needed to enter system sleep state.
- *      @state:         pm_state structure for state we're entering.
+ * @state: System sleep state to enter.
 *
- *      Make sure we're the only ones trying to enter a sleep state. Fail
+ * Make sure that no one else is trying to put the system into a sleep state.
- *      if someone has beat us to it, since we don't want anything weird to
+ * Fail if that's not the case.  Otherwise, prepare for system suspend, make the
- *      happen when we wake up.
+ * system enter the given sleep state and clean up after wakeup.
- *      Then, do the setup for suspend, enter the state, and cleaup (after
- *      we've woken up).
 */
-int enter_state(suspend_state_t state)
+static int enter_state(suspend_state_t state)
 {
        int error;
@@ -310,24 +308,26 @@ int enter_state(suspend_state_t state)
 }
 /**
- *      pm_suspend - Externally visible function for suspending system.
+ * pm_suspend - Externally visible function for suspending the system.
- *      @state:         Enumerated value of state to enter.
+ * @state: System sleep state to enter.
 *
- *      Determine whether or not value is within range, get state
+ * Check if the value of @state represents one of the supported states,
- *      structure, and enter (above).
+ * execute enter_state() and update system suspend statistics.
 */
 int pm_suspend(suspend_state_t state)
 {
-        int ret;
+        int error;
-        if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) {
-                ret = enter_state(state);
+        if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
-                if (ret) {
+                return -EINVAL;
-                        suspend_stats.fail++;
-                        dpm_save_failed_errno(ret);
+        error = enter_state(state);
-                } else
+        if (error) {
-                        suspend_stats.success++;
+                suspend_stats.fail++;
-                return ret;
+                dpm_save_failed_errno(error);
+        } else {
+                suspend_stats.success++;
        }
-        return -EINVAL;
+        return error;
 }
 EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 3e100075b13c..33c4329205af 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -249,16 +249,10 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                }
                pm_restore_gfp_mask();
                error = hibernation_snapshot(data->platform_support);
-                if (error) {
+                if (!error) {
-                        thaw_kernel_threads();
-                } else {
                        error = put_user(in_suspend, (int __user *)arg);
-                        if (!error && !freezer_test_done)
+                        data->ready = !freezer_test_done && !error;
-                                data->ready = 1;
+                        freezer_test_done = false;
-                        if (freezer_test_done) {
-                                freezer_test_done = false;
-                                thaw_kernel_threads();
-                        }
                }
                break;
diff --git a/kernel/printk.c b/kernel/printk.c
index 13c0a1143f49..b663c2c95d39 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -44,6 +44,9 @@
 #include <asm/uaccess.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/printk.h>
 /*
 * Architectures can override it:
 */
@@ -542,6 +545,8 @@ MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
 static void _call_console_drivers(unsigned start,
                                unsigned end, int msg_log_level)
 {
+        trace_console(&LOG_BUF(0), start, end, log_buf_len);
        if ((msg_log_level < console_loglevel || ignore_loglevel) &&
                        console_drivers && start != end) {
                if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
@@ -702,6 +707,9 @@ static bool printk_time = 0;
 #endif
 module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
+static bool always_kmsg_dump;
+module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
 /* Check if we have any console registered that can be called early in boot. */
 static int have_callable_console(void)
 {
@@ -1208,13 +1216,27 @@ int is_console_locked(void)
        return console_locked;
 }
+/*
+ * Delayed printk facility, for scheduler-internal messages:
+ */
+#define PRINTK_BUF_SIZE         512
+#define PRINTK_PENDING_WAKEUP   0x01
+#define PRINTK_PENDING_SCHED    0x02
 static DEFINE_PER_CPU(int, printk_pending);
+static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
 void printk_tick(void)
 {
        if (__this_cpu_read(printk_pending)) {
-                __this_cpu_write(printk_pending, 0);
+                int pending = __this_cpu_xchg(printk_pending, 0);
-                wake_up_interruptible(&log_wait);
+                if (pending & PRINTK_PENDING_SCHED) {
+                        char *buf = __get_cpu_var(printk_sched_buf);
+                        printk(KERN_WARNING "[sched_delayed] %s", buf);
+                }
+                if (pending & PRINTK_PENDING_WAKEUP)
+                        wake_up_interruptible(&log_wait);
        }
 }
@@ -1228,7 +1250,7 @@ int printk_needs_cpu(int cpu)
 void wake_up_klogd(void)
 {
        if (waitqueue_active(&log_wait))
-                this_cpu_write(printk_pending, 1);
+                this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
 }
 /**
@@ -1621,6 +1643,26 @@ late_initcall(printk_late_init);
 #if defined CONFIG_PRINTK
+int printk_sched(const char *fmt, ...)
+{
+        unsigned long flags;
+        va_list args;
+        char *buf;
+        int r;
+        local_irq_save(flags);
+        buf = __get_cpu_var(printk_sched_buf);
+        va_start(args, fmt);
+        r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args);
+        va_end(args);
+        __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED);
+        local_irq_restore(flags);
+        return r;
+}
 /*
 * printk rate limiting, lifted from the networking subsystem.
 *
@@ -1732,6 +1774,9 @@ void kmsg_dump(enum kmsg_dump_reason reason)
        unsigned long l1, l2;
        unsigned long flags;
+        if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump)
+                return;
        /* Theoretically, the log could move on after we do this, but
           there's not a lot we can do about that. The new messages
           will overwrite the start of what we dump. */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 00ab2ca5ed11..ee8d49b9c309 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -231,26 +231,22 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
 }
 static int ptrace_attach(struct task_struct *task, long request,
+                         unsigned long addr,
                         unsigned long flags)
 {
        bool seize = (request == PTRACE_SEIZE);
        int retval;
-        /*
-         * SEIZE will enable new ptrace behaviors which will be implemented
-         * gradually.  SEIZE_DEVEL is used to prevent applications
-         * expecting full SEIZE behaviors trapping on kernel commits which
-         * are still in the process of implementing them.
-         *
-         * Only test programs for new ptrace behaviors being implemented
-         * should set SEIZE_DEVEL.  If unset, SEIZE will fail with -EIO.
-         *
-         * Once SEIZE behaviors are completely implemented, this flag and
-         * the following test will be removed.
-         */
        retval = -EIO;
-        if (seize && !(flags & PTRACE_SEIZE_DEVEL))
+        if (seize) {
-                goto out;
+                if (addr != 0)
+                        goto out;
+                if (flags & ~(unsigned long)PTRACE_O_MASK)
+                        goto out;
+                flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT);
+        } else {
+                flags = PT_PTRACED;
+        }
        audit_ptrace(task);
@@ -262,7 +258,7 @@ static int ptrace_attach(struct task_struct *task, long request,
        /*
         * Protect exec's credential calculations against our interference;
-         * interference; SUID, SGID and LSM creds get determined differently
+         * SUID, SGID and LSM creds get determined differently
         * under ptrace.
         */
        retval = -ERESTARTNOINTR;
@@ -282,11 +278,11 @@ static int ptrace_attach(struct task_struct *task, long request,
        if (task->ptrace)
                goto unlock_tasklist;
-        task->ptrace = PT_PTRACED;
        if (seize)
-                task->ptrace |= PT_SEIZED;
+                flags |= PT_SEIZED;
        if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE))
-                task->ptrace |= PT_PTRACE_CAP;
+                flags |= PT_PTRACE_CAP;
+        task->ptrace = flags;
        __ptrace_link(task, current);
@@ -528,30 +524,18 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
 static int ptrace_setoptions(struct task_struct *child, unsigned long data)
 {
-        child->ptrace &= ~PT_TRACE_MASK;
+        unsigned flags;
-        if (data & PTRACE_O_TRACESYSGOOD)
+        if (data & ~(unsigned long)PTRACE_O_MASK)
-                child->ptrace |= PT_TRACESYSGOOD;
+                return -EINVAL;
-        if (data & PTRACE_O_TRACEFORK)
-                child->ptrace |= PT_TRACE_FORK;
-        if (data & PTRACE_O_TRACEVFORK)
-                child->ptrace |= PT_TRACE_VFORK;
-        if (data & PTRACE_O_TRACECLONE)
-                child->ptrace |= PT_TRACE_CLONE;
-        if (data & PTRACE_O_TRACEEXEC)
-                child->ptrace |= PT_TRACE_EXEC;
-        if (data & PTRACE_O_TRACEVFORKDONE)
-                child->ptrace |= PT_TRACE_VFORK_DONE;
-        if (data & PTRACE_O_TRACEEXIT)
+        /* Avoid intermediate state when all opts are cleared */
-                child->ptrace |= PT_TRACE_EXIT;
+        flags = child->ptrace;
+        flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT);
+        flags |= (data << PT_OPT_FLAG_SHIFT);
+        child->ptrace = flags;
-        return (data & ~PTRACE_O_MASK) ? -EINVAL : 0;
+        return 0;
 }
 static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
@@ -891,7 +875,7 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
        }
        if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
-                ret = ptrace_attach(child, request, data);
+                ret = ptrace_attach(child, request, addr, data);
                /*
                 * Some architectures need to do book-keeping after
                 * a ptrace attach.
@@ -1034,7 +1018,7 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
        }
        if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
-                ret = ptrace_attach(child, request, data);
+                ret = ptrace_attach(child, request, addr, data);
                /*
                 * Some architectures need to do book-keeping after
                 * a ptrace attach.
diff --git a/kernel/rcu.h b/kernel/rcu.h
index aa88baab5f78..8ba99cdc6515 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -33,8 +33,27 @@
 * Process-level increment to ->dynticks_nesting field.  This allows for
 * architectures that use half-interrupts and half-exceptions from
 * process context.
+ *
+ * DYNTICK_TASK_NEST_MASK defines a field of width DYNTICK_TASK_NEST_WIDTH
+ * that counts the number of process-based reasons why RCU cannot
+ * consider the corresponding CPU to be idle, and DYNTICK_TASK_NEST_VALUE
+ * is the value used to increment or decrement this field.
+ *
+ * The rest of the bits could in principle be used to count interrupts,
+ * but this would mean that a negative-one value in the interrupt
+ * field could incorrectly zero out the DYNTICK_TASK_NEST_MASK field.
+ * We therefore provide a two-bit guard field defined by DYNTICK_TASK_MASK
+ * that is set to DYNTICK_TASK_FLAG upon initial exit from idle.
+ * The DYNTICK_TASK_EXIT_IDLE value is thus the combined value used upon
+ * initial exit from idle.
 */
-#define DYNTICK_TASK_NESTING (LLONG_MAX / 2 - 1)
+#define DYNTICK_TASK_NEST_WIDTH 7
+#define DYNTICK_TASK_NEST_VALUE ((LLONG_MAX >> DYNTICK_TASK_NEST_WIDTH) + 1)
+#define DYNTICK_TASK_NEST_MASK  (LLONG_MAX - DYNTICK_TASK_NEST_VALUE + 1)
+#define DYNTICK_TASK_FLAG          ((DYNTICK_TASK_NEST_VALUE / 8) * 2)
+#define DYNTICK_TASK_MASK          ((DYNTICK_TASK_NEST_VALUE / 8) * 3)
+#define DYNTICK_TASK_EXIT_IDLE     (DYNTICK_TASK_NEST_VALUE + \
+                                    DYNTICK_TASK_FLAG)
 /*
 * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
@@ -50,7 +69,6 @@ extern struct debug_obj_descr rcuhead_debug_descr;
 static inline void debug_rcu_head_queue(struct rcu_head *head)
 {
-        WARN_ON_ONCE((unsigned long)head & 0x3);
        debug_object_activate(head, &rcuhead_debug_descr);
        debug_object_active_state(head, &rcuhead_debug_descr,
                                  STATE_RCU_HEAD_READY,
@@ -76,16 +94,18 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
 extern void kfree(const void *);
-static inline void __rcu_reclaim(char *rn, struct rcu_head *head)
+static inline bool __rcu_reclaim(char *rn, struct rcu_head *head)
 {
        unsigned long offset = (unsigned long)head->func;
        if (__is_kfree_rcu_offset(offset)) {
                RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
                kfree((void *)head - offset);
+                return 1;
        } else {
                RCU_TRACE(trace_rcu_invoke_callback(rn, head));
                head->func(head);
+                return 0;
        }
 }
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 2bc4e135ff23..a86f1741cc27 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -88,6 +88,9 @@ EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
 * section.
 *
 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
+ *
+ * Note that rcu_read_lock() is disallowed if the CPU is either idle or
+ * offline from an RCU perspective, so check for those as well.
 */
 int rcu_read_lock_bh_held(void)
 {
@@ -95,6 +98,8 @@ int rcu_read_lock_bh_held(void)
                return 1;
        if (rcu_is_cpu_idle())
                return 0;
+        if (!rcu_lockdep_current_cpu_online())
+                return 0;
        return in_softirq() || irqs_disabled();
 }
 EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 977296dca0a4..37a5444204d2 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -53,7 +53,7 @@ static void __call_rcu(struct rcu_head *head,
 #include "rcutiny_plugin.h"
-static long long rcu_dynticks_nesting = DYNTICK_TASK_NESTING;
+static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
 /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
 static void rcu_idle_enter_common(long long oldval)
@@ -88,10 +88,16 @@ void rcu_idle_enter(void)
        local_irq_save(flags);
        oldval = rcu_dynticks_nesting;
-        rcu_dynticks_nesting = 0;
+        WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0);
+        if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) ==
+            DYNTICK_TASK_NEST_VALUE)
+                rcu_dynticks_nesting = 0;
+        else
+                rcu_dynticks_nesting  -= DYNTICK_TASK_NEST_VALUE;
        rcu_idle_enter_common(oldval);
        local_irq_restore(flags);
 }
+EXPORT_SYMBOL_GPL(rcu_idle_enter);
 /*
 * Exit an interrupt handler towards idle.
@@ -140,11 +146,15 @@ void rcu_idle_exit(void)
        local_irq_save(flags);
        oldval = rcu_dynticks_nesting;
-        WARN_ON_ONCE(oldval != 0);
+        WARN_ON_ONCE(rcu_dynticks_nesting < 0);
-        rcu_dynticks_nesting = DYNTICK_TASK_NESTING;
+        if (rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK)
+                rcu_dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
+        else
+                rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
        rcu_idle_exit_common(oldval);
        local_irq_restore(flags);
 }
+EXPORT_SYMBOL_GPL(rcu_idle_exit);
 /*
 * Enter an interrupt handler, moving away from idle.
@@ -258,7 +268,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
        /* If no RCU callbacks ready to invoke, just return. */
        if (&rcp->rcucblist == rcp->donetail) {
-                RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1));
+                RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1));
                RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
                                              ACCESS_ONCE(rcp->rcucblist),
                                              need_resched(),
@@ -269,7 +279,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
        /* Move the ready-to-invoke callbacks to a local list. */
        local_irq_save(flags);
-        RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1));
+        RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1));
        list = rcp->rcucblist;
        rcp->rcucblist = *rcp->donetail;
        *rcp->donetail = NULL;
@@ -319,6 +329,10 @@ static void rcu_process_callbacks(struct softirq_action *unused)
 */
 void synchronize_sched(void)
 {
+        rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
+                           !lock_is_held(&rcu_lock_map) &&
+                           !lock_is_held(&rcu_sched_lock_map),
+                           "Illegal synchronize_sched() in RCU read-side critical section");
        cond_resched();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 9cb1ae4aabdd..22ecea0dfb62 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -132,6 +132,7 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
        RCU_TRACE(.rcb.name = "rcu_preempt")
 };
+static void rcu_read_unlock_special(struct task_struct *t);
 static int rcu_preempted_readers_exp(void);
 static void rcu_report_exp_done(void);
@@ -146,6 +147,16 @@ static int rcu_cpu_blocking_cur_gp(void)
 /*
 * Check for a running RCU reader.  Because there is only one CPU,
 * there can be but one running RCU reader at a time.  ;-)
+ *
+ * Returns zero if there are no running readers.  Returns a positive
+ * number if there is at least one reader within its RCU read-side
+ * critical section.  Returns a negative number if an outermost reader
+ * is in the midst of exiting from its RCU read-side critical section
+ *
+ * Returns zero if there are no running readers.  Returns a positive
+ * number if there is at least one reader within its RCU read-side
+ * critical section.  Returns a negative number if an outermost reader
+ * is in the midst of exiting from its RCU read-side critical section.
 */
 static int rcu_preempt_running_reader(void)
 {
@@ -307,7 +318,6 @@ static int rcu_boost(void)
        t = container_of(tb, struct task_struct, rcu_node_entry);
        rt_mutex_init_proxy_locked(&mtx, t);
        t->rcu_boost_mutex = &mtx;
-        t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
        raw_local_irq_restore(flags);
        rt_mutex_lock(&mtx);
        rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
@@ -475,7 +485,7 @@ void rcu_preempt_note_context_switch(void)
        unsigned long flags;
        local_irq_save(flags); /* must exclude scheduler_tick(). */
-        if (rcu_preempt_running_reader() &&
+        if (rcu_preempt_running_reader() > 0 &&
            (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
                /* Possibly blocking in an RCU read-side critical section. */
@@ -494,6 +504,13 @@ void rcu_preempt_note_context_switch(void)
                list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks);
                if (rcu_cpu_blocking_cur_gp())
                        rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry;
+        } else if (rcu_preempt_running_reader() < 0 &&
+                   t->rcu_read_unlock_special) {
+                /*
+                 * Complete exit from RCU read-side critical section on
+                 * behalf of preempted instance of __rcu_read_unlock().
+                 */
+                rcu_read_unlock_special(t);
        }
        /*
@@ -526,12 +543,15 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock);
 * notify RCU core processing or task having blocked during the RCU
 * read-side critical section.
 */
-static void rcu_read_unlock_special(struct task_struct *t)
+static noinline void rcu_read_unlock_special(struct task_struct *t)
 {
        int empty;
        int empty_exp;
        unsigned long flags;
        struct list_head *np;
+#ifdef CONFIG_RCU_BOOST
+        struct rt_mutex *rbmp = NULL;
+#endif /* #ifdef CONFIG_RCU_BOOST */
        int special;
        /*
@@ -552,7 +572,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
                rcu_preempt_cpu_qs();
        /* Hardware IRQ handlers cannot block. */
-        if (in_irq()) {
+        if (in_irq() || in_serving_softirq()) {
                local_irq_restore(flags);
                return;
        }
@@ -597,10 +617,10 @@ static void rcu_read_unlock_special(struct task_struct *t)
        }
 #ifdef CONFIG_RCU_BOOST
        /* Unboost self if was boosted. */
-        if (special & RCU_READ_UNLOCK_BOOSTED) {
+        if (t->rcu_boost_mutex != NULL) {
-                t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
+                rbmp = t->rcu_boost_mutex;
-                rt_mutex_unlock(t->rcu_boost_mutex);
                t->rcu_boost_mutex = NULL;
+                rt_mutex_unlock(rbmp);
        }
 #endif /* #ifdef CONFIG_RCU_BOOST */
        local_irq_restore(flags);
@@ -618,13 +638,22 @@ void __rcu_read_unlock(void)
        struct task_struct *t = current;
        barrier();  /* needed if we ever invoke rcu_read_unlock in rcutiny.c */
-        --t->rcu_read_lock_nesting;
+        if (t->rcu_read_lock_nesting != 1)
-        barrier();  /* decrement before load of ->rcu_read_unlock_special */
+                --t->rcu_read_lock_nesting;
-        if (t->rcu_read_lock_nesting == 0 &&
+        else {
-            unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
+                t->rcu_read_lock_nesting = INT_MIN;
-                rcu_read_unlock_special(t);
+                barrier();  /* assign before ->rcu_read_unlock_special load */
+                if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
+                        rcu_read_unlock_special(t);
+                barrier();  /* ->rcu_read_unlock_special load before assign */
+                t->rcu_read_lock_nesting = 0;
+        }
 #ifdef CONFIG_PROVE_LOCKING
-        WARN_ON_ONCE(t->rcu_read_lock_nesting < 0);
+        {
+                int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
+                WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
+        }
 #endif /* #ifdef CONFIG_PROVE_LOCKING */
 }
 EXPORT_SYMBOL_GPL(__rcu_read_unlock);
@@ -649,7 +678,7 @@ static void rcu_preempt_check_callbacks(void)
                invoke_rcu_callbacks();
        if (rcu_preempt_gp_in_progress() &&
            rcu_cpu_blocking_cur_gp() &&
-            rcu_preempt_running_reader())
+            rcu_preempt_running_reader() > 0)
                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
 }
@@ -706,6 +735,11 @@ EXPORT_SYMBOL_GPL(call_rcu);
 */
 void synchronize_rcu(void)
 {
+        rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
+                           !lock_is_held(&rcu_lock_map) &&
+                           !lock_is_held(&rcu_sched_lock_map),
+                           "Illegal synchronize_rcu() in RCU read-side critical section");
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
        if (!rcu_scheduler_active)
                return;
@@ -882,7 +916,8 @@ static void rcu_preempt_process_callbacks(void)
 static void invoke_rcu_callbacks(void)
 {
        have_rcu_kthread_work = 1;
-        wake_up(&rcu_kthread_wq);
+        if (rcu_kthread_task != NULL)
+                wake_up(&rcu_kthread_wq);
 }
 #ifdef CONFIG_RCU_TRACE
@@ -943,12 +978,16 @@ early_initcall(rcu_spawn_kthreads);
 #else /* #ifdef CONFIG_RCU_BOOST */
+/* Hold off callback invocation until early_initcall() time. */
+static int rcu_scheduler_fully_active __read_mostly;
 /*
 * Start up softirq processing of callbacks.
 */
 void invoke_rcu_callbacks(void)
 {
-        raise_softirq(RCU_SOFTIRQ);
+        if (rcu_scheduler_fully_active)
+                raise_softirq(RCU_SOFTIRQ);
 }
 #ifdef CONFIG_RCU_TRACE
@@ -963,10 +1002,14 @@ static bool rcu_is_callbacks_kthread(void)
 #endif /* #ifdef CONFIG_RCU_TRACE */
-void rcu_init(void)
+static int __init rcu_scheduler_really_started(void)
 {
+        rcu_scheduler_fully_active = 1;
        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+        raise_softirq(RCU_SOFTIRQ);  /* Invoke any callbacks from early boot. */
+        return 0;
 }
+early_initcall(rcu_scheduler_really_started);
 #endif /* #else #ifdef CONFIG_RCU_BOOST */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index a58ac285fc69..a89b381a8c6e 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -65,7 +65,10 @@ static int fqs_duration;	/* Duration of bursts (us), 0 to disable. */
 static int fqs_holdoff;         /* Hold time within burst (us). */
 static int fqs_stutter = 3;     /* Wait time between bursts (s). */
 static int onoff_interval;      /* Wait time between CPU hotplugs, 0=disable. */
+static int onoff_holdoff;       /* Seconds after boot before CPU hotplugs. */
 static int shutdown_secs;       /* Shutdown time (s).  <=0 for no shutdown. */
+static int stall_cpu;           /* CPU-stall duration (s).  0 for no stall. */
+static int stall_cpu_holdoff = 10; /* Time to wait until stall (s).  */
 static int test_boost = 1;      /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
 static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
 static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
@@ -95,8 +98,14 @@ module_param(fqs_stutter, int, 0444);
 MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
 module_param(onoff_interval, int, 0444);
 MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
+module_param(onoff_holdoff, int, 0444);
+MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)");
 module_param(shutdown_secs, int, 0444);
 MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable.");
+module_param(stall_cpu, int, 0444);
+MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable.");
+module_param(stall_cpu_holdoff, int, 0444);
+MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s).");
 module_param(test_boost, int, 0444);
 MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
 module_param(test_boost_interval, int, 0444);
@@ -129,6 +138,7 @@ static struct task_struct *shutdown_task;
 #ifdef CONFIG_HOTPLUG_CPU
 static struct task_struct *onoff_task;
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
+static struct task_struct *stall_task;
 #define RCU_TORTURE_PIPE_LEN 10
@@ -990,12 +1000,12 @@ static void rcu_torture_timer(unsigned long unused)
                                  rcu_read_lock_bh_held() ||
                                  rcu_read_lock_sched_held() ||
                                  srcu_read_lock_held(&srcu_ctl));
-        do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
        if (p == NULL) {
                /* Leave because rcu_torture_writer is not yet underway */
                cur_ops->readunlock(idx);
                return;
        }
+        do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
        if (p->rtort_mbtest == 0)
                atomic_inc(&n_rcu_torture_mberror);
        spin_lock(&rand_lock);
@@ -1053,13 +1063,13 @@ rcu_torture_reader(void *arg)
                                          rcu_read_lock_bh_held() ||
                                          rcu_read_lock_sched_held() ||
                                          srcu_read_lock_held(&srcu_ctl));
-                do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
                if (p == NULL) {
                        /* Wait for rcu_torture_writer to get underway */
                        cur_ops->readunlock(idx);
                        schedule_timeout_interruptible(HZ);
                        continue;
                }
+                do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
                if (p->rtort_mbtest == 0)
                        atomic_inc(&n_rcu_torture_mberror);
                cur_ops->read_delay(&rand);
@@ -1300,13 +1310,13 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
                "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
                "test_boost=%d/%d test_boost_interval=%d "
                "test_boost_duration=%d shutdown_secs=%d "
-                "onoff_interval=%d\n",
+                "onoff_interval=%d onoff_holdoff=%d\n",
                torture_type, tag, nrealreaders, nfakewriters,
                stat_interval, verbose, test_no_idle_hz, shuffle_interval,
                stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
                test_boost, cur_ops->can_boost,
                test_boost_interval, test_boost_duration, shutdown_secs,
-                onoff_interval);
+                onoff_interval, onoff_holdoff);
 }
 static struct notifier_block rcutorture_shutdown_nb = {
@@ -1410,6 +1420,11 @@ rcu_torture_onoff(void *arg)
        for_each_online_cpu(cpu)
                maxcpu = cpu;
        WARN_ON(maxcpu < 0);
+        if (onoff_holdoff > 0) {
+                VERBOSE_PRINTK_STRING("rcu_torture_onoff begin holdoff");
+                schedule_timeout_interruptible(onoff_holdoff * HZ);
+                VERBOSE_PRINTK_STRING("rcu_torture_onoff end holdoff");
+        }
        while (!kthread_should_stop()) {
                cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1);
                if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
@@ -1450,12 +1465,15 @@ rcu_torture_onoff(void *arg)
 static int __cpuinit
 rcu_torture_onoff_init(void)
 {
+        int ret;
        if (onoff_interval <= 0)
                return 0;
        onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff");
        if (IS_ERR(onoff_task)) {
+                ret = PTR_ERR(onoff_task);
                onoff_task = NULL;
-                return PTR_ERR(onoff_task);
+                return ret;
        }
        return 0;
 }
@@ -1481,6 +1499,63 @@ static void rcu_torture_onoff_cleanup(void)
 #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
+/*
+ * CPU-stall kthread.  It waits as specified by stall_cpu_holdoff, then
+ * induces a CPU stall for the time specified by stall_cpu.
+ */
+static int __cpuinit rcu_torture_stall(void *args)
+{
+        unsigned long stop_at;
+        VERBOSE_PRINTK_STRING("rcu_torture_stall task started");
+        if (stall_cpu_holdoff > 0) {
+                VERBOSE_PRINTK_STRING("rcu_torture_stall begin holdoff");
+                schedule_timeout_interruptible(stall_cpu_holdoff * HZ);
+                VERBOSE_PRINTK_STRING("rcu_torture_stall end holdoff");
+        }
+        if (!kthread_should_stop()) {
+                stop_at = get_seconds() + stall_cpu;
+                /* RCU CPU stall is expected behavior in following code. */
+                printk(KERN_ALERT "rcu_torture_stall start.\n");
+                rcu_read_lock();
+                preempt_disable();
+                while (ULONG_CMP_LT(get_seconds(), stop_at))
+                        continue;  /* Induce RCU CPU stall warning. */
+                preempt_enable();
+                rcu_read_unlock();
+                printk(KERN_ALERT "rcu_torture_stall end.\n");
+        }
+        rcutorture_shutdown_absorb("rcu_torture_stall");
+        while (!kthread_should_stop())
+                schedule_timeout_interruptible(10 * HZ);
+        return 0;
+}
+/* Spawn CPU-stall kthread, if stall_cpu specified. */
+static int __init rcu_torture_stall_init(void)
+{
+        int ret;
+        if (stall_cpu <= 0)
+                return 0;
+        stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall");
+        if (IS_ERR(stall_task)) {
+                ret = PTR_ERR(stall_task);
+                stall_task = NULL;
+                return ret;
+        }
+        return 0;
+}
+/* Clean up after the CPU-stall kthread, if one was spawned. */
+static void rcu_torture_stall_cleanup(void)
+{
+        if (stall_task == NULL)
+                return;
+        VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task.");
+        kthread_stop(stall_task);
+}
 static int rcutorture_cpu_notify(struct notifier_block *self,
                                 unsigned long action, void *hcpu)
 {
@@ -1523,6 +1598,7 @@ rcu_torture_cleanup(void)
        fullstop = FULLSTOP_RMMOD;
        mutex_unlock(&fullstop_mutex);
        unregister_reboot_notifier(&rcutorture_shutdown_nb);
+        rcu_torture_stall_cleanup();
        if (stutter_task) {
                VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
                kthread_stop(stutter_task);
@@ -1602,6 +1678,10 @@ rcu_torture_cleanup(void)
                cur_ops->cleanup();
        if (atomic_read(&n_rcu_torture_error))
                rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
+        else if (n_online_successes != n_online_attempts ||
+                 n_offline_successes != n_offline_attempts)
+                rcu_torture_print_module_parms(cur_ops,
+                                               "End of test: RCU_HOTPLUG");
        else
                rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
 }
@@ -1819,6 +1899,7 @@ rcu_torture_init(void)
        }
        rcu_torture_onoff_init();
        register_reboot_notifier(&rcutorture_shutdown_nb);
+        rcu_torture_stall_init();
        rcutorture_record_test_transition();
        mutex_unlock(&fullstop_mutex);
        return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 6c4a6722abfd..1050d6d3922c 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -50,6 +50,8 @@
 #include <linux/wait.h>
 #include <linux/kthread.h>
 #include <linux/prefetch.h>
+#include <linux/delay.h>
+#include <linux/stop_machine.h>
 #include "rcutree.h"
 #include <trace/events/rcu.h>
@@ -196,7 +198,7 @@ void rcu_note_context_switch(int cpu)
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
 DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
-        .dynticks_nesting = DYNTICK_TASK_NESTING,
+        .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
        .dynticks = ATOMIC_INIT(1),
 };
@@ -208,8 +210,11 @@ module_param(blimit, int, 0);
 module_param(qhimark, int, 0);
 module_param(qlowmark, int, 0);
-int rcu_cpu_stall_suppress __read_mostly;
+int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
+int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
 module_param(rcu_cpu_stall_suppress, int, 0644);
+module_param(rcu_cpu_stall_timeout, int, 0644);
 static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
 static int rcu_pending(int cpu);
@@ -301,8 +306,6 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
        return &rsp->node[0];
 }
-#ifdef CONFIG_SMP
 /*
 * If the specified CPU is offline, tell the caller that it is in
 * a quiescent state.  Otherwise, whack it with a reschedule IPI.
@@ -317,30 +320,21 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
 static int rcu_implicit_offline_qs(struct rcu_data *rdp)
 {
        /*
-         * If the CPU is offline, it is in a quiescent state.  We can
+         * If the CPU is offline for more than a jiffy, it is in a quiescent
-         * trust its state not to change because interrupts are disabled.
+         * state.  We can trust its state not to change because interrupts
+         * are disabled.  The reason for the jiffy's worth of slack is to
+         * handle CPUs initializing on the way up and finding their way
+         * to the idle loop on the way down.
         */
-        if (cpu_is_offline(rdp->cpu)) {
+        if (cpu_is_offline(rdp->cpu) &&
+            ULONG_CMP_LT(rdp->rsp->gp_start + 2, jiffies)) {
                trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
                rdp->offline_fqs++;
                return 1;
        }
-        /*
-         * The CPU is online, so send it a reschedule IPI.  This forces
-         * it through the scheduler, and (inefficiently) also handles cases
-         * where idle loops fail to inform RCU about the CPU being idle.
-         */
-        if (rdp->cpu != smp_processor_id())
-                smp_send_reschedule(rdp->cpu);
-        else
-                set_need_resched();
-        rdp->resched_ipi++;
        return 0;
 }
-#endif /* #ifdef CONFIG_SMP */
 /*
 * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle
 *
@@ -366,6 +360,17 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
        atomic_inc(&rdtp->dynticks);
        smp_mb__after_atomic_inc();  /* Force ordering with next sojourn. */
        WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
+        /*
+         * The idle task is not permitted to enter the idle loop while
+         * in an RCU read-side critical section.
+         */
+        rcu_lockdep_assert(!lock_is_held(&rcu_lock_map),
+                           "Illegal idle entry in RCU read-side critical section.");
+        rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map),
+                           "Illegal idle entry in RCU-bh read-side critical section.");
+        rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map),
+                           "Illegal idle entry in RCU-sched read-side critical section.");
 }
 /**
@@ -389,10 +394,15 @@ void rcu_idle_enter(void)
        local_irq_save(flags);
        rdtp = &__get_cpu_var(rcu_dynticks);
        oldval = rdtp->dynticks_nesting;
-        rdtp->dynticks_nesting = 0;
+        WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
+        if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE)
+                rdtp->dynticks_nesting = 0;
+        else
+                rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
        rcu_idle_enter_common(rdtp, oldval);
        local_irq_restore(flags);
 }
+EXPORT_SYMBOL_GPL(rcu_idle_enter);
 /**
 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
@@ -462,7 +472,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
 * Exit idle mode, in other words, -enter- the mode in which RCU
 * read-side critical sections can occur.
 *
- * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to
+ * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to
 * allow for the possibility of usermode upcalls messing up our count
 * of interrupt nesting level during the busy period that is just
 * now starting.
@@ -476,11 +486,15 @@ void rcu_idle_exit(void)
        local_irq_save(flags);
        rdtp = &__get_cpu_var(rcu_dynticks);
        oldval = rdtp->dynticks_nesting;
-        WARN_ON_ONCE(oldval != 0);
+        WARN_ON_ONCE(oldval < 0);
-        rdtp->dynticks_nesting = DYNTICK_TASK_NESTING;
+        if (oldval & DYNTICK_TASK_NEST_MASK)
+                rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
+        else
+                rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
        rcu_idle_exit_common(rdtp, oldval);
        local_irq_restore(flags);
 }
+EXPORT_SYMBOL_GPL(rcu_idle_exit);
 /**
 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
@@ -581,6 +595,49 @@ int rcu_is_cpu_idle(void)
 }
 EXPORT_SYMBOL(rcu_is_cpu_idle);
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Is the current CPU online?  Disable preemption to avoid false positives
+ * that could otherwise happen due to the current CPU number being sampled,
+ * this task being preempted, its old CPU being taken offline, resuming
+ * on some other CPU, then determining that its old CPU is now offline.
+ * It is OK to use RCU on an offline processor during initial boot, hence
+ * the check for rcu_scheduler_fully_active.  Note also that it is OK
+ * for a CPU coming online to use RCU for one jiffy prior to marking itself
+ * online in the cpu_online_mask.  Similarly, it is OK for a CPU going
+ * offline to continue to use RCU for one jiffy after marking itself
+ * offline in the cpu_online_mask.  This leniency is necessary given the
+ * non-atomic nature of the online and offline processing, for example,
+ * the fact that a CPU enters the scheduler after completing the CPU_DYING
+ * notifiers.
+ *
+ * This is also why RCU internally marks CPUs online during the
+ * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase.
+ *
+ * Disable checking if in an NMI handler because we cannot safely report
+ * errors from NMI handlers anyway.
+ */
+bool rcu_lockdep_current_cpu_online(void)
+{
+        struct rcu_data *rdp;
+        struct rcu_node *rnp;
+        bool ret;
+        if (in_nmi())
+                return 1;
+        preempt_disable();
+        rdp = &__get_cpu_var(rcu_sched_data);
+        rnp = rdp->mynode;
+        ret = (rdp->grpmask & rnp->qsmaskinit) ||
+              !rcu_scheduler_fully_active;
+        preempt_enable();
+        return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 #endif /* #ifdef CONFIG_PROVE_RCU */
 /**
@@ -595,8 +652,6 @@ int rcu_is_cpu_rrupt_from_idle(void)
        return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
 }
-#ifdef CONFIG_SMP
 /*
 * Snapshot the specified CPU's dynticks counter so that we can later
 * credit them with an implicit quiescent state.  Return 1 if this CPU
@@ -640,12 +695,28 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
        return rcu_implicit_offline_qs(rdp);
 }
-#endif /* #ifdef CONFIG_SMP */
+static int jiffies_till_stall_check(void)
+{
+        int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
+        /*
+         * Limit check must be consistent with the Kconfig limits
+         * for CONFIG_RCU_CPU_STALL_TIMEOUT.
+         */
+        if (till_stall_check < 3) {
+                ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
+                till_stall_check = 3;
+        } else if (till_stall_check > 300) {
+                ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
+                till_stall_check = 300;
+        }
+        return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
+}
 static void record_gp_stall_check_time(struct rcu_state *rsp)
 {
        rsp->gp_start = jiffies;
-        rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
+        rsp->jiffies_stall = jiffies + jiffies_till_stall_check();
 }
 static void print_other_cpu_stall(struct rcu_state *rsp)
@@ -664,13 +735,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
-        rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+        rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3;
-        /*
-         * Now rat on any tasks that got kicked up to the root rcu_node
-         * due to CPU offlining.
-         */
-        ndetected = rcu_print_task_stall(rnp);
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        /*
@@ -678,8 +743,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
         * See Documentation/RCU/stallwarn.txt for info on how to debug
         * RCU CPU stall warnings.
         */
-        printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
+        printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks:",
               rsp->name);
+        print_cpu_stall_info_begin();
        rcu_for_each_leaf_node(rsp, rnp) {
                raw_spin_lock_irqsave(&rnp->lock, flags);
                ndetected += rcu_print_task_stall(rnp);
@@ -688,11 +754,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
                        continue;
                for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
                        if (rnp->qsmask & (1UL << cpu)) {
-                                printk(" %d", rnp->grplo + cpu);
+                                print_cpu_stall_info(rsp, rnp->grplo + cpu);
                                ndetected++;
                        }
        }
-        printk("} (detected by %d, t=%ld jiffies)\n",
+        /*
+         * Now rat on any tasks that got kicked up to the root rcu_node
+         * due to CPU offlining.
+         */
+        rnp = rcu_get_root(rsp);
+        raw_spin_lock_irqsave(&rnp->lock, flags);
+        ndetected = rcu_print_task_stall(rnp);
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+        print_cpu_stall_info_end();
+        printk(KERN_CONT "(detected by %d, t=%ld jiffies)\n",
               smp_processor_id(), (long)(jiffies - rsp->gp_start));
        if (ndetected == 0)
                printk(KERN_ERR "INFO: Stall ended before state dump start\n");
@@ -716,15 +793,18 @@ static void print_cpu_stall(struct rcu_state *rsp)
         * See Documentation/RCU/stallwarn.txt for info on how to debug
         * RCU CPU stall warnings.
         */
-        printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
+        printk(KERN_ERR "INFO: %s self-detected stall on CPU", rsp->name);
-               rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
+        print_cpu_stall_info_begin();
+        print_cpu_stall_info(rsp, smp_processor_id());
+        print_cpu_stall_info_end();
+        printk(KERN_CONT " (t=%lu jiffies)\n", jiffies - rsp->gp_start);
        if (!trigger_all_cpu_backtrace())
                dump_stack();
        raw_spin_lock_irqsave(&rnp->lock, flags);
        if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
-                rsp->jiffies_stall =
+                rsp->jiffies_stall = jiffies +
-                        jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+                                     3 * jiffies_till_stall_check() + 3;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        set_need_resched();  /* kick ourselves to get things going. */
@@ -807,6 +887,7 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct
                        rdp->passed_quiesce = 0;
                } else
                        rdp->qs_pending = 0;
+                zero_cpu_stall_ticks(rdp);
        }
 }
@@ -943,6 +1024,10 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
 * in preparation for detecting the next grace period.  The caller must hold
 * the root node's ->lock, which is released before return.  Hard irqs must
 * be disabled.
+ *
+ * Note that it is legal for a dying CPU (which is marked as offline) to
+ * invoke this function.  This can happen when the dying CPU reports its
+ * quiescent state.
 */
 static void
 rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
@@ -980,26 +1065,8 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
        rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */
        rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
        record_gp_stall_check_time(rsp);
-        /* Special-case the common single-level case. */
-        if (NUM_RCU_NODES == 1) {
-                rcu_preempt_check_blocked_tasks(rnp);
-                rnp->qsmask = rnp->qsmaskinit;
-                rnp->gpnum = rsp->gpnum;
-                rnp->completed = rsp->completed;
-                rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */
-                rcu_start_gp_per_cpu(rsp, rnp, rdp);
-                rcu_preempt_boost_start_gp(rnp);
-                trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
-                                            rnp->level, rnp->grplo,
-                                            rnp->grphi, rnp->qsmask);
-                raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                return;
-        }
        raw_spin_unlock(&rnp->lock);  /* leave irqs disabled. */
        /* Exclude any concurrent CPU-hotplug operations. */
        raw_spin_lock(&rsp->onofflock);  /* irqs already disabled. */
@@ -1245,53 +1312,115 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 /*
 * Move a dying CPU's RCU callbacks to online CPU's callback list.
- * Synchronization is not required because this function executes
+ * Also record a quiescent state for this CPU for the current grace period.
- * in stop_machine() context.
+ * Synchronization and interrupt disabling are not required because
+ * this function executes in stop_machine() context.  Therefore, cleanup
+ * operations that might block must be done later from the CPU_DEAD
+ * notifier.
+ *
+ * Note that the outgoing CPU's bit has already been cleared in the
+ * cpu_online_mask.  This allows us to randomly pick a callback
+ * destination from the bits set in that mask.
 */
-static void rcu_send_cbs_to_online(struct rcu_state *rsp)
+static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
 {
        int i;
-        /* current DYING CPU is cleared in the cpu_online_mask */
+        unsigned long mask;
        int receive_cpu = cpumask_any(cpu_online_mask);
        struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
        struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
+        RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */
+        /* First, adjust the counts. */
+        if (rdp->nxtlist != NULL) {
+                receive_rdp->qlen_lazy += rdp->qlen_lazy;
+                receive_rdp->qlen += rdp->qlen;
+                rdp->qlen_lazy = 0;
+                rdp->qlen = 0;
+        }
-        if (rdp->nxtlist == NULL)
+        /*
-                return;  /* irqs disabled, so comparison is stable. */
+         * Next, move ready-to-invoke callbacks to be invoked on some
+         * other CPU.  These will not be required to pass through another
+         * grace period:  They are done, regardless of CPU.
+         */
+        if (rdp->nxtlist != NULL &&
+            rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) {
+                struct rcu_head *oldhead;
+                struct rcu_head **oldtail;
+                struct rcu_head **newtail;
+                oldhead = rdp->nxtlist;
+                oldtail = receive_rdp->nxttail[RCU_DONE_TAIL];
+                rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
+                *rdp->nxttail[RCU_DONE_TAIL] = *oldtail;
+                *receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead;
+                newtail = rdp->nxttail[RCU_DONE_TAIL];
+                for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) {
+                        if (receive_rdp->nxttail[i] == oldtail)
+                                receive_rdp->nxttail[i] = newtail;
+                        if (rdp->nxttail[i] == newtail)
+                                rdp->nxttail[i] = &rdp->nxtlist;
+                }
+        }
-        *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
+        /*
-        receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+         * Finally, put the rest of the callbacks at the end of the list.
-        receive_rdp->qlen += rdp->qlen;
+         * The ones that made it partway through get to start over:  We
-        receive_rdp->n_cbs_adopted += rdp->qlen;
+         * cannot assume that grace periods are synchronized across CPUs.
-        rdp->n_cbs_orphaned += rdp->qlen;
+         * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but
+         * this does not seem compelling.  Not yet, anyway.)
+         */
+        if (rdp->nxtlist != NULL) {
+                *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
+                receive_rdp->nxttail[RCU_NEXT_TAIL] =
+                                rdp->nxttail[RCU_NEXT_TAIL];
+                receive_rdp->n_cbs_adopted += rdp->qlen;
+                rdp->n_cbs_orphaned += rdp->qlen;
+                rdp->nxtlist = NULL;
+                for (i = 0; i < RCU_NEXT_SIZE; i++)
+                        rdp->nxttail[i] = &rdp->nxtlist;
+        }
-        rdp->nxtlist = NULL;
+        /*
-        for (i = 0; i < RCU_NEXT_SIZE; i++)
+         * Record a quiescent state for the dying CPU.  This is safe
-                rdp->nxttail[i] = &rdp->nxtlist;
+         * only because we have already cleared out the callbacks.
-        rdp->qlen = 0;
+         * (Otherwise, the RCU core might try to schedule the invocation
+         * of callbacks on this now-offline CPU, which would be bad.)
+         */
+        mask = rdp->grpmask;    /* rnp->grplo is constant. */
+        trace_rcu_grace_period(rsp->name,
+                               rnp->gpnum + 1 - !!(rnp->qsmask & mask),
+                               "cpuofl");
+        rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum);
+        /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */
 }
 /*
- * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
+ * The CPU has been completely removed, and some other CPU is reporting
- * and move all callbacks from the outgoing CPU to the current one.
+ * this fact from process context.  Do the remainder of the cleanup.
 * There can only be one CPU hotplug operation at a time, so no other
 * CPU can be attempting to update rcu_cpu_kthread_task.
 */
-static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
+static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 {
        unsigned long flags;
        unsigned long mask;
        int need_report = 0;
        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-        struct rcu_node *rnp;
+        struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rnp. */
+        /* Adjust any no-longer-needed kthreads. */
        rcu_stop_cpu_kthread(cpu);
+        rcu_node_kthread_setaffinity(rnp, -1);
+        /* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */
        /* Exclude any attempts to start a new grace period. */
        raw_spin_lock_irqsave(&rsp->onofflock, flags);
        /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
-        rnp = rdp->mynode;      /* this is the outgoing CPU's rnp. */
        mask = rdp->grpmask;    /* rnp->grplo is constant. */
        do {
                raw_spin_lock(&rnp->lock);      /* irqs already disabled. */
@@ -1299,20 +1428,11 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
                if (rnp->qsmaskinit != 0) {
                        if (rnp != rdp->mynode)
                                raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
-                        else
-                                trace_rcu_grace_period(rsp->name,
-                                                       rnp->gpnum + 1 -
-                                                       !!(rnp->qsmask & mask),
-                                                       "cpuofl");
                        break;
                }
-                if (rnp == rdp->mynode) {
+                if (rnp == rdp->mynode)
-                        trace_rcu_grace_period(rsp->name,
-                                               rnp->gpnum + 1 -
-                                               !!(rnp->qsmask & mask),
-                                               "cpuofl");
                        need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
-                } else
+                else
                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
                mask = rnp->grpmask;
                rnp = rnp->parent;
@@ -1332,29 +1452,15 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        if (need_report & RCU_OFL_TASKS_EXP_GP)
                rcu_report_exp_rnp(rsp, rnp, true);
-        rcu_node_kthread_setaffinity(rnp, -1);
-}
-/*
- * Remove the specified CPU from the RCU hierarchy and move any pending
- * callbacks that it might have to the current CPU.  This code assumes
- * that at least one CPU in the system will remain running at all times.
- * Any attempt to offline -all- CPUs is likely to strand RCU callbacks.
- */
-static void rcu_offline_cpu(int cpu)
-{
-        __rcu_offline_cpu(cpu, &rcu_sched_state);
-        __rcu_offline_cpu(cpu, &rcu_bh_state);
-        rcu_preempt_offline_cpu(cpu);
 }
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
-static void rcu_send_cbs_to_online(struct rcu_state *rsp)
+static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
 {
 }
-static void rcu_offline_cpu(int cpu)
+static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 {
 }
@@ -1368,11 +1474,11 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 {
        unsigned long flags;
        struct rcu_head *next, *list, **tail;
-        int bl, count;
+        int bl, count, count_lazy;
        /* If no callbacks are ready, just return.*/
        if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
-                trace_rcu_batch_start(rsp->name, 0, 0);
+                trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
                trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
                                    need_resched(), is_idle_task(current),
                                    rcu_is_callbacks_kthread());
@@ -1384,8 +1490,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
         * races with call_rcu() from interrupt handlers.
         */
        local_irq_save(flags);
+        WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
        bl = rdp->blimit;
-        trace_rcu_batch_start(rsp->name, rdp->qlen, bl);
+        trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl);
        list = rdp->nxtlist;
        rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
        *rdp->nxttail[RCU_DONE_TAIL] = NULL;
@@ -1396,12 +1503,13 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
        local_irq_restore(flags);
        /* Invoke callbacks. */
-        count = 0;
+        count = count_lazy = 0;
        while (list) {
                next = list->next;
                prefetch(next);
                debug_rcu_head_unqueue(list);
-                __rcu_reclaim(rsp->name, list);
+                if (__rcu_reclaim(rsp->name, list))
+                        count_lazy++;
                list = next;
                /* Stop only if limit reached and CPU has something to do. */
                if (++count >= bl &&
@@ -1416,6 +1524,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
                            rcu_is_callbacks_kthread());
        /* Update count, and requeue any remaining callbacks. */
+        rdp->qlen_lazy -= count_lazy;
        rdp->qlen -= count;
        rdp->n_cbs_invoked += count;
        if (list != NULL) {
@@ -1458,6 +1567,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 void rcu_check_callbacks(int cpu, int user)
 {
        trace_rcu_utilization("Start scheduler-tick");
+        increment_cpu_stall_ticks();
        if (user || rcu_is_cpu_rrupt_from_idle()) {
                /*
@@ -1492,8 +1602,6 @@ void rcu_check_callbacks(int cpu, int user)
        trace_rcu_utilization("End scheduler-tick");
 }
-#ifdef CONFIG_SMP
 /*
 * Scan the leaf rcu_node structures, processing dyntick state for any that
 * have not yet encountered a quiescent state, using the function specified.
@@ -1616,15 +1724,6 @@ unlock_fqs_ret:
        trace_rcu_utilization("End fqs");
 }
-#else /* #ifdef CONFIG_SMP */
-static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
-{
-        set_need_resched();
-}
-#endif /* #else #ifdef CONFIG_SMP */
 /*
 * This does the RCU core processing work for the specified rcu_state
 * and rcu_data structures.  This may be called only from the CPU to
@@ -1702,11 +1801,12 @@ static void invoke_rcu_core(void)
 static void
 __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
-           struct rcu_state *rsp)
+           struct rcu_state *rsp, bool lazy)
 {
        unsigned long flags;
        struct rcu_data *rdp;
+        WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */
        debug_rcu_head_queue(head);
        head->func = func;
        head->next = NULL;
@@ -1720,18 +1820,21 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
         * a quiescent state betweentimes.
         */
        local_irq_save(flags);
+        WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
        rdp = this_cpu_ptr(rsp->rda);
        /* Add the callback to our list. */
        *rdp->nxttail[RCU_NEXT_TAIL] = head;
        rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
        rdp->qlen++;
+        if (lazy)
+                rdp->qlen_lazy++;
        if (__is_kfree_rcu_offset((unsigned long)func))
                trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
-                                         rdp->qlen);
+                                         rdp->qlen_lazy, rdp->qlen);
        else
-                trace_rcu_callback(rsp->name, head, rdp->qlen);
+                trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen);
        /* If interrupts were disabled, don't dive into RCU core. */
        if (irqs_disabled_flags(flags)) {
@@ -1778,16 +1881,16 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 */
 void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
-        __call_rcu(head, func, &rcu_sched_state);
+        __call_rcu(head, func, &rcu_sched_state, 0);
 }
 EXPORT_SYMBOL_GPL(call_rcu_sched);
 /*
- * Queue an RCU for invocation after a quicker grace period.
+ * Queue an RCU callback for invocation after a quicker grace period.
 */
 void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
-        __call_rcu(head, func, &rcu_bh_state);
+        __call_rcu(head, func, &rcu_bh_state, 0);
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
@@ -1816,6 +1919,10 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
 */
 void synchronize_sched(void)
 {
+        rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
+                           !lock_is_held(&rcu_lock_map) &&
+                           !lock_is_held(&rcu_sched_lock_map),
+                           "Illegal synchronize_sched() in RCU-sched read-side critical section");
        if (rcu_blocking_is_gp())
                return;
        wait_rcu_gp(call_rcu_sched);
@@ -1833,12 +1940,137 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
 */
 void synchronize_rcu_bh(void)
 {
+        rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
+                           !lock_is_held(&rcu_lock_map) &&
+                           !lock_is_held(&rcu_sched_lock_map),
+                           "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
        if (rcu_blocking_is_gp())
                return;
        wait_rcu_gp(call_rcu_bh);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
+static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
+static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
+static int synchronize_sched_expedited_cpu_stop(void *data)
+{
+        /*
+         * There must be a full memory barrier on each affected CPU
+         * between the time that try_stop_cpus() is called and the
+         * time that it returns.
+         *
+         * In the current initial implementation of cpu_stop, the
+         * above condition is already met when the control reaches
+         * this point and the following smp_mb() is not strictly
+         * necessary.  Do smp_mb() anyway for documentation and
+         * robustness against future implementation changes.
+         */
+        smp_mb(); /* See above comment block. */
+        return 0;
+}
+/**
+ * synchronize_sched_expedited - Brute-force RCU-sched grace period
+ *
+ * Wait for an RCU-sched grace period to elapse, but use a "big hammer"
+ * approach to force the grace period to end quickly.  This consumes
+ * significant time on all CPUs and is unfriendly to real-time workloads,
+ * so is thus not recommended for any sort of common-case code.  In fact,
+ * if you are using synchronize_sched_expedited() in a loop, please
+ * restructure your code to batch your updates, and then use a single
+ * synchronize_sched() instead.
+ *
+ * Note that it is illegal to call this function while holding any lock
+ * that is acquired by a CPU-hotplug notifier.  And yes, it is also illegal
+ * to call this function from a CPU-hotplug notifier.  Failing to observe
+ * these restriction will result in deadlock.
+ *
+ * This implementation can be thought of as an application of ticket
+ * locking to RCU, with sync_sched_expedited_started and
+ * sync_sched_expedited_done taking on the roles of the halves
+ * of the ticket-lock word.  Each task atomically increments
+ * sync_sched_expedited_started upon entry, snapshotting the old value,
+ * then attempts to stop all the CPUs.  If this succeeds, then each
+ * CPU will have executed a context switch, resulting in an RCU-sched
+ * grace period.  We are then done, so we use atomic_cmpxchg() to
+ * update sync_sched_expedited_done to match our snapshot -- but
+ * only if someone else has not already advanced past our snapshot.
+ *
+ * On the other hand, if try_stop_cpus() fails, we check the value
+ * of sync_sched_expedited_done.  If it has advanced past our
+ * initial snapshot, then someone else must have forced a grace period
+ * some time after we took our snapshot.  In this case, our work is
+ * done for us, and we can simply return.  Otherwise, we try again,
+ * but keep our initial snapshot for purposes of checking for someone
+ * doing our work for us.
+ *
+ * If we fail too many times in a row, we fall back to synchronize_sched().
+ */
+void synchronize_sched_expedited(void)
+{
+        int firstsnap, s, snap, trycount = 0;
+        /* Note that atomic_inc_return() implies full memory barrier. */
+        firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
+        get_online_cpus();
+        WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
+        /*
+         * Each pass through the following loop attempts to force a
+         * context switch on each CPU.
+         */
+        while (try_stop_cpus(cpu_online_mask,
+                             synchronize_sched_expedited_cpu_stop,
+                             NULL) == -EAGAIN) {
+                put_online_cpus();
+                /* No joy, try again later.  Or just synchronize_sched(). */
+                if (trycount++ < 10)
+                        udelay(trycount * num_online_cpus());
+                else {
+                        synchronize_sched();
+                        return;
+                }
+                /* Check to see if someone else did our work for us. */
+                s = atomic_read(&sync_sched_expedited_done);
+                if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
+                        smp_mb(); /* ensure test happens before caller kfree */
+                        return;
+                }
+                /*
+                 * Refetching sync_sched_expedited_started allows later
+                 * callers to piggyback on our grace period.  We subtract
+                 * 1 to get the same token that the last incrementer got.
+                 * We retry after they started, so our grace period works
+                 * for them, and they started after our first try, so their
+                 * grace period works for us.
+                 */
+                get_online_cpus();
+                snap = atomic_read(&sync_sched_expedited_started);
+                smp_mb(); /* ensure read is before try_stop_cpus(). */
+        }
+        /*
+         * Everyone up to our most recent fetch is covered by our grace
+         * period.  Update the counter, but only if our work is still
+         * relevant -- which it won't be if someone who started later
+         * than we did beat us to the punch.
+         */
+        do {
+                s = atomic_read(&sync_sched_expedited_done);
+                if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
+                        smp_mb(); /* ensure test happens before caller kfree */
+                        break;
+                }
+        } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
+        put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 /*
 * Check to see if there is any immediate RCU-related work to be done
 * by the current CPU, for the specified type of RCU, returning 1 if so.
@@ -1932,7 +2164,7 @@ static int rcu_cpu_has_callbacks(int cpu)
        /* RCU callbacks either ready or pending? */
        return per_cpu(rcu_sched_data, cpu).nxtlist ||
               per_cpu(rcu_bh_data, cpu).nxtlist ||
-               rcu_preempt_needs_cpu(cpu);
+               rcu_preempt_cpu_has_callbacks(cpu);
 }
 static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
@@ -2027,9 +2259,10 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
        rdp->nxtlist = NULL;
        for (i = 0; i < RCU_NEXT_SIZE; i++)
                rdp->nxttail[i] = &rdp->nxtlist;
+        rdp->qlen_lazy = 0;
        rdp->qlen = 0;
        rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
-        WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING);
+        WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
        WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
        rdp->cpu = cpu;
        rdp->rsp = rsp;
@@ -2057,7 +2290,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
        rdp->qlen_last_fqs_check = 0;
        rdp->n_force_qs_snap = rsp->n_force_qs;
        rdp->blimit = blimit;
-        rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING;
+        rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
        atomic_set(&rdp->dynticks->dynticks,
                   (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
        rcu_prepare_for_idle_init(cpu);
@@ -2139,16 +2372,18 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
                 * touch any data without introducing corruption. We send the
                 * dying CPU's callbacks to an arbitrarily chosen online CPU.
                 */
-                rcu_send_cbs_to_online(&rcu_bh_state);
+                rcu_cleanup_dying_cpu(&rcu_bh_state);
-                rcu_send_cbs_to_online(&rcu_sched_state);
+                rcu_cleanup_dying_cpu(&rcu_sched_state);
-                rcu_preempt_send_cbs_to_online();
+                rcu_preempt_cleanup_dying_cpu();
                rcu_cleanup_after_idle(cpu);
                break;
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
        case CPU_UP_CANCELED:
        case CPU_UP_CANCELED_FROZEN:
-                rcu_offline_cpu(cpu);
+                rcu_cleanup_dead_cpu(cpu, &rcu_bh_state);
+                rcu_cleanup_dead_cpu(cpu, &rcu_sched_state);
+                rcu_preempt_cleanup_dead_cpu(cpu);
                break;
        default:
                break;
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index fddff92d6676..cdd1be0a4072 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -239,6 +239,12 @@ struct rcu_data {
        bool            preemptible;    /* Preemptible RCU? */
        struct rcu_node *mynode;        /* This CPU's leaf of hierarchy */
        unsigned long grpmask;          /* Mask to apply to leaf qsmask. */
+#ifdef CONFIG_RCU_CPU_STALL_INFO
+        unsigned long   ticks_this_gp;  /* The number of scheduling-clock */
+                                        /*  ticks this CPU has handled */
+                                        /*  during and after the last grace */
+                                        /* period it is aware of. */
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
        /* 2) batch handling */
        /*
@@ -265,7 +271,8 @@ struct rcu_data {
         */
        struct rcu_head *nxtlist;
        struct rcu_head **nxttail[RCU_NEXT_SIZE];
-        long            qlen;           /* # of queued callbacks */
+        long            qlen_lazy;      /* # of lazy queued callbacks */
+        long            qlen;           /* # of queued callbacks, incl lazy */
        long            qlen_last_fqs_check;
                                        /* qlen at last check for QS forcing */
        unsigned long   n_cbs_invoked;  /* count of RCU cbs invoked. */
@@ -282,7 +289,6 @@ struct rcu_data {
        /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
        unsigned long dynticks_fqs;     /* Kicked due to dynticks idle. */
        unsigned long offline_fqs;      /* Kicked due to being offline. */
-        unsigned long resched_ipi;      /* Sent a resched IPI. */
        /* 5) __rcu_pending() statistics. */
        unsigned long n_rcu_pending;    /* rcu_pending() calls since boot. */
@@ -313,12 +319,6 @@ struct rcu_data {
 #else
 #define RCU_STALL_DELAY_DELTA          0
 #endif
-#define RCU_SECONDS_TILL_STALL_CHECK   (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \
-                                        RCU_STALL_DELAY_DELTA)
-                                                /* for rsp->jiffies_stall */
-#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30)
-                                                /* for rsp->jiffies_stall */
 #define RCU_STALL_RAT_DELAY             2       /* Allow other CPUs time */
                                                /*  to take at least one */
                                                /*  scheduling clock irq */
@@ -438,8 +438,8 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
 static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
                                     struct rcu_node *rnp,
                                     struct rcu_data *rdp);
-static void rcu_preempt_offline_cpu(int cpu);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
+static void rcu_preempt_cleanup_dead_cpu(int cpu);
 static void rcu_preempt_check_callbacks(int cpu);
 static void rcu_preempt_process_callbacks(void);
 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
@@ -448,9 +448,9 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
                               bool wake);
 #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
 static int rcu_preempt_pending(int cpu);
-static int rcu_preempt_needs_cpu(int cpu);
+static int rcu_preempt_cpu_has_callbacks(int cpu);
 static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
-static void rcu_preempt_send_cbs_to_online(void);
+static void rcu_preempt_cleanup_dying_cpu(void);
 static void __init __rcu_init_preempt(void);
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
@@ -471,5 +471,10 @@ static void __cpuinit rcu_prepare_kthreads(int cpu);
 static void rcu_prepare_for_idle_init(int cpu);
 static void rcu_cleanup_after_idle(int cpu);
 static void rcu_prepare_for_idle(int cpu);
+static void print_cpu_stall_info_begin(void);
+static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
+static void print_cpu_stall_info_end(void);
+static void zero_cpu_stall_ticks(struct rcu_data *rdp);
+static void increment_cpu_stall_ticks(void);
 #endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 8bb35d73e1f9..c023464816be 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,7 +25,6 @@
 */
 #include <linux/delay.h>
-#include <linux/stop_machine.h>
 #define RCU_KTHREAD_PRIO 1
@@ -63,7 +62,10 @@ static void __init rcu_bootup_announce_oddness(void)
        printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
 #endif
 #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
-        printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
+        printk(KERN_INFO "\tDump stacks of tasks blocking RCU-preempt GP.\n");
+#endif
+#if defined(CONFIG_RCU_CPU_STALL_INFO)
+        printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n");
 #endif
 #if NUM_RCU_LVL_4 != 0
        printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n");
@@ -490,6 +492,31 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
 #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
+#ifdef CONFIG_RCU_CPU_STALL_INFO
+static void rcu_print_task_stall_begin(struct rcu_node *rnp)
+{
+        printk(KERN_ERR "\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
+               rnp->level, rnp->grplo, rnp->grphi);
+}
+static void rcu_print_task_stall_end(void)
+{
+        printk(KERN_CONT "\n");
+}
+#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
+static void rcu_print_task_stall_begin(struct rcu_node *rnp)
+{
+}
+static void rcu_print_task_stall_end(void)
+{
+}
+#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
 /*
 * Scan the current list of tasks blocked within RCU read-side critical
 * sections, printing out the tid of each.
@@ -501,12 +528,14 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
        if (!rcu_preempt_blocked_readers_cgp(rnp))
                return 0;
+        rcu_print_task_stall_begin(rnp);
        t = list_entry(rnp->gp_tasks,
                       struct task_struct, rcu_node_entry);
        list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
-                printk(" P%d", t->pid);
+                printk(KERN_CONT " P%d", t->pid);
                ndetected++;
        }
+        rcu_print_task_stall_end();
        return ndetected;
 }
@@ -581,7 +610,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
         * absolutely necessary, but this is a good performance/complexity
         * tradeoff.
         */
-        if (rcu_preempt_blocked_readers_cgp(rnp))
+        if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0)
                retval |= RCU_OFL_TASKS_NORM_GP;
        if (rcu_preempted_readers_exp(rnp))
                retval |= RCU_OFL_TASKS_EXP_GP;
@@ -618,16 +647,16 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
        return retval;
 }
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
 * Do CPU-offline processing for preemptible RCU.
 */
-static void rcu_preempt_offline_cpu(int cpu)
+static void rcu_preempt_cleanup_dead_cpu(int cpu)
 {
-        __rcu_offline_cpu(cpu, &rcu_preempt_state);
+        rcu_cleanup_dead_cpu(cpu, &rcu_preempt_state);
 }
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
 * Check for a quiescent state from the current CPU.  When a task blocks,
 * the task is recorded in the corresponding CPU's rcu_node structure,
@@ -671,10 +700,24 @@ static void rcu_preempt_do_callbacks(void)
 */
 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
-        __call_rcu(head, func, &rcu_preempt_state);
+        __call_rcu(head, func, &rcu_preempt_state, 0);
 }
 EXPORT_SYMBOL_GPL(call_rcu);
+/*
+ * Queue an RCU callback for lazy invocation after a grace period.
+ * This will likely be later named something like "call_rcu_lazy()",
+ * but this change will require some way of tagging the lazy RCU
+ * callbacks in the list of pending callbacks.  Until then, this
+ * function may only be called from __kfree_rcu().
+ */
+void kfree_call_rcu(struct rcu_head *head,
+                    void (*func)(struct rcu_head *rcu))
+{
+        __call_rcu(head, func, &rcu_preempt_state, 1);
+}
+EXPORT_SYMBOL_GPL(kfree_call_rcu);
 /**
 * synchronize_rcu - wait until a grace period has elapsed.
 *
@@ -688,6 +731,10 @@ EXPORT_SYMBOL_GPL(call_rcu);
 */
 void synchronize_rcu(void)
 {
+        rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
+                           !lock_is_held(&rcu_lock_map) &&
+                           !lock_is_held(&rcu_sched_lock_map),
+                           "Illegal synchronize_rcu() in RCU read-side critical section");
        if (!rcu_scheduler_active)
                return;
        wait_rcu_gp(call_rcu);
@@ -788,10 +835,22 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
                rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */
 }
-/*
+/**
- * Wait for an rcu-preempt grace period, but expedite it.  The basic idea
+ * synchronize_rcu_expedited - Brute-force RCU grace period
- * is to invoke synchronize_sched_expedited() to push all the tasks to
+ *
- * the ->blkd_tasks lists and wait for this list to drain.
+ * Wait for an RCU-preempt grace period, but expedite it.  The basic
+ * idea is to invoke synchronize_sched_expedited() to push all the tasks to
+ * the ->blkd_tasks lists and wait for this list to drain.  This consumes
+ * significant time on all CPUs and is unfriendly to real-time workloads,
+ * so is thus not recommended for any sort of common-case code.
+ * In fact, if you are using synchronize_rcu_expedited() in a loop,
+ * please restructure your code to batch your updates, and then Use a
+ * single synchronize_rcu() instead.
+ *
+ * Note that it is illegal to call this function while holding any lock
+ * that is acquired by a CPU-hotplug notifier.  And yes, it is also illegal
+ * to call this function from a CPU-hotplug notifier.  Failing to observe
+ * these restriction will result in deadlock.
 */
 void synchronize_rcu_expedited(void)
 {
@@ -869,9 +928,9 @@ static int rcu_preempt_pending(int cpu)
 }
 /*
- * Does preemptible RCU need the CPU to stay out of dynticks mode?
+ * Does preemptible RCU have callbacks on this CPU?
 */
-static int rcu_preempt_needs_cpu(int cpu)
+static int rcu_preempt_cpu_has_callbacks(int cpu)
 {
        return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
 }
@@ -894,11 +953,12 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 }
 /*
- * Move preemptible RCU's callbacks from dying CPU to other online CPU.
+ * Move preemptible RCU's callbacks from dying CPU to other online CPU
+ * and record a quiescent state.
 */
-static void rcu_preempt_send_cbs_to_online(void)
+static void rcu_preempt_cleanup_dying_cpu(void)
 {
-        rcu_send_cbs_to_online(&rcu_preempt_state);
+        rcu_cleanup_dying_cpu(&rcu_preempt_state);
 }
 /*
@@ -1034,16 +1094,16 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
        return 0;
 }
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
 * Because preemptible RCU does not exist, it never needs CPU-offline
 * processing.
 */
-static void rcu_preempt_offline_cpu(int cpu)
+static void rcu_preempt_cleanup_dead_cpu(int cpu)
 {
 }
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
 * Because preemptible RCU does not exist, it never has any callbacks
 * to check.
@@ -1061,6 +1121,22 @@ static void rcu_preempt_process_callbacks(void)
 }
 /*
+ * Queue an RCU callback for lazy invocation after a grace period.
+ * This will likely be later named something like "call_rcu_lazy()",
+ * but this change will require some way of tagging the lazy RCU
+ * callbacks in the list of pending callbacks.  Until then, this
+ * function may only be called from __kfree_rcu().
+ *
+ * Because there is no preemptible RCU, we use RCU-sched instead.
+ */
+void kfree_call_rcu(struct rcu_head *head,
+                    void (*func)(struct rcu_head *rcu))
+{
+        __call_rcu(head, func, &rcu_sched_state, 1);
+}
+EXPORT_SYMBOL_GPL(kfree_call_rcu);
+/*
 * Wait for an rcu-preempt grace period, but make it happen quickly.
 * But because preemptible RCU does not exist, map to rcu-sched.
 */
@@ -1093,9 +1169,9 @@ static int rcu_preempt_pending(int cpu)
 }
 /*
- * Because preemptible RCU does not exist, it never needs any CPU.
+ * Because preemptible RCU does not exist, it never has callbacks
 */
-static int rcu_preempt_needs_cpu(int cpu)
+static int rcu_preempt_cpu_has_callbacks(int cpu)
 {
        return 0;
 }
@@ -1119,9 +1195,9 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 }
 /*
- * Because there is no preemptible RCU, there are no callbacks to move.
+ * Because there is no preemptible RCU, there is no cleanup to do.
 */
-static void rcu_preempt_send_cbs_to_online(void)
+static void rcu_preempt_cleanup_dying_cpu(void)
 {
 }
@@ -1823,132 +1899,6 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
 #endif /* #else #ifdef CONFIG_RCU_BOOST */
-#ifndef CONFIG_SMP
-void synchronize_sched_expedited(void)
-{
-        cond_resched();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-#else /* #ifndef CONFIG_SMP */
-static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
-static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
-static int synchronize_sched_expedited_cpu_stop(void *data)
-{
-        /*
-         * There must be a full memory barrier on each affected CPU
-         * between the time that try_stop_cpus() is called and the
-         * time that it returns.
-         *
-         * In the current initial implementation of cpu_stop, the
-         * above condition is already met when the control reaches
-         * this point and the following smp_mb() is not strictly
-         * necessary.  Do smp_mb() anyway for documentation and
-         * robustness against future implementation changes.
-         */
-        smp_mb(); /* See above comment block. */
-        return 0;
-}
-/*
- * Wait for an rcu-sched grace period to elapse, but use "big hammer"
- * approach to force grace period to end quickly.  This consumes
- * significant time on all CPUs, and is thus not recommended for
- * any sort of common-case code.
- *
- * Note that it is illegal to call this function while holding any
- * lock that is acquired by a CPU-hotplug notifier.  Failing to
- * observe this restriction will result in deadlock.
- *
- * This implementation can be thought of as an application of ticket
- * locking to RCU, with sync_sched_expedited_started and
- * sync_sched_expedited_done taking on the roles of the halves
- * of the ticket-lock word.  Each task atomically increments
- * sync_sched_expedited_started upon entry, snapshotting the old value,
- * then attempts to stop all the CPUs.  If this succeeds, then each
- * CPU will have executed a context switch, resulting in an RCU-sched
- * grace period.  We are then done, so we use atomic_cmpxchg() to
- * update sync_sched_expedited_done to match our snapshot -- but
- * only if someone else has not already advanced past our snapshot.
- *
- * On the other hand, if try_stop_cpus() fails, we check the value
- * of sync_sched_expedited_done.  If it has advanced past our
- * initial snapshot, then someone else must have forced a grace period
- * some time after we took our snapshot.  In this case, our work is
- * done for us, and we can simply return.  Otherwise, we try again,
- * but keep our initial snapshot for purposes of checking for someone
- * doing our work for us.
- *
- * If we fail too many times in a row, we fall back to synchronize_sched().
- */
-void synchronize_sched_expedited(void)
-{
-        int firstsnap, s, snap, trycount = 0;
-        /* Note that atomic_inc_return() implies full memory barrier. */
-        firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
-        get_online_cpus();
-        /*
-         * Each pass through the following loop attempts to force a
-         * context switch on each CPU.
-         */
-        while (try_stop_cpus(cpu_online_mask,
-                             synchronize_sched_expedited_cpu_stop,
-                             NULL) == -EAGAIN) {
-                put_online_cpus();
-                /* No joy, try again later.  Or just synchronize_sched(). */
-                if (trycount++ < 10)
-                        udelay(trycount * num_online_cpus());
-                else {
-                        synchronize_sched();
-                        return;
-                }
-                /* Check to see if someone else did our work for us. */
-                s = atomic_read(&sync_sched_expedited_done);
-                if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
-                        smp_mb(); /* ensure test happens before caller kfree */
-                        return;
-                }
-                /*
-                 * Refetching sync_sched_expedited_started allows later
-                 * callers to piggyback on our grace period.  We subtract
-                 * 1 to get the same token that the last incrementer got.
-                 * We retry after they started, so our grace period works
-                 * for them, and they started after our first try, so their
-                 * grace period works for us.
-                 */
-                get_online_cpus();
-                snap = atomic_read(&sync_sched_expedited_started);
-                smp_mb(); /* ensure read is before try_stop_cpus(). */
-        }
-        /*
-         * Everyone up to our most recent fetch is covered by our grace
-         * period.  Update the counter, but only if our work is still
-         * relevant -- which it won't be if someone who started later
-         * than we did beat us to the punch.
-         */
-        do {
-                s = atomic_read(&sync_sched_expedited_done);
-                if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
-                        smp_mb(); /* ensure test happens before caller kfree */
-                        break;
-                }
-        } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
-        put_online_cpus();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-#endif /* #else #ifndef CONFIG_SMP */
 #if !defined(CONFIG_RCU_FAST_NO_HZ)
 /*
@@ -1981,7 +1931,7 @@ static void rcu_cleanup_after_idle(int cpu)
 }
 /*
- * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y,
+ * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n,
 * is nothing.
 */
 static void rcu_prepare_for_idle(int cpu)
@@ -2015,6 +1965,9 @@ static void rcu_prepare_for_idle(int cpu)
 *      number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
 *      system.  And if you are -that- concerned about energy efficiency,
 *      just power the system down and be done with it!
+ * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is
+ *      permitted to sleep in dyntick-idle mode with only lazy RCU
+ *      callbacks pending.  Setting this too high can OOM your system.
 *
 * The values below work well in practice.  If future workloads require
 * adjustment, they can be converted into kernel config parameters, though
@@ -2023,11 +1976,13 @@ static void rcu_prepare_for_idle(int cpu)
 #define RCU_IDLE_FLUSHES 5              /* Number of dyntick-idle tries. */
 #define RCU_IDLE_OPT_FLUSHES 3          /* Optional dyntick-idle tries. */
 #define RCU_IDLE_GP_DELAY 6             /* Roughly one grace period. */
+#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
 static DEFINE_PER_CPU(int, rcu_dyntick_drain);
 static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
 static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer);
-static ktime_t rcu_idle_gp_wait;
+static ktime_t rcu_idle_gp_wait;        /* If some non-lazy callbacks. */
+static ktime_t rcu_idle_lazy_gp_wait;   /* If only lazy callbacks. */
 /*
 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
@@ -2048,6 +2003,48 @@ int rcu_needs_cpu(int cpu)
 }
 /*
+ * Does the specified flavor of RCU have non-lazy callbacks pending on
+ * the specified CPU?  Both RCU flavor and CPU are specified by the
+ * rcu_data structure.
+ */
+static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp)
+{
+        return rdp->qlen != rdp->qlen_lazy;
+}
+#ifdef CONFIG_TREE_PREEMPT_RCU
+/*
+ * Are there non-lazy RCU-preempt callbacks?  (There cannot be if there
+ * is no RCU-preempt in the kernel.)
+ */
+static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
+{
+        struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
+        return __rcu_cpu_has_nonlazy_callbacks(rdp);
+}
+#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
+{
+        return 0;
+}
+#endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */
+/*
+ * Does any flavor of RCU have non-lazy callbacks on the specified CPU?
+ */
+static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
+{
+        return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) ||
+               __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) ||
+               rcu_preempt_cpu_has_nonlazy_callbacks(cpu);
+}
+/*
 * Timer handler used to force CPU to start pushing its remaining RCU
 * callbacks in the case where it entered dyntick-idle mode with callbacks
 * pending.  The hander doesn't really need to do anything because the
@@ -2074,6 +2071,8 @@ static void rcu_prepare_for_idle_init(int cpu)
                unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY);
                rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000);
+                upj = jiffies_to_usecs(RCU_IDLE_LAZY_GP_DELAY);
+                rcu_idle_lazy_gp_wait = ns_to_ktime(upj * (u64)1000);
                firsttime = 0;
        }
 }
@@ -2109,10 +2108,6 @@ static void rcu_cleanup_after_idle(int cpu)
 */
 static void rcu_prepare_for_idle(int cpu)
 {
-        unsigned long flags;
-        local_irq_save(flags);
        /*
         * If there are no callbacks on this CPU, enter dyntick-idle mode.
         * Also reset state to avoid prejudicing later attempts.
@@ -2120,7 +2115,6 @@ static void rcu_prepare_for_idle(int cpu)
        if (!rcu_cpu_has_callbacks(cpu)) {
                per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
                per_cpu(rcu_dyntick_drain, cpu) = 0;
-                local_irq_restore(flags);
                trace_rcu_prep_idle("No callbacks");
                return;
        }
@@ -2130,7 +2124,6 @@ static void rcu_prepare_for_idle(int cpu)
         * refrained from disabling the scheduling-clock tick.
         */
        if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) {
-                local_irq_restore(flags);
                trace_rcu_prep_idle("In holdoff");
                return;
        }
@@ -2140,18 +2133,22 @@ static void rcu_prepare_for_idle(int cpu)
                /* First time through, initialize the counter. */
                per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES;
        } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES &&
-                   !rcu_pending(cpu)) {
+                   !rcu_pending(cpu) &&
+                   !local_softirq_pending()) {
                /* Can we go dyntick-idle despite still having callbacks? */
                trace_rcu_prep_idle("Dyntick with callbacks");
                per_cpu(rcu_dyntick_drain, cpu) = 0;
-                per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
+                per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
-                hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu),
+                if (rcu_cpu_has_nonlazy_callbacks(cpu))
-                              rcu_idle_gp_wait, HRTIMER_MODE_REL);
+                        hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu),
+                                      rcu_idle_gp_wait, HRTIMER_MODE_REL);
+                else
+                        hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu),
+                                      rcu_idle_lazy_gp_wait, HRTIMER_MODE_REL);
                return; /* Nothing more to do immediately. */
        } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
                /* We have hit the limit, so time to give up. */
                per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
-                local_irq_restore(flags);
                trace_rcu_prep_idle("Begin holdoff");
                invoke_rcu_core();  /* Force the CPU out of dyntick-idle. */
                return;
@@ -2163,23 +2160,17 @@ static void rcu_prepare_for_idle(int cpu)
         */
 #ifdef CONFIG_TREE_PREEMPT_RCU
        if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
-                local_irq_restore(flags);
                rcu_preempt_qs(cpu);
                force_quiescent_state(&rcu_preempt_state, 0);
-                local_irq_save(flags);
        }
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
        if (per_cpu(rcu_sched_data, cpu).nxtlist) {
-                local_irq_restore(flags);
                rcu_sched_qs(cpu);
                force_quiescent_state(&rcu_sched_state, 0);
-                local_irq_save(flags);
        }
        if (per_cpu(rcu_bh_data, cpu).nxtlist) {
-                local_irq_restore(flags);
                rcu_bh_qs(cpu);
                force_quiescent_state(&rcu_bh_state, 0);
-                local_irq_save(flags);
        }
        /*
@@ -2187,13 +2178,124 @@ static void rcu_prepare_for_idle(int cpu)
         * So try forcing the callbacks through the grace period.
         */
        if (rcu_cpu_has_callbacks(cpu)) {
-                local_irq_restore(flags);
                trace_rcu_prep_idle("More callbacks");
                invoke_rcu_core();
-        } else {
+        } else
-                local_irq_restore(flags);
                trace_rcu_prep_idle("Callbacks drained");
-        }
 }
 #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
+#ifdef CONFIG_RCU_CPU_STALL_INFO
+#ifdef CONFIG_RCU_FAST_NO_HZ
+static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
+{
+        struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu);
+        sprintf(cp, "drain=%d %c timer=%lld",
+                per_cpu(rcu_dyntick_drain, cpu),
+                per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.',
+                hrtimer_active(hrtp)
+                        ? ktime_to_us(hrtimer_get_remaining(hrtp))
+                        : -1);
+}
+#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
+static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
+{
+}
+#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
+/* Initiate the stall-info list. */
+static void print_cpu_stall_info_begin(void)
+{
+        printk(KERN_CONT "\n");
+}
+/*
+ * Print out diagnostic information for the specified stalled CPU.
+ *
+ * If the specified CPU is aware of the current RCU grace period
+ * (flavor specified by rsp), then print the number of scheduling
+ * clock interrupts the CPU has taken during the time that it has
+ * been aware.  Otherwise, print the number of RCU grace periods
+ * that this CPU is ignorant of, for example, "1" if the CPU was
+ * aware of the previous grace period.
+ *
+ * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
+ */
+static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
+{
+        char fast_no_hz[72];
+        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+        struct rcu_dynticks *rdtp = rdp->dynticks;
+        char *ticks_title;
+        unsigned long ticks_value;
+        if (rsp->gpnum == rdp->gpnum) {
+                ticks_title = "ticks this GP";
+                ticks_value = rdp->ticks_this_gp;
+        } else {
+                ticks_title = "GPs behind";
+                ticks_value = rsp->gpnum - rdp->gpnum;
+        }
+        print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
+        printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n",
+               cpu, ticks_value, ticks_title,
+               atomic_read(&rdtp->dynticks) & 0xfff,
+               rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
+               fast_no_hz);
+}
+/* Terminate the stall-info list. */
+static void print_cpu_stall_info_end(void)
+{
+        printk(KERN_ERR "\t");
+}
+/* Zero ->ticks_this_gp for all flavors of RCU. */
+static void zero_cpu_stall_ticks(struct rcu_data *rdp)
+{
+        rdp->ticks_this_gp = 0;
+}
+/* Increment ->ticks_this_gp for all flavors of RCU. */
+static void increment_cpu_stall_ticks(void)
+{
+        __get_cpu_var(rcu_sched_data).ticks_this_gp++;
+        __get_cpu_var(rcu_bh_data).ticks_this_gp++;
+#ifdef CONFIG_TREE_PREEMPT_RCU
+        __get_cpu_var(rcu_preempt_data).ticks_this_gp++;
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+}
+#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
+static void print_cpu_stall_info_begin(void)
+{
+        printk(KERN_CONT " {");
+}
+static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
+{
+        printk(KERN_CONT " %d", cpu);
+}
+static void print_cpu_stall_info_end(void)
+{
+        printk(KERN_CONT "} ");
+}
+static void zero_cpu_stall_ticks(struct rcu_data *rdp)
+{
+}
+static void increment_cpu_stall_ticks(void)
+{
+}
+#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 654cfe67f0d1..ed459edeff43 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -72,9 +72,9 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
                   rdp->dynticks->dynticks_nesting,
                   rdp->dynticks->dynticks_nmi_nesting,
                   rdp->dynticks_fqs);
-        seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
+        seq_printf(m, " of=%lu", rdp->offline_fqs);
-        seq_printf(m, " ql=%ld qs=%c%c%c%c",
+        seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c",
-                   rdp->qlen,
+                   rdp->qlen_lazy, rdp->qlen,
                   ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
                        rdp->nxttail[RCU_NEXT_TAIL]],
                   ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
@@ -144,8 +144,8 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
                   rdp->dynticks->dynticks_nesting,
                   rdp->dynticks->dynticks_nmi_nesting,
                   rdp->dynticks_fqs);
-        seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
+        seq_printf(m, ",%lu", rdp->offline_fqs);
-        seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen,
+        seq_printf(m, ",%ld,%ld,\"%c%c%c%c\"", rdp->qlen_lazy, rdp->qlen,
                   ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
                        rdp->nxttail[RCU_NEXT_TAIL]],
                   ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
@@ -168,7 +168,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
 {
        seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");
        seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
-        seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\"");
+        seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\"");
 #ifdef CONFIG_RCU_BOOST
        seq_puts(m, "\"kt\",\"ktl\"");
 #endif /* #ifdef CONFIG_RCU_BOOST */
diff --git a/kernel/resource.c b/kernel/resource.c
index 7640b3a947d0..7e8ea66a8c01 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -749,6 +749,7 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
        write_unlock(&resource_lock);
        return result;
 }
+EXPORT_SYMBOL(adjust_resource);
 static void __init __reserve_region_with_split(struct resource *root,
                resource_size_t start, resource_size_t end,
@@ -792,8 +793,6 @@ void __init reserve_region_with_split(struct resource *root,
        write_unlock(&resource_lock);
 }
-EXPORT_SYMBOL(adjust_resource);
 /**
 * resource_alignment - calculate resource's alignment
 * @res: resource pointer
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index b152f74f02de..6850f53e02d8 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -10,7 +10,6 @@
 #include <linux/export.h>
 #include <linux/rwsem.h>
-#include <asm/system.h>
 #include <linux/atomic.h>
 /*
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index e8a1f83ee0e7..0984a21076a3 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -195,20 +195,20 @@ __setup("noautogroup", setup_autogroup);
 #ifdef CONFIG_PROC_FS
-int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
+int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
 {
        static unsigned long next = INITIAL_JIFFIES;
        struct autogroup *ag;
        int err;
-        if (*nice < -20 || *nice > 19)
+        if (nice < -20 || nice > 19)
                return -EINVAL;
-        err = security_task_setnice(current, *nice);
+        err = security_task_setnice(current, nice);
        if (err)
                return err;
-        if (*nice < 0 && !can_nice(current, *nice))
+        if (nice < 0 && !can_nice(current, nice))
                return -EPERM;
        /* this is a heavy operation taking global locks.. */
@@ -219,9 +219,9 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
        ag = autogroup_task_get(p);
        down_write(&ag->lock);
-        err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]);
+        err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]);
        if (!err)
-                ag->nice = *nice;
+                ag->nice = nice;
        up_write(&ag->lock);
        autogroup_kref_put(ag);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5255c9d2e053..afc6d7e71557 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -71,7 +71,9 @@
 #include <linux/ftrace.h>
 #include <linux/slab.h>
 #include <linux/init_task.h>
+#include <linux/binfmts.h>
+#include <asm/switch_to.h>
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
 #include <asm/mutex.h>
@@ -162,13 +164,13 @@ static int sched_feat_show(struct seq_file *m, void *v)
 #ifdef HAVE_JUMP_LABEL
-#define jump_label_key__true  jump_label_key_enabled
+#define jump_label_key__true  STATIC_KEY_INIT_TRUE
-#define jump_label_key__false jump_label_key_disabled
+#define jump_label_key__false STATIC_KEY_INIT_FALSE
 #define SCHED_FEAT(name, enabled)       \
        jump_label_key__##enabled ,
-struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
+struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
 #include "features.h"
 };
@@ -176,14 +178,14 @@ struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
 static void sched_feat_disable(int i)
 {
-        if (jump_label_enabled(&sched_feat_keys[i]))
+        if (static_key_enabled(&sched_feat_keys[i]))
-                jump_label_dec(&sched_feat_keys[i]);
+                static_key_slow_dec(&sched_feat_keys[i]);
 }
 static void sched_feat_enable(int i)
 {
-        if (!jump_label_enabled(&sched_feat_keys[i]))
+        if (!static_key_enabled(&sched_feat_keys[i]))
-                jump_label_inc(&sched_feat_keys[i]);
+                static_key_slow_inc(&sched_feat_keys[i]);
 }
 #else
 static void sched_feat_disable(int i) { };
@@ -894,7 +896,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
        delta -= irq_delta;
 #endif
 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
-        if (static_branch((&paravirt_steal_rq_enabled))) {
+        if (static_key_false((&paravirt_steal_rq_enabled))) {
                u64 st;
                steal = paravirt_steal_clock(cpu_of(rq));
@@ -1263,29 +1265,59 @@ EXPORT_SYMBOL_GPL(kick_process);
 */
 static int select_fallback_rq(int cpu, struct task_struct *p)
 {
-        int dest_cpu;
        const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
+        enum { cpuset, possible, fail } state = cpuset;
+        int dest_cpu;
        /* Look for allowed, online CPU in same node. */
-        for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
+        for_each_cpu(dest_cpu, nodemask) {
+                if (!cpu_online(dest_cpu))
+                        continue;
+                if (!cpu_active(dest_cpu))
+                        continue;
                if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
                        return dest_cpu;
+        }
-        /* Any allowed, online CPU? */
+        for (;;) {
-        dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
+                /* Any allowed, online CPU? */
-        if (dest_cpu < nr_cpu_ids)
+                for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
-                return dest_cpu;
+                        if (!cpu_online(dest_cpu))
+                                continue;
+                        if (!cpu_active(dest_cpu))
+                                continue;
+                        goto out;
+                }
-        /* No more Mr. Nice Guy. */
+                switch (state) {
-        dest_cpu = cpuset_cpus_allowed_fallback(p);
+                case cpuset:
-        /*
+                        /* No more Mr. Nice Guy. */
-         * Don't tell them about moving exiting tasks or
+                        cpuset_cpus_allowed_fallback(p);
-         * kernel threads (both mm NULL), since they never
+                        state = possible;
-         * leave kernel.
+                        break;
-         */
-        if (p->mm && printk_ratelimit()) {
+                case possible:
-                printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
+                        do_set_cpus_allowed(p, cpu_possible_mask);
-                                task_pid_nr(p), p->comm, cpu);
+                        state = fail;
+                        break;
+                case fail:
+                        BUG();
+                        break;
+                }
+        }
+out:
+        if (state != cpuset) {
+                /*
+                 * Don't tell them about moving exiting tasks or
+                 * kernel threads (both mm NULL), since they never
+                 * leave kernel.
+                 */
+                if (p->mm && printk_ratelimit()) {
+                        printk_sched("process %d (%s) no longer affine to cpu%d\n",
+                                        task_pid_nr(p), p->comm, cpu);
+                }
        }
        return dest_cpu;
@@ -1507,7 +1539,7 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
 }
 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
-static inline int ttwu_share_cache(int this_cpu, int that_cpu)
+bool cpus_share_cache(int this_cpu, int that_cpu)
 {
        return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
 }
@@ -1518,7 +1550,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
        struct rq *rq = cpu_rq(cpu);
 #if defined(CONFIG_SMP)
-        if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) {
+        if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
                sched_clock_cpu(cpu); /* sync clocks x-cpu */
                ttwu_queue_remote(p, cpu);
                return;
@@ -1932,7 +1964,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
        local_irq_enable();
 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
        finish_lock_switch(rq, prev);
-        trace_sched_stat_sleeptime(current, rq->clock);
+        finish_arch_post_lock_switch();
        fire_sched_in_preempt_notifiers(current);
        if (mm)
@@ -2267,13 +2299,10 @@ calc_load_n(unsigned long load, unsigned long exp,
 * Once we've updated the global active value, we need to apply the exponential
 * weights adjusted to the number of cycles missed.
 */
-static void calc_global_nohz(unsigned long ticks)
+static void calc_global_nohz(void)
 {
        long delta, active, n;
-        if (time_before(jiffies, calc_load_update))
-                return;
        /*
         * If we crossed a calc_load_update boundary, make sure to fold
         * any pending idle changes, the respective CPUs might have
@@ -2285,31 +2314,25 @@ static void calc_global_nohz(unsigned long ticks)
                atomic_long_add(delta, &calc_load_tasks);
        /*
-         * If we were idle for multiple load cycles, apply them.
+         * It could be the one fold was all it took, we done!
         */
-        if (ticks >= LOAD_FREQ) {
+        if (time_before(jiffies, calc_load_update + 10))
-                n = ticks / LOAD_FREQ;
+                return;
-                active = atomic_long_read(&calc_load_tasks);
+        /*
-                active = active > 0 ? active * FIXED_1 : 0;
+         * Catch-up, fold however many we are behind still
+         */
+        delta = jiffies - calc_load_update - 10;
+        n = 1 + (delta / LOAD_FREQ);
-                avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+        active = atomic_long_read(&calc_load_tasks);
-                avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+        active = active > 0 ? active * FIXED_1 : 0;
-                avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
-                calc_load_update += n * LOAD_FREQ;
+        avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
-        }
+        avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+        avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
-        /*
+        calc_load_update += n * LOAD_FREQ;
-         * Its possible the remainder of the above division also crosses
-         * a LOAD_FREQ period, the regular check in calc_global_load()
-         * which comes after this will take care of that.
-         *
-         * Consider us being 11 ticks before a cycle completion, and us
-         * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
-         * age us 4 cycles, and the test in calc_global_load() will
-         * pick up the final one.
-         */
 }
 #else
 void calc_load_account_idle(struct rq *this_rq)
@@ -2321,7 +2344,7 @@ static inline long calc_load_fold_idle(void)
        return 0;
 }
-static void calc_global_nohz(unsigned long ticks)
+static void calc_global_nohz(void)
 {
 }
 #endif
@@ -2349,8 +2372,6 @@ void calc_global_load(unsigned long ticks)
 {
        long active;
-        calc_global_nohz(ticks);
        if (time_before(jiffies, calc_load_update + 10))
                return;
@@ -2362,6 +2383,16 @@ void calc_global_load(unsigned long ticks)
        avenrun[2] = calc_load(avenrun[2], EXP_15, active);
        calc_load_update += LOAD_FREQ;
+        /*
+         * Account one period with whatever state we found before
+         * folding in the nohz state and ageing the entire idle period.
+         *
+         * This avoids loosing a sample when we go idle between 
+         * calc_load_account_active() (10 ticks ago) and now and thus
+         * under-accounting.
+         */
+        calc_global_nohz();
 }
 /*
@@ -2756,7 +2787,7 @@ void account_idle_time(cputime_t cputime)
 static __always_inline bool steal_account_process_tick(void)
 {
 #ifdef CONFIG_PARAVIRT
-        if (static_branch(&paravirt_steal_enabled)) {
+        if (static_key_false(&paravirt_steal_enabled)) {
                u64 steal, st = 0;
                steal = paravirt_steal_clock(smp_processor_id());
@@ -3071,8 +3102,6 @@ EXPORT_SYMBOL(sub_preempt_count);
 */
 static noinline void __schedule_bug(struct task_struct *prev)
 {
-        struct pt_regs *regs = get_irq_regs();
        if (oops_in_progress)
                return;
@@ -3083,11 +3112,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
        print_modules();
        if (irqs_disabled())
                print_irqtrace_events(prev);
+        dump_stack();
-        if (regs)
-                show_regs(regs);
-        else
-                dump_stack();
 }
 /*
@@ -3221,14 +3246,14 @@ need_resched:
        post_schedule(rq);
-        preempt_enable_no_resched();
+        sched_preempt_enable_no_resched();
        if (need_resched())
                goto need_resched;
 }
 static inline void sched_submit_work(struct task_struct *tsk)
 {
-        if (!tsk->state)
+        if (!tsk->state || tsk_is_pi_blocked(tsk))
                return;
        /*
         * If we are going to sleep and we have plugged IO queued,
@@ -3247,6 +3272,18 @@ asmlinkage void __sched schedule(void)
 }
 EXPORT_SYMBOL(schedule);
+/**
+ * schedule_preempt_disabled - called with preemption disabled
+ *
+ * Returns with preemption disabled. Note: preempt_count must be 1
+ */
+void __sched schedule_preempt_disabled(void)
+{
+        sched_preempt_enable_no_resched();
+        schedule();
+        preempt_disable();
+}
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
@@ -3407,9 +3444,9 @@ EXPORT_SYMBOL(__wake_up);
 /*
 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
 */
-void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
 {
-        __wake_up_common(q, mode, 1, 0, NULL);
+        __wake_up_common(q, mode, nr, 0, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked);
@@ -3768,6 +3805,24 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        rq = __task_rq_lock(p);
+        /*
+         * Idle task boosting is a nono in general. There is one
+         * exception, when PREEMPT_RT and NOHZ is active:
+         *
+         * The idle task calls get_next_timer_interrupt() and holds
+         * the timer wheel base->lock on the CPU and another CPU wants
+         * to access the timer (probably to cancel it). We can safely
+         * ignore the boosting request, as the idle CPU runs this code
+         * with interrupts disabled and will complete the lock
+         * protected section without being interrupted. So there is no
+         * real need to boost.
+         */
+        if (unlikely(p == rq->idle)) {
+                WARN_ON(p != rq->curr);
+                WARN_ON(p->pi_blocked_on);
+                goto out_unlock;
+        }
        trace_sched_pi_setprio(p, prio);
        oldprio = p->prio;
        prev_class = p->sched_class;
@@ -3791,11 +3846,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
        check_class_changed(rq, p, prev_class, oldprio);
+out_unlock:
        __task_rq_unlock(rq);
 }
 #endif
 void set_user_nice(struct task_struct *p, long nice)
 {
        int old_prio, delta, on_rq;
@@ -4475,7 +4529,7 @@ SYSCALL_DEFINE0(sched_yield)
        __release(rq->lock);
        spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
        do_raw_spin_unlock(&rq->lock);
-        preempt_enable_no_resched();
+        sched_preempt_enable_no_resched();
        schedule();
@@ -4549,8 +4603,24 @@ EXPORT_SYMBOL(__cond_resched_softirq);
 /**
 * yield - yield the current processor to other threads.
 *
- * This is a shortcut for kernel-space yielding - it marks the
+ * Do not ever use this function, there's a 99% chance you're doing it wrong.
- * thread runnable and calls sys_sched_yield().
+ *
+ * The scheduler is at all times free to pick the calling task as the most
+ * eligible task to run, if removing the yield() call from your code breaks
+ * it, its already broken.
+ *
+ * Typical broken usage is:
+ *
+ * while (!event)
+ *      yield();
+ *
+ * where one assumes that yield() will let 'the other' process run that will
+ * make event true. If the current task is a SCHED_FIFO task that will never
+ * happen. Never use yield() as a progress guarantee!!
+ *
+ * If you want to use yield() to wait for something, use wait_event().
+ * If you want to use yield() to be 'nice' for others, use cond_resched().
+ * If you still want to use yield(), do not!
 */
 void __sched yield(void)
 {
@@ -5382,7 +5452,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
                                      unsigned long action, void *hcpu)
 {
        switch (action & ~CPU_TASKS_FROZEN) {
-        case CPU_ONLINE:
+        case CPU_STARTING:
        case CPU_DOWN_FAILED:
                set_cpu_active((long)hcpu, true);
                return NOTIFY_OK;
@@ -5754,7 +5824,7 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
 *
 * Also keep a unique ID per domain (we use the first cpu number in
 * the cpumask of the domain), this allows us to quickly tell if
- * two cpus are in the same cache domain, see ttwu_share_cache().
+ * two cpus are in the same cache domain, see cpus_share_cache().
 */
 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_id);
@@ -6931,6 +7001,9 @@ void __init sched_init(void)
                rq->online = 0;
                rq->idle_stamp = 0;
                rq->avg_idle = 2*sysctl_sched_migration_cost;
+                INIT_LIST_HEAD(&rq->cfs_tasks);
                rq_attach_root(rq, &def_root_domain);
 #ifdef CONFIG_NO_HZ
                rq->nohz_flags = 0;
@@ -7525,8 +7598,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
                            struct task_group, css);
 }
-static struct cgroup_subsys_state *
+static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
-cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
        struct task_group *tg, *parent;
@@ -7543,15 +7615,14 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
        return &tg->css;
 }
-static void
+static void cpu_cgroup_destroy(struct cgroup *cgrp)
-cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
        struct task_group *tg = cgroup_tg(cgrp);
        sched_destroy_group(tg);
 }
-static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+static int cpu_cgroup_can_attach(struct cgroup *cgrp,
                                 struct cgroup_taskset *tset)
 {
        struct task_struct *task;
@@ -7569,7 +7640,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
        return 0;
 }
-static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+static void cpu_cgroup_attach(struct cgroup *cgrp,
                              struct cgroup_taskset *tset)
 {
        struct task_struct *task;
@@ -7579,8 +7650,8 @@ static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 }
 static void
-cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
+cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
-                struct cgroup *old_cgrp, struct task_struct *task)
+                struct task_struct *task)
 {
        /*
         * cgroup_exit() is called in the copy_process() failure path.
@@ -7899,13 +7970,9 @@ static struct cftype cpu_files[] = {
                .write_u64 = cpu_rt_period_write_uint,
        },
 #endif
+        { }     /* terminate */
 };
-static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-{
-        return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
-}
 struct cgroup_subsys cpu_cgroup_subsys = {
        .name           = "cpu",
        .create         = cpu_cgroup_create,
@@ -7913,8 +7980,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
        .can_attach     = cpu_cgroup_can_attach,
        .attach         = cpu_cgroup_attach,
        .exit           = cpu_cgroup_exit,
-        .populate       = cpu_cgroup_populate,
        .subsys_id      = cpu_cgroup_subsys_id,
+        .base_cftypes   = cpu_files,
        .early_init     = 1,
 };
@@ -7930,8 +7997,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
 */
 /* create a new cpu accounting group */
-static struct cgroup_subsys_state *cpuacct_create(
+static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp)
-        struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
        struct cpuacct *ca;
@@ -7961,8 +8027,7 @@ out:
 }
 /* destroy an existing cpu accounting group */
-static void
+static void cpuacct_destroy(struct cgroup *cgrp)
-cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
        struct cpuacct *ca = cgroup_ca(cgrp);
@@ -8101,13 +8166,9 @@ static struct cftype files[] = {
                .name = "stat",
                .read_map = cpuacct_stats_show,
        },
+        { }     /* terminate */
 };
-static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
-{
-        return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
-}
 /*
 * charge this task's execution time to its accounting group.
 *
@@ -8139,7 +8200,7 @@ struct cgroup_subsys cpuacct_subsys = {
        .name = "cpuacct",
        .create = cpuacct_create,
        .destroy = cpuacct_destroy,
-        .populate = cpuacct_populate,
        .subsys_id = cpuacct_subsys_id,
+        .base_cftypes = files,
 };
 #endif  /* CONFIG_CGROUP_CPUACCT */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2a075e10004b..09acaa15161d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -288,7 +288,6 @@ static void print_cpu(struct seq_file *m, int cpu)
        P(yld_count);
-        P(sched_switch);
        P(sched_count);
        P(sched_goidle);
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7c6414fc669d..0d97ebdc58f0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -416,8 +416,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 #endif  /* CONFIG_FAIR_GROUP_SCHED */
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+static __always_inline
-                                   unsigned long delta_exec);
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
 /**************************************************************
 * Scheduling class tree data structure manipulation methods:
@@ -776,29 +776,16 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 * Scheduling class queueing methods:
 */
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
-static void
-add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
-{
-        cfs_rq->task_weight += weight;
-}
-#else
-static inline void
-add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
-{
-}
-#endif
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        update_load_add(&cfs_rq->load, se->load.weight);
        if (!parent_entity(se))
                update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
-        if (entity_is_task(se)) {
+#ifdef CONFIG_SMP
-                add_cfs_task_weight(cfs_rq, se->load.weight);
+        if (entity_is_task(se))
-                list_add(&se->group_node, &cfs_rq->tasks);
+                list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
-        }
+#endif
        cfs_rq->nr_running++;
 }
@@ -808,10 +795,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
        update_load_sub(&cfs_rq->load, se->load.weight);
        if (!parent_entity(se))
                update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
-        if (entity_is_task(se)) {
+        if (entity_is_task(se))
-                add_cfs_task_weight(cfs_rq, -se->load.weight);
                list_del_init(&se->group_node);
-        }
        cfs_rq->nr_running--;
 }
@@ -1003,6 +988,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
                if (unlikely(delta > se->statistics.sleep_max))
                        se->statistics.sleep_max = delta;
+                se->statistics.sleep_start = 0;
                se->statistics.sum_sleep_runtime += delta;
                if (tsk) {
@@ -1019,6 +1005,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
                if (unlikely(delta > se->statistics.block_max))
                        se->statistics.block_max = delta;
+                se->statistics.block_start = 0;
                se->statistics.sum_sleep_runtime += delta;
                if (tsk) {
@@ -1175,7 +1162,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
                __clear_buddies_skip(se);
 }
-static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
@@ -1399,20 +1386,20 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 #ifdef CONFIG_CFS_BANDWIDTH
 #ifdef HAVE_JUMP_LABEL
-static struct jump_label_key __cfs_bandwidth_used;
+static struct static_key __cfs_bandwidth_used;
 static inline bool cfs_bandwidth_used(void)
 {
-        return static_branch(&__cfs_bandwidth_used);
+        return static_key_false(&__cfs_bandwidth_used);
 }
 void account_cfs_bandwidth_used(int enabled, int was_enabled)
 {
        /* only need to count groups transitioning between enabled/!enabled */
        if (enabled && !was_enabled)
-                jump_label_inc(&__cfs_bandwidth_used);
+                static_key_slow_inc(&__cfs_bandwidth_used);
        else if (!enabled && was_enabled)
-                jump_label_dec(&__cfs_bandwidth_used);
+                static_key_slow_dec(&__cfs_bandwidth_used);
 }
 #else /* HAVE_JUMP_LABEL */
 static bool cfs_bandwidth_used(void)
@@ -1559,8 +1546,8 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
                resched_task(rq_of(cfs_rq)->curr);
 }
-static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+static __always_inline
-                                                   unsigned long delta_exec)
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
 {
        if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
                return;
@@ -2086,11 +2073,11 @@ void unthrottle_offline_cfs_rqs(struct rq *rq)
 }
 #else /* CONFIG_CFS_BANDWIDTH */
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+static __always_inline
-                                     unsigned long delta_exec) {}
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {}
 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
-static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 {
@@ -2670,8 +2657,6 @@ static int select_idle_sibling(struct task_struct *p, int target)
        /*
         * Otherwise, iterate the domains and find an elegible idle cpu.
         */
-        rcu_read_lock();
        sd = rcu_dereference(per_cpu(sd_llc, target));
        for_each_lower_domain(sd) {
                sg = sd->groups;
@@ -2693,8 +2678,6 @@ next:
                } while (sg != sd->groups);
        }
 done:
-        rcu_read_unlock();
        return target;
 }
@@ -2920,7 +2903,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
                return;
        /*
-         * This is possible from callers such as pull_task(), in which we
+         * This is possible from callers such as move_task(), in which we
         * unconditionally check_prempt_curr() after an enqueue (which may have
         * lead to a throttle).  This both saves work and prevents false
         * next-buddy nomination below.
@@ -3084,17 +3067,39 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 * Fair scheduling class load-balancing methods:
 */
+static unsigned long __read_mostly max_load_balance_interval = HZ/10;
+#define LBF_ALL_PINNED  0x01
+#define LBF_NEED_BREAK  0x02
+struct lb_env {
+        struct sched_domain     *sd;
+        int                     src_cpu;
+        struct rq               *src_rq;
+        int                     dst_cpu;
+        struct rq               *dst_rq;
+        enum cpu_idle_type      idle;
+        long                    load_move;
+        unsigned int            flags;
+        unsigned int            loop;
+        unsigned int            loop_break;
+        unsigned int            loop_max;
+};
 /*
- * pull_task - move a task from a remote runqueue to the local runqueue.
+ * move_task - move a task from one runqueue to another runqueue.
 * Both runqueues must be locked.
 */
-static void pull_task(struct rq *src_rq, struct task_struct *p,
+static void move_task(struct task_struct *p, struct lb_env *env)
-                      struct rq *this_rq, int this_cpu)
 {
-        deactivate_task(src_rq, p, 0);
+        deactivate_task(env->src_rq, p, 0);
-        set_task_cpu(p, this_cpu);
+        set_task_cpu(p, env->dst_cpu);
-        activate_task(this_rq, p, 0);
+        activate_task(env->dst_rq, p, 0);
-        check_preempt_curr(this_rq, p, 0);
+        check_preempt_curr(env->dst_rq, p, 0);
 }
 /*
@@ -3129,19 +3134,11 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
        return delta < (s64)sysctl_sched_migration_cost;
 }
-#define LBF_ALL_PINNED  0x01
-#define LBF_NEED_BREAK  0x02    /* clears into HAD_BREAK */
-#define LBF_HAD_BREAK   0x04
-#define LBF_HAD_BREAKS  0x0C    /* count HAD_BREAKs overflows into ABORT */
-#define LBF_ABORT       0x10
 /*
 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
 */
 static
-int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
+int can_migrate_task(struct task_struct *p, struct lb_env *env)
-                     struct sched_domain *sd, enum cpu_idle_type idle,
-                     int *lb_flags)
 {
        int tsk_cache_hot = 0;
        /*
@@ -3150,13 +3147,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
         * 2) cannot be migrated to this CPU due to cpus_allowed, or
         * 3) are cache-hot on their current CPU.
         */
-        if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) {
+        if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
                schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
                return 0;
        }
-        *lb_flags &= ~LBF_ALL_PINNED;
+        env->flags &= ~LBF_ALL_PINNED;
-        if (task_running(rq, p)) {
+        if (task_running(env->src_rq, p)) {
                schedstat_inc(p, se.statistics.nr_failed_migrations_running);
                return 0;
        }
@@ -3167,12 +3164,12 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
         * 2) too many balance attempts have failed.
         */
-        tsk_cache_hot = task_hot(p, rq->clock_task, sd);
+        tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
        if (!tsk_cache_hot ||
-                sd->nr_balance_failed > sd->cache_nice_tries) {
+                env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
                if (tsk_cache_hot) {
-                        schedstat_inc(sd, lb_hot_gained[idle]);
+                        schedstat_inc(env->sd, lb_hot_gained[env->idle]);
                        schedstat_inc(p, se.statistics.nr_forced_migrations);
                }
 #endif
@@ -3193,65 +3190,80 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 *
 * Called with both runqueues locked.
 */
-static int
+static int move_one_task(struct lb_env *env)
-move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
-              struct sched_domain *sd, enum cpu_idle_type idle)
 {
        struct task_struct *p, *n;
-        struct cfs_rq *cfs_rq;
-        int pinned = 0;
-        for_each_leaf_cfs_rq(busiest, cfs_rq) {
+        list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
-                list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
+                if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
-                        if (throttled_lb_pair(task_group(p),
+                        continue;
-                                              busiest->cpu, this_cpu))
-                                break;
-                        if (!can_migrate_task(p, busiest, this_cpu,
+                if (!can_migrate_task(p, env))
-                                                sd, idle, &pinned))
+                        continue;
-                                continue;
-                        pull_task(busiest, p, this_rq, this_cpu);
+                move_task(p, env);
-                        /*
+                /*
-                         * Right now, this is only the second place pull_task()
+                 * Right now, this is only the second place move_task()
-                         * is called, so we can safely collect pull_task()
+                 * is called, so we can safely collect move_task()
-                         * stats here rather than inside pull_task().
+                 * stats here rather than inside move_task().
-                         */
+                 */
-                        schedstat_inc(sd, lb_gained[idle]);
+                schedstat_inc(env->sd, lb_gained[env->idle]);
-                        return 1;
+                return 1;
-                }
        }
        return 0;
 }
-static unsigned long
+static unsigned long task_h_load(struct task_struct *p);
-balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-              unsigned long max_load_move, struct sched_domain *sd,
+/*
-              enum cpu_idle_type idle, int *lb_flags,
+ * move_tasks tries to move up to load_move weighted load from busiest to
-              struct cfs_rq *busiest_cfs_rq)
+ * this_rq, as part of a balancing operation within domain "sd".
+ * Returns 1 if successful and 0 otherwise.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_tasks(struct lb_env *env)
 {
-        int loops = 0, pulled = 0;
+        struct list_head *tasks = &env->src_rq->cfs_tasks;
-        long rem_load_move = max_load_move;
+        struct task_struct *p;
-        struct task_struct *p, *n;
+        unsigned long load;
+        int pulled = 0;
+        if (env->load_move <= 0)
+                return 0;
-        if (max_load_move == 0)
+        while (!list_empty(tasks)) {
-                goto out;
+                p = list_first_entry(tasks, struct task_struct, se.group_node);
-        list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
+                env->loop++;
-                if (loops++ > sysctl_sched_nr_migrate) {
+                /* We've more or less seen every task there is, call it quits */
-                        *lb_flags |= LBF_NEED_BREAK;
+                if (env->loop > env->loop_max)
+                        break;
+                /* take a breather every nr_migrate tasks */
+                if (env->loop > env->loop_break) {
+                        env->loop_break += sysctl_sched_nr_migrate;
+                        env->flags |= LBF_NEED_BREAK;
                        break;
                }
-                if ((p->se.load.weight >> 1) > rem_load_move ||
+                if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
-                    !can_migrate_task(p, busiest, this_cpu, sd, idle,
+                        goto next;
-                                      lb_flags))
-                        continue;
+                load = task_h_load(p);
+                if (load < 16 && !env->sd->nr_balance_failed)
+                        goto next;
+                if ((load / 2) > env->load_move)
+                        goto next;
-                pull_task(busiest, p, this_rq, this_cpu);
+                if (!can_migrate_task(p, env))
+                        goto next;
+                move_task(p, env);
                pulled++;
-                rem_load_move -= p->se.load.weight;
+                env->load_move -= load;
 #ifdef CONFIG_PREEMPT
                /*
@@ -3259,28 +3271,30 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
                 * kernels will stop after the first task is pulled to minimize
                 * the critical section.
                 */
-                if (idle == CPU_NEWLY_IDLE) {
+                if (env->idle == CPU_NEWLY_IDLE)
-                        *lb_flags |= LBF_ABORT;
                        break;
-                }
 #endif
                /*
                 * We only want to steal up to the prescribed amount of
                 * weighted load.
                 */
-                if (rem_load_move <= 0)
+                if (env->load_move <= 0)
                        break;
+                continue;
+next:
+                list_move_tail(&p->se.group_node, tasks);
        }
-out:
        /*
-         * Right now, this is one of only two places pull_task() is called,
+         * Right now, this is one of only two places move_task() is called,
-         * so we can safely collect pull_task() stats here rather than
+         * so we can safely collect move_task() stats here rather than
-         * inside pull_task().
+         * inside move_task().
         */
-        schedstat_add(sd, lb_gained[idle], pulled);
+        schedstat_add(env->sd, lb_gained[env->idle], pulled);
-        return max_load_move - rem_load_move;
+        return pulled;
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -3360,113 +3374,35 @@ static int tg_load_down(struct task_group *tg, void *data)
 static void update_h_load(long cpu)
 {
+        rcu_read_lock();
        walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
+        rcu_read_unlock();
 }
-static unsigned long
+static unsigned long task_h_load(struct task_struct *p)
-load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                  unsigned long max_load_move,
-                  struct sched_domain *sd, enum cpu_idle_type idle,
-                  int *lb_flags)
 {
-        long rem_load_move = max_load_move;
+        struct cfs_rq *cfs_rq = task_cfs_rq(p);
-        struct cfs_rq *busiest_cfs_rq;
+        unsigned long load;
-        rcu_read_lock();
-        update_h_load(cpu_of(busiest));
-        for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) {
-                unsigned long busiest_h_load = busiest_cfs_rq->h_load;
-                unsigned long busiest_weight = busiest_cfs_rq->load.weight;
-                u64 rem_load, moved_load;
-                if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
-                        break;
-                /*
-                 * empty group or part of a throttled hierarchy
-                 */
-                if (!busiest_cfs_rq->task_weight ||
-                    throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu))
-                        continue;
-                rem_load = (u64)rem_load_move * busiest_weight;
-                rem_load = div_u64(rem_load, busiest_h_load + 1);
-                moved_load = balance_tasks(this_rq, this_cpu, busiest,
-                                rem_load, sd, idle, lb_flags,
-                                busiest_cfs_rq);
-                if (!moved_load)
-                        continue;
-                moved_load *= busiest_h_load;
+        load = p->se.load.weight;
-                moved_load = div_u64(moved_load, busiest_weight + 1);
+        load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
-                rem_load_move -= moved_load;
+        return load;
-                if (rem_load_move < 0)
-                        break;
-        }
-        rcu_read_unlock();
-        return max_load_move - rem_load_move;
 }
 #else
 static inline void update_shares(int cpu)
 {
 }
-static unsigned long
+static inline void update_h_load(long cpu)
-load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                  unsigned long max_load_move,
-                  struct sched_domain *sd, enum cpu_idle_type idle,
-                  int *lb_flags)
 {
-        return balance_tasks(this_rq, this_cpu, busiest,
-                        max_load_move, sd, idle, lb_flags,
-                        &busiest->cfs);
 }
-#endif
-/*
+static unsigned long task_h_load(struct task_struct *p)
- * move_tasks tries to move up to max_load_move weighted load from busiest to
- * this_rq, as part of a balancing operation within domain "sd".
- * Returns 1 if successful and 0 otherwise.
- *
- * Called with both runqueues locked.
- */
-static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                      unsigned long max_load_move,
-                      struct sched_domain *sd, enum cpu_idle_type idle,
-                      int *lb_flags)
 {
-        unsigned long total_load_moved = 0, load_moved;
+        return p->se.load.weight;
-        do {
-                load_moved = load_balance_fair(this_rq, this_cpu, busiest,
-                                max_load_move - total_load_moved,
-                                sd, idle, lb_flags);
-                total_load_moved += load_moved;
-                if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
-                        break;
-#ifdef CONFIG_PREEMPT
-                /*
-                 * NEWIDLE balancing is a source of latency, so preemptible
-                 * kernels will stop after the first task is pulled to minimize
-                 * the critical section.
-                 */
-                if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) {
-                        *lb_flags |= LBF_ABORT;
-                        break;
-                }
-#endif
-        } while (load_moved && max_load_move > total_load_moved);
-        return total_load_moved > 0;
 }
+#endif
 /********** Helpers for find_busiest_group ************************/
 /*
@@ -3776,6 +3712,11 @@ void update_group_power(struct sched_domain *sd, int cpu)
        struct sched_domain *child = sd->child;
        struct sched_group *group, *sdg = sd->groups;
        unsigned long power;
+        unsigned long interval;
+        interval = msecs_to_jiffies(sd->balance_interval);
+        interval = clamp(interval, 1UL, max_load_balance_interval);
+        sdg->sgp->next_update = jiffies + interval;
        if (!child) {
                update_cpu_power(sd, cpu);
@@ -3883,12 +3824,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
         * domains. In the newly idle case, we will allow all the cpu's
         * to do the newly idle load balance.
         */
-        if (idle != CPU_NEWLY_IDLE && local_group) {
+        if (local_group) {
-                if (balance_cpu != this_cpu) {
+                if (idle != CPU_NEWLY_IDLE) {
-                        *balance = 0;
+                        if (balance_cpu != this_cpu) {
-                        return;
+                                *balance = 0;
-                }
+                                return;
-                update_group_power(sd, this_cpu);
+                        }
+                        update_group_power(sd, this_cpu);
+                } else if (time_after_eq(jiffies, group->sgp->next_update))
+                        update_group_power(sd, this_cpu);
        }
        /* Adjust by relative CPU power of the group */
@@ -4451,13 +4395,21 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                        struct sched_domain *sd, enum cpu_idle_type idle,
                        int *balance)
 {
-        int ld_moved, lb_flags = 0, active_balance = 0;
+        int ld_moved, active_balance = 0;
        struct sched_group *group;
        unsigned long imbalance;
        struct rq *busiest;
        unsigned long flags;
        struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
+        struct lb_env env = {
+                .sd             = sd,
+                .dst_cpu        = this_cpu,
+                .dst_rq         = this_rq,
+                .idle           = idle,
+                .loop_break     = sysctl_sched_nr_migrate,
+        };
        cpumask_copy(cpus, cpu_active_mask);
        schedstat_inc(sd, lb_count[idle]);
@@ -4492,32 +4444,34 @@ redo:
                 * still unbalanced. ld_moved simply stays zero, so it is
                 * correctly treated as an imbalance.
                 */
-                lb_flags |= LBF_ALL_PINNED;
+                env.flags |= LBF_ALL_PINNED;
+                env.load_move = imbalance;
+                env.src_cpu = busiest->cpu;
+                env.src_rq = busiest;
+                env.loop_max = busiest->nr_running;
+more_balance:
                local_irq_save(flags);
                double_rq_lock(this_rq, busiest);
-                ld_moved = move_tasks(this_rq, this_cpu, busiest,
+                if (!env.loop)
-                                      imbalance, sd, idle, &lb_flags);
+                        update_h_load(env.src_cpu);
+                ld_moved += move_tasks(&env);
                double_rq_unlock(this_rq, busiest);
                local_irq_restore(flags);
+                if (env.flags & LBF_NEED_BREAK) {
+                        env.flags &= ~LBF_NEED_BREAK;
+                        goto more_balance;
+                }
                /*
                 * some other cpu did the load balance for us.
                 */
                if (ld_moved && this_cpu != smp_processor_id())
                        resched_cpu(this_cpu);
-                if (lb_flags & LBF_ABORT)
-                        goto out_balanced;
-                if (lb_flags & LBF_NEED_BREAK) {
-                        lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK;
-                        if (lb_flags & LBF_ABORT)
-                                goto out_balanced;
-                        goto redo;
-                }
                /* All tasks on this runqueue were pinned by CPU affinity */
-                if (unlikely(lb_flags & LBF_ALL_PINNED)) {
+                if (unlikely(env.flags & LBF_ALL_PINNED)) {
                        cpumask_clear_cpu(cpu_of(busiest), cpus);
                        if (!cpumask_empty(cpus))
                                goto redo;
@@ -4547,7 +4501,7 @@ redo:
                                        tsk_cpus_allowed(busiest->curr))) {
                                raw_spin_unlock_irqrestore(&busiest->lock,
                                                            flags);
-                                lb_flags |= LBF_ALL_PINNED;
+                                env.flags |= LBF_ALL_PINNED;
                                goto out_one_pinned;
                        }
@@ -4600,7 +4554,7 @@ out_balanced:
 out_one_pinned:
        /* tune up the balancing interval */
-        if (((lb_flags & LBF_ALL_PINNED) &&
+        if (((env.flags & LBF_ALL_PINNED) &&
                        sd->balance_interval < MAX_PINNED_INTERVAL) ||
                        (sd->balance_interval < sd->max_interval))
                sd->balance_interval *= 2;
@@ -4710,10 +4664,18 @@ static int active_load_balance_cpu_stop(void *data)
        }
        if (likely(sd)) {
+                struct lb_env env = {
+                        .sd             = sd,
+                        .dst_cpu        = target_cpu,
+                        .dst_rq         = target_rq,
+                        .src_cpu        = busiest_rq->cpu,
+                        .src_rq         = busiest_rq,
+                        .idle           = CPU_IDLE,
+                };
                schedstat_inc(sd, alb_count);
-                if (move_one_task(target_rq, target_cpu, busiest_rq,
+                if (move_one_task(&env))
-                                  sd, CPU_IDLE))
                        schedstat_inc(sd, alb_pushed);
                else
                        schedstat_inc(sd, alb_failed);
@@ -4945,8 +4907,6 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
 static DEFINE_SPINLOCK(balancing);
-static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 /*
 * Scale the max load_balance interval with the number of CPUs in the system.
 * This trades load-balance latency on larger machines for less cross talk.
@@ -5340,7 +5300,6 @@ static void set_curr_task_fair(struct rq *rq)
 void init_cfs_rq(struct cfs_rq *cfs_rq)
 {
        cfs_rq->tasks_timeline = RB_ROOT;
-        INIT_LIST_HEAD(&cfs_rq->tasks);
        cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 #ifndef CONFIG_64BIT
        cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
@@ -5612,6 +5571,7 @@ __init void init_sched_fair_class(void)
        open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
 #ifdef CONFIG_NO_HZ
+        nohz.next_balance = jiffies;
        zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
        cpu_notifier(sched_ilb_notifier, 0);
 #endif
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f42ae7fb5ec5..44af55e6d5d0 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -778,12 +778,9 @@ static inline int balance_runtime(struct rt_rq *rt_rq)
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 {
-        int i, idle = 1;
+        int i, idle = 1, throttled = 0;
        const struct cpumask *span;
-        if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
-                return 1;
        span = sched_rt_period_mask();
        for_each_cpu(i, span) {
                int enqueue = 0;
@@ -818,12 +815,17 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
                        if (!rt_rq_throttled(rt_rq))
                                enqueue = 1;
                }
+                if (rt_rq->rt_throttled)
+                        throttled = 1;
                if (enqueue)
                        sched_rt_rq_enqueue(rt_rq);
                raw_spin_unlock(&rq->lock);
        }
+        if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
+                return 1;
        return idle;
 }
@@ -855,8 +857,30 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
                return 0;
        if (rt_rq->rt_time > runtime) {
-                rt_rq->rt_throttled = 1;
+                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
-                printk_once(KERN_WARNING "sched: RT throttling activated\n");
+                /*
+                 * Don't actually throttle groups that have no runtime assigned
+                 * but accrue some time due to boosting.
+                 */
+                if (likely(rt_b->rt_runtime)) {
+                        static bool once = false;
+                        rt_rq->rt_throttled = 1;
+                        if (!once) {
+                                once = true;
+                                printk_sched("sched: RT throttling activated\n");
+                        }
+                } else {
+                        /*
+                         * In case we did anyway, make it go away,
+                         * replenishment is a joke, since it will replenish us
+                         * with exactly 0 ns.
+                         */
+                        rt_rq->rt_time = 0;
+                }
                if (rt_rq_throttled(rt_rq)) {
                        sched_rt_rq_dequeue(rt_rq);
                        return 1;
@@ -884,7 +908,8 @@ static void update_curr_rt(struct rq *rq)
        if (unlikely((s64)delta_exec < 0))
                delta_exec = 0;
-        schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
+        schedstat_set(curr->se.statistics.exec_max,
+                      max(curr->se.statistics.exec_max, delta_exec));
        curr->se.sum_exec_runtime += delta_exec;
        account_group_exec_runtime(curr, delta_exec);
@@ -1403,7 +1428,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 next_idx:
                if (idx >= MAX_RT_PRIO)
                        continue;
-                if (next && next->prio < idx)
+                if (next && next->prio <= idx)
                        continue;
                list_for_each_entry(rt_se, array->queue + idx, run_list) {
                        struct task_struct *p;
@@ -1972,7 +1997,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
        if (--p->rt.time_slice)
                return;
-        p->rt.time_slice = DEF_TIMESLICE;
+        p->rt.time_slice = RR_TIMESLICE;
        /*
         * Requeue to the end of queue if we are not the only element
@@ -2000,7 +2025,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
         * Time slice is 0 for SCHED_FIFO tasks
         */
        if (task->policy == SCHED_RR)
-                return DEF_TIMESLICE;
+                return RR_TIMESLICE;
        else
                return 0;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 98c0c2623db8..fb3acba4d52e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -36,11 +36,7 @@ extern __read_mostly int scheduler_running;
 /*
 * These are the 'tuning knobs' of the scheduler:
- *
- * default timeslice is 100 msecs (used only for SCHED_RR tasks).
- * Timeslices get refilled after they expire.
 */
-#define DEF_TIMESLICE           (100 * HZ / 1000)
 /*
 * single value that denotes runtime == period, ie unlimited time.
@@ -216,9 +212,6 @@ struct cfs_rq {
        struct rb_root tasks_timeline;
        struct rb_node *rb_leftmost;
-        struct list_head tasks;
-        struct list_head *balance_iterator;
        /*
         * 'curr' points to currently running entity on this cfs_rq.
         * It is set to NULL otherwise (i.e when none are currently running).
@@ -246,11 +239,6 @@ struct cfs_rq {
 #ifdef CONFIG_SMP
        /*
-         * the part of load.weight contributed by tasks
-         */
-        unsigned long task_weight;
-        /*
         *   h_load = weight * f(tg)
         *
         * Where f(tg) is the recursive weight fraction assigned to
@@ -424,6 +412,8 @@ struct rq {
        int cpu;
        int online;
+        struct list_head cfs_tasks;
        u64 rt_avg;
        u64 age_stamp;
        u64 idle_stamp;
@@ -462,7 +452,6 @@ struct rq {
        unsigned int yld_count;
        /* schedule() stats */
-        unsigned int sched_switch;
        unsigned int sched_count;
        unsigned int sched_goidle;
@@ -611,7 +600,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
 */
 #ifdef CONFIG_SCHED_DEBUG
-# include <linux/jump_label.h>
+# include <linux/static_key.h>
 # define const_debug __read_mostly
 #else
 # define const_debug const
@@ -630,18 +619,18 @@ enum {
 #undef SCHED_FEAT
 #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
-static __always_inline bool static_branch__true(struct jump_label_key *key)
+static __always_inline bool static_branch__true(struct static_key *key)
 {
-        return likely(static_branch(key)); /* Not out of line branch. */
+        return static_key_true(key); /* Not out of line branch. */
 }
-static __always_inline bool static_branch__false(struct jump_label_key *key)
+static __always_inline bool static_branch__false(struct static_key *key)
 {
-        return unlikely(static_branch(key)); /* Out of line branch. */
+        return static_key_false(key); /* Out of line branch. */
 }
 #define SCHED_FEAT(name, enabled)                                       \
-static __always_inline bool static_branch_##name(struct jump_label_key *key) \
+static __always_inline bool static_branch_##name(struct static_key *key) \
 {                                                                       \
        return static_branch__##enabled(key);                           \
 }
@@ -650,7 +639,7 @@ static __always_inline bool static_branch_##name(struct jump_label_key *key) \
 #undef SCHED_FEAT
-extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR];
+extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
 #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
 #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
@@ -692,6 +681,9 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
 #ifndef finish_arch_switch
 # define finish_arch_switch(prev)       do { } while (0)
 #endif
+#ifndef finish_arch_post_lock_switch
+# define finish_arch_post_lock_switch() do { } while (0)
+#endif
 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 2a581ba8e190..903ffa9e8872 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -32,9 +32,9 @@ static int show_schedstat(struct seq_file *seq, void *v)
                /* runqueue-specific stats */
                seq_printf(seq,
-                    "cpu%d %u %u %u %u %u %u %llu %llu %lu",
+                    "cpu%d %u 0 %u %u %u %u %llu %llu %lu",
                    cpu, rq->yld_count,
-                    rq->sched_switch, rq->sched_count, rq->sched_goidle,
+                    rq->sched_count, rq->sched_goidle,
                    rq->ttwu_count, rq->ttwu_local,
                    rq->rq_cpu_time,
                    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
diff --git a/kernel/signal.c b/kernel/signal.c
index c73c4284160e..17afcaf582d0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -36,6 +36,7 @@
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/siginfo.h>
+#include <asm/cacheflush.h>
 #include "audit.h"      /* audit_signal_info() */
 /*
@@ -58,21 +59,20 @@ static int sig_handler_ignored(void __user *handler, int sig)
                (handler == SIG_DFL && sig_kernel_ignore(sig));
 }
-static int sig_task_ignored(struct task_struct *t, int sig,
+static int sig_task_ignored(struct task_struct *t, int sig, bool force)
-                int from_ancestor_ns)
 {
        void __user *handler;
        handler = sig_handler(t, sig);
        if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
-                        handler == SIG_DFL && !from_ancestor_ns)
+                        handler == SIG_DFL && !force)
                return 1;
        return sig_handler_ignored(handler, sig);
 }
-static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
+static int sig_ignored(struct task_struct *t, int sig, bool force)
 {
        /*
         * Blocked signals are never ignored, since the
@@ -82,7 +82,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
        if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
                return 0;
-        if (!sig_task_ignored(t, sig, from_ancestor_ns))
+        if (!sig_task_ignored(t, sig, force))
                return 0;
        /*
@@ -855,7 +855,7 @@ static void ptrace_trap_notify(struct task_struct *t)
 * Returns true if the signal should be actually delivered, otherwise
 * it should be dropped.
 */
-static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
+static int prepare_signal(int sig, struct task_struct *p, bool force)
 {
        struct signal_struct *signal = p->signal;
        struct task_struct *t;
@@ -915,7 +915,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
                }
        }
-        return !sig_ignored(p, sig, from_ancestor_ns);
+        return !sig_ignored(p, sig, force);
 }
 /*
@@ -1054,13 +1054,14 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
        struct sigpending *pending;
        struct sigqueue *q;
        int override_rlimit;
+        int ret = 0, result;
-        trace_signal_generate(sig, info, t);
        assert_spin_locked(&t->sighand->siglock);
-        if (!prepare_signal(sig, t, from_ancestor_ns))
+        result = TRACE_SIGNAL_IGNORED;
-                return 0;
+        if (!prepare_signal(sig, t,
+                        from_ancestor_ns || (info == SEND_SIG_FORCED)))
+                goto ret;
        pending = group ? &t->signal->shared_pending : &t->pending;
        /*
@@ -1068,8 +1069,11 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
         * exactly one non-rt signal, so that we can get more
         * detailed information about the cause of the signal.
         */
+        result = TRACE_SIGNAL_ALREADY_PENDING;
        if (legacy_queue(pending, sig))
-                return 0;
+                goto ret;
+        result = TRACE_SIGNAL_DELIVERED;
        /*
         * fast-pathed signals for kernel-internal things like SIGSTOP
         * or SIGKILL.
@@ -1127,14 +1131,15 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
                         * signal was rt and sent by user using something
                         * other than kill().
                         */
-                        trace_signal_overflow_fail(sig, group, info);
+                        result = TRACE_SIGNAL_OVERFLOW_FAIL;
-                        return -EAGAIN;
+                        ret = -EAGAIN;
+                        goto ret;
                } else {
                        /*
                         * This is a silent loss of information.  We still
                         * send the signal, but the *info bits are lost.
                         */
-                        trace_signal_lose_info(sig, group, info);
+                        result = TRACE_SIGNAL_LOSE_INFO;
                }
        }
@@ -1142,7 +1147,9 @@ out_set:
        signalfd_notify(t, sig);
        sigaddset(&pending->signal, sig);
        complete_signal(sig, t, group);
-        return 0;
+ret:
+        trace_signal_generate(sig, info, t, group, result);
+        return ret;
 }
 static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
@@ -1585,7 +1592,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
        int sig = q->info.si_signo;
        struct sigpending *pending;
        unsigned long flags;
-        int ret;
+        int ret, result;
        BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
@@ -1594,7 +1601,8 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
                goto ret;
        ret = 1; /* the signal is ignored */
-        if (!prepare_signal(sig, t, 0))
+        result = TRACE_SIGNAL_IGNORED;
+        if (!prepare_signal(sig, t, false))
                goto out;
        ret = 0;
@@ -1605,6 +1613,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
                 */
                BUG_ON(q->info.si_code != SI_TIMER);
                q->info.si_overrun++;
+                result = TRACE_SIGNAL_ALREADY_PENDING;
                goto out;
        }
        q->info.si_overrun = 0;
@@ -1614,7 +1623,9 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
        list_add_tail(&q->list, &pending->list);
        sigaddset(&pending->signal, sig);
        complete_signal(sig, t, group);
+        result = TRACE_SIGNAL_DELIVERED;
 out:
+        trace_signal_generate(sig, &q->info, t, group, result);
        unlock_task_sighand(t, &flags);
 ret:
        return ret;
@@ -1642,6 +1653,15 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
        BUG_ON(!tsk->ptrace &&
               (tsk->group_leader != tsk || !thread_group_empty(tsk)));
+        if (sig != SIGCHLD) {
+                /*
+                 * This is only possible if parent == real_parent.
+                 * Check if it has changed security domain.
+                 */
+                if (tsk->parent_exec_id != tsk->parent->self_exec_id)
+                        sig = SIGCHLD;
+        }
        info.si_signo = sig;
        info.si_errno = 0;
        /*
diff --git a/kernel/smp.c b/kernel/smp.c
index db197d60489b..2f8b10ecf759 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -701,3 +701,93 @@ int on_each_cpu(void (*func) (void *info), void *info, int wait)
        return ret;
 }
 EXPORT_SYMBOL(on_each_cpu);
+/**
+ * on_each_cpu_mask(): Run a function on processors specified by
+ * cpumask, which may include the local processor.
+ * @mask: The set of cpus to run on (only runs on online subset).
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait (atomically) until function has completed
+ *        on other CPUs.
+ *
+ * If @wait is true, then returns once @func has returned.
+ *
+ * You must not call this function with disabled interrupts or
+ * from a hardware interrupt handler or from a bottom half handler.
+ */
+void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
+                        void *info, bool wait)
+{
+        int cpu = get_cpu();
+        smp_call_function_many(mask, func, info, wait);
+        if (cpumask_test_cpu(cpu, mask)) {
+                local_irq_disable();
+                func(info);
+                local_irq_enable();
+        }
+        put_cpu();
+}
+EXPORT_SYMBOL(on_each_cpu_mask);
+/*
+ * on_each_cpu_cond(): Call a function on each processor for which
+ * the supplied function cond_func returns true, optionally waiting
+ * for all the required CPUs to finish. This may include the local
+ * processor.
+ * @cond_func:  A callback function that is passed a cpu id and
+ *              the the info parameter. The function is called
+ *              with preemption disabled. The function should
+ *              return a blooean value indicating whether to IPI
+ *              the specified CPU.
+ * @func:       The function to run on all applicable CPUs.
+ *              This must be fast and non-blocking.
+ * @info:       An arbitrary pointer to pass to both functions.
+ * @wait:       If true, wait (atomically) until function has
+ *              completed on other CPUs.
+ * @gfp_flags:  GFP flags to use when allocating the cpumask
+ *              used internally by the function.
+ *
+ * The function might sleep if the GFP flags indicates a non
+ * atomic allocation is allowed.
+ *
+ * Preemption is disabled to protect against CPUs going offline but not online.
+ * CPUs going online during the call will not be seen or sent an IPI.
+ *
+ * You must not call this function with disabled interrupts or
+ * from a hardware interrupt handler or from a bottom half handler.
+ */
+void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
+                        smp_call_func_t func, void *info, bool wait,
+                        gfp_t gfp_flags)
+{
+        cpumask_var_t cpus;
+        int cpu, ret;
+        might_sleep_if(gfp_flags & __GFP_WAIT);
+        if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) {
+                preempt_disable();
+                for_each_online_cpu(cpu)
+                        if (cond_func(cpu, info))
+                                cpumask_set_cpu(cpu, cpus);
+                on_each_cpu_mask(cpus, func, info, wait);
+                preempt_enable();
+                free_cpumask_var(cpus);
+        } else {
+                /*
+                 * No free cpumask, bother. No matter, we'll
+                 * just have to IPI them one by one.
+                 */
+                preempt_disable();
+                for_each_online_cpu(cpu)
+                        if (cond_func(cpu, info)) {
+                                ret = smp_call_function_single(cpu, func,
+                                                                info, wait);
+                                WARN_ON_ONCE(!ret);
+                        }
+                preempt_enable();
+        }
+}
+EXPORT_SYMBOL(on_each_cpu_cond);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 4eb3a0fa351e..671f9594e368 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -297,7 +297,7 @@ void irq_enter(void)
        int cpu = smp_processor_id();
        rcu_irq_enter();
-        if (idle_cpu(cpu) && !in_interrupt()) {
+        if (is_idle_task(current) && !in_interrupt()) {
                /*
                 * Prevent raise_softirq from needlessly waking up ksoftirqd
                 * here, as softirq will be serviced on return from interrupt.
@@ -310,31 +310,21 @@ void irq_enter(void)
        __irq_enter();
 }
-#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
 static inline void invoke_softirq(void)
 {
-        if (!force_irqthreads)
+        if (!force_irqthreads) {
+#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
                __do_softirq();
-        else {
-                __local_bh_disable((unsigned long)__builtin_return_address(0),
-                                SOFTIRQ_OFFSET);
-                wakeup_softirqd();
-                __local_bh_enable(SOFTIRQ_OFFSET);
-        }
-}
 #else
-static inline void invoke_softirq(void)
-{
-        if (!force_irqthreads)
                do_softirq();
-        else {
+#endif
+        } else {
                __local_bh_disable((unsigned long)__builtin_return_address(0),
                                SOFTIRQ_OFFSET);
                wakeup_softirqd();
                __local_bh_enable(SOFTIRQ_OFFSET);
        }
 }
-#endif
 /*
 * Exit an interrupt context. Process softirqs if needed and possible:
@@ -353,7 +343,7 @@ void irq_exit(void)
                tick_nohz_irq_exit();
 #endif
        rcu_irq_exit();
-        preempt_enable_no_resched();
+        sched_preempt_enable_no_resched();
 }
 /*
@@ -385,6 +375,12 @@ void raise_softirq(unsigned int nr)
        local_irq_restore(flags);
 }
+void __raise_softirq_irqoff(unsigned int nr)
+{
+        trace_softirq_raise(nr);
+        or_softirq_pending(1UL << nr);
+}
 void open_softirq(int nr, void (*action)(struct softirq_action *))
 {
        softirq_vec[nr].action = action;
@@ -744,9 +740,7 @@ static int run_ksoftirqd(void * __bind_cpu)
        while (!kthread_should_stop()) {
                preempt_disable();
                if (!local_softirq_pending()) {
-                        preempt_enable_no_resched();
+                        schedule_preempt_disabled();
-                        schedule();
-                        preempt_disable();
                }
                __set_current_state(TASK_RUNNING);
@@ -761,7 +755,7 @@ static int run_ksoftirqd(void * __bind_cpu)
                        if (local_softirq_pending())
                                __do_softirq();
                        local_irq_enable();
-                        preempt_enable_no_resched();
+                        sched_preempt_enable_no_resched();
                        cond_resched();
                        preempt_disable();
                        rcu_note_context_switch((long)__bind_cpu);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 84c7d96918bf..5cdd8065a3ce 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -163,7 +163,7 @@ void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
 EXPORT_SYMBOL(_raw_spin_lock_bh);
 #endif
-#ifndef CONFIG_INLINE_SPIN_UNLOCK
+#ifdef CONFIG_UNINLINE_SPIN_UNLOCK
 void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
 {
        __raw_spin_unlock(lock);
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 0febf61e1aa3..ba35f3a4a1f4 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -172,6 +172,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
 {
        int idx;
+        rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
+                           !lock_is_held(&rcu_bh_lock_map) &&
+                           !lock_is_held(&rcu_lock_map) &&
+                           !lock_is_held(&rcu_sched_lock_map),
+                           "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
        idx = sp->completed;
        mutex_lock(&sp->mutex);
@@ -280,19 +286,26 @@ void synchronize_srcu(struct srcu_struct *sp)
 EXPORT_SYMBOL_GPL(synchronize_srcu);
 /**
- * synchronize_srcu_expedited - like synchronize_srcu, but less patient
+ * synchronize_srcu_expedited - Brute-force SRCU grace period
 * @sp: srcu_struct with which to synchronize.
 *
- * Flip the completed counter, and wait for the old count to drain to zero.
+ * Wait for an SRCU grace period to elapse, but use a "big hammer"
- * As with classic RCU, the updater must use some separate means of
+ * approach to force the grace period to end quickly.  This consumes
- * synchronizing concurrent updates.  Can block; must be called from
+ * significant time on all CPUs and is unfriendly to real-time workloads,
- * process context.
+ * so is thus not recommended for any sort of common-case code.  In fact,
+ * if you are using synchronize_srcu_expedited() in a loop, please
+ * restructure your code to batch your updates, and then use a single
+ * synchronize_srcu() instead.
 *
- * Note that it is illegal to call synchronize_srcu_expedited()
+ * Note that it is illegal to call this function while holding any lock
- * from the corresponding SRCU read-side critical section; doing so
+ * that is acquired by a CPU-hotplug notifier.  And yes, it is also illegal
- * will result in deadlock.  However, it is perfectly legal to call
+ * to call this function from a CPU-hotplug notifier.  Failing to observe
- * synchronize_srcu_expedited() on one srcu_struct from some other
+ * these restriction will result in deadlock.  It is also illegal to call
- * srcu_struct's read-side critical section.
+ * synchronize_srcu_expedited() from the corresponding SRCU read-side
+ * critical section; doing so will result in deadlock.  However, it is
+ * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct
+ * from some other srcu_struct's read-side critical section, as long as
+ * the resulting graph of srcu_structs is acyclic.
 */
 void synchronize_srcu_expedited(struct srcu_struct *sp)
 {
diff --git a/kernel/sys.c b/kernel/sys.c
index 40701538fbd1..e7006eb6c1e4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -444,6 +444,15 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
                        magic2 != LINUX_REBOOT_MAGIC2C))
                return -EINVAL;
+        /*
+         * If pid namespaces are enabled and the current task is in a child
+         * pid_namespace, the command is handled by reboot_pid_ns() which will
+         * call do_exit().
+         */
+        ret = reboot_pid_ns(task_active_pid_ns(current), cmd);
+        if (ret)
+                return ret;
        /* Instead of trying to make the power_off code look like
         * halt when pm_power_off is not set do it the easy way.
         */
@@ -1706,7 +1715,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
        if (arg4 | arg5)
                return -EINVAL;
-        if (!capable(CAP_SYS_ADMIN))
+        if (!capable(CAP_SYS_RESOURCE))
                return -EPERM;
        if (addr >= TASK_SIZE)
@@ -1962,6 +1971,14 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                case PR_SET_MM:
                        error = prctl_set_mm(arg2, arg3, arg4, arg5);
                        break;
+                case PR_SET_CHILD_SUBREAPER:
+                        me->signal->is_child_subreaper = !!arg2;
+                        error = 0;
+                        break;
+                case PR_GET_CHILD_SUBREAPER:
+                        error = put_user(me->signal->is_child_subreaper,
+                                         (int __user *) arg2);
+                        break;
                default:
                        error = -EINVAL;
                        break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f487f257e05e..52b3a06a02f8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -23,6 +23,7 @@
 #include <linux/swap.h>
 #include <linux/slab.h>
 #include <linux/sysctl.h>
+#include <linux/bitmap.h>
 #include <linux/signal.h>
 #include <linux/printk.h>
 #include <linux/proc_fs.h>
@@ -58,6 +59,7 @@
 #include <linux/oom.h>
 #include <linux/kmod.h>
 #include <linux/capability.h>
+#include <linux/binfmts.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -67,6 +69,9 @@
 #include <asm/stacktrace.h>
 #include <asm/io.h>
 #endif
+#ifdef CONFIG_SPARC
+#include <asm/setup.h>
+#endif
 #ifdef CONFIG_BSD_PROCESS_ACCT
 #include <linux/acct.h>
 #endif
@@ -141,7 +146,6 @@ static const int cap_last_cap = CAP_LAST_CAP;
 #include <linux/inotify.h>
 #endif
 #ifdef CONFIG_SPARC
-#include <asm/system.h>
 #endif
 #ifdef CONFIG_SPARC64
@@ -192,20 +196,6 @@ static int sysrq_sysctl_handler(ctl_table *table, int write,
 #endif
-static struct ctl_table root_table[];
-static struct ctl_table_root sysctl_table_root;
-static struct ctl_table_header root_table_header = {
-        {{.count = 1,
-        .ctl_table = root_table,
-        .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}},
-        .root = &sysctl_table_root,
-        .set = &sysctl_table_root.default_set,
-};
-static struct ctl_table_root sysctl_table_root = {
-        .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
-        .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry),
-};
 static struct ctl_table kern_table[];
 static struct ctl_table vm_table[];
 static struct ctl_table fs_table[];
@@ -222,7 +212,7 @@ int sysctl_legacy_va_layout;
 /* The default sysctl tables: */
-static struct ctl_table root_table[] = {
+static struct ctl_table sysctl_base_table[] = {
        {
                .procname       = "kernel",
                .mode           = 0555,
@@ -1559,490 +1549,12 @@ static struct ctl_table dev_table[] = {
        { }
 };
-static DEFINE_SPINLOCK(sysctl_lock);
+int __init sysctl_init(void)
-/* called under sysctl_lock */
-static int use_table(struct ctl_table_header *p)
 {
-        if (unlikely(p->unregistering))
+        register_sysctl_table(sysctl_base_table);
-                return 0;
-        p->used++;
-        return 1;
-}
-/* called under sysctl_lock */
-static void unuse_table(struct ctl_table_header *p)
-{
-        if (!--p->used)
-                if (unlikely(p->unregistering))
-                        complete(p->unregistering);
-}
-/* called under sysctl_lock, will reacquire if has to wait */
-static void start_unregistering(struct ctl_table_header *p)
-{
-        /*
-         * if p->used is 0, nobody will ever touch that entry again;
-         * we'll eliminate all paths to it before dropping sysctl_lock
-         */
-        if (unlikely(p->used)) {
-                struct completion wait;
-                init_completion(&wait);
-                p->unregistering = &wait;
-                spin_unlock(&sysctl_lock);
-                wait_for_completion(&wait);
-                spin_lock(&sysctl_lock);
-        } else {
-                /* anything non-NULL; we'll never dereference it */
-                p->unregistering = ERR_PTR(-EINVAL);
-        }
-        /*
-         * do not remove from the list until nobody holds it; walking the
-         * list in do_sysctl() relies on that.
-         */
-        list_del_init(&p->ctl_entry);
-}
-void sysctl_head_get(struct ctl_table_header *head)
-{
-        spin_lock(&sysctl_lock);
-        head->count++;
-        spin_unlock(&sysctl_lock);
-}
-void sysctl_head_put(struct ctl_table_header *head)
-{
-        spin_lock(&sysctl_lock);
-        if (!--head->count)
-                kfree_rcu(head, rcu);
-        spin_unlock(&sysctl_lock);
-}
-struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
-{
-        if (!head)
-                BUG();
-        spin_lock(&sysctl_lock);
-        if (!use_table(head))
-                head = ERR_PTR(-ENOENT);
-        spin_unlock(&sysctl_lock);
-        return head;
-}
-void sysctl_head_finish(struct ctl_table_header *head)
-{
-        if (!head)
-                return;
-        spin_lock(&sysctl_lock);
-        unuse_table(head);
-        spin_unlock(&sysctl_lock);
-}
-static struct ctl_table_set *
-lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
-{
-        struct ctl_table_set *set = &root->default_set;
-        if (root->lookup)
-                set = root->lookup(root, namespaces);
-        return set;
-}
-static struct list_head *
-lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
-{
-        struct ctl_table_set *set = lookup_header_set(root, namespaces);
-        return &set->list;
-}
-struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
-                                            struct ctl_table_header *prev)
-{
-        struct ctl_table_root *root;
-        struct list_head *header_list;
-        struct ctl_table_header *head;
-        struct list_head *tmp;
-        spin_lock(&sysctl_lock);
-        if (prev) {
-                head = prev;
-                tmp = &prev->ctl_entry;
-                unuse_table(prev);
-                goto next;
-        }
-        tmp = &root_table_header.ctl_entry;
-        for (;;) {
-                head = list_entry(tmp, struct ctl_table_header, ctl_entry);
-                if (!use_table(head))
-                        goto next;
-                spin_unlock(&sysctl_lock);
-                return head;
-        next:
-                root = head->root;
-                tmp = tmp->next;
-                header_list = lookup_header_list(root, namespaces);
-                if (tmp != header_list)
-                        continue;
-                do {
-                        root = list_entry(root->root_list.next,
-                                        struct ctl_table_root, root_list);
-                        if (root == &sysctl_table_root)
-                                goto out;
-                        header_list = lookup_header_list(root, namespaces);
-                } while (list_empty(header_list));
-                tmp = header_list->next;
-        }
-out:
-        spin_unlock(&sysctl_lock);
-        return NULL;
-}
-struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
-{
-        return __sysctl_head_next(current->nsproxy, prev);
-}
-void register_sysctl_root(struct ctl_table_root *root)
-{
-        spin_lock(&sysctl_lock);
-        list_add_tail(&root->root_list, &sysctl_table_root.root_list);
-        spin_unlock(&sysctl_lock);
-}
-/*
- * sysctl_perm does NOT grant the superuser all rights automatically, because
- * some sysctl variables are readonly even to root.
- */
-static int test_perm(int mode, int op)
-{
-        if (!current_euid())
-                mode >>= 6;
-        else if (in_egroup_p(0))
-                mode >>= 3;
-        if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
-                return 0;
-        return -EACCES;
-}
-int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
-{
-        int mode;
-        if (root->permissions)
-                mode = root->permissions(root, current->nsproxy, table);
-        else
-                mode = table->mode;
-        return test_perm(mode, op);
-}
-static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
-{
-        for (; table->procname; table++) {
-                table->parent = parent;
-                if (table->child)
-                        sysctl_set_parent(table, table->child);
-        }
-}
-static __init int sysctl_init(void)
-{
-        sysctl_set_parent(NULL, root_table);
-#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
-        sysctl_check_table(current->nsproxy, root_table);
-#endif
        return 0;
 }
-core_initcall(sysctl_init);
-static struct ctl_table *is_branch_in(struct ctl_table *branch,
-                                      struct ctl_table *table)
-{
-        struct ctl_table *p;
-        const char *s = branch->procname;
-        /* branch should have named subdirectory as its first element */
-        if (!s || !branch->child)
-                return NULL;
-        /* ... and nothing else */
-        if (branch[1].procname)
-                return NULL;
-        /* table should contain subdirectory with the same name */
-        for (p = table; p->procname; p++) {
-                if (!p->child)
-                        continue;
-                if (p->procname && strcmp(p->procname, s) == 0)
-                        return p;
-        }
-        return NULL;
-}
-/* see if attaching q to p would be an improvement */
-static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
-{
-        struct ctl_table *to = p->ctl_table, *by = q->ctl_table;
-        struct ctl_table *next;
-        int is_better = 0;
-        int not_in_parent = !p->attached_by;
-        while ((next = is_branch_in(by, to)) != NULL) {
-                if (by == q->attached_by)
-                        is_better = 1;
-                if (to == p->attached_by)
-                        not_in_parent = 1;
-                by = by->child;
-                to = next->child;
-        }
-        if (is_better && not_in_parent) {
-                q->attached_by = by;
-                q->attached_to = to;
-                q->parent = p;
-        }
-}
-/**
- * __register_sysctl_paths - register a sysctl hierarchy
- * @root: List of sysctl headers to register on
- * @namespaces: Data to compute which lists of sysctl entries are visible
- * @path: The path to the directory the sysctl table is in.
- * @table: the top-level table structure
- *
- * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
- *
- * The members of the &struct ctl_table structure are used as follows:
- *
- * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
- *            enter a sysctl file
- *
- * data - a pointer to data for use by proc_handler
- *
- * maxlen - the maximum size in bytes of the data
- *
- * mode - the file permissions for the /proc/sys file, and for sysctl(2)
- *
- * child - a pointer to the child sysctl table if this entry is a directory, or
- *         %NULL.
- *
- * proc_handler - the text handler routine (described below)
- *
- * de - for internal use by the sysctl routines
- *
- * extra1, extra2 - extra pointers usable by the proc handler routines
- *
- * Leaf nodes in the sysctl tree will be represented by a single file
- * under /proc; non-leaf nodes will be represented by directories.
- *
- * sysctl(2) can automatically manage read and write requests through
- * the sysctl table.  The data and maxlen fields of the ctl_table
- * struct enable minimal validation of the values being written to be
- * performed, and the mode field allows minimal authentication.
- *
- * There must be a proc_handler routine for any terminal nodes
- * mirrored under /proc/sys (non-terminals are handled by a built-in
- * directory handler).  Several default handlers are available to
- * cover common cases -
- *
- * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
- * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), 
- * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax()
- *
- * It is the handler's job to read the input buffer from user memory
- * and process it. The handler should return 0 on success.
- *
- * This routine returns %NULL on a failure to register, and a pointer
- * to the table header on success.
- */
-struct ctl_table_header *__register_sysctl_paths(
-        struct ctl_table_root *root,
-        struct nsproxy *namespaces,
-        const struct ctl_path *path, struct ctl_table *table)
-{
-        struct ctl_table_header *header;
-        struct ctl_table *new, **prevp;
-        unsigned int n, npath;
-        struct ctl_table_set *set;
-        /* Count the path components */
-        for (npath = 0; path[npath].procname; ++npath)
-                ;
-        /*
-         * For each path component, allocate a 2-element ctl_table array.
-         * The first array element will be filled with the sysctl entry
-         * for this, the second will be the sentinel (procname == 0).
-         *
-         * We allocate everything in one go so that we don't have to
-         * worry about freeing additional memory in unregister_sysctl_table.
-         */
-        header = kzalloc(sizeof(struct ctl_table_header) +
-                         (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL);
-        if (!header)
-                return NULL;
-        new = (struct ctl_table *) (header + 1);
-        /* Now connect the dots */
-        prevp = &header->ctl_table;
-        for (n = 0; n < npath; ++n, ++path) {
-                /* Copy the procname */
-                new->procname = path->procname;
-                new->mode     = 0555;
-                *prevp = new;
-                prevp = &new->child;
-                new += 2;
-        }
-        *prevp = table;
-        header->ctl_table_arg = table;
-        INIT_LIST_HEAD(&header->ctl_entry);
-        header->used = 0;
-        header->unregistering = NULL;
-        header->root = root;
-        sysctl_set_parent(NULL, header->ctl_table);
-        header->count = 1;
-#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
-        if (sysctl_check_table(namespaces, header->ctl_table)) {
-                kfree(header);
-                return NULL;
-        }
-#endif
-        spin_lock(&sysctl_lock);
-        header->set = lookup_header_set(root, namespaces);
-        header->attached_by = header->ctl_table;
-        header->attached_to = root_table;
-        header->parent = &root_table_header;
-        for (set = header->set; set; set = set->parent) {
-                struct ctl_table_header *p;
-                list_for_each_entry(p, &set->list, ctl_entry) {
-                        if (p->unregistering)
-                                continue;
-                        try_attach(p, header);
-                }
-        }
-        header->parent->count++;
-        list_add_tail(&header->ctl_entry, &header->set->list);
-        spin_unlock(&sysctl_lock);
-        return header;
-}
-/**
- * register_sysctl_table_path - register a sysctl table hierarchy
- * @path: The path to the directory the sysctl table is in.
- * @table: the top-level table structure
- *
- * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
- *
- * See __register_sysctl_paths for more details.
- */
-struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
-                                                struct ctl_table *table)
-{
-        return __register_sysctl_paths(&sysctl_table_root, current->nsproxy,
-                                        path, table);
-}
-/**
- * register_sysctl_table - register a sysctl table hierarchy
- * @table: the top-level table structure
- *
- * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
- *
- * See register_sysctl_paths for more details.
- */
-struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
-{
-        static const struct ctl_path null_path[] = { {} };
-        return register_sysctl_paths(null_path, table);
-}
-/**
- * unregister_sysctl_table - unregister a sysctl table hierarchy
- * @header: the header returned from register_sysctl_table
- *
- * Unregisters the sysctl table and all children. proc entries may not
- * actually be removed until they are no longer used by anyone.
- */
-void unregister_sysctl_table(struct ctl_table_header * header)
-{
-        might_sleep();
-        if (header == NULL)
-                return;
-        spin_lock(&sysctl_lock);
-        start_unregistering(header);
-        if (!--header->parent->count) {
-                WARN_ON(1);
-                kfree_rcu(header->parent, rcu);
-        }
-        if (!--header->count)
-                kfree_rcu(header, rcu);
-        spin_unlock(&sysctl_lock);
-}
-int sysctl_is_seen(struct ctl_table_header *p)
-{
-        struct ctl_table_set *set = p->set;
-        int res;
-        spin_lock(&sysctl_lock);
-        if (p->unregistering)
-                res = 0;
-        else if (!set->is_seen)
-                res = 1;
-        else
-                res = set->is_seen(set);
-        spin_unlock(&sysctl_lock);
-        return res;
-}
-void setup_sysctl_set(struct ctl_table_set *p,
-        struct ctl_table_set *parent,
-        int (*is_seen)(struct ctl_table_set *))
-{
-        INIT_LIST_HEAD(&p->list);
-        p->parent = parent ? parent : &sysctl_table_root.default_set;
-        p->is_seen = is_seen;
-}
-#else /* !CONFIG_SYSCTL */
-struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
-{
-        return NULL;
-}
-struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
-                                                    struct ctl_table *table)
-{
-        return NULL;
-}
-void unregister_sysctl_table(struct ctl_table_header * table)
-{
-}
-void setup_sysctl_set(struct ctl_table_set *p,
-        struct ctl_table_set *parent,
-        int (*is_seen)(struct ctl_table_set *))
-{
-}
-void sysctl_head_put(struct ctl_table_header *head)
-{
-}
 #endif /* CONFIG_SYSCTL */
 /*
@@ -2884,9 +2396,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
                                }
                        }
-                        while (val_a <= val_b)
+                        bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1);
-                                set_bit(val_a++, tmp_bitmap);
                        first = 0;
                        proc_skip_char(&kbuf, &left, '\n');
                }
@@ -2929,8 +2439,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
                        if (*ppos)
                                bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
                        else
-                                memcpy(bitmap, tmp_bitmap,
+                                bitmap_copy(bitmap, tmp_bitmap, bitmap_len);
-                                        BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long));
                }
                kfree(tmp_bitmap);
                *lenp -= left;
@@ -3008,6 +2517,3 @@ EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
 EXPORT_SYMBOL(proc_dostring);
 EXPORT_SYMBOL(proc_doulongvec_minmax);
 EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
-EXPORT_SYMBOL(register_sysctl_table);
-EXPORT_SYMBOL(register_sysctl_paths);
-EXPORT_SYMBOL(unregister_sysctl_table);
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
deleted file mode 100644
index 362da653813d..000000000000
--- a/kernel/sysctl_check.c
+++ /dev/null
@@ -1,160 +0,0 @@
-#include <linux/stat.h>
-#include <linux/sysctl.h>
-#include "../fs/xfs/xfs_sysctl.h"
-#include <linux/sunrpc/debug.h>
-#include <linux/string.h>
-#include <net/ip_vs.h>
-static int sysctl_depth(struct ctl_table *table)
-{
-        struct ctl_table *tmp;
-        int depth;
-        depth = 0;
-        for (tmp = table; tmp->parent; tmp = tmp->parent)
-                depth++;
-        return depth;
-}
-static struct ctl_table *sysctl_parent(struct ctl_table *table, int n)
-{
-        int i;
-        for (i = 0; table && i < n; i++)
-                table = table->parent;
-        return table;
-}
-static void sysctl_print_path(struct ctl_table *table)
-{
-        struct ctl_table *tmp;
-        int depth, i;
-        depth = sysctl_depth(table);
-        if (table->procname) {
-                for (i = depth; i >= 0; i--) {
-                        tmp = sysctl_parent(table, i);
-                        printk("/%s", tmp->procname?tmp->procname:"");
-                }
-        }
-        printk(" ");
-}
-static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
-                                                struct ctl_table *table)
-{
-        struct ctl_table_header *head;
-        struct ctl_table *ref, *test;
-        int depth, cur_depth;
-        depth = sysctl_depth(table);
-        for (head = __sysctl_head_next(namespaces, NULL); head;
-             head = __sysctl_head_next(namespaces, head)) {
-                cur_depth = depth;
-                ref = head->ctl_table;
-repeat:
-                test = sysctl_parent(table, cur_depth);
-                for (; ref->procname; ref++) {
-                        int match = 0;
-                        if (cur_depth && !ref->child)
-                                continue;
-                        if (test->procname && ref->procname &&
-                            (strcmp(test->procname, ref->procname) == 0))
-                                        match++;
-                        if (match) {
-                                if (cur_depth != 0) {
-                                        cur_depth--;
-                                        ref = ref->child;
-                                        goto repeat;
-                                }
-                                goto out;
-                        }
-                }
-        }
-        ref = NULL;
-out:
-        sysctl_head_finish(head);
-        return ref;
-}
-static void set_fail(const char **fail, struct ctl_table *table, const char *str)
-{
-        if (*fail) {
-                printk(KERN_ERR "sysctl table check failed: ");
-                sysctl_print_path(table);
-                printk(" %s\n", *fail);
-                dump_stack();
-        }
-        *fail = str;
-}
-static void sysctl_check_leaf(struct nsproxy *namespaces,
-                                struct ctl_table *table, const char **fail)
-{
-        struct ctl_table *ref;
-        ref = sysctl_check_lookup(namespaces, table);
-        if (ref && (ref != table))
-                set_fail(fail, table, "Sysctl already exists");
-}
-int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
-{
-        int error = 0;
-        for (; table->procname; table++) {
-                const char *fail = NULL;
-                if (table->parent) {
-                        if (!table->parent->procname)
-                                set_fail(&fail, table, "Parent without procname");
-                }
-                if (table->child) {
-                        if (table->data)
-                                set_fail(&fail, table, "Directory with data?");
-                        if (table->maxlen)
-                                set_fail(&fail, table, "Directory with maxlen?");
-                        if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode)
-                                set_fail(&fail, table, "Writable sysctl directory");
-                        if (table->proc_handler)
-                                set_fail(&fail, table, "Directory with proc_handler");
-                        if (table->extra1)
-                                set_fail(&fail, table, "Directory with extra1");
-                        if (table->extra2)
-                                set_fail(&fail, table, "Directory with extra2");
-                } else {
-                        if ((table->proc_handler == proc_dostring) ||
-                            (table->proc_handler == proc_dointvec) ||
-                            (table->proc_handler == proc_dointvec_minmax) ||
-                            (table->proc_handler == proc_dointvec_jiffies) ||
-                            (table->proc_handler == proc_dointvec_userhz_jiffies) ||
-                            (table->proc_handler == proc_dointvec_ms_jiffies) ||
-                            (table->proc_handler == proc_doulongvec_minmax) ||
-                            (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
-                                if (!table->data)
-                                        set_fail(&fail, table, "No data");
-                                if (!table->maxlen)
-                                        set_fail(&fail, table, "No maxlen");
-                        }
-#ifdef CONFIG_PROC_SYSCTL
-                        if (!table->proc_handler)
-                                set_fail(&fail, table, "No proc_handler");
-#endif
-                        sysctl_check_leaf(namespaces, table, &fail);
-                }
-                if (table->mode > 0777)
-                        set_fail(&fail, table, "bogus .mode");
-                if (fail) {
-                        set_fail(&fail, table, NULL);
-                        error = -EINVAL;
-                }
-                if (table->child)
-                        error |= sysctl_check_table(namespaces, table->child);
-        }
-        return error;
-}
diff --git a/kernel/time.c b/kernel/time.c
index 73e416db0a1e..ba744cf80696 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -163,7 +163,6 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
                return error;
        if (tz) {
-                /* SMP safe, global irq locking makes it work. */
                sys_tz = *tz;
                update_vsyscall_tz();
                if (firsttime) {
@@ -173,12 +172,7 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
                }
        }
        if (tv)
-        {
-                /* SMP safe, again the code in arch/foo/time.c should
-                 * globally block out interrupts when it runs.
-                 */
                return do_settimeofday(tv);
-        }
        return 0;
 }
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 8a46f5d64504..8a538c55fc7b 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -96,6 +96,11 @@ static int alarmtimer_rtc_add_device(struct device *dev,
        return 0;
 }
+static inline void alarmtimer_rtc_timer_init(void)
+{
+        rtc_timer_init(&rtctimer, NULL, NULL);
+}
 static struct class_interface alarmtimer_rtc_interface = {
        .add_dev = &alarmtimer_rtc_add_device,
 };
@@ -117,6 +122,7 @@ static inline struct rtc_device *alarmtimer_get_rtcdev(void)
 #define rtcdev (NULL)
 static inline int alarmtimer_rtc_interface_setup(void) { return 0; }
 static inline void alarmtimer_rtc_interface_remove(void) { }
+static inline void alarmtimer_rtc_timer_init(void) { }
 #endif
 /**
@@ -783,6 +789,8 @@ static int __init alarmtimer_init(void)
                .nsleep         = alarm_timer_nsleep,
        };
+        alarmtimer_rtc_timer_init();
        posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock);
        posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index a45ca167ab24..c9583382141a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -500,7 +500,7 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
 {
        u64 ret;
        /*
-         * We won't try to correct for more then 11% adjustments (110,000 ppm),
+         * We won't try to correct for more than 11% adjustments (110,000 ppm),
         */
        ret = (u64)cs->mult * 11;
        do_div(ret,100);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index f6117a4c7cb8..f03fd83b170b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -22,17 +22,18 @@
 * NTP timekeeping variables:
 */
+DEFINE_SPINLOCK(ntp_lock);
 /* USER_HZ period (usecs): */
 unsigned long                   tick_usec = TICK_USEC;
 /* ACTHZ period (nsecs): */
 unsigned long                   tick_nsec;
-u64                             tick_length;
+static u64                      tick_length;
 static u64                      tick_length_base;
-static struct hrtimer           leap_timer;
 #define MAX_TICKADJ             500LL           /* usecs */
 #define MAX_TICKADJ_SCALED \
        (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
@@ -49,7 +50,7 @@ static struct hrtimer		leap_timer;
 static int                      time_state = TIME_OK;
 /* clock status bits:                                                   */
-int                             time_status = STA_UNSYNC;
+static int                      time_status = STA_UNSYNC;
 /* TAI offset (secs):                                                   */
 static long                     time_tai;
@@ -133,7 +134,7 @@ static inline void pps_reset_freq_interval(void)
 /**
 * pps_clear - Clears the PPS state variables
 *
- * Must be called while holding a write on the xtime_lock
+ * Must be called while holding a write on the ntp_lock
 */
 static inline void pps_clear(void)
 {
@@ -149,7 +150,7 @@ static inline void pps_clear(void)
 * the last PPS signal. When it reaches 0, indicate that PPS signal is
 * missing.
 *
- * Must be called while holding a write on the xtime_lock
+ * Must be called while holding a write on the ntp_lock
 */
 static inline void pps_dec_valid(void)
 {
@@ -233,6 +234,17 @@ static inline void pps_fill_timex(struct timex *txc)
 #endif /* CONFIG_NTP_PPS */
+/**
+ * ntp_synced - Returns 1 if the NTP status is not UNSYNC
+ *
+ */
+static inline int ntp_synced(void)
+{
+        return !(time_status & STA_UNSYNC);
+}
 /*
 * NTP methods:
 */
@@ -275,7 +287,7 @@ static inline s64 ntp_update_offset_fll(s64 offset64, long secs)
        time_status |= STA_MODE;
-        return div_s64(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs);
+        return div64_long(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs);
 }
 static void ntp_update_offset(long offset)
@@ -330,11 +342,13 @@ static void ntp_update_offset(long offset)
 /**
 * ntp_clear - Clears the NTP state variables
- *
- * Must be called while holding a write on the xtime_lock
 */
 void ntp_clear(void)
 {
+        unsigned long flags;
+        spin_lock_irqsave(&ntp_lock, flags);
        time_adjust     = 0;            /* stop active adjtime() */
        time_status     |= STA_UNSYNC;
        time_maxerror   = NTP_PHASE_LIMIT;
@@ -347,63 +361,81 @@ void ntp_clear(void)
        /* Clear PPS state variables */
        pps_clear();
+        spin_unlock_irqrestore(&ntp_lock, flags);
+}
+u64 ntp_tick_length(void)
+{
+        unsigned long flags;
+        s64 ret;
+        spin_lock_irqsave(&ntp_lock, flags);
+        ret = tick_length;
+        spin_unlock_irqrestore(&ntp_lock, flags);
+        return ret;
 }
 /*
- * Leap second processing. If in leap-insert state at the end of the
+ * this routine handles the overflow of the microsecond field
- * day, the system clock is set back one second; if in leap-delete
+ *
- * state, the system clock is set ahead one second.
+ * The tricky bits of code to handle the accurate clock support
+ * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
+ * They were originally developed for SUN and DEC kernels.
+ * All the kudos should go to Dave for this stuff.
+ *
+ * Also handles leap second processing, and returns leap offset
 */
-static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
+int second_overflow(unsigned long secs)
 {
-        enum hrtimer_restart res = HRTIMER_NORESTART;
+        s64 delta;
+        int leap = 0;
+        unsigned long flags;
-        write_seqlock(&xtime_lock);
+        spin_lock_irqsave(&ntp_lock, flags);
+        /*
+         * Leap second processing. If in leap-insert state at the end of the
+         * day, the system clock is set back one second; if in leap-delete
+         * state, the system clock is set ahead one second.
+         */
        switch (time_state) {
        case TIME_OK:
+                if (time_status & STA_INS)
+                        time_state = TIME_INS;
+                else if (time_status & STA_DEL)
+                        time_state = TIME_DEL;
                break;
        case TIME_INS:
-                timekeeping_leap_insert(-1);
+                if (secs % 86400 == 0) {
-                time_state = TIME_OOP;
+                        leap = -1;
-                printk(KERN_NOTICE
+                        time_state = TIME_OOP;
-                        "Clock: inserting leap second 23:59:60 UTC\n");
+                        printk(KERN_NOTICE
-                hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC);
+                                "Clock: inserting leap second 23:59:60 UTC\n");
-                res = HRTIMER_RESTART;
+                }
                break;
        case TIME_DEL:
-                timekeeping_leap_insert(1);
+                if ((secs + 1) % 86400 == 0) {
-                time_tai--;
+                        leap = 1;
-                time_state = TIME_WAIT;
+                        time_tai--;
-                printk(KERN_NOTICE
+                        time_state = TIME_WAIT;
-                        "Clock: deleting leap second 23:59:59 UTC\n");
+                        printk(KERN_NOTICE
+                                "Clock: deleting leap second 23:59:59 UTC\n");
+                }
                break;
        case TIME_OOP:
                time_tai++;
                time_state = TIME_WAIT;
-                /* fall through */
+                break;
        case TIME_WAIT:
                if (!(time_status & (STA_INS | STA_DEL)))
                        time_state = TIME_OK;
                break;
        }
-        write_sequnlock(&xtime_lock);
-        return res;
-}
-/*
- * this routine handles the overflow of the microsecond field
- *
- * The tricky bits of code to handle the accurate clock support
- * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
- * They were originally developed for SUN and DEC kernels.
- * All the kudos should go to Dave for this stuff.
- */
-void second_overflow(void)
-{
-        s64 delta;
        /* Bump the maxerror field */
        time_maxerror += MAXFREQ / NSEC_PER_USEC;
@@ -423,30 +455,34 @@ void second_overflow(void)
        pps_dec_valid();
        if (!time_adjust)
-                return;
+                goto out;
        if (time_adjust > MAX_TICKADJ) {
                time_adjust -= MAX_TICKADJ;
                tick_length += MAX_TICKADJ_SCALED;
-                return;
+                goto out;
        }
        if (time_adjust < -MAX_TICKADJ) {
                time_adjust += MAX_TICKADJ;
                tick_length -= MAX_TICKADJ_SCALED;
-                return;
+                goto out;
        }
        tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ)
                                                         << NTP_SCALE_SHIFT;
        time_adjust = 0;
+out:
+        spin_unlock_irqrestore(&ntp_lock, flags);
+        return leap;
 }
 #ifdef CONFIG_GENERIC_CMOS_UPDATE
-/* Disable the cmos update - used by virtualization and embedded */
-int no_sync_cmos_clock  __read_mostly;
 static void sync_cmos_clock(struct work_struct *work);
 static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
@@ -493,35 +529,13 @@ static void sync_cmos_clock(struct work_struct *work)
 static void notify_cmos_timer(void)
 {
-        if (!no_sync_cmos_clock)
+        schedule_delayed_work(&sync_cmos_work, 0);
-                schedule_delayed_work(&sync_cmos_work, 0);
 }
 #else
 static inline void notify_cmos_timer(void) { }
 #endif
-/*
- * Start the leap seconds timer:
- */
-static inline void ntp_start_leap_timer(struct timespec *ts)
-{
-        long now = ts->tv_sec;
-        if (time_status & STA_INS) {
-                time_state = TIME_INS;
-                now += 86400 - now % 86400;
-                hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
-                return;
-        }
-        if (time_status & STA_DEL) {
-                time_state = TIME_DEL;
-                now += 86400 - (now + 1) % 86400;
-                hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
-        }
-}
 /*
 * Propagate a new txc->status value into the NTP state:
@@ -546,22 +560,6 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
        time_status &= STA_RONLY;
        time_status |= txc->status & ~STA_RONLY;
-        switch (time_state) {
-        case TIME_OK:
-                ntp_start_leap_timer(ts);
-                break;
-        case TIME_INS:
-        case TIME_DEL:
-                time_state = TIME_OK;
-                ntp_start_leap_timer(ts);
-        case TIME_WAIT:
-                if (!(time_status & (STA_INS | STA_DEL)))
-                        time_state = TIME_OK;
-                break;
-        case TIME_OOP:
-                hrtimer_restart(&leap_timer);
-                break;
-        }
 }
 /*
 * Called with the xtime lock held, so we can access and modify
@@ -643,9 +641,6 @@ int do_adjtimex(struct timex *txc)
                    (txc->tick <  900000/USER_HZ ||
                     txc->tick > 1100000/USER_HZ))
                        return -EINVAL;
-                if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
-                        hrtimer_cancel(&leap_timer);
        }
        if (txc->modes & ADJ_SETOFFSET) {
@@ -663,7 +658,7 @@ int do_adjtimex(struct timex *txc)
        getnstimeofday(&ts);
-        write_seqlock_irq(&xtime_lock);
+        spin_lock_irq(&ntp_lock);
        if (txc->modes & ADJ_ADJTIME) {
                long save_adjust = time_adjust;
@@ -705,7 +700,7 @@ int do_adjtimex(struct timex *txc)
        /* fill PPS status fields */
        pps_fill_timex(txc);
-        write_sequnlock_irq(&xtime_lock);
+        spin_unlock_irq(&ntp_lock);
        txc->time.tv_sec = ts.tv_sec;
        txc->time.tv_usec = ts.tv_nsec;
@@ -903,7 +898,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
        pts_norm = pps_normalize_ts(*phase_ts);
-        write_seqlock_irqsave(&xtime_lock, flags);
+        spin_lock_irqsave(&ntp_lock, flags);
        /* clear the error bits, they will be set again if needed */
        time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
@@ -916,7 +911,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
         * just start the frequency interval */
        if (unlikely(pps_fbase.tv_sec == 0)) {
                pps_fbase = *raw_ts;
-                write_sequnlock_irqrestore(&xtime_lock, flags);
+                spin_unlock_irqrestore(&ntp_lock, flags);
                return;
        }
@@ -931,7 +926,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
                time_status |= STA_PPSJITTER;
                /* restart the frequency calibration interval */
                pps_fbase = *raw_ts;
-                write_sequnlock_irqrestore(&xtime_lock, flags);
+                spin_unlock_irqrestore(&ntp_lock, flags);
                pr_err("hardpps: PPSJITTER: bad pulse\n");
                return;
        }
@@ -948,7 +943,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
        hardpps_update_phase(pts_norm.nsec);
-        write_sequnlock_irqrestore(&xtime_lock, flags);
+        spin_unlock_irqrestore(&ntp_lock, flags);
 }
 EXPORT_SYMBOL(hardpps);
@@ -967,6 +962,4 @@ __setup("ntp_tick_adj=", ntp_tick_adj_setup);
 void __init ntp_init(void)
 {
        ntp_clear();
-        hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
-        leap_timer.function = ntp_leap_second;
 }
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index fd4a7b1625a2..e883f57a3cd3 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -575,11 +575,15 @@ void tick_broadcast_switch_to_oneshot(void)
        unsigned long flags;
        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+        if (cpumask_empty(tick_get_broadcast_mask()))
+                goto end;
        tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
        bc = tick_broadcast_device.evtdev;
        if (bc)
                tick_broadcast_setup_oneshot(bc);
+end:
        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7656642e4b8e..3526038f2836 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -182,11 +182,7 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now)
 static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
 {
-        ktime_t now;
+        ktime_t now = ktime_get();
-        now = ktime_get();
-        update_ts_time_stats(cpu, ts, now, NULL);
        ts->idle_entrytime = now;
        ts->idle_active = 1;
@@ -562,20 +558,21 @@ void tick_nohz_idle_exit(void)
        local_irq_disable();
-        if (ts->idle_active || (ts->inidle && ts->tick_stopped))
+        WARN_ON_ONCE(!ts->inidle);
+        ts->inidle = 0;
+        if (ts->idle_active || ts->tick_stopped)
                now = ktime_get();
        if (ts->idle_active)
                tick_nohz_stop_idle(cpu, now);
-        if (!ts->inidle || !ts->tick_stopped) {
+        if (!ts->tick_stopped) {
-                ts->inidle = 0;
                local_irq_enable();
                return;
        }
-        ts->inidle = 0;
        /* Update jiffies first */
        select_nohz_load_balancer(0);
        tick_do_update_jiffies64(now);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 0c6358186401..d66b21308f7c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -25,6 +25,8 @@
 struct timekeeper {
        /* Current clocksource used for timekeeping. */
        struct clocksource *clock;
+        /* NTP adjusted clock multiplier */
+        u32     mult;
        /* The shift value of the current clocksource. */
        int     shift;
@@ -45,12 +47,47 @@ struct timekeeper {
        /* Shift conversion between clock shifted nano seconds and
         * ntp shifted nano seconds. */
        int     ntp_error_shift;
-        /* NTP adjusted clock multiplier */
-        u32     mult;
+        /* The current time */
+        struct timespec xtime;
+        /*
+         * wall_to_monotonic is what we need to add to xtime (or xtime corrected
+         * for sub jiffie times) to get to monotonic time.  Monotonic is pegged
+         * at zero at system boot time, so wall_to_monotonic will be negative,
+         * however, we will ALWAYS keep the tv_nsec part positive so we can use
+         * the usual normalization.
+         *
+         * wall_to_monotonic is moved after resume from suspend for the
+         * monotonic time not to jump. We need to add total_sleep_time to
+         * wall_to_monotonic to get the real boot based time offset.
+         *
+         * - wall_to_monotonic is no longer the boot time, getboottime must be
+         * used instead.
+         */
+        struct timespec wall_to_monotonic;
+        /* time spent in suspend */
+        struct timespec total_sleep_time;
+        /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
+        struct timespec raw_time;
+        /* Seqlock for all timekeeper values */
+        seqlock_t lock;
 };
 static struct timekeeper timekeeper;
+/*
+ * This read-write spinlock protects us from races in SMP while
+ * playing with xtime.
+ */
+__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
+/* flag for if timekeeping is suspended */
+int __read_mostly timekeeping_suspended;
 /**
 * timekeeper_setup_internals - Set up internals to use clocksource clock.
 *
@@ -135,49 +172,18 @@ static inline s64 timekeeping_get_ns_raw(void)
        return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
 }
-/*
+/* must hold write on timekeeper.lock */
- * This read-write spinlock protects us from races in SMP while
+static void timekeeping_update(bool clearntp)
- * playing with xtime.
- */
-__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
-/*
- * The current time
- * wall_to_monotonic is what we need to add to xtime (or xtime corrected
- * for sub jiffie times) to get to monotonic time.  Monotonic is pegged
- * at zero at system boot time, so wall_to_monotonic will be negative,
- * however, we will ALWAYS keep the tv_nsec part positive so we can use
- * the usual normalization.
- *
- * wall_to_monotonic is moved after resume from suspend for the monotonic
- * time not to jump. We need to add total_sleep_time to wall_to_monotonic
- * to get the real boot based time offset.
- *
- * - wall_to_monotonic is no longer the boot time, getboottime must be
- * used instead.
- */
-static struct timespec xtime __attribute__ ((aligned (16)));
-static struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
-static struct timespec total_sleep_time;
-/*
- * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
- */
-static struct timespec raw_time;
-/* flag for if timekeeping is suspended */
-int __read_mostly timekeeping_suspended;
-/* must hold xtime_lock */
-void timekeeping_leap_insert(int leapsecond)
 {
-        xtime.tv_sec += leapsecond;
+        if (clearntp) {
-        wall_to_monotonic.tv_sec -= leapsecond;
+                timekeeper.ntp_error = 0;
-        update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+                ntp_clear();
-                        timekeeper.mult);
+        }
+        update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic,
+                         timekeeper.clock, timekeeper.mult);
 }
 /**
 * timekeeping_forward_now - update clock to the current time
 *
@@ -202,10 +208,10 @@ static void timekeeping_forward_now(void)
        /* If arch requires, add in gettimeoffset() */
        nsec += arch_gettimeoffset();
-        timespec_add_ns(&xtime, nsec);
+        timespec_add_ns(&timekeeper.xtime, nsec);
        nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
-        timespec_add_ns(&raw_time, nsec);
+        timespec_add_ns(&timekeeper.raw_time, nsec);
 }
 /**
@@ -222,15 +228,15 @@ void getnstimeofday(struct timespec *ts)
        WARN_ON(timekeeping_suspended);
        do {
-                seq = read_seqbegin(&xtime_lock);
+                seq = read_seqbegin(&timekeeper.lock);
-                *ts = xtime;
+                *ts = timekeeper.xtime;
                nsecs = timekeeping_get_ns();
                /* If arch requires, add in gettimeoffset() */
                nsecs += arch_gettimeoffset();
-        } while (read_seqretry(&xtime_lock, seq));
+        } while (read_seqretry(&timekeeper.lock, seq));
        timespec_add_ns(ts, nsecs);
 }
@@ -245,14 +251,16 @@ ktime_t ktime_get(void)
        WARN_ON(timekeeping_suspended);
        do {
-                seq = read_seqbegin(&xtime_lock);
+                seq = read_seqbegin(&timekeeper.lock);
-                secs = xtime.tv_sec + wall_to_monotonic.tv_sec;
+                secs = timekeeper.xtime.tv_sec +
-                nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;
+                                timekeeper.wall_to_monotonic.tv_sec;
+                nsecs = timekeeper.xtime.tv_nsec +
+                                timekeeper.wall_to_monotonic.tv_nsec;
                nsecs += timekeeping_get_ns();
                /* If arch requires, add in gettimeoffset() */
                nsecs += arch_gettimeoffset();
-        } while (read_seqretry(&xtime_lock, seq));
+        } while (read_seqretry(&timekeeper.lock, seq));
        /*
         * Use ktime_set/ktime_add_ns to create a proper ktime on
         * 32-bit architectures without CONFIG_KTIME_SCALAR.
@@ -278,14 +286,14 @@ void ktime_get_ts(struct timespec *ts)
        WARN_ON(timekeeping_suspended);
        do {
-                seq = read_seqbegin(&xtime_lock);
+                seq = read_seqbegin(&timekeeper.lock);
-                *ts = xtime;
+                *ts = timekeeper.xtime;
-                tomono = wall_to_monotonic;
+                tomono = timekeeper.wall_to_monotonic;
                nsecs = timekeeping_get_ns();
                /* If arch requires, add in gettimeoffset() */
                nsecs += arch_gettimeoffset();
-        } while (read_seqretry(&xtime_lock, seq));
+        } while (read_seqretry(&timekeeper.lock, seq));
        set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
                                ts->tv_nsec + tomono.tv_nsec + nsecs);
@@ -313,10 +321,10 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
        do {
                u32 arch_offset;
-                seq = read_seqbegin(&xtime_lock);
+                seq = read_seqbegin(&timekeeper.lock);
-                *ts_raw = raw_time;
+                *ts_raw = timekeeper.raw_time;
-                *ts_real = xtime;
+                *ts_real = timekeeper.xtime;
                nsecs_raw = timekeeping_get_ns_raw();
                nsecs_real = timekeeping_get_ns();
@@ -326,7 +334,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
                nsecs_raw += arch_offset;
                nsecs_real += arch_offset;
-        } while (read_seqretry(&xtime_lock, seq));
+        } while (read_seqretry(&timekeeper.lock, seq));
        timespec_add_ns(ts_raw, nsecs_raw);
        timespec_add_ns(ts_real, nsecs_real);
@@ -365,23 +373,19 @@ int do_settimeofday(const struct timespec *tv)
        if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
                return -EINVAL;
-        write_seqlock_irqsave(&xtime_lock, flags);
+        write_seqlock_irqsave(&timekeeper.lock, flags);
        timekeeping_forward_now();
-        ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec;
+        ts_delta.tv_sec = tv->tv_sec - timekeeper.xtime.tv_sec;
-        ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec;
+        ts_delta.tv_nsec = tv->tv_nsec - timekeeper.xtime.tv_nsec;
-        wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta);
+        timekeeper.wall_to_monotonic =
+                        timespec_sub(timekeeper.wall_to_monotonic, ts_delta);
-        xtime = *tv;
+        timekeeper.xtime = *tv;
+        timekeeping_update(true);
-        timekeeper.ntp_error = 0;
-        ntp_clear();
-        update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+        write_sequnlock_irqrestore(&timekeeper.lock, flags);
-                                timekeeper.mult);
-        write_sequnlock_irqrestore(&xtime_lock, flags);
        /* signal hrtimers about time change */
        clock_was_set();
@@ -405,20 +409,17 @@ int timekeeping_inject_offset(struct timespec *ts)
        if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
                return -EINVAL;
-        write_seqlock_irqsave(&xtime_lock, flags);
+        write_seqlock_irqsave(&timekeeper.lock, flags);
        timekeeping_forward_now();
-        xtime = timespec_add(xtime, *ts);
+        timekeeper.xtime = timespec_add(timekeeper.xtime, *ts);
-        wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts);
+        timekeeper.wall_to_monotonic =
+                                timespec_sub(timekeeper.wall_to_monotonic, *ts);
-        timekeeper.ntp_error = 0;
-        ntp_clear();
-        update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+        timekeeping_update(true);
-                                timekeeper.mult);
-        write_sequnlock_irqrestore(&xtime_lock, flags);
+        write_sequnlock_irqrestore(&timekeeper.lock, flags);
        /* signal hrtimers about time change */
        clock_was_set();
@@ -435,9 +436,12 @@ EXPORT_SYMBOL(timekeeping_inject_offset);
 static int change_clocksource(void *data)
 {
        struct clocksource *new, *old;
+        unsigned long flags;
        new = (struct clocksource *) data;
+        write_seqlock_irqsave(&timekeeper.lock, flags);
        timekeeping_forward_now();
        if (!new->enable || new->enable(new) == 0) {
                old = timekeeper.clock;
@@ -445,6 +449,10 @@ static int change_clocksource(void *data)
                if (old->disable)
                        old->disable(old);
        }
+        timekeeping_update(true);
+        write_sequnlock_irqrestore(&timekeeper.lock, flags);
        return 0;
 }
@@ -490,11 +498,11 @@ void getrawmonotonic(struct timespec *ts)
        s64 nsecs;
        do {
-                seq = read_seqbegin(&xtime_lock);
+                seq = read_seqbegin(&timekeeper.lock);
                nsecs = timekeeping_get_ns_raw();
-                *ts = raw_time;
+                *ts = timekeeper.raw_time;
-        } while (read_seqretry(&xtime_lock, seq));
+        } while (read_seqretry(&timekeeper.lock, seq));
        timespec_add_ns(ts, nsecs);
 }
@@ -510,24 +518,30 @@ int timekeeping_valid_for_hres(void)
        int ret;
        do {
-                seq = read_seqbegin(&xtime_lock);
+                seq = read_seqbegin(&timekeeper.lock);
                ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
-        } while (read_seqretry(&xtime_lock, seq));
+        } while (read_seqretry(&timekeeper.lock, seq));
        return ret;
 }
 /**
 * timekeeping_max_deferment - Returns max time the clocksource can be deferred
- *
- * Caller must observe xtime_lock via read_seqbegin/read_seqretry to
- * ensure that the clocksource does not change!
 */
 u64 timekeeping_max_deferment(void)
 {
-        return timekeeper.clock->max_idle_ns;
+        unsigned long seq;
+        u64 ret;
+        do {
+                seq = read_seqbegin(&timekeeper.lock);
+                ret = timekeeper.clock->max_idle_ns;
+        } while (read_seqretry(&timekeeper.lock, seq));
+        return ret;
 }
 /**
@@ -572,28 +586,29 @@ void __init timekeeping_init(void)
        read_persistent_clock(&now);
        read_boot_clock(&boot);
-        write_seqlock_irqsave(&xtime_lock, flags);
+        seqlock_init(&timekeeper.lock);
        ntp_init();
+        write_seqlock_irqsave(&timekeeper.lock, flags);
        clock = clocksource_default_clock();
        if (clock->enable)
                clock->enable(clock);
        timekeeper_setup_internals(clock);
-        xtime.tv_sec = now.tv_sec;
+        timekeeper.xtime.tv_sec = now.tv_sec;
-        xtime.tv_nsec = now.tv_nsec;
+        timekeeper.xtime.tv_nsec = now.tv_nsec;
-        raw_time.tv_sec = 0;
+        timekeeper.raw_time.tv_sec = 0;
-        raw_time.tv_nsec = 0;
+        timekeeper.raw_time.tv_nsec = 0;
        if (boot.tv_sec == 0 && boot.tv_nsec == 0) {
-                boot.tv_sec = xtime.tv_sec;
+                boot.tv_sec = timekeeper.xtime.tv_sec;
-                boot.tv_nsec = xtime.tv_nsec;
+                boot.tv_nsec = timekeeper.xtime.tv_nsec;
        }
-        set_normalized_timespec(&wall_to_monotonic,
+        set_normalized_timespec(&timekeeper.wall_to_monotonic,
                                -boot.tv_sec, -boot.tv_nsec);
-        total_sleep_time.tv_sec = 0;
+        timekeeper.total_sleep_time.tv_sec = 0;
-        total_sleep_time.tv_nsec = 0;
+        timekeeper.total_sleep_time.tv_nsec = 0;
-        write_sequnlock_irqrestore(&xtime_lock, flags);
+        write_sequnlock_irqrestore(&timekeeper.lock, flags);
 }
 /* time in seconds when suspend began */
@@ -614,9 +629,11 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta)
                return;
        }
-        xtime = timespec_add(xtime, *delta);
+        timekeeper.xtime = timespec_add(timekeeper.xtime, *delta);
-        wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta);
+        timekeeper.wall_to_monotonic =
-        total_sleep_time = timespec_add(total_sleep_time, *delta);
+                        timespec_sub(timekeeper.wall_to_monotonic, *delta);
+        timekeeper.total_sleep_time = timespec_add(
+                                        timekeeper.total_sleep_time, *delta);
 }
@@ -640,17 +657,15 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
        if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
                return;
-        write_seqlock_irqsave(&xtime_lock, flags);
+        write_seqlock_irqsave(&timekeeper.lock, flags);
        timekeeping_forward_now();
        __timekeeping_inject_sleeptime(delta);
-        timekeeper.ntp_error = 0;
+        timekeeping_update(true);
-        ntp_clear();
-        update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
-                                timekeeper.mult);
-        write_sequnlock_irqrestore(&xtime_lock, flags);
+        write_sequnlock_irqrestore(&timekeeper.lock, flags);
        /* signal hrtimers about time change */
        clock_was_set();
@@ -673,7 +688,7 @@ static void timekeeping_resume(void)
        clocksource_resume();
-        write_seqlock_irqsave(&xtime_lock, flags);
+        write_seqlock_irqsave(&timekeeper.lock, flags);
        if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
                ts = timespec_sub(ts, timekeeping_suspend_time);
@@ -683,7 +698,7 @@ static void timekeeping_resume(void)
        timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
        timekeeper.ntp_error = 0;
        timekeeping_suspended = 0;
-        write_sequnlock_irqrestore(&xtime_lock, flags);
+        write_sequnlock_irqrestore(&timekeeper.lock, flags);
        touch_softlockup_watchdog();
@@ -701,7 +716,7 @@ static int timekeeping_suspend(void)
        read_persistent_clock(&timekeeping_suspend_time);
-        write_seqlock_irqsave(&xtime_lock, flags);
+        write_seqlock_irqsave(&timekeeper.lock, flags);
        timekeeping_forward_now();
        timekeeping_suspended = 1;
@@ -711,7 +726,7 @@ static int timekeeping_suspend(void)
         * try to compensate so the difference in system time
         * and persistent_clock time stays close to constant.
         */
-        delta = timespec_sub(xtime, timekeeping_suspend_time);
+        delta = timespec_sub(timekeeper.xtime, timekeeping_suspend_time);
        delta_delta = timespec_sub(delta, old_delta);
        if (abs(delta_delta.tv_sec)  >= 2) {
                /*
@@ -724,7 +739,7 @@ static int timekeeping_suspend(void)
                timekeeping_suspend_time =
                        timespec_add(timekeeping_suspend_time, delta_delta);
        }
-        write_sequnlock_irqrestore(&xtime_lock, flags);
+        write_sequnlock_irqrestore(&timekeeper.lock, flags);
        clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
        clocksource_suspend();
@@ -775,7 +790,7 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
         * Now calculate the error in (1 << look_ahead) ticks, but first
         * remove the single look ahead already included in the error.
         */
-        tick_error = tick_length >> (timekeeper.ntp_error_shift + 1);
+        tick_error = ntp_tick_length() >> (timekeeper.ntp_error_shift + 1);
        tick_error -= timekeeper.xtime_interval >> 1;
        error = ((error - tick_error) >> look_ahead) + tick_error;
@@ -807,7 +822,7 @@ static void timekeeping_adjust(s64 offset)
        int adj;
        /*
-         * The point of this is to check if the error is greater then half
+         * The point of this is to check if the error is greater than half
         * an interval.
         *
         * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs.
@@ -815,7 +830,7 @@ static void timekeeping_adjust(s64 offset)
         * Note we subtract one in the shift, so that error is really error*2.
         * This "saves" dividing(shifting) interval twice, but keeps the
         * (error > interval) comparison as still measuring if error is
-         * larger then half an interval.
+         * larger than half an interval.
         *
         * Note: It does not "save" on aggravation when reading the code.
         */
@@ -823,7 +838,7 @@ static void timekeeping_adjust(s64 offset)
        if (error > interval) {
                /*
                 * We now divide error by 4(via shift), which checks if
-                 * the error is greater then twice the interval.
+                 * the error is greater than twice the interval.
                 * If it is greater, we need a bigadjust, if its smaller,
                 * we can adjust by 1.
                 */
@@ -854,13 +869,15 @@ static void timekeeping_adjust(s64 offset)
        } else /* No adjustment needed */
                return;
-        WARN_ONCE(timekeeper.clock->maxadj &&
+        if (unlikely(timekeeper.clock->maxadj &&
-                        (timekeeper.mult + adj > timekeeper.clock->mult +
+                        (timekeeper.mult + adj >
-                                                timekeeper.clock->maxadj),
+                        timekeeper.clock->mult + timekeeper.clock->maxadj))) {
-                        "Adjusting %s more then 11%% (%ld vs %ld)\n",
+                printk_once(KERN_WARNING
+                        "Adjusting %s more than 11%% (%ld vs %ld)\n",
                        timekeeper.clock->name, (long)timekeeper.mult + adj,
                        (long)timekeeper.clock->mult +
                                timekeeper.clock->maxadj);
+        }
        /*
         * So the following can be confusing.
         *
@@ -932,7 +949,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
        u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
        u64 raw_nsecs;
-        /* If the offset is smaller then a shifted interval, do nothing */
+        /* If the offset is smaller than a shifted interval, do nothing */
        if (offset < timekeeper.cycle_interval<<shift)
                return offset;
@@ -942,23 +959,25 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
        timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
        while (timekeeper.xtime_nsec >= nsecps) {
+                int leap;
                timekeeper.xtime_nsec -= nsecps;
-                xtime.tv_sec++;
+                timekeeper.xtime.tv_sec++;
-                second_overflow();
+                leap = second_overflow(timekeeper.xtime.tv_sec);
+                timekeeper.xtime.tv_sec += leap;
        }
        /* Accumulate raw time */
        raw_nsecs = timekeeper.raw_interval << shift;
-        raw_nsecs += raw_time.tv_nsec;
+        raw_nsecs += timekeeper.raw_time.tv_nsec;
        if (raw_nsecs >= NSEC_PER_SEC) {
                u64 raw_secs = raw_nsecs;
                raw_nsecs = do_div(raw_secs, NSEC_PER_SEC);
-                raw_time.tv_sec += raw_secs;
+                timekeeper.raw_time.tv_sec += raw_secs;
        }
-        raw_time.tv_nsec = raw_nsecs;
+        timekeeper.raw_time.tv_nsec = raw_nsecs;
        /* Accumulate error between NTP and clock interval */
-        timekeeper.ntp_error += tick_length << shift;
+        timekeeper.ntp_error += ntp_tick_length() << shift;
        timekeeper.ntp_error -=
            (timekeeper.xtime_interval + timekeeper.xtime_remainder) <<
                                (timekeeper.ntp_error_shift + shift);
@@ -970,17 +989,19 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
 /**
 * update_wall_time - Uses the current clocksource to increment the wall time
 *
- * Called from the timer interrupt, must hold a write on xtime_lock.
 */
 static void update_wall_time(void)
 {
        struct clocksource *clock;
        cycle_t offset;
        int shift = 0, maxshift;
+        unsigned long flags;
+        write_seqlock_irqsave(&timekeeper.lock, flags);
        /* Make sure we're fully resumed: */
        if (unlikely(timekeeping_suspended))
-                return;
+                goto out;
        clock = timekeeper.clock;
@@ -989,20 +1010,21 @@ static void update_wall_time(void)
 #else
        offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
 #endif
-        timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
+        timekeeper.xtime_nsec = (s64)timekeeper.xtime.tv_nsec <<
+                                                timekeeper.shift;
        /*
         * With NO_HZ we may have to accumulate many cycle_intervals
         * (think "ticks") worth of time at once. To do this efficiently,
         * we calculate the largest doubling multiple of cycle_intervals
-         * that is smaller then the offset. We then accumulate that
+         * that is smaller than the offset.  We then accumulate that
         * chunk in one go, and then try to consume the next smaller
         * doubled multiple.
         */
        shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
        shift = max(0, shift);
-        /* Bound shift to one less then what overflows tick_length */
+        /* Bound shift to one less than what overflows tick_length */
-        maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1;
+        maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
        shift = min(shift, maxshift);
        while (offset >= timekeeper.cycle_interval) {
                offset = logarithmic_accumulation(offset, shift);
@@ -1040,24 +1062,30 @@ static void update_wall_time(void)
         * Store full nanoseconds into xtime after rounding it up and
         * add the remainder to the error difference.
         */
-        xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
+        timekeeper.xtime.tv_nsec = ((s64)timekeeper.xtime_nsec >>
-        timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift;
+                                                timekeeper.shift) + 1;
+        timekeeper.xtime_nsec -= (s64)timekeeper.xtime.tv_nsec <<
+                                                timekeeper.shift;
        timekeeper.ntp_error += timekeeper.xtime_nsec <<
                                timekeeper.ntp_error_shift;
        /*
         * Finally, make sure that after the rounding
-         * xtime.tv_nsec isn't larger then NSEC_PER_SEC
+         * xtime.tv_nsec isn't larger than NSEC_PER_SEC
         */
-        if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) {
+        if (unlikely(timekeeper.xtime.tv_nsec >= NSEC_PER_SEC)) {
-                xtime.tv_nsec -= NSEC_PER_SEC;
+                int leap;
-                xtime.tv_sec++;
+                timekeeper.xtime.tv_nsec -= NSEC_PER_SEC;
-                second_overflow();
+                timekeeper.xtime.tv_sec++;
+                leap = second_overflow(timekeeper.xtime.tv_sec);
+                timekeeper.xtime.tv_sec += leap;
        }
-        /* check to see if there is a new clocksource to use */
+        timekeeping_update(false);
-        update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
-                                timekeeper.mult);
+out:
+        write_sequnlock_irqrestore(&timekeeper.lock, flags);
 }
 /**
@@ -1074,8 +1102,10 @@ static void update_wall_time(void)
 void getboottime(struct timespec *ts)
 {
        struct timespec boottime = {
-                .tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec,
+                .tv_sec = timekeeper.wall_to_monotonic.tv_sec +
-                .tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec
+                                timekeeper.total_sleep_time.tv_sec,
+                .tv_nsec = timekeeper.wall_to_monotonic.tv_nsec +
+                                timekeeper.total_sleep_time.tv_nsec
        };
        set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
@@ -1101,13 +1131,13 @@ void get_monotonic_boottime(struct timespec *ts)
        WARN_ON(timekeeping_suspended);
        do {
-                seq = read_seqbegin(&xtime_lock);
+                seq = read_seqbegin(&timekeeper.lock);
-                *ts = xtime;
+                *ts = timekeeper.xtime;
-                tomono = wall_to_monotonic;
+                tomono = timekeeper.wall_to_monotonic;
-                sleep = total_sleep_time;
+                sleep = timekeeper.total_sleep_time;
                nsecs = timekeeping_get_ns();
-        } while (read_seqretry(&xtime_lock, seq));
+        } while (read_seqretry(&timekeeper.lock, seq));
        set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
                        ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs);
@@ -1137,19 +1167,19 @@ EXPORT_SYMBOL_GPL(ktime_get_boottime);
 */
 void monotonic_to_bootbased(struct timespec *ts)
 {
-        *ts = timespec_add(*ts, total_sleep_time);
+        *ts = timespec_add(*ts, timekeeper.total_sleep_time);
 }
 EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
 unsigned long get_seconds(void)
 {
-        return xtime.tv_sec;
+        return timekeeper.xtime.tv_sec;
 }
 EXPORT_SYMBOL(get_seconds);
 struct timespec __current_kernel_time(void)
 {
-        return xtime;
+        return timekeeper.xtime;
 }
 struct timespec current_kernel_time(void)
@@ -1158,10 +1188,10 @@ struct timespec current_kernel_time(void)
        unsigned long seq;
        do {
-                seq = read_seqbegin(&xtime_lock);
+                seq = read_seqbegin(&timekeeper.lock);
-                now = xtime;
+                now = timekeeper.xtime;
-        } while (read_seqretry(&xtime_lock, seq));
+        } while (read_seqretry(&timekeeper.lock, seq));
        return now;
 }
@@ -1173,11 +1203,11 @@ struct timespec get_monotonic_coarse(void)
        unsigned long seq;
        do {
-                seq = read_seqbegin(&xtime_lock);
+                seq = read_seqbegin(&timekeeper.lock);
-                now = xtime;
+                now = timekeeper.xtime;
-                mono = wall_to_monotonic;
+                mono = timekeeper.wall_to_monotonic;
-        } while (read_seqretry(&xtime_lock, seq));
+        } while (read_seqretry(&timekeeper.lock, seq));
        set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
                                now.tv_nsec + mono.tv_nsec);
@@ -1209,11 +1239,11 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
        unsigned long seq;
        do {
-                seq = read_seqbegin(&xtime_lock);
+                seq = read_seqbegin(&timekeeper.lock);
-                *xtim = xtime;
+                *xtim = timekeeper.xtime;
-                *wtom = wall_to_monotonic;
+                *wtom = timekeeper.wall_to_monotonic;
-                *sleep = total_sleep_time;
+                *sleep = timekeeper.total_sleep_time;
-        } while (read_seqretry(&xtime_lock, seq));
+        } while (read_seqretry(&timekeeper.lock, seq));
 }
 /**
@@ -1225,11 +1255,14 @@ ktime_t ktime_get_monotonic_offset(void)
        struct timespec wtom;
        do {
-                seq = read_seqbegin(&xtime_lock);
+                seq = read_seqbegin(&timekeeper.lock);
-                wtom = wall_to_monotonic;
+                wtom = timekeeper.wall_to_monotonic;
-        } while (read_seqretry(&xtime_lock, seq));
+        } while (read_seqretry(&timekeeper.lock, seq));
        return timespec_to_ktime(wtom);
 }
+EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
 /**
 * xtime_update() - advances the timekeeping infrastructure
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index cd3134510f3d..a1d2849f2473 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -141,7 +141,7 @@ if FTRACE
 config FUNCTION_TRACER
        bool "Kernel Function Tracer"
        depends on HAVE_FUNCTION_TRACER
-        select FRAME_POINTER if !ARM_UNWIND && !S390 && !MICROBLAZE
+        select FRAME_POINTER if !ARM_UNWIND && !PPC && !S390 && !MICROBLAZE
        select KALLSYMS
        select GENERIC_TRACER
        select CONTEXT_SWITCH_TRACER
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 683d559a0eef..0fa92f677c92 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -62,6 +62,8 @@
 #define FTRACE_HASH_DEFAULT_BITS 10
 #define FTRACE_HASH_MAX_BITS 12
+#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL)
 /* ftrace_enabled is a method to turn ftrace on or off */
 int ftrace_enabled __read_mostly;
 static int last_ftrace_enabled;
@@ -89,12 +91,14 @@ static struct ftrace_ops ftrace_list_end __read_mostly = {
 };
 static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
+static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
 static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
 ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
 static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub;
 ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
 ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
 static struct ftrace_ops global_ops;
+static struct ftrace_ops control_ops;
 static void
 ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip);
@@ -168,6 +172,32 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
 }
 #endif
+static void control_ops_disable_all(struct ftrace_ops *ops)
+{
+        int cpu;
+        for_each_possible_cpu(cpu)
+                *per_cpu_ptr(ops->disabled, cpu) = 1;
+}
+static int control_ops_alloc(struct ftrace_ops *ops)
+{
+        int __percpu *disabled;
+        disabled = alloc_percpu(int);
+        if (!disabled)
+                return -ENOMEM;
+        ops->disabled = disabled;
+        control_ops_disable_all(ops);
+        return 0;
+}
+static void control_ops_free(struct ftrace_ops *ops)
+{
+        free_percpu(ops->disabled);
+}
 static void update_global_ops(void)
 {
        ftrace_func_t func;
@@ -219,7 +249,8 @@ static void update_ftrace_function(void)
 #else
        __ftrace_trace_function = func;
 #endif
-        ftrace_trace_function = ftrace_test_stop_func;
+        ftrace_trace_function =
+                (func == ftrace_stub) ? func : ftrace_test_stop_func;
 #endif
 }
@@ -259,6 +290,26 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
        return 0;
 }
+static void add_ftrace_list_ops(struct ftrace_ops **list,
+                                struct ftrace_ops *main_ops,
+                                struct ftrace_ops *ops)
+{
+        int first = *list == &ftrace_list_end;
+        add_ftrace_ops(list, ops);
+        if (first)
+                add_ftrace_ops(&ftrace_ops_list, main_ops);
+}
+static int remove_ftrace_list_ops(struct ftrace_ops **list,
+                                  struct ftrace_ops *main_ops,
+                                  struct ftrace_ops *ops)
+{
+        int ret = remove_ftrace_ops(list, ops);
+        if (!ret && *list == &ftrace_list_end)
+                ret = remove_ftrace_ops(&ftrace_ops_list, main_ops);
+        return ret;
+}
 static int __register_ftrace_function(struct ftrace_ops *ops)
 {
        if (ftrace_disabled)
@@ -270,15 +321,20 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
        if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))
                return -EBUSY;
+        /* We don't support both control and global flags set. */
+        if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK)
+                return -EINVAL;
        if (!core_kernel_data((unsigned long)ops))
                ops->flags |= FTRACE_OPS_FL_DYNAMIC;
        if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
-                int first = ftrace_global_list == &ftrace_list_end;
+                add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops);
-                add_ftrace_ops(&ftrace_global_list, ops);
                ops->flags |= FTRACE_OPS_FL_ENABLED;
-                if (first)
+        } else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
-                        add_ftrace_ops(&ftrace_ops_list, &global_ops);
+                if (control_ops_alloc(ops))
+                        return -ENOMEM;
+                add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
        } else
                add_ftrace_ops(&ftrace_ops_list, ops);
@@ -302,11 +358,23 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
                return -EINVAL;
        if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
-                ret = remove_ftrace_ops(&ftrace_global_list, ops);
+                ret = remove_ftrace_list_ops(&ftrace_global_list,
-                if (!ret && ftrace_global_list == &ftrace_list_end)
+                                             &global_ops, ops);
-                        ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops);
                if (!ret)
                        ops->flags &= ~FTRACE_OPS_FL_ENABLED;
+        } else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
+                ret = remove_ftrace_list_ops(&ftrace_control_list,
+                                             &control_ops, ops);
+                if (!ret) {
+                        /*
+                         * The ftrace_ops is now removed from the list,
+                         * so there'll be no new users. We must ensure
+                         * all current users are done before we free
+                         * the control data.
+                         */
+                        synchronize_sched();
+                        control_ops_free(ops);
+                }
        } else
                ret = remove_ftrace_ops(&ftrace_ops_list, ops);
@@ -1119,6 +1187,12 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
        call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu);
 }
+void ftrace_free_filter(struct ftrace_ops *ops)
+{
+        free_ftrace_hash(ops->filter_hash);
+        free_ftrace_hash(ops->notrace_hash);
+}
 static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
 {
        struct ftrace_hash *hash;
@@ -1129,7 +1203,7 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
                return NULL;
        size = 1 << size_bits;
-        hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL);
+        hash->buckets = kcalloc(size, sizeof(*hash->buckets), GFP_KERNEL);
        if (!hash->buckets) {
                kfree(hash);
@@ -3146,8 +3220,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
        mutex_lock(&ftrace_regex_lock);
        if (reset)
                ftrace_filter_reset(hash);
-        if (buf)
+        if (buf && !ftrace_match_records(hash, buf, len)) {
-                ftrace_match_records(hash, buf, len);
+                ret = -EINVAL;
+                goto out_regex_unlock;
+        }
        mutex_lock(&ftrace_lock);
        ret = ftrace_hash_move(ops, enable, orig_hash, hash);
@@ -3157,6 +3233,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
        mutex_unlock(&ftrace_lock);
+ out_regex_unlock:
        mutex_unlock(&ftrace_regex_lock);
        free_ftrace_hash(hash);
@@ -3173,10 +3250,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
 * Filters denote which functions should be enabled when tracing is enabled.
 * If @buf is NULL and reset is set, all functions will be enabled for tracing.
 */
-void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
+int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
                       int len, int reset)
 {
-        ftrace_set_regex(ops, buf, len, reset, 1);
+        return ftrace_set_regex(ops, buf, len, reset, 1);
 }
 EXPORT_SYMBOL_GPL(ftrace_set_filter);
@@ -3191,10 +3268,10 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter);
 * is enabled. If @buf is NULL and reset is set, all functions will be enabled
 * for tracing.
 */
-void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
+int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
                        int len, int reset)
 {
-        ftrace_set_regex(ops, buf, len, reset, 0);
+        return ftrace_set_regex(ops, buf, len, reset, 0);
 }
 EXPORT_SYMBOL_GPL(ftrace_set_notrace);
 /**
@@ -3871,6 +3948,36 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 static void
+ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip)
+{
+        struct ftrace_ops *op;
+        if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT)))
+                return;
+        /*
+         * Some of the ops may be dynamically allocated,
+         * they must be freed after a synchronize_sched().
+         */
+        preempt_disable_notrace();
+        trace_recursion_set(TRACE_CONTROL_BIT);
+        op = rcu_dereference_raw(ftrace_control_list);
+        while (op != &ftrace_list_end) {
+                if (!ftrace_function_local_disabled(op) &&
+                    ftrace_ops_test(op, ip))
+                        op->func(ip, parent_ip);
+                op = rcu_dereference_raw(op->next);
+        };
+        trace_recursion_clear(TRACE_CONTROL_BIT);
+        preempt_enable_notrace();
+}
+static struct ftrace_ops control_ops = {
+        .func = ftrace_ops_control_func,
+};
+static void
 ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
 {
        struct ftrace_ops *op;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f5b7b5c1195b..cf8d11e91efd 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -154,33 +154,10 @@ enum {
 static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
-#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
+/* Used for individual buffers (after the counter) */
+#define RB_BUFFER_OFF           (1 << 20)
-/**
- * tracing_on - enable all tracing buffers
- *
- * This function enables all tracing buffers that may have been
- * disabled with tracing_off.
- */
-void tracing_on(void)
-{
-        set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
-}
-EXPORT_SYMBOL_GPL(tracing_on);
-/**
+#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
- * tracing_off - turn off all tracing buffers
- *
- * This function stops all tracing buffers from recording data.
- * It does not disable any overhead the tracers themselves may
- * be causing. This function simply causes all recording to
- * the ring buffers to fail.
- */
-void tracing_off(void)
-{
-        clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
-}
-EXPORT_SYMBOL_GPL(tracing_off);
 /**
 * tracing_off_permanent - permanently disable ring buffers
@@ -193,15 +170,6 @@ void tracing_off_permanent(void)
        set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
 }
-/**
- * tracing_is_on - show state of ring buffers enabled
- */
-int tracing_is_on(void)
-{
-        return ring_buffer_flags == RB_BUFFERS_ON;
-}
-EXPORT_SYMBOL_GPL(tracing_is_on);
 #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
 #define RB_ALIGNMENT            4U
 #define RB_MAX_SMALL_DATA       (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
@@ -2619,6 +2587,63 @@ void ring_buffer_record_enable(struct ring_buffer *buffer)
 EXPORT_SYMBOL_GPL(ring_buffer_record_enable);
 /**
+ * ring_buffer_record_off - stop all writes into the buffer
+ * @buffer: The ring buffer to stop writes to.
+ *
+ * This prevents all writes to the buffer. Any attempt to write
+ * to the buffer after this will fail and return NULL.
+ *
+ * This is different than ring_buffer_record_disable() as
+ * it works like an on/off switch, where as the disable() verison
+ * must be paired with a enable().
+ */
+void ring_buffer_record_off(struct ring_buffer *buffer)
+{
+        unsigned int rd;
+        unsigned int new_rd;
+        do {
+                rd = atomic_read(&buffer->record_disabled);
+                new_rd = rd | RB_BUFFER_OFF;
+        } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_record_off);
+/**
+ * ring_buffer_record_on - restart writes into the buffer
+ * @buffer: The ring buffer to start writes to.
+ *
+ * This enables all writes to the buffer that was disabled by
+ * ring_buffer_record_off().
+ *
+ * This is different than ring_buffer_record_enable() as
+ * it works like an on/off switch, where as the enable() verison
+ * must be paired with a disable().
+ */
+void ring_buffer_record_on(struct ring_buffer *buffer)
+{
+        unsigned int rd;
+        unsigned int new_rd;
+        do {
+                rd = atomic_read(&buffer->record_disabled);
+                new_rd = rd & ~RB_BUFFER_OFF;
+        } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_record_on);
+/**
+ * ring_buffer_record_is_on - return true if the ring buffer can write
+ * @buffer: The ring buffer to see if write is enabled
+ *
+ * Returns true if the ring buffer is in a state that it accepts writes.
+ */
+int ring_buffer_record_is_on(struct ring_buffer *buffer)
+{
+        return !atomic_read(&buffer->record_disabled);
+}
+/**
 * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
 * @buffer: The ring buffer to stop writes to.
 * @cpu: The CPU buffer to stop
@@ -4039,68 +4064,6 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_page);
-#ifdef CONFIG_TRACING
-static ssize_t
-rb_simple_read(struct file *filp, char __user *ubuf,
-               size_t cnt, loff_t *ppos)
-{
-        unsigned long *p = filp->private_data;
-        char buf[64];
-        int r;
-        if (test_bit(RB_BUFFERS_DISABLED_BIT, p))
-                r = sprintf(buf, "permanently disabled\n");
-        else
-                r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p));
-        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
-static ssize_t
-rb_simple_write(struct file *filp, const char __user *ubuf,
-                size_t cnt, loff_t *ppos)
-{
-        unsigned long *p = filp->private_data;
-        unsigned long val;
-        int ret;
-        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
-        if (ret)
-                return ret;
-        if (val)
-                set_bit(RB_BUFFERS_ON_BIT, p);
-        else
-                clear_bit(RB_BUFFERS_ON_BIT, p);
-        (*ppos)++;
-        return cnt;
-}
-static const struct file_operations rb_simple_fops = {
-        .open           = tracing_open_generic,
-        .read           = rb_simple_read,
-        .write          = rb_simple_write,
-        .llseek         = default_llseek,
-};
-static __init int rb_init_debugfs(void)
-{
-        struct dentry *d_tracer;
-        d_tracer = tracing_init_dentry();
-        trace_create_file("tracing_on", 0644, d_tracer,
-                            &ring_buffer_flags, &rb_simple_fops);
-        return 0;
-}
-fs_initcall(rb_init_debugfs);
-#endif
 #ifdef CONFIG_HOTPLUG_CPU
 static int rb_cpu_notify(struct notifier_block *self,
                         unsigned long action, void *hcpu)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a3f1bc5d2a00..ed7b5d1e12f4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -36,6 +36,7 @@
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/poll.h>
+#include <linux/nmi.h>
 #include <linux/fs.h>
 #include "trace.h"
@@ -352,6 +353,59 @@ static void wakeup_work_handler(struct work_struct *work)
 static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler);
 /**
+ * tracing_on - enable tracing buffers
+ *
+ * This function enables tracing buffers that may have been
+ * disabled with tracing_off.
+ */
+void tracing_on(void)
+{
+        if (global_trace.buffer)
+                ring_buffer_record_on(global_trace.buffer);
+        /*
+         * This flag is only looked at when buffers haven't been
+         * allocated yet. We don't really care about the race
+         * between setting this flag and actually turning
+         * on the buffer.
+         */
+        global_trace.buffer_disabled = 0;
+}
+EXPORT_SYMBOL_GPL(tracing_on);
+/**
+ * tracing_off - turn off tracing buffers
+ *
+ * This function stops the tracing buffers from recording data.
+ * It does not disable any overhead the tracers themselves may
+ * be causing. This function simply causes all recording to
+ * the ring buffers to fail.
+ */
+void tracing_off(void)
+{
+        if (global_trace.buffer)
+                ring_buffer_record_on(global_trace.buffer);
+        /*
+         * This flag is only looked at when buffers haven't been
+         * allocated yet. We don't really care about the race
+         * between setting this flag and actually turning
+         * on the buffer.
+         */
+        global_trace.buffer_disabled = 1;
+}
+EXPORT_SYMBOL_GPL(tracing_off);
+/**
+ * tracing_is_on - show state of ring buffers enabled
+ */
+int tracing_is_on(void)
+{
+        if (global_trace.buffer)
+                return ring_buffer_record_is_on(global_trace.buffer);
+        return !global_trace.buffer_disabled;
+}
+EXPORT_SYMBOL_GPL(tracing_is_on);
+/**
 * trace_wake_up - wake up tasks waiting for trace input
 *
 * Schedules a delayed work to wake up any task that is blocked on the
@@ -1644,6 +1698,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
        int cpu_file = iter->cpu_file;
        u64 next_ts = 0, ts;
        int next_cpu = -1;
+        int next_size = 0;
        int cpu;
        /*
@@ -1675,9 +1730,12 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
                        next_cpu = cpu;
                        next_ts = ts;
                        next_lost = lost_events;
+                        next_size = iter->ent_size;
                }
        }
+        iter->ent_size = next_size;
        if (ent_cpu)
                *ent_cpu = next_cpu;
@@ -2764,12 +2822,12 @@ static const char readme_msg[] =
        "tracing mini-HOWTO:\n\n"
        "# mount -t debugfs nodev /sys/kernel/debug\n\n"
        "# cat /sys/kernel/debug/tracing/available_tracers\n"
-        "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n"
+        "wakeup wakeup_rt preemptirqsoff preemptoff irqsoff function nop\n\n"
        "# cat /sys/kernel/debug/tracing/current_tracer\n"
        "nop\n"
-        "# echo sched_switch > /sys/kernel/debug/tracing/current_tracer\n"
+        "# echo wakeup > /sys/kernel/debug/tracing/current_tracer\n"
        "# cat /sys/kernel/debug/tracing/current_tracer\n"
-        "sched_switch\n"
+        "wakeup\n"
        "# cat /sys/kernel/debug/tracing/trace_options\n"
        "noprint-parent nosym-offset nosym-addr noverbose\n"
        "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n"
@@ -4567,6 +4625,55 @@ static __init void create_trace_options_dir(void)
                create_trace_option_core_file(trace_options[i], i);
 }
+static ssize_t
+rb_simple_read(struct file *filp, char __user *ubuf,
+               size_t cnt, loff_t *ppos)
+{
+        struct ring_buffer *buffer = filp->private_data;
+        char buf[64];
+        int r;
+        if (buffer)
+                r = ring_buffer_record_is_on(buffer);
+        else
+                r = 0;
+        r = sprintf(buf, "%d\n", r);
+        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+static ssize_t
+rb_simple_write(struct file *filp, const char __user *ubuf,
+                size_t cnt, loff_t *ppos)
+{
+        struct ring_buffer *buffer = filp->private_data;
+        unsigned long val;
+        int ret;
+        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+        if (ret)
+                return ret;
+        if (buffer) {
+                if (val)
+                        ring_buffer_record_on(buffer);
+                else
+                        ring_buffer_record_off(buffer);
+        }
+        (*ppos)++;
+        return cnt;
+}
+static const struct file_operations rb_simple_fops = {
+        .open           = tracing_open_generic,
+        .read           = rb_simple_read,
+        .write          = rb_simple_write,
+        .llseek         = default_llseek,
+};
 static __init int tracer_init_debugfs(void)
 {
        struct dentry *d_tracer;
@@ -4626,6 +4733,9 @@ static __init int tracer_init_debugfs(void)
        trace_create_file("trace_clock", 0644, d_tracer, NULL,
                          &trace_clock_fops);
+        trace_create_file("tracing_on", 0644, d_tracer,
+                            global_trace.buffer, &rb_simple_fops);
 #ifdef CONFIG_DYNAMIC_FTRACE
        trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
                        &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
@@ -4798,6 +4908,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
                        if (ret != TRACE_TYPE_NO_CONSUME)
                                trace_consume(&iter);
                }
+                touch_nmi_watchdog();
                trace_printk_seq(&iter.seq);
        }
@@ -4863,6 +4974,8 @@ __init static int tracer_alloc_buffers(void)
                goto out_free_cpumask;
        }
        global_trace.entries = ring_buffer_size(global_trace.buffer);
+        if (global_trace.buffer_disabled)
+                tracing_off();
 #ifdef CONFIG_TRACER_MAX_TRACE
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b93ecbadad6d..95059f091a24 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -56,17 +56,23 @@ enum trace_type {
 #define F_STRUCT(args...)               args
 #undef FTRACE_ENTRY
-#define FTRACE_ENTRY(name, struct_name, id, tstruct, print)     \
+#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter)     \
-        struct struct_name {                                    \
+        struct struct_name {                                            \
-                struct trace_entry      ent;                    \
+                struct trace_entry      ent;                            \
-                tstruct                                         \
+                tstruct                                                 \
        }
 #undef TP_ARGS
 #define TP_ARGS(args...)        args
 #undef FTRACE_ENTRY_DUP
-#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk)
+#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk, filter)
+#undef FTRACE_ENTRY_REG
+#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \
+                         filter, regfn) \
+        FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
+                     filter)
 #include "trace_entries.h"
@@ -148,6 +154,7 @@ struct trace_array {
        struct ring_buffer      *buffer;
        unsigned long           entries;
        int                     cpu;
+        int                     buffer_disabled;
        cycle_t                 time_start;
        struct task_struct      *waiter;
        struct trace_array_cpu  *data[NR_CPUS];
@@ -288,6 +295,8 @@ struct tracer {
 /* for function tracing recursion */
 #define TRACE_INTERNAL_BIT              (1<<11)
 #define TRACE_GLOBAL_BIT                (1<<12)
+#define TRACE_CONTROL_BIT               (1<<13)
 /*
 * Abuse of the trace_recursion.
 * As we need a way to maintain state if we are tracing the function
@@ -589,6 +598,8 @@ static inline int ftrace_trace_task(struct task_struct *task)
 static inline int ftrace_is_dead(void) { return 0; }
 #endif
+int ftrace_event_is_function(struct ftrace_event_call *call);
 /*
 * struct trace_parser - servers for reading the user input separated by spaces
 * @cont: set if the input is not complete - no final space char was found
@@ -766,9 +777,7 @@ struct filter_pred {
        u64                     val;
        struct regex            regex;
        unsigned short          *ops;
-#ifdef CONFIG_FTRACE_STARTUP_TEST
        struct ftrace_event_field *field;
-#endif
        int                     offset;
        int                     not;
        int                     op;
@@ -818,12 +827,20 @@ extern const char *__start___trace_bprintk_fmt[];
 extern const char *__stop___trace_bprintk_fmt[];
 #undef FTRACE_ENTRY
-#define FTRACE_ENTRY(call, struct_name, id, tstruct, print)             \
+#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter)     \
        extern struct ftrace_event_call                                 \
        __attribute__((__aligned__(4))) event_##call;
 #undef FTRACE_ENTRY_DUP
-#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print)         \
+#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \
-        FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
+        FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
+                     filter)
 #include "trace_entries.h"
+#ifdef CONFIG_FUNCTION_TRACER
+int perf_ftrace_event_register(struct ftrace_event_call *call,
+                               enum trace_reg type, void *data);
+#else
+#define perf_ftrace_event_register NULL
+#endif /* CONFIG_FUNCTION_TRACER */
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 93365907f219..4108e1250ca2 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -55,7 +55,7 @@
 /*
 * Function trace entry - function address and parent function address:
 */
-FTRACE_ENTRY(function, ftrace_entry,
+FTRACE_ENTRY_REG(function, ftrace_entry,
        TRACE_FN,
@@ -64,7 +64,11 @@ FTRACE_ENTRY(function, ftrace_entry,
                __field(        unsigned long,  parent_ip       )
        ),
-        F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip)
+        F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip),
+        FILTER_TRACE_FN,
+        perf_ftrace_event_register
 );
 /* Function call entry */
@@ -78,7 +82,9 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
                __field_desc(   int,            graph_ent,      depth           )
        ),
-        F_printk("--> %lx (%d)", __entry->func, __entry->depth)
+        F_printk("--> %lx (%d)", __entry->func, __entry->depth),
+        FILTER_OTHER
 );
 /* Function return entry */
@@ -98,7 +104,9 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
        F_printk("<-- %lx (%d) (start: %llx  end: %llx) over: %d",
                 __entry->func, __entry->depth,
                 __entry->calltime, __entry->rettime,
-                 __entry->depth)
+                 __entry->depth),
+        FILTER_OTHER
 );
 /*
@@ -127,8 +135,9 @@ FTRACE_ENTRY(context_switch, ctx_switch_entry,
        F_printk("%u:%u:%u  ==> %u:%u:%u [%03u]",
                 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
                 __entry->next_pid, __entry->next_prio, __entry->next_state,
-                 __entry->next_cpu
+                 __entry->next_cpu),
-                )
+        FILTER_OTHER
 );
 /*
@@ -146,8 +155,9 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
        F_printk("%u:%u:%u  ==+ %u:%u:%u [%03u]",
                 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
                 __entry->next_pid, __entry->next_prio, __entry->next_state,
-                 __entry->next_cpu
+                 __entry->next_cpu),
-                )
+        FILTER_OTHER
 );
 /*
@@ -156,6 +166,12 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
 #define FTRACE_STACK_ENTRIES    8
+#ifndef CONFIG_64BIT
+# define IP_FMT "%08lx"
+#else
+# define IP_FMT "%016lx"
+#endif
 FTRACE_ENTRY(kernel_stack, stack_entry,
        TRACE_STACK,
@@ -165,11 +181,14 @@ FTRACE_ENTRY(kernel_stack, stack_entry,
                __dynamic_array(unsigned long,  caller  )
        ),
-        F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
+        F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
-                 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
+                 "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
+                 "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n",
                 __entry->caller[0], __entry->caller[1], __entry->caller[2],
                 __entry->caller[3], __entry->caller[4], __entry->caller[5],
-                 __entry->caller[6], __entry->caller[7])
+                 __entry->caller[6], __entry->caller[7]),
+        FILTER_OTHER
 );
 FTRACE_ENTRY(user_stack, userstack_entry,
@@ -181,11 +200,14 @@ FTRACE_ENTRY(user_stack, userstack_entry,
                __array(        unsigned long,  caller, FTRACE_STACK_ENTRIES    )
        ),
-        F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
+        F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
-                 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
+                 "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
+                 "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n",
                 __entry->caller[0], __entry->caller[1], __entry->caller[2],
                 __entry->caller[3], __entry->caller[4], __entry->caller[5],
-                 __entry->caller[6], __entry->caller[7])
+                 __entry->caller[6], __entry->caller[7]),
+        FILTER_OTHER
 );
 /*
@@ -202,7 +224,9 @@ FTRACE_ENTRY(bprint, bprint_entry,
        ),
        F_printk("%08lx fmt:%p",
-                 __entry->ip, __entry->fmt)
+                 __entry->ip, __entry->fmt),
+        FILTER_OTHER
 );
 FTRACE_ENTRY(print, print_entry,
@@ -215,7 +239,9 @@ FTRACE_ENTRY(print, print_entry,
        ),
        F_printk("%08lx %s",
-                 __entry->ip, __entry->buf)
+                 __entry->ip, __entry->buf),
+        FILTER_OTHER
 );
 FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
@@ -234,7 +260,9 @@ FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
        F_printk("%lx %lx %lx %d %x %x",
                 (unsigned long)__entry->phys, __entry->value, __entry->pc,
-                 __entry->map_id, __entry->opcode, __entry->width)
+                 __entry->map_id, __entry->opcode, __entry->width),
+        FILTER_OTHER
 );
 FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
@@ -252,7 +280,9 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
        F_printk("%lx %lx %lx %d %x",
                 (unsigned long)__entry->phys, __entry->virt, __entry->len,
-                 __entry->map_id, __entry->opcode)
+                 __entry->map_id, __entry->opcode),
+        FILTER_OTHER
 );
@@ -272,6 +302,8 @@ FTRACE_ENTRY(branch, trace_branch,
        F_printk("%u:%s:%s (%u)",
                 __entry->line,
-                 __entry->func, __entry->file, __entry->correct)
+                 __entry->func, __entry->file, __entry->correct),
+        FILTER_OTHER
 );
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 19a359d5e6d5..fee3752ae8f6 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -24,6 +24,11 @@ static int	total_ref_count;
 static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
                                 struct perf_event *p_event)
 {
+        /* The ftrace function trace is allowed only for root. */
+        if (ftrace_event_is_function(tp_event) &&
+            perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+                return -EPERM;
        /* No tracing, just counting, so no obvious leak */
        if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
                return 0;
@@ -44,23 +49,17 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
        return 0;
 }
-static int perf_trace_event_init(struct ftrace_event_call *tp_event,
+static int perf_trace_event_reg(struct ftrace_event_call *tp_event,
-                                 struct perf_event *p_event)
+                                struct perf_event *p_event)
 {
        struct hlist_head __percpu *list;
-        int ret;
+        int ret = -ENOMEM;
        int cpu;
-        ret = perf_trace_event_perm(tp_event, p_event);
-        if (ret)
-                return ret;
        p_event->tp_event = tp_event;
        if (tp_event->perf_refcount++ > 0)
                return 0;
-        ret = -ENOMEM;
        list = alloc_percpu(struct hlist_head);
        if (!list)
                goto fail;
@@ -83,7 +82,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
                }
        }
-        ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
+        ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);
        if (ret)
                goto fail;
@@ -108,6 +107,69 @@ fail:
        return ret;
 }
+static void perf_trace_event_unreg(struct perf_event *p_event)
+{
+        struct ftrace_event_call *tp_event = p_event->tp_event;
+        int i;
+        if (--tp_event->perf_refcount > 0)
+                goto out;
+        tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL);
+        /*
+         * Ensure our callback won't be called anymore. The buffers
+         * will be freed after that.
+         */
+        tracepoint_synchronize_unregister();
+        free_percpu(tp_event->perf_events);
+        tp_event->perf_events = NULL;
+        if (!--total_ref_count) {
+                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
+                        free_percpu(perf_trace_buf[i]);
+                        perf_trace_buf[i] = NULL;
+                }
+        }
+out:
+        module_put(tp_event->mod);
+}
+static int perf_trace_event_open(struct perf_event *p_event)
+{
+        struct ftrace_event_call *tp_event = p_event->tp_event;
+        return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event);
+}
+static void perf_trace_event_close(struct perf_event *p_event)
+{
+        struct ftrace_event_call *tp_event = p_event->tp_event;
+        tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event);
+}
+static int perf_trace_event_init(struct ftrace_event_call *tp_event,
+                                 struct perf_event *p_event)
+{
+        int ret;
+        ret = perf_trace_event_perm(tp_event, p_event);
+        if (ret)
+                return ret;
+        ret = perf_trace_event_reg(tp_event, p_event);
+        if (ret)
+                return ret;
+        ret = perf_trace_event_open(p_event);
+        if (ret) {
+                perf_trace_event_unreg(p_event);
+                return ret;
+        }
+        return 0;
+}
 int perf_trace_init(struct perf_event *p_event)
 {
        struct ftrace_event_call *tp_event;
@@ -130,6 +192,14 @@ int perf_trace_init(struct perf_event *p_event)
        return ret;
 }
+void perf_trace_destroy(struct perf_event *p_event)
+{
+        mutex_lock(&event_mutex);
+        perf_trace_event_close(p_event);
+        perf_trace_event_unreg(p_event);
+        mutex_unlock(&event_mutex);
+}
 int perf_trace_add(struct perf_event *p_event, int flags)
 {
        struct ftrace_event_call *tp_event = p_event->tp_event;
@@ -146,43 +216,14 @@ int perf_trace_add(struct perf_event *p_event, int flags)
        list = this_cpu_ptr(pcpu_list);
        hlist_add_head_rcu(&p_event->hlist_entry, list);
-        return 0;
+        return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
 }
 void perf_trace_del(struct perf_event *p_event, int flags)
 {
-        hlist_del_rcu(&p_event->hlist_entry);
-}
-void perf_trace_destroy(struct perf_event *p_event)
-{
        struct ftrace_event_call *tp_event = p_event->tp_event;
-        int i;
+        hlist_del_rcu(&p_event->hlist_entry);
+        tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
-        mutex_lock(&event_mutex);
-        if (--tp_event->perf_refcount > 0)
-                goto out;
-        tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
-        /*
-         * Ensure our callback won't be called anymore. The buffers
-         * will be freed after that.
-         */
-        tracepoint_synchronize_unregister();
-        free_percpu(tp_event->perf_events);
-        tp_event->perf_events = NULL;
-        if (!--total_ref_count) {
-                for (i = 0; i < PERF_NR_CONTEXTS; i++) {
-                        free_percpu(perf_trace_buf[i]);
-                        perf_trace_buf[i] = NULL;
-                }
-        }
-out:
-        module_put(tp_event->mod);
-        mutex_unlock(&event_mutex);
 }
 __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
@@ -214,3 +255,86 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
        return raw_data;
 }
 EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
+#ifdef CONFIG_FUNCTION_TRACER
+static void
+perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip)
+{
+        struct ftrace_entry *entry;
+        struct hlist_head *head;
+        struct pt_regs regs;
+        int rctx;
+#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
+                    sizeof(u64)) - sizeof(u32))
+        BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE);
+        perf_fetch_caller_regs(&regs);
+        entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx);
+        if (!entry)
+                return;
+        entry->ip = ip;
+        entry->parent_ip = parent_ip;
+        head = this_cpu_ptr(event_function.perf_events);
+        perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
+                              1, &regs, head);
+#undef ENTRY_SIZE
+}
+static int perf_ftrace_function_register(struct perf_event *event)
+{
+        struct ftrace_ops *ops = &event->ftrace_ops;
+        ops->flags |= FTRACE_OPS_FL_CONTROL;
+        ops->func = perf_ftrace_function_call;
+        return register_ftrace_function(ops);
+}
+static int perf_ftrace_function_unregister(struct perf_event *event)
+{
+        struct ftrace_ops *ops = &event->ftrace_ops;
+        int ret = unregister_ftrace_function(ops);
+        ftrace_free_filter(ops);
+        return ret;
+}
+static void perf_ftrace_function_enable(struct perf_event *event)
+{
+        ftrace_function_local_enable(&event->ftrace_ops);
+}
+static void perf_ftrace_function_disable(struct perf_event *event)
+{
+        ftrace_function_local_disable(&event->ftrace_ops);
+}
+int perf_ftrace_event_register(struct ftrace_event_call *call,
+                               enum trace_reg type, void *data)
+{
+        switch (type) {
+        case TRACE_REG_REGISTER:
+        case TRACE_REG_UNREGISTER:
+                break;
+        case TRACE_REG_PERF_REGISTER:
+        case TRACE_REG_PERF_UNREGISTER:
+                return 0;
+        case TRACE_REG_PERF_OPEN:
+                return perf_ftrace_function_register(data);
+        case TRACE_REG_PERF_CLOSE:
+                return perf_ftrace_function_unregister(data);
+        case TRACE_REG_PERF_ADD:
+                perf_ftrace_function_enable(data);
+                return 0;
+        case TRACE_REG_PERF_DEL:
+                perf_ftrace_function_disable(data);
+                return 0;
+        }
+        return -EINVAL;
+}
+#endif /* CONFIG_FUNCTION_TRACER */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index c212a7f934ec..079a93ae8a9d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -147,7 +147,8 @@ int trace_event_raw_init(struct ftrace_event_call *call)
 }
 EXPORT_SYMBOL_GPL(trace_event_raw_init);
-int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type)
+int ftrace_event_reg(struct ftrace_event_call *call,
+                     enum trace_reg type, void *data)
 {
        switch (type) {
        case TRACE_REG_REGISTER:
@@ -170,6 +171,11 @@ int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type)
                                            call->class->perf_probe,
                                            call);
                return 0;
+        case TRACE_REG_PERF_OPEN:
+        case TRACE_REG_PERF_CLOSE:
+        case TRACE_REG_PERF_ADD:
+        case TRACE_REG_PERF_DEL:
+                return 0;
 #endif
        }
        return 0;
@@ -209,7 +215,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
                                tracing_stop_cmdline_record();
                                call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
                        }
-                        call->class->reg(call, TRACE_REG_UNREGISTER);
+                        call->class->reg(call, TRACE_REG_UNREGISTER, NULL);
                }
                break;
        case 1:
@@ -218,7 +224,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
                                tracing_start_cmdline_record();
                                call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
                        }
-                        ret = call->class->reg(call, TRACE_REG_REGISTER);
+                        ret = call->class->reg(call, TRACE_REG_REGISTER, NULL);
                        if (ret) {
                                tracing_stop_cmdline_record();
                                pr_info("event trace: Could not enable event "
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 24aee7127451..431dba8b7542 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -81,6 +81,7 @@ enum {
        FILT_ERR_TOO_MANY_PREDS,
        FILT_ERR_MISSING_FIELD,
        FILT_ERR_INVALID_FILTER,
+        FILT_ERR_IP_FIELD_ONLY,
 };
 static char *err_text[] = {
@@ -96,6 +97,7 @@ static char *err_text[] = {
        "Too many terms in predicate expression",
        "Missing field name and/or value",
        "Meaningless filter expression",
+        "Only 'ip' field is supported for function trace",
 };
 struct opstack_op {
@@ -685,7 +687,7 @@ find_event_field(struct ftrace_event_call *call, char *name)
 static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
 {
-        stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL);
+        stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL);
        if (!stack->preds)
                return -ENOMEM;
        stack->index = n_preds;
@@ -826,8 +828,7 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)
        if (filter->preds)
                __free_preds(filter);
-        filter->preds =
+        filter->preds = kcalloc(n_preds, sizeof(*filter->preds), GFP_KERNEL);
-                kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL);
        if (!filter->preds)
                return -ENOMEM;
@@ -900,6 +901,11 @@ int filter_assign_type(const char *type)
        return FILTER_OTHER;
 }
+static bool is_function_field(struct ftrace_event_field *field)
+{
+        return field->filter_type == FILTER_TRACE_FN;
+}
 static bool is_string_field(struct ftrace_event_field *field)
 {
        return field->filter_type == FILTER_DYN_STRING ||
@@ -987,6 +993,11 @@ static int init_pred(struct filter_parse_state *ps,
                        fn = filter_pred_strloc;
                else
                        fn = filter_pred_pchar;
+        } else if (is_function_field(field)) {
+                if (strcmp(field->name, "ip")) {
+                        parse_error(ps, FILT_ERR_IP_FIELD_ONLY, 0);
+                        return -EINVAL;
+                }
        } else {
                if (field->is_signed)
                        ret = strict_strtoll(pred->regex.pattern, 0, &val);
@@ -1334,10 +1345,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps,
        strcpy(pred.regex.pattern, operand2);
        pred.regex.len = strlen(pred.regex.pattern);
-#ifdef CONFIG_FTRACE_STARTUP_TEST
        pred.field = field;
-#endif
        return init_pred(ps, field, &pred) ? NULL : &pred;
 }
@@ -1486,7 +1494,7 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
        children = count_leafs(preds, &preds[root->left]);
        children += count_leafs(preds, &preds[root->right]);
-        root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL);
+        root->ops = kcalloc(children, sizeof(*root->ops), GFP_KERNEL);
        if (!root->ops)
                return -ENOMEM;
@@ -1950,6 +1958,148 @@ void ftrace_profile_free_filter(struct perf_event *event)
        __free_filter(filter);
 }
+struct function_filter_data {
+        struct ftrace_ops *ops;
+        int first_filter;
+        int first_notrace;
+};
+#ifdef CONFIG_FUNCTION_TRACER
+static char **
+ftrace_function_filter_re(char *buf, int len, int *count)
+{
+        char *str, *sep, **re;
+        str = kstrndup(buf, len, GFP_KERNEL);
+        if (!str)
+                return NULL;
+        /*
+         * The argv_split function takes white space
+         * as a separator, so convert ',' into spaces.
+         */
+        while ((sep = strchr(str, ',')))
+                *sep = ' ';
+        re = argv_split(GFP_KERNEL, str, count);
+        kfree(str);
+        return re;
+}
+static int ftrace_function_set_regexp(struct ftrace_ops *ops, int filter,
+                                      int reset, char *re, int len)
+{
+        int ret;
+        if (filter)
+                ret = ftrace_set_filter(ops, re, len, reset);
+        else
+                ret = ftrace_set_notrace(ops, re, len, reset);
+        return ret;
+}
+static int __ftrace_function_set_filter(int filter, char *buf, int len,
+                                        struct function_filter_data *data)
+{
+        int i, re_cnt, ret;
+        int *reset;
+        char **re;
+        reset = filter ? &data->first_filter : &data->first_notrace;
+        /*
+         * The 'ip' field could have multiple filters set, separated
+         * either by space or comma. We first cut the filter and apply
+         * all pieces separatelly.
+         */
+        re = ftrace_function_filter_re(buf, len, &re_cnt);
+        if (!re)
+                return -EINVAL;
+        for (i = 0; i < re_cnt; i++) {
+                ret = ftrace_function_set_regexp(data->ops, filter, *reset,
+                                                 re[i], strlen(re[i]));
+                if (ret)
+                        break;
+                if (*reset)
+                        *reset = 0;
+        }
+        argv_free(re);
+        return ret;
+}
+static int ftrace_function_check_pred(struct filter_pred *pred, int leaf)
+{
+        struct ftrace_event_field *field = pred->field;
+        if (leaf) {
+                /*
+                 * Check the leaf predicate for function trace, verify:
+                 *  - only '==' and '!=' is used
+                 *  - the 'ip' field is used
+                 */
+                if ((pred->op != OP_EQ) && (pred->op != OP_NE))
+                        return -EINVAL;
+                if (strcmp(field->name, "ip"))
+                        return -EINVAL;
+        } else {
+                /*
+                 * Check the non leaf predicate for function trace, verify:
+                 *  - only '||' is used
+                */
+                if (pred->op != OP_OR)
+                        return -EINVAL;
+        }
+        return 0;
+}
+static int ftrace_function_set_filter_cb(enum move_type move,
+                                         struct filter_pred *pred,
+                                         int *err, void *data)
+{
+        /* Checking the node is valid for function trace. */
+        if ((move != MOVE_DOWN) ||
+            (pred->left != FILTER_PRED_INVALID)) {
+                *err = ftrace_function_check_pred(pred, 0);
+        } else {
+                *err = ftrace_function_check_pred(pred, 1);
+                if (*err)
+                        return WALK_PRED_ABORT;
+                *err = __ftrace_function_set_filter(pred->op == OP_EQ,
+                                                    pred->regex.pattern,
+                                                    pred->regex.len,
+                                                    data);
+        }
+        return (*err) ? WALK_PRED_ABORT : WALK_PRED_DEFAULT;
+}
+static int ftrace_function_set_filter(struct perf_event *event,
+                                      struct event_filter *filter)
+{
+        struct function_filter_data data = {
+                .first_filter  = 1,
+                .first_notrace = 1,
+                .ops           = &event->ftrace_ops,
+        };
+        return walk_pred_tree(filter->preds, filter->root,
+                              ftrace_function_set_filter_cb, &data);
+}
+#else
+static int ftrace_function_set_filter(struct perf_event *event,
+                                      struct event_filter *filter)
+{
+        return -ENODEV;
+}
+#endif /* CONFIG_FUNCTION_TRACER */
 int ftrace_profile_set_filter(struct perf_event *event, int event_id,
                              char *filter_str)
 {
@@ -1970,9 +2120,16 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
                goto out_unlock;
        err = create_filter(call, filter_str, false, &filter);
-        if (!err)
+        if (err)
-                event->filter = filter;
+                goto free_filter;
+        if (ftrace_event_is_function(call))
+                err = ftrace_function_set_filter(event, filter);
        else
+                event->filter = filter;
+free_filter:
+        if (err || ftrace_event_is_function(call))
                __free_filter(filter);
 out_unlock:
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index bbeec31e0ae3..3dd15e8bc856 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -18,6 +18,16 @@
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM    ftrace
+/*
+ * The FTRACE_ENTRY_REG macro allows ftrace entry to define register
+ * function and thus become accesible via perf.
+ */
+#undef FTRACE_ENTRY_REG
+#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \
+                         filter, regfn) \
+        FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
+                     filter)
 /* not needed for this file */
 #undef __field_struct
 #define __field_struct(type, item)
@@ -44,21 +54,22 @@
 #define F_printk(fmt, args...) fmt, args
 #undef FTRACE_ENTRY
-#define FTRACE_ENTRY(name, struct_name, id, tstruct, print)     \
+#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter)     \
-struct ____ftrace_##name {                                      \
+struct ____ftrace_##name {                                              \
-        tstruct                                                 \
+        tstruct                                                         \
-};                                                              \
+};                                                                      \
-static void __always_unused ____ftrace_check_##name(void)       \
+static void __always_unused ____ftrace_check_##name(void)               \
-{                                                               \
+{                                                                       \
-        struct ____ftrace_##name *__entry = NULL;               \
+        struct ____ftrace_##name *__entry = NULL;                       \
-                                                                \
+                                                                        \
-        /* force compile-time check on F_printk() */            \
+        /* force compile-time check on F_printk() */                    \
-        printk(print);                                          \
+        printk(print);                                                  \
 }
 #undef FTRACE_ENTRY_DUP
-#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \
+#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print, filter) \
-        FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print))
+        FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
+                     filter)
 #include "trace_entries.h"
@@ -67,7 +78,7 @@ static void __always_unused ____ftrace_check_##name(void)	\
        ret = trace_define_field(event_call, #type, #item,              \
                                 offsetof(typeof(field), item),         \
                                 sizeof(field.item),                    \
-                                 is_signed_type(type), FILTER_OTHER);   \
+                                 is_signed_type(type), filter_type);    \
        if (ret)                                                        \
                return ret;
@@ -77,7 +88,7 @@ static void __always_unused ____ftrace_check_##name(void)	\
                                 offsetof(typeof(field),                \
                                          container.item),              \
                                 sizeof(field.container.item),          \
-                                 is_signed_type(type), FILTER_OTHER);   \
+                                 is_signed_type(type), filter_type);    \
        if (ret)                                                        \
                return ret;
@@ -91,7 +102,7 @@ static void __always_unused ____ftrace_check_##name(void)	\
                ret = trace_define_field(event_call, event_storage, #item, \
                                 offsetof(typeof(field), item),         \
                                 sizeof(field.item),                    \
-                                 is_signed_type(type), FILTER_OTHER);   \
+                                 is_signed_type(type), filter_type);    \
                mutex_unlock(&event_storage_mutex);                     \
                if (ret)                                                \
                        return ret;                                     \
@@ -104,7 +115,7 @@ static void __always_unused ____ftrace_check_##name(void)	\
                                 offsetof(typeof(field),                \
                                          container.item),              \
                                 sizeof(field.container.item),          \
-                                 is_signed_type(type), FILTER_OTHER);   \
+                                 is_signed_type(type), filter_type);    \
        if (ret)                                                        \
                return ret;
@@ -112,17 +123,18 @@ static void __always_unused ____ftrace_check_##name(void)	\
 #define __dynamic_array(type, item)                                     \
        ret = trace_define_field(event_call, #type, #item,              \
                                 offsetof(typeof(field), item),         \
-                                 0, is_signed_type(type), FILTER_OTHER);\
+                                 0, is_signed_type(type), filter_type);\
        if (ret)                                                        \
                return ret;
 #undef FTRACE_ENTRY
-#define FTRACE_ENTRY(name, struct_name, id, tstruct, print)             \
+#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter)     \
 int                                                                     \
 ftrace_define_fields_##name(struct ftrace_event_call *event_call)       \
 {                                                                       \
        struct struct_name field;                                       \
        int ret;                                                        \
+        int filter_type = filter;                                       \
                                                                        \
        tstruct;                                                        \
                                                                        \
@@ -150,15 +162,17 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\
 #define __dynamic_array(type, item)
 #undef F_printk
-#define F_printk(fmt, args...) #fmt ", "  __stringify(args)
+#define F_printk(fmt, args...) __stringify(fmt) ", "  __stringify(args)
-#undef FTRACE_ENTRY
+#undef FTRACE_ENTRY_REG
-#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print)          \
+#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\
+                         regfn)                                         \
                                                                        \
 struct ftrace_event_class event_class_ftrace_##call = {                 \
        .system                 = __stringify(TRACE_SYSTEM),            \
        .define_fields          = ftrace_define_fields_##call,          \
        .fields                 = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
+        .reg                    = regfn,                                \
 };                                                                      \
                                                                        \
 struct ftrace_event_call __used event_##call = {                        \
@@ -170,4 +184,14 @@ struct ftrace_event_call __used event_##call = {			\
 struct ftrace_event_call __used                                         \
 __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
+#undef FTRACE_ENTRY
+#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print, filter)  \
+        FTRACE_ENTRY_REG(call, struct_name, etype,                      \
+                         PARAMS(tstruct), PARAMS(print), filter, NULL)
+int ftrace_event_is_function(struct ftrace_event_call *call)
+{
+        return call == &event_function;
+}
 #include "trace_entries.h"
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 00d527c945a4..580a05ec926b 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1892,7 +1892,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
 #endif  /* CONFIG_PERF_EVENTS */
 static __kprobes
-int kprobe_register(struct ftrace_event_call *event, enum trace_reg type)
+int kprobe_register(struct ftrace_event_call *event,
+                    enum trace_reg type, void *data)
 {
        struct trace_probe *tp = (struct trace_probe *)event->data;
@@ -1909,6 +1910,11 @@ int kprobe_register(struct ftrace_event_call *event, enum trace_reg type)
        case TRACE_REG_PERF_UNREGISTER:
                disable_trace_probe(tp, TP_FLAG_PROFILE);
                return 0;
+        case TRACE_REG_PERF_OPEN:
+        case TRACE_REG_PERF_CLOSE:
+        case TRACE_REG_PERF_ADD:
+        case TRACE_REG_PERF_DEL:
+                return 0;
 #endif
        }
        return 0;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 0d6ff3555942..859fae6b1825 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -264,7 +264,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
        return ret;
 }
-int trace_seq_path(struct trace_seq *s, struct path *path)
+int trace_seq_path(struct trace_seq *s, const struct path *path)
 {
        unsigned char *p;
@@ -300,7 +300,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
        unsigned long mask;
        const char *str;
        const char *ret = p->buffer + p->len;
-        int i;
+        int i, first = 1;
        for (i = 0;  flag_array[i].name && flags; i++) {
@@ -310,14 +310,16 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
                str = flag_array[i].name;
                flags &= ~mask;
-                if (p->len && delim)
+                if (!first && delim)
                        trace_seq_puts(p, delim);
+                else
+                        first = 0;
                trace_seq_puts(p, str);
        }
        /* check for left over flags */
        if (flags) {
-                if (p->len && delim)
+                if (!first && delim)
                        trace_seq_puts(p, delim);
                trace_seq_printf(p, "0x%lx", flags);
        }
@@ -344,7 +346,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
                break;
        }
-        if (!p->len)
+        if (ret == (const char *)(p->buffer + p->len))
                trace_seq_printf(p, "0x%lx", val);
                
        trace_seq_putc(p, 0);
@@ -370,7 +372,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
                break;
        }
-        if (!p->len)
+        if (ret == (const char *)(p->buffer + p->len))
                trace_seq_printf(p, "0x%llx", val);
        trace_seq_putc(p, 0);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index cb654542c1a1..96fc73369099 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -17,9 +17,9 @@ static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
 static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
 static int syscall_enter_register(struct ftrace_event_call *event,
-                                 enum trace_reg type);
+                                 enum trace_reg type, void *data);
 static int syscall_exit_register(struct ftrace_event_call *event,
-                                 enum trace_reg type);
+                                 enum trace_reg type, void *data);
 static int syscall_enter_define_fields(struct ftrace_event_call *call);
 static int syscall_exit_define_fields(struct ftrace_event_call *call);
@@ -468,8 +468,8 @@ int __init init_ftrace_syscalls(void)
        unsigned long addr;
        int i;
-        syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
+        syscalls_metadata = kcalloc(NR_syscalls, sizeof(*syscalls_metadata),
-                                        NR_syscalls, GFP_KERNEL);
+                                    GFP_KERNEL);
        if (!syscalls_metadata) {
                WARN_ON(1);
                return -ENOMEM;
@@ -649,7 +649,7 @@ void perf_sysexit_disable(struct ftrace_event_call *call)
 #endif /* CONFIG_PERF_EVENTS */
 static int syscall_enter_register(struct ftrace_event_call *event,
-                                 enum trace_reg type)
+                                 enum trace_reg type, void *data)
 {
        switch (type) {
        case TRACE_REG_REGISTER:
@@ -664,13 +664,18 @@ static int syscall_enter_register(struct ftrace_event_call *event,
        case TRACE_REG_PERF_UNREGISTER:
                perf_sysenter_disable(event);
                return 0;
+        case TRACE_REG_PERF_OPEN:
+        case TRACE_REG_PERF_CLOSE:
+        case TRACE_REG_PERF_ADD:
+        case TRACE_REG_PERF_DEL:
+                return 0;
 #endif
        }
        return 0;
 }
 static int syscall_exit_register(struct ftrace_event_call *event,
-                                 enum trace_reg type)
+                                 enum trace_reg type, void *data)
 {
        switch (type) {
        case TRACE_REG_REGISTER:
@@ -685,6 +690,11 @@ static int syscall_exit_register(struct ftrace_event_call *event,
        case TRACE_REG_PERF_UNREGISTER:
                perf_sysexit_disable(event);
                return 0;
+        case TRACE_REG_PERF_OPEN:
+        case TRACE_REG_PERF_CLOSE:
+        case TRACE_REG_PERF_ADD:
+        case TRACE_REG_PERF_DEL:
+                return 0;
 #endif
        }
        return 0;
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index f1539decd99d..d96ba22dabfa 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -25,7 +25,7 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
-#include <linux/jump_label.h>
+#include <linux/static_key.h>
 extern struct tracepoint * const __start___tracepoints_ptrs[];
 extern struct tracepoint * const __stop___tracepoints_ptrs[];
@@ -256,9 +256,9 @@ static void set_tracepoint(struct tracepoint_entry **entry,
 {
        WARN_ON(strcmp((*entry)->name, elem->name) != 0);
-        if (elem->regfunc && !jump_label_enabled(&elem->key) && active)
+        if (elem->regfunc && !static_key_enabled(&elem->key) && active)
                elem->regfunc();
-        else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active)
+        else if (elem->unregfunc && static_key_enabled(&elem->key) && !active)
                elem->unregfunc();
        /*
@@ -269,10 +269,10 @@ static void set_tracepoint(struct tracepoint_entry **entry,
         * is used.
         */
        rcu_assign_pointer(elem->funcs, (*entry)->funcs);
-        if (active && !jump_label_enabled(&elem->key))
+        if (active && !static_key_enabled(&elem->key))
-                jump_label_inc(&elem->key);
+                static_key_slow_inc(&elem->key);
-        else if (!active && jump_label_enabled(&elem->key))
+        else if (!active && static_key_enabled(&elem->key))
-                jump_label_dec(&elem->key);
+                static_key_slow_dec(&elem->key);
 }
 /*
@@ -283,11 +283,11 @@ static void set_tracepoint(struct tracepoint_entry **entry,
 */
 static void disable_tracepoint(struct tracepoint *elem)
 {
-        if (elem->unregfunc && jump_label_enabled(&elem->key))
+        if (elem->unregfunc && static_key_enabled(&elem->key))
                elem->unregfunc();
-        if (jump_label_enabled(&elem->key))
+        if (static_key_enabled(&elem->key))
-                jump_label_dec(&elem->key);
+                static_key_slow_dec(&elem->key);
        rcu_assign_pointer(elem->funcs, NULL);
 }
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index d117262deba3..df30ee08bdd4 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -3,15 +3,14 @@
 *
 * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
 *
- * this code detects hard lockups: incidents in where on a CPU
+ * Note: Most of this code is borrowed heavily from the original softlockup
- * the kernel does not respond to anything except NMI.
+ * detector, so thanks to Ingo for the initial implementation.
- *
+ * Some chunks also taken from the old x86-specific nmi watchdog code, thanks
- * Note: Most of this code is borrowed heavily from softlockup.c,
- * so thanks to Ingo for the initial implementation.
- * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks
 * to those contributors as well.
 */
+#define pr_fmt(fmt) "NMI watchdog: " fmt
 #include <linux/mm.h>
 #include <linux/cpu.h>
 #include <linux/nmi.h>
@@ -117,9 +116,10 @@ static unsigned long get_sample_period(void)
 {
        /*
         * convert watchdog_thresh from seconds to ns
-         * the divide by 5 is to give hrtimer 5 chances to
+         * the divide by 5 is to give hrtimer several chances (two
-         * increment before the hardlockup detector generates
+         * or three with the current relation between the soft
-         * a warning
+         * and hard thresholds) to increment before the
+         * hardlockup detector generates a warning
         */
        return get_softlockup_thresh() * (NSEC_PER_SEC / 5);
 }
@@ -321,11 +321,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 */
 static int watchdog(void *unused)
 {
-        struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+        struct sched_param param = { .sched_priority = 0 };
        struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
-        sched_setscheduler(current, SCHED_FIFO, &param);
        /* initialize timestamp */
        __touch_watchdog();
@@ -336,9 +334,11 @@ static int watchdog(void *unused)
        set_current_state(TASK_INTERRUPTIBLE);
        /*
-         * Run briefly once per second to reset the softlockup timestamp.
+         * Run briefly (kicked by the hrtimer callback function) once every
-         * If this gets delayed for more than 60 seconds then the
+         * get_sample_period() seconds (4 seconds by default) to reset the
-         * debug-printout triggers in watchdog_timer_fn().
+         * softlockup timestamp. If this gets delayed for more than
+         * 2*watchdog_thresh seconds then the debug-printout triggers in
+         * watchdog_timer_fn().
         */
        while (!kthread_should_stop()) {
                __touch_watchdog();
@@ -349,8 +349,11 @@ static int watchdog(void *unused)
                set_current_state(TASK_INTERRUPTIBLE);
        }
+        /*
+         * Drop the policy/priority elevation during thread exit to avoid a
+         * scheduling latency spike.
+         */
        __set_current_state(TASK_RUNNING);
-        param.sched_priority = 0;
        sched_setscheduler(current, SCHED_NORMAL, &param);
        return 0;
 }
@@ -376,18 +379,20 @@ static int watchdog_nmi_enable(int cpu)
        /* Try to register using hardware perf events */
        event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
        if (!IS_ERR(event)) {
-                printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
+                pr_info("enabled, takes one hw-pmu counter.\n");
                goto out_save;
        }
        /* vary the KERN level based on the returned errno */
        if (PTR_ERR(event) == -EOPNOTSUPP)
-                printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
+                pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
        else if (PTR_ERR(event) == -ENOENT)
-                printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu);
+                pr_warning("disabled (cpu%i): hardware events not enabled\n",
+                         cpu);
        else
-                printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event));
+                pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
+                        cpu, PTR_ERR(event));
        return PTR_ERR(event);
        /* success path */
@@ -439,9 +444,10 @@ static int watchdog_enable(int cpu)
        /* create the watchdog thread */
        if (!p) {
+                struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
                p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
                if (IS_ERR(p)) {
-                        printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
+                        pr_err("softlockup watchdog for %i failed\n", cpu);
                        if (!err) {
                                /* if hardlockup hasn't already set this */
                                err = PTR_ERR(p);
@@ -450,6 +456,7 @@ static int watchdog_enable(int cpu)
                        }
                        goto out;
                }
+                sched_setscheduler(p, SCHED_FIFO, &param);
                kthread_bind(p, cpu);
                per_cpu(watchdog_touch_ts, cpu) = 0;
                per_cpu(softlockup_watchdog, cpu) = p;
@@ -496,7 +503,7 @@ static void watchdog_enable_all_cpus(void)
                        watchdog_enabled = 1;
        if (!watchdog_enabled)
-                printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
+                pr_err("failed to be enabled on some cpus\n");
 }
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f2c5638bb5ab..5abf42f63c08 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -476,13 +476,8 @@ static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
                                            struct workqueue_struct *wq)
 {
        if (!(wq->flags & WQ_UNBOUND)) {
-                if (likely(cpu < nr_cpu_ids)) {
+                if (likely(cpu < nr_cpu_ids))
-#ifdef CONFIG_SMP
                        return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
-#else
-                        return wq->cpu_wq.single;
-#endif
-                }
        } else if (likely(cpu == WORK_CPU_UNBOUND))
                return wq->cpu_wq.single;
        return NULL;
@@ -2899,13 +2894,8 @@ static int alloc_cwqs(struct workqueue_struct *wq)
        const size_t size = sizeof(struct cpu_workqueue_struct);
        const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
                                   __alignof__(unsigned long long));
-#ifdef CONFIG_SMP
-        bool percpu = !(wq->flags & WQ_UNBOUND);
-#else
-        bool percpu = false;
-#endif
-        if (percpu)
+        if (!(wq->flags & WQ_UNBOUND))
                wq->cpu_wq.pcpu = __alloc_percpu(size, align);
        else {
                void *ptr;
@@ -2929,13 +2919,7 @@ static int alloc_cwqs(struct workqueue_struct *wq)
 static void free_cwqs(struct workqueue_struct *wq)
 {
-#ifdef CONFIG_SMP
+        if (!(wq->flags & WQ_UNBOUND))
-        bool percpu = !(wq->flags & WQ_UNBOUND);
-#else
-        bool percpu = false;
-#endif
-        if (percpu)
                free_percpu(wq->cpu_wq.pcpu);
        else if (wq->cpu_wq.single) {
                /* the pointer to free is stored right after the cwq */
author	Tejun Heo <tj@kernel.org>	2012-04-01 15:30:01 -0400
committer	Tejun Heo <tj@kernel.org>	2012-04-01 15:55:00 -0400
commit	959d851caa48829eb85cb85aa949fd6b4c5d5bc6 (patch)
tree	3ba9c94ec346275fb44c4f0d1cd2537cdff8d811 /kernel
parent	a5567932fc926739e29e98487128080f40c61710 (diff)
parent	48ddbe194623ae089cc0576e60363f2d2e85662a (diff)