Merge remote-tracking branch 'linus/master' into testing

author: Sage Weil <sage@inktank.com> 2013-08-15 14:11:45 -0400
committer: Sage Weil <sage@inktank.com> 2013-08-15 14:11:45 -0400
commit: ee3e542fec6e69bc9fb668698889a37d93950ddf (patch)
tree: e74ee766a4764769ef1d3d45d266b4dea64101d3 /kernel
parent: fe2a801b50c0bb8039d627e5ae1fec249d10ff39 (diff)
parent: f1d6e17f540af37bb1891480143669ba7636c4cf (diff)
108 files changed, 6967 insertions, 5470 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 44511d100eaa..d2b32ac27a39 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -138,7 +138,7 @@ config INLINE_SPIN_UNLOCK_BH
 config INLINE_SPIN_UNLOCK_IRQ
        def_bool y
-        depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH
+        depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_IRQ
 config INLINE_SPIN_UNLOCK_IRQRESTORE
        def_bool y
@@ -175,7 +175,7 @@ config INLINE_READ_UNLOCK_BH
 config INLINE_READ_UNLOCK_IRQ
        def_bool y
-        depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_BH
+        depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_IRQ
 config INLINE_READ_UNLOCK_IRQRESTORE
        def_bool y
@@ -212,7 +212,7 @@ config INLINE_WRITE_UNLOCK_BH
 config INLINE_WRITE_UNLOCK_IRQ
        def_bool y
-        depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH
+        depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_IRQ
 config INLINE_WRITE_UNLOCK_IRQRESTORE
        def_bool y
diff --git a/kernel/Makefile b/kernel/Makefile
index 271fd3119af9..35ef1185e359 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -2,14 +2,14 @@
 # Makefile for the linux kernel.
 #
-obj-y     = fork.o exec_domain.o panic.o printk.o \
+obj-y     = fork.o exec_domain.o panic.o \
            cpu.o exit.o itimer.o time.o softirq.o resource.o \
            sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
            signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
            rcupdate.o extable.o params.o posix-timers.o \
            kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
-            notifier.o ksysfs.o cred.o \
+            notifier.o ksysfs.o cred.o reboot.o \
            async.o range.o groups.o lglock.o smpboot.o
 ifdef CONFIG_FUNCTION_TRACER
@@ -24,6 +24,7 @@ endif
 obj-y += sched/
 obj-y += power/
+obj-y += printk/
 obj-y += cpu/
 obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
diff --git a/kernel/audit.h b/kernel/audit.h
index 1c95131ef760..123c9b7c3979 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -85,6 +85,7 @@ struct audit_names {
        struct filename         *name;
        int                     name_len;       /* number of chars to log */
+        bool                    hidden;         /* don't log this record */
        bool                    name_put;       /* call __putname()? */
        unsigned long           ino;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 6bd4a90d1991..f7aee8be7fb2 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -423,7 +423,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
                f->lsm_rule = NULL;
                /* Support legacy tests for a valid loginuid */
-                if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295)) {
+                if ((f->type == AUDIT_LOGINUID) && (f->val == ~0U)) {
                        f->type = AUDIT_LOGINUID_SET;
                        f->val = 0;
                }
@@ -865,6 +865,12 @@ static inline int audit_add_rule(struct audit_entry *entry)
                err = audit_add_watch(&entry->rule, &list);
                if (err) {
                        mutex_unlock(&audit_filter_mutex);
+                        /*
+                         * normally audit_add_tree_rule() will free it
+                         * on failure
+                         */
+                        if (tree)
+                                audit_put_tree(tree);
                        goto error;
                }
        }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3c8a601324a2..9845cb32b60a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1399,8 +1399,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
        }
        i = 0;
-        list_for_each_entry(n, &context->names_list, list)
+        list_for_each_entry(n, &context->names_list, list) {
+                if (n->hidden)
+                        continue;
                audit_log_name(context, n, NULL, i++, &call_panic);
+        }
        /* Send end of event record to help user space know we are finished */
        ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
@@ -1769,14 +1772,15 @@ void audit_putname(struct filename *name)
 * __audit_inode - store the inode and device from a lookup
 * @name: name being audited
 * @dentry: dentry being audited
- * @parent: does this dentry represent the parent?
+ * @flags: attributes for this particular entry
 */
 void __audit_inode(struct filename *name, const struct dentry *dentry,
-                   unsigned int parent)
+                   unsigned int flags)
 {
        struct audit_context *context = current->audit_context;
        const struct inode *inode = dentry->d_inode;
        struct audit_names *n;
+        bool parent = flags & AUDIT_INODE_PARENT;
        if (!context->in_syscall)
                return;
@@ -1831,6 +1835,8 @@ out:
        if (parent) {
                n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
                n->type = AUDIT_TYPE_PARENT;
+                if (flags & AUDIT_INODE_HIDDEN)
+                        n->hidden = true;
        } else {
                n->name_len = AUDIT_NAME_FULL;
                n->type = AUDIT_TYPE_NORMAL;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a7c9e6ddb979..781845a013ab 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -63,9 +63,6 @@
 #include <linux/atomic.h>
-/* css deactivation bias, makes css->refcnt negative to deny new trygets */
-#define CSS_DEACT_BIAS          INT_MIN
 /*
 * cgroup_mutex is the master lock.  Any modification to cgroup or its
 * hierarchy must be performed while holding it.
@@ -99,16 +96,19 @@ static DEFINE_MUTEX(cgroup_root_mutex);
 */
 #define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys,
 #define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
-static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
+static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = {
 #include <linux/cgroup_subsys.h>
 };
 /*
- * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
+ * The dummy hierarchy, reserved for the subsystems that are otherwise
- * subsystems that are otherwise unattached - it never has more than a
+ * unattached - it never has more than a single cgroup, and all tasks are
- * single cgroup, and all tasks are part of that cgroup.
+ * part of that cgroup.
 */
-static struct cgroupfs_root rootnode;
+static struct cgroupfs_root cgroup_dummy_root;
+/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */
+static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup;
 /*
 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
@@ -186,18 +186,28 @@ struct cgroup_event {
 /* The list of hierarchy roots */
-static LIST_HEAD(roots);
+static LIST_HEAD(cgroup_roots);
-static int root_count;
+static int cgroup_root_count;
-static DEFINE_IDA(hierarchy_ida);
+/*
-static int next_hierarchy_id;
+ * Hierarchy ID allocation and mapping.  It follows the same exclusion
-static DEFINE_SPINLOCK(hierarchy_id_lock);
+ * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for
+ * writes, either for reads.
-/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
+ */
-#define dummytop (&rootnode.top_cgroup)
+static DEFINE_IDR(cgroup_hierarchy_idr);
 static struct cgroup_name root_cgroup_name = { .name = "/" };
+/*
+ * Assign a monotonically increasing serial number to cgroups.  It
+ * guarantees cgroups with bigger numbers are newer than those with smaller
+ * numbers.  Also, as cgroups are always appended to the parent's
+ * ->children list, it guarantees that sibling cgroups are always sorted in
+ * the ascending serial number order on the list.  Protected by
+ * cgroup_mutex.
+ */
+static u64 cgroup_serial_nr_next = 1;
 /* This flag indicates whether tasks in the fork and exit paths should
 * check for fork/exit handlers to call. This avoids us having to do
 * extra work in the fork/exit path if none of the subsystems need to
@@ -205,27 +215,15 @@ static struct cgroup_name root_cgroup_name = { .name = "/" };
 */
 static int need_forkexit_callback __read_mostly;
+static void cgroup_offline_fn(struct work_struct *work);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
                              struct cftype cfts[], bool is_add);
-static int css_unbias_refcnt(int refcnt)
-{
-        return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
-}
-/* the current nr of refs, always >= 0 whether @css is deactivated or not */
-static int css_refcnt(struct cgroup_subsys_state *css)
-{
-        int v = atomic_read(&css->refcnt);
-        return css_unbias_refcnt(v);
-}
 /* convenient tests for these bits */
-inline int cgroup_is_removed(const struct cgroup *cgrp)
+static inline bool cgroup_is_dead(const struct cgroup *cgrp)
 {
-        return test_bit(CGRP_REMOVED, &cgrp->flags);
+        return test_bit(CGRP_DEAD, &cgrp->flags);
 }
 /**
@@ -261,16 +259,38 @@ static int notify_on_release(const struct cgroup *cgrp)
        return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 }
-/*
+/**
- * for_each_subsys() allows you to iterate on each subsystem attached to
+ * for_each_subsys - iterate all loaded cgroup subsystems
- * an active hierarchy
+ * @ss: the iteration cursor
+ * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ *
+ * Should be called under cgroup_mutex.
 */
-#define for_each_subsys(_root, _ss) \
+#define for_each_subsys(ss, i)                                          \
-list_for_each_entry(_ss, &_root->subsys_list, sibling)
+        for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++)                 \
+                if (({ lockdep_assert_held(&cgroup_mutex);              \
+                       !((ss) = cgroup_subsys[i]); })) { }              \
+                else
-/* for_each_active_root() allows you to iterate across the active hierarchies */
+/**
-#define for_each_active_root(_root) \
+ * for_each_builtin_subsys - iterate all built-in cgroup subsystems
-list_for_each_entry(_root, &roots, root_list)
+ * @ss: the iteration cursor
+ * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end
+ *
+ * Bulit-in subsystems are always present and iteration itself doesn't
+ * require any synchronization.
+ */
+#define for_each_builtin_subsys(ss, i)                                  \
+        for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT &&              \
+             (((ss) = cgroup_subsys[i]) || true); (i)++)
+/* iterate each subsystem attached to a hierarchy */
+#define for_each_root_subsys(root, ss)                                  \
+        list_for_each_entry((ss), &(root)->subsys_list, sibling)
+/* iterate across the active hierarchies */
+#define for_each_active_root(root)                                      \
+        list_for_each_entry((root), &cgroup_roots, root_list)
 static inline struct cgroup *__d_cgrp(struct dentry *dentry)
 {
@@ -297,7 +317,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
 static bool cgroup_lock_live_group(struct cgroup *cgrp)
 {
        mutex_lock(&cgroup_mutex);
-        if (cgroup_is_removed(cgrp)) {
+        if (cgroup_is_dead(cgrp)) {
                mutex_unlock(&cgroup_mutex);
                return false;
        }
@@ -312,20 +332,24 @@ static void cgroup_release_agent(struct work_struct *work);
 static DECLARE_WORK(release_agent_work, cgroup_release_agent);
 static void check_for_release(struct cgroup *cgrp);
-/* Link structure for associating css_set objects with cgroups */
+/*
-struct cg_cgroup_link {
+ * A cgroup can be associated with multiple css_sets as different tasks may
-        /*
+ * belong to different cgroups on different hierarchies.  In the other
-         * List running through cg_cgroup_links associated with a
+ * direction, a css_set is naturally associated with multiple cgroups.
-         * cgroup, anchored on cgroup->css_sets
+ * This M:N relationship is represented by the following link structure
-         */
+ * which exists for each association and allows traversing the associations
-        struct list_head cgrp_link_list;
+ * from both sides.
-        struct cgroup *cgrp;
+ */
-        /*
+struct cgrp_cset_link {
-         * List running through cg_cgroup_links pointing at a
+        /* the cgroup and css_set this link associates */
-         * single css_set object, anchored on css_set->cg_links
+        struct cgroup           *cgrp;
-         */
+        struct css_set          *cset;
-        struct list_head cg_link_list;
-        struct css_set *cg;
+        /* list of cgrp_cset_links anchored at cgrp->cset_links */
+        struct list_head        cset_link;
+        /* list of cgrp_cset_links anchored at css_set->cgrp_links */
+        struct list_head        cgrp_link;
 };
 /* The default css_set - used by init and its children prior to any
@@ -336,7 +360,7 @@ struct cg_cgroup_link {
 */
 static struct css_set init_css_set;
-static struct cg_cgroup_link init_css_set_link;
+static struct cgrp_cset_link init_cgrp_cset_link;
 static int cgroup_init_idr(struct cgroup_subsys *ss,
                           struct cgroup_subsys_state *css);
@@ -357,10 +381,11 @@ static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
 static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
 {
-        int i;
        unsigned long key = 0UL;
+        struct cgroup_subsys *ss;
+        int i;
-        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
+        for_each_subsys(ss, i)
                key += (unsigned long)css[i];
        key = (key >> 16) ^ key;
@@ -373,90 +398,83 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
 * compiled into their kernel but not actually in use */
 static int use_task_css_set_links __read_mostly;
-static void __put_css_set(struct css_set *cg, int taskexit)
+static void __put_css_set(struct css_set *cset, int taskexit)
 {
-        struct cg_cgroup_link *link;
+        struct cgrp_cset_link *link, *tmp_link;
-        struct cg_cgroup_link *saved_link;
        /*
         * Ensure that the refcount doesn't hit zero while any readers
         * can see it. Similar to atomic_dec_and_lock(), but for an
         * rwlock
         */
-        if (atomic_add_unless(&cg->refcount, -1, 1))
+        if (atomic_add_unless(&cset->refcount, -1, 1))
                return;
        write_lock(&css_set_lock);
-        if (!atomic_dec_and_test(&cg->refcount)) {
+        if (!atomic_dec_and_test(&cset->refcount)) {
                write_unlock(&css_set_lock);
                return;
        }
        /* This css_set is dead. unlink it and release cgroup refcounts */
-        hash_del(&cg->hlist);
+        hash_del(&cset->hlist);
        css_set_count--;
-        list_for_each_entry_safe(link, saved_link, &cg->cg_links,
+        list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
-                                 cg_link_list) {
                struct cgroup *cgrp = link->cgrp;
-                list_del(&link->cg_link_list);
-                list_del(&link->cgrp_link_list);
-                /*
+                list_del(&link->cset_link);
-                 * We may not be holding cgroup_mutex, and if cgrp->count is
+                list_del(&link->cgrp_link);
-                 * dropped to 0 the cgroup can be destroyed at any time, hence
-                 * rcu_read_lock is used to keep it alive.
+                /* @cgrp can't go away while we're holding css_set_lock */
-                 */
+                if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
-                rcu_read_lock();
-                if (atomic_dec_and_test(&cgrp->count) &&
-                    notify_on_release(cgrp)) {
                        if (taskexit)
                                set_bit(CGRP_RELEASABLE, &cgrp->flags);
                        check_for_release(cgrp);
                }
-                rcu_read_unlock();
                kfree(link);
        }
        write_unlock(&css_set_lock);
-        kfree_rcu(cg, rcu_head);
+        kfree_rcu(cset, rcu_head);
 }
 /*
 * refcounted get/put for css_set objects
 */
-static inline void get_css_set(struct css_set *cg)
+static inline void get_css_set(struct css_set *cset)
 {
-        atomic_inc(&cg->refcount);
+        atomic_inc(&cset->refcount);
 }
-static inline void put_css_set(struct css_set *cg)
+static inline void put_css_set(struct css_set *cset)
 {
-        __put_css_set(cg, 0);
+        __put_css_set(cset, 0);
 }
-static inline void put_css_set_taskexit(struct css_set *cg)
+static inline void put_css_set_taskexit(struct css_set *cset)
 {
-        __put_css_set(cg, 1);
+        __put_css_set(cset, 1);
 }
-/*
+/**
 * compare_css_sets - helper function for find_existing_css_set().
- * @cg: candidate css_set being tested
+ * @cset: candidate css_set being tested
- * @old_cg: existing css_set for a task
+ * @old_cset: existing css_set for a task
 * @new_cgrp: cgroup that's being entered by the task
 * @template: desired set of css pointers in css_set (pre-calculated)
 *
 * Returns true if "cg" matches "old_cg" except for the hierarchy
 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
 */
-static bool compare_css_sets(struct css_set *cg,
+static bool compare_css_sets(struct css_set *cset,
-                             struct css_set *old_cg,
+                             struct css_set *old_cset,
                             struct cgroup *new_cgrp,
                             struct cgroup_subsys_state *template[])
 {
        struct list_head *l1, *l2;
-        if (memcmp(template, cg->subsys, sizeof(cg->subsys))) {
+        if (memcmp(template, cset->subsys, sizeof(cset->subsys))) {
                /* Not all subsystems matched */
                return false;
        }
@@ -470,28 +488,28 @@ static bool compare_css_sets(struct css_set *cg,
         * candidates.
         */
-        l1 = &cg->cg_links;
+        l1 = &cset->cgrp_links;
-        l2 = &old_cg->cg_links;
+        l2 = &old_cset->cgrp_links;
        while (1) {
-                struct cg_cgroup_link *cgl1, *cgl2;
+                struct cgrp_cset_link *link1, *link2;
-                struct cgroup *cg1, *cg2;
+                struct cgroup *cgrp1, *cgrp2;
                l1 = l1->next;
                l2 = l2->next;
                /* See if we reached the end - both lists are equal length. */
-                if (l1 == &cg->cg_links) {
+                if (l1 == &cset->cgrp_links) {
-                        BUG_ON(l2 != &old_cg->cg_links);
+                        BUG_ON(l2 != &old_cset->cgrp_links);
                        break;
                } else {
-                        BUG_ON(l2 == &old_cg->cg_links);
+                        BUG_ON(l2 == &old_cset->cgrp_links);
                }
                /* Locate the cgroups associated with these links. */
-                cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list);
+                link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
-                cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list);
+                link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
-                cg1 = cgl1->cgrp;
+                cgrp1 = link1->cgrp;
-                cg2 = cgl2->cgrp;
+                cgrp2 = link2->cgrp;
                /* Hierarchies should be linked in the same order. */
-                BUG_ON(cg1->root != cg2->root);
+                BUG_ON(cgrp1->root != cgrp2->root);
                /*
                 * If this hierarchy is the hierarchy of the cgroup
@@ -500,46 +518,39 @@ static bool compare_css_sets(struct css_set *cg,
                 * hierarchy, then this css_set should point to the
                 * same cgroup as the old css_set.
                 */
-                if (cg1->root == new_cgrp->root) {
+                if (cgrp1->root == new_cgrp->root) {
-                        if (cg1 != new_cgrp)
+                        if (cgrp1 != new_cgrp)
                                return false;
                } else {
-                        if (cg1 != cg2)
+                        if (cgrp1 != cgrp2)
                                return false;
                }
        }
        return true;
 }
-/*
+/**
- * find_existing_css_set() is a helper for
+ * find_existing_css_set - init css array and find the matching css_set
- * find_css_set(), and checks to see whether an existing
+ * @old_cset: the css_set that we're using before the cgroup transition
- * css_set is suitable.
+ * @cgrp: the cgroup that we're moving into
- *
+ * @template: out param for the new set of csses, should be clear on entry
- * oldcg: the cgroup group that we're using before the cgroup
- * transition
- *
- * cgrp: the cgroup that we're moving into
- *
- * template: location in which to build the desired set of subsystem
- * state objects for the new cgroup group
 */
-static struct css_set *find_existing_css_set(
+static struct css_set *find_existing_css_set(struct css_set *old_cset,
-        struct css_set *oldcg,
+                                        struct cgroup *cgrp,
-        struct cgroup *cgrp,
+                                        struct cgroup_subsys_state *template[])
-        struct cgroup_subsys_state *template[])
 {
-        int i;
        struct cgroupfs_root *root = cgrp->root;
-        struct css_set *cg;
+        struct cgroup_subsys *ss;
+        struct css_set *cset;
        unsigned long key;
+        int i;
        /*
         * Build the set of subsystem state objects that we want to see in the
         * new css_set. while subsystems can change globally, the entries here
         * won't change, so no need for locking.
         */
-        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+        for_each_subsys(ss, i) {
                if (root->subsys_mask & (1UL << i)) {
                        /* Subsystem is in this hierarchy. So we want
                         * the subsystem state from the new
@@ -548,148 +559,152 @@ static struct css_set *find_existing_css_set(
                } else {
                        /* Subsystem is not in this hierarchy, so we
                         * don't want to change the subsystem state */
-                        template[i] = oldcg->subsys[i];
+                        template[i] = old_cset->subsys[i];
                }
        }
        key = css_set_hash(template);
-        hash_for_each_possible(css_set_table, cg, hlist, key) {
+        hash_for_each_possible(css_set_table, cset, hlist, key) {
-                if (!compare_css_sets(cg, oldcg, cgrp, template))
+                if (!compare_css_sets(cset, old_cset, cgrp, template))
                        continue;
                /* This css_set matches what we need */
-                return cg;
+                return cset;
        }
        /* No existing cgroup group matched */
        return NULL;
 }
-static void free_cg_links(struct list_head *tmp)
+static void free_cgrp_cset_links(struct list_head *links_to_free)
 {
-        struct cg_cgroup_link *link;
+        struct cgrp_cset_link *link, *tmp_link;
-        struct cg_cgroup_link *saved_link;
-        list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
+        list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
-                list_del(&link->cgrp_link_list);
+                list_del(&link->cset_link);
                kfree(link);
        }
 }
-/*
+/**
- * allocate_cg_links() allocates "count" cg_cgroup_link structures
+ * allocate_cgrp_cset_links - allocate cgrp_cset_links
- * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
+ * @count: the number of links to allocate
- * success or a negative error
+ * @tmp_links: list_head the allocated links are put on
+ *
+ * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
+ * through ->cset_link.  Returns 0 on success or -errno.
 */
-static int allocate_cg_links(int count, struct list_head *tmp)
+static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
 {
-        struct cg_cgroup_link *link;
+        struct cgrp_cset_link *link;
        int i;
-        INIT_LIST_HEAD(tmp);
+        INIT_LIST_HEAD(tmp_links);
        for (i = 0; i < count; i++) {
-                link = kmalloc(sizeof(*link), GFP_KERNEL);
+                link = kzalloc(sizeof(*link), GFP_KERNEL);
                if (!link) {
-                        free_cg_links(tmp);
+                        free_cgrp_cset_links(tmp_links);
                        return -ENOMEM;
                }
-                list_add(&link->cgrp_link_list, tmp);
+                list_add(&link->cset_link, tmp_links);
        }
        return 0;
 }
 /**
 * link_css_set - a helper function to link a css_set to a cgroup
- * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
+ * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
- * @cg: the css_set to be linked
+ * @cset: the css_set to be linked
 * @cgrp: the destination cgroup
 */
-static void link_css_set(struct list_head *tmp_cg_links,
+static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
-                         struct css_set *cg, struct cgroup *cgrp)
+                         struct cgroup *cgrp)
 {
-        struct cg_cgroup_link *link;
+        struct cgrp_cset_link *link;
-        BUG_ON(list_empty(tmp_cg_links));
+        BUG_ON(list_empty(tmp_links));
-        link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
+        link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
-                                cgrp_link_list);
+        link->cset = cset;
-        link->cg = cg;
        link->cgrp = cgrp;
-        atomic_inc(&cgrp->count);
+        list_move(&link->cset_link, &cgrp->cset_links);
-        list_move(&link->cgrp_link_list, &cgrp->css_sets);
        /*
         * Always add links to the tail of the list so that the list
         * is sorted by order of hierarchy creation
         */
-        list_add_tail(&link->cg_link_list, &cg->cg_links);
+        list_add_tail(&link->cgrp_link, &cset->cgrp_links);
 }
-/*
+/**
- * find_css_set() takes an existing cgroup group and a
+ * find_css_set - return a new css_set with one cgroup updated
- * cgroup object, and returns a css_set object that's
+ * @old_cset: the baseline css_set
- * equivalent to the old group, but with the given cgroup
+ * @cgrp: the cgroup to be updated
- * substituted into the appropriate hierarchy. Must be called with
+ *
- * cgroup_mutex held
+ * Return a new css_set that's equivalent to @old_cset, but with @cgrp
+ * substituted into the appropriate hierarchy.
 */
-static struct css_set *find_css_set(
+static struct css_set *find_css_set(struct css_set *old_cset,
-        struct css_set *oldcg, struct cgroup *cgrp)
+                                    struct cgroup *cgrp)
 {
-        struct css_set *res;
+        struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
-        struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+        struct css_set *cset;
+        struct list_head tmp_links;
-        struct list_head tmp_cg_links;
+        struct cgrp_cset_link *link;
-        struct cg_cgroup_link *link;
        unsigned long key;
+        lockdep_assert_held(&cgroup_mutex);
        /* First see if we already have a cgroup group that matches
         * the desired set */
        read_lock(&css_set_lock);
-        res = find_existing_css_set(oldcg, cgrp, template);
+        cset = find_existing_css_set(old_cset, cgrp, template);
-        if (res)
+        if (cset)
-                get_css_set(res);
+                get_css_set(cset);
        read_unlock(&css_set_lock);
-        if (res)
+        if (cset)
-                return res;
+                return cset;
-        res = kmalloc(sizeof(*res), GFP_KERNEL);
+        cset = kzalloc(sizeof(*cset), GFP_KERNEL);
-        if (!res)
+        if (!cset)
                return NULL;
-        /* Allocate all the cg_cgroup_link objects that we'll need */
+        /* Allocate all the cgrp_cset_link objects that we'll need */
-        if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
+        if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
-                kfree(res);
+                kfree(cset);
                return NULL;
        }
-        atomic_set(&res->refcount, 1);
+        atomic_set(&cset->refcount, 1);
-        INIT_LIST_HEAD(&res->cg_links);
+        INIT_LIST_HEAD(&cset->cgrp_links);
-        INIT_LIST_HEAD(&res->tasks);
+        INIT_LIST_HEAD(&cset->tasks);
-        INIT_HLIST_NODE(&res->hlist);
+        INIT_HLIST_NODE(&cset->hlist);
        /* Copy the set of subsystem state objects generated in
         * find_existing_css_set() */
-        memcpy(res->subsys, template, sizeof(res->subsys));
+        memcpy(cset->subsys, template, sizeof(cset->subsys));
        write_lock(&css_set_lock);
        /* Add reference counts and links from the new css_set. */
-        list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
+        list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
                struct cgroup *c = link->cgrp;
                if (c->root == cgrp->root)
                        c = cgrp;
-                link_css_set(&tmp_cg_links, res, c);
+                link_css_set(&tmp_links, cset, c);
        }
-        BUG_ON(!list_empty(&tmp_cg_links));
+        BUG_ON(!list_empty(&tmp_links));
        css_set_count++;
        /* Add this cgroup group to the hash table */
-        key = css_set_hash(res->subsys);
+        key = css_set_hash(cset->subsys);
-        hash_add(css_set_table, &res->hlist, key);
+        hash_add(css_set_table, &cset->hlist, key);
        write_unlock(&css_set_lock);
-        return res;
+        return cset;
 }
 /*
@@ -699,7 +714,7 @@ static struct css_set *find_css_set(
 static struct cgroup *task_cgroup_from_root(struct task_struct *task,
                                            struct cgroupfs_root *root)
 {
-        struct css_set *css;
+        struct css_set *cset;
        struct cgroup *res = NULL;
        BUG_ON(!mutex_is_locked(&cgroup_mutex));
@@ -709,13 +724,15 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
         * task can't change groups, so the only thing that can happen
         * is that it exits and its css is set back to init_css_set.
         */
-        css = task->cgroups;
+        cset = task_css_set(task);
-        if (css == &init_css_set) {
+        if (cset == &init_css_set) {
                res = &root->top_cgroup;
        } else {
-                struct cg_cgroup_link *link;
+                struct cgrp_cset_link *link;
-                list_for_each_entry(link, &css->cg_links, cg_link_list) {
+                list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
                        struct cgroup *c = link->cgrp;
                        if (c->root == root) {
                                res = c;
                                break;
@@ -785,7 +802,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 */
 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
-static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int);
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
 static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
                               unsigned long subsys_mask);
@@ -828,14 +844,14 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
 static void cgroup_free_fn(struct work_struct *work)
 {
-        struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
+        struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
        struct cgroup_subsys *ss;
        mutex_lock(&cgroup_mutex);
        /*
         * Release the subsystem state objects.
         */
-        for_each_subsys(cgrp->root, ss)
+        for_each_root_subsys(cgrp->root, ss)
                ss->css_free(cgrp);
        cgrp->root->number_of_cgroups--;
@@ -873,7 +889,8 @@ static void cgroup_free_rcu(struct rcu_head *head)
 {
        struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
-        schedule_work(&cgrp->free_work);
+        INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
+        schedule_work(&cgrp->destroy_work);
 }
 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -882,7 +899,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
        if (S_ISDIR(inode->i_mode)) {
                struct cgroup *cgrp = dentry->d_fsdata;
-                BUG_ON(!(cgroup_is_removed(cgrp)));
+                BUG_ON(!(cgroup_is_dead(cgrp)));
                call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
        } else {
                struct cfent *cfe = __d_cfe(dentry);
@@ -950,7 +967,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
        struct cgroup *cgrp = __d_cgrp(dir);
        struct cgroup_subsys *ss;
-        for_each_subsys(cgrp->root, ss) {
+        for_each_root_subsys(cgrp->root, ss) {
                struct cftype_set *set;
                if (!test_bit(ss->subsys_id, &subsys_mask))
                        continue;
@@ -988,30 +1005,23 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
 * returns an error, no reference counts are touched.
 */
 static int rebind_subsystems(struct cgroupfs_root *root,
-                              unsigned long final_subsys_mask)
+                             unsigned long added_mask, unsigned removed_mask)
 {
-        unsigned long added_mask, removed_mask;
        struct cgroup *cgrp = &root->top_cgroup;
+        struct cgroup_subsys *ss;
        int i;
        BUG_ON(!mutex_is_locked(&cgroup_mutex));
        BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
-        removed_mask = root->actual_subsys_mask & ~final_subsys_mask;
-        added_mask = final_subsys_mask & ~root->actual_subsys_mask;
        /* Check that any added subsystems are currently free */
-        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+        for_each_subsys(ss, i) {
                unsigned long bit = 1UL << i;
-                struct cgroup_subsys *ss = subsys[i];
                if (!(bit & added_mask))
                        continue;
-                /*
-                 * Nobody should tell us to do a subsys that doesn't exist:
+                if (ss->root != &cgroup_dummy_root) {
-                 * parse_cgroupfs_options should catch that case and refcounts
-                 * ensure that subsystems won't disappear once selected.
-                 */
-                BUG_ON(ss == NULL);
-                if (ss->root != &rootnode) {
                        /* Subsystem isn't free */
                        return -EBUSY;
                }
@@ -1025,38 +1035,41 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                return -EBUSY;
        /* Process each subsystem */
-        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+        for_each_subsys(ss, i) {
-                struct cgroup_subsys *ss = subsys[i];
                unsigned long bit = 1UL << i;
                if (bit & added_mask) {
                        /* We're binding this subsystem to this hierarchy */
-                        BUG_ON(ss == NULL);
                        BUG_ON(cgrp->subsys[i]);
-                        BUG_ON(!dummytop->subsys[i]);
+                        BUG_ON(!cgroup_dummy_top->subsys[i]);
-                        BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
+                        BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top);
-                        cgrp->subsys[i] = dummytop->subsys[i];
+                        cgrp->subsys[i] = cgroup_dummy_top->subsys[i];
                        cgrp->subsys[i]->cgroup = cgrp;
                        list_move(&ss->sibling, &root->subsys_list);
                        ss->root = root;
                        if (ss->bind)
                                ss->bind(cgrp);
                        /* refcount was already taken, and we're keeping it */
+                        root->subsys_mask |= bit;
                } else if (bit & removed_mask) {
                        /* We're removing this subsystem */
-                        BUG_ON(ss == NULL);
+                        BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]);
-                        BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
                        BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
                        if (ss->bind)
-                                ss->bind(dummytop);
+                                ss->bind(cgroup_dummy_top);
-                        dummytop->subsys[i]->cgroup = dummytop;
+                        cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top;
                        cgrp->subsys[i] = NULL;
-                        subsys[i]->root = &rootnode;
+                        cgroup_subsys[i]->root = &cgroup_dummy_root;
-                        list_move(&ss->sibling, &rootnode.subsys_list);
+                        list_move(&ss->sibling, &cgroup_dummy_root.subsys_list);
                        /* subsystem is now free - drop reference on module */
                        module_put(ss->module);
-                } else if (bit & final_subsys_mask) {
+                        root->subsys_mask &= ~bit;
+                } else if (bit & root->subsys_mask) {
                        /* Subsystem state should already exist */
-                        BUG_ON(ss == NULL);
                        BUG_ON(!cgrp->subsys[i]);
                        /*
                         * a refcount was taken, but we already had one, so
@@ -1071,7 +1084,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                        BUG_ON(cgrp->subsys[i]);
                }
        }
-        root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
+        /*
+         * Mark @root has finished binding subsystems.  @root->subsys_mask
+         * now matches the bound subsystems.
+         */
+        root->flags |= CGRP_ROOT_SUBSYS_BOUND;
        return 0;
 }
@@ -1082,7 +1100,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
        struct cgroup_subsys *ss;
        mutex_lock(&cgroup_root_mutex);
-        for_each_subsys(root, ss)
+        for_each_root_subsys(root, ss)
                seq_printf(seq, ",%s", ss->name);
        if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
                seq_puts(seq, ",sane_behavior");
@@ -1114,18 +1132,19 @@ struct cgroup_sb_opts {
 };
 /*
- * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
+ * Convert a hierarchy specifier into a bitmask of subsystems and
- * with cgroup_mutex held to protect the subsys[] array. This function takes
+ * flags. Call with cgroup_mutex held to protect the cgroup_subsys[]
- * refcounts on subsystems to be used, unless it returns error, in which case
+ * array. This function takes refcounts on subsystems to be used, unless it
- * no refcounts are taken.
+ * returns error, in which case no refcounts are taken.
 */
 static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 {
        char *token, *o = data;
        bool all_ss = false, one_ss = false;
        unsigned long mask = (unsigned long)-1;
-        int i;
        bool module_pin_failed = false;
+        struct cgroup_subsys *ss;
+        int i;
        BUG_ON(!mutex_is_locked(&cgroup_mutex));
@@ -1202,10 +1221,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                        continue;
                }
-                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                for_each_subsys(ss, i) {
-                        struct cgroup_subsys *ss = subsys[i];
-                        if (ss == NULL)
-                                continue;
                        if (strcmp(token, ss->name))
                                continue;
                        if (ss->disabled)
@@ -1228,16 +1244,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
         * otherwise if 'none', 'name=' and a subsystem name options
         * were not specified, let's default to 'all'
         */
-        if (all_ss || (!one_ss && !opts->none && !opts->name)) {
+        if (all_ss || (!one_ss && !opts->none && !opts->name))
-                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                for_each_subsys(ss, i)
-                        struct cgroup_subsys *ss = subsys[i];
+                        if (!ss->disabled)
-                        if (ss == NULL)
+                                set_bit(i, &opts->subsys_mask);
-                                continue;
-                        if (ss->disabled)
-                                continue;
-                        set_bit(i, &opts->subsys_mask);
-                }
-        }
        /* Consistency checks */
@@ -1281,12 +1291,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
         * take duplicate reference counts on a subsystem that's already used,
         * but rebind_subsystems handles this case.
         */
-        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+        for_each_subsys(ss, i) {
-                unsigned long bit = 1UL << i;
+                if (!(opts->subsys_mask & (1UL << i)))
-                if (!(bit & opts->subsys_mask))
                        continue;
-                if (!try_module_get(subsys[i]->module)) {
+                if (!try_module_get(cgroup_subsys[i]->module)) {
                        module_pin_failed = true;
                        break;
                }
@@ -1303,7 +1311,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                        if (!(bit & opts->subsys_mask))
                                continue;
-                        module_put(subsys[i]->module);
+                        module_put(cgroup_subsys[i]->module);
                }
                return -ENOENT;
        }
@@ -1313,14 +1321,14 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 static void drop_parsed_module_refcounts(unsigned long subsys_mask)
 {
+        struct cgroup_subsys *ss;
        int i;
-        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-                unsigned long bit = 1UL << i;
-                if (!(bit & subsys_mask))
+        mutex_lock(&cgroup_mutex);
-                        continue;
+        for_each_subsys(ss, i)
-                module_put(subsys[i]->module);
+                if (subsys_mask & (1UL << i))
-        }
+                        module_put(cgroup_subsys[i]->module);
+        mutex_unlock(&cgroup_mutex);
 }
 static int cgroup_remount(struct super_block *sb, int *flags, char *data)
@@ -1345,7 +1353,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
        if (ret)
                goto out_unlock;
-        if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent)
+        if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
                pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
                           task_tgid_nr(current), current->comm);
@@ -1353,10 +1361,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
        removed_mask = root->subsys_mask & ~opts.subsys_mask;
        /* Don't allow flags or name to change at remount */
-        if (opts.flags != root->flags ||
+        if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
            (opts.name && strcmp(opts.name, root->name))) {
+                pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n",
+                       opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
+                       root->flags & CGRP_ROOT_OPTION_MASK, root->name);
                ret = -EINVAL;
-                drop_parsed_module_refcounts(opts.subsys_mask);
                goto out_unlock;
        }
@@ -1367,11 +1377,10 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
         */
        cgroup_clear_directory(cgrp->dentry, false, removed_mask);
-        ret = rebind_subsystems(root, opts.subsys_mask);
+        ret = rebind_subsystems(root, added_mask, removed_mask);
        if (ret) {
                /* rebind_subsystems failed, re-populate the removed files */
                cgroup_populate_dir(cgrp, false, removed_mask);
-                drop_parsed_module_refcounts(opts.subsys_mask);
                goto out_unlock;
        }
@@ -1386,6 +1395,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
        mutex_unlock(&cgroup_root_mutex);
        mutex_unlock(&cgroup_mutex);
        mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
+        if (ret)
+                drop_parsed_module_refcounts(opts.subsys_mask);
        return ret;
 }
@@ -1401,11 +1412,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
        INIT_LIST_HEAD(&cgrp->sibling);
        INIT_LIST_HEAD(&cgrp->children);
        INIT_LIST_HEAD(&cgrp->files);
-        INIT_LIST_HEAD(&cgrp->css_sets);
+        INIT_LIST_HEAD(&cgrp->cset_links);
-        INIT_LIST_HEAD(&cgrp->allcg_node);
        INIT_LIST_HEAD(&cgrp->release_list);
        INIT_LIST_HEAD(&cgrp->pidlists);
-        INIT_WORK(&cgrp->free_work, cgroup_free_fn);
        mutex_init(&cgrp->pidlist_mutex);
        INIT_LIST_HEAD(&cgrp->event_list);
        spin_lock_init(&cgrp->event_list_lock);
@@ -1418,37 +1427,37 @@ static void init_cgroup_root(struct cgroupfs_root *root)
        INIT_LIST_HEAD(&root->subsys_list);
        INIT_LIST_HEAD(&root->root_list);
-        INIT_LIST_HEAD(&root->allcg_list);
        root->number_of_cgroups = 1;
        cgrp->root = root;
-        cgrp->name = &root_cgroup_name;
+        RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
        init_cgroup_housekeeping(cgrp);
-        list_add_tail(&cgrp->allcg_node, &root->allcg_list);
 }
-static bool init_root_id(struct cgroupfs_root *root)
+static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
 {
-        int ret = 0;
+        int id;
-        do {
+        lockdep_assert_held(&cgroup_mutex);
-                if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
+        lockdep_assert_held(&cgroup_root_mutex);
-                        return false;
-                spin_lock(&hierarchy_id_lock);
+        id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end,
-                /* Try to allocate the next unused ID */
+                              GFP_KERNEL);
-                ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
+        if (id < 0)
-                                        &root->hierarchy_id);
+                return id;
-                if (ret == -ENOSPC)
-                        /* Try again starting from 0 */
+        root->hierarchy_id = id;
-                        ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
+        return 0;
-                if (!ret) {
+}
-                        next_hierarchy_id = root->hierarchy_id + 1;
-                } else if (ret != -EAGAIN) {
+static void cgroup_exit_root_id(struct cgroupfs_root *root)
-                        /* Can only get here if the 31-bit IDR is full ... */
+{
-                        BUG_ON(ret);
+        lockdep_assert_held(&cgroup_mutex);
-                }
+        lockdep_assert_held(&cgroup_root_mutex);
-                spin_unlock(&hierarchy_id_lock);
-        } while (ret);
+        if (root->hierarchy_id) {
-        return true;
+                idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
+                root->hierarchy_id = 0;
+        }
 }
 static int cgroup_test_super(struct super_block *sb, void *data)
@@ -1482,12 +1491,16 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
        if (!root)
                return ERR_PTR(-ENOMEM);
-        if (!init_root_id(root)) {
-                kfree(root);
-                return ERR_PTR(-ENOMEM);
-        }
        init_cgroup_root(root);
+        /*
+         * We need to set @root->subsys_mask now so that @root can be
+         * matched by cgroup_test_super() before it finishes
+         * initialization; otherwise, competing mounts with the same
+         * options may try to bind the same subsystems instead of waiting
+         * for the first one leading to unexpected mount errors.
+         * SUBSYS_BOUND will be set once actual binding is complete.
+         */
        root->subsys_mask = opts->subsys_mask;
        root->flags = opts->flags;
        ida_init(&root->cgroup_ida);
@@ -1500,17 +1513,15 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
        return root;
 }
-static void cgroup_drop_root(struct cgroupfs_root *root)
+static void cgroup_free_root(struct cgroupfs_root *root)
 {
-        if (!root)
+        if (root) {
-                return;
+                /* hierarhcy ID shoulid already have been released */
+                WARN_ON_ONCE(root->hierarchy_id);
-        BUG_ON(!root->hierarchy_id);
+                ida_destroy(&root->cgroup_ida);
-        spin_lock(&hierarchy_id_lock);
+                kfree(root);
-        ida_remove(&hierarchy_ida, root->hierarchy_id);
+        }
-        spin_unlock(&hierarchy_id_lock);
-        ida_destroy(&root->cgroup_ida);
-        kfree(root);
 }
 static int cgroup_set_super(struct super_block *sb, void *data)
@@ -1597,7 +1608,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
        sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts);
        if (IS_ERR(sb)) {
                ret = PTR_ERR(sb);
-                cgroup_drop_root(opts.new_root);
+                cgroup_free_root(opts.new_root);
                goto drop_modules;
        }
@@ -1605,12 +1616,12 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
        BUG_ON(!root);
        if (root == opts.new_root) {
                /* We used the new root structure, so this is a new hierarchy */
-                struct list_head tmp_cg_links;
+                struct list_head tmp_links;
                struct cgroup *root_cgrp = &root->top_cgroup;
                struct cgroupfs_root *existing_root;
                const struct cred *cred;
                int i;
-                struct css_set *cg;
+                struct css_set *cset;
                BUG_ON(sb->s_root != NULL);
@@ -1637,13 +1648,18 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                 * that's us. The worst that can happen is that we
                 * have some link structures left over
                 */
-                ret = allocate_cg_links(css_set_count, &tmp_cg_links);
+                ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
+                if (ret)
+                        goto unlock_drop;
+                /* ID 0 is reserved for dummy root, 1 for unified hierarchy */
+                ret = cgroup_init_root_id(root, 2, 0);
                if (ret)
                        goto unlock_drop;
-                ret = rebind_subsystems(root, root->subsys_mask);
+                ret = rebind_subsystems(root, root->subsys_mask, 0);
                if (ret == -EBUSY) {
-                        free_cg_links(&tmp_cg_links);
+                        free_cgrp_cset_links(&tmp_links);
                        goto unlock_drop;
                }
                /*
@@ -1655,8 +1671,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                /* EBUSY should be the only error here */
                BUG_ON(ret);
-                list_add(&root->root_list, &roots);
+                list_add(&root->root_list, &cgroup_roots);
-                root_count++;
+                cgroup_root_count++;
                sb->s_root->d_fsdata = root_cgrp;
                root->top_cgroup.dentry = sb->s_root;
@@ -1664,11 +1680,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                /* Link the top cgroup in this hierarchy into all
                 * the css_set objects */
                write_lock(&css_set_lock);
-                hash_for_each(css_set_table, i, cg, hlist)
+                hash_for_each(css_set_table, i, cset, hlist)
-                        link_css_set(&tmp_cg_links, cg, root_cgrp);
+                        link_css_set(&tmp_links, cset, root_cgrp);
                write_unlock(&css_set_lock);
-                free_cg_links(&tmp_cg_links);
+                free_cgrp_cset_links(&tmp_links);
                BUG_ON(!list_empty(&root_cgrp->children));
                BUG_ON(root->number_of_cgroups != 1);
@@ -1684,9 +1700,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                 * We re-used an existing hierarchy - the new root (if
                 * any) is not needed
                 */
-                cgroup_drop_root(opts.new_root);
+                cgroup_free_root(opts.new_root);
-                if (root->flags != opts.flags) {
+                if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
                        if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
                                pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
                                ret = -EINVAL;
@@ -1705,6 +1721,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
        return dget(sb->s_root);
 unlock_drop:
+        cgroup_exit_root_id(root);
        mutex_unlock(&cgroup_root_mutex);
        mutex_unlock(&cgroup_mutex);
        mutex_unlock(&inode->i_mutex);
@@ -1721,9 +1738,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 static void cgroup_kill_sb(struct super_block *sb) {
        struct cgroupfs_root *root = sb->s_fs_info;
        struct cgroup *cgrp = &root->top_cgroup;
+        struct cgrp_cset_link *link, *tmp_link;
        int ret;
-        struct cg_cgroup_link *link;
-        struct cg_cgroup_link *saved_link;
        BUG_ON(!root);
@@ -1734,36 +1750,39 @@ static void cgroup_kill_sb(struct super_block *sb) {
        mutex_lock(&cgroup_root_mutex);
        /* Rebind all subsystems back to the default hierarchy */
-        ret = rebind_subsystems(root, 0);
+        if (root->flags & CGRP_ROOT_SUBSYS_BOUND) {
-        /* Shouldn't be able to fail ... */
+                ret = rebind_subsystems(root, 0, root->subsys_mask);
-        BUG_ON(ret);
+                /* Shouldn't be able to fail ... */
+                BUG_ON(ret);
+        }
        /*
-         * Release all the links from css_sets to this hierarchy's
+         * Release all the links from cset_links to this hierarchy's
         * root cgroup
         */
        write_lock(&css_set_lock);
-        list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
+        list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
-                                 cgrp_link_list) {
+                list_del(&link->cset_link);
-                list_del(&link->cg_link_list);
+                list_del(&link->cgrp_link);
-                list_del(&link->cgrp_link_list);
                kfree(link);
        }
        write_unlock(&css_set_lock);
        if (!list_empty(&root->root_list)) {
                list_del(&root->root_list);
-                root_count--;
+                cgroup_root_count--;
        }
+        cgroup_exit_root_id(root);
        mutex_unlock(&cgroup_root_mutex);
        mutex_unlock(&cgroup_mutex);
        simple_xattrs_free(&cgrp->xattrs);
        kill_litter_super(sb);
-        cgroup_drop_root(root);
+        cgroup_free_root(root);
 }
 static struct file_system_type cgroup_fs_type = {
@@ -1825,6 +1844,45 @@ out:
 }
 EXPORT_SYMBOL_GPL(cgroup_path);
+/**
+ * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
+ * @task: target task
+ * @buf: the buffer to write the path into
+ * @buflen: the length of the buffer
+ *
+ * Determine @task's cgroup on the first (the one with the lowest non-zero
+ * hierarchy_id) cgroup hierarchy and copy its path into @buf.  This
+ * function grabs cgroup_mutex and shouldn't be used inside locks used by
+ * cgroup controller callbacks.
+ *
+ * Returns 0 on success, fails with -%ENAMETOOLONG if @buflen is too short.
+ */
+int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
+{
+        struct cgroupfs_root *root;
+        struct cgroup *cgrp;
+        int hierarchy_id = 1, ret = 0;
+        if (buflen < 2)
+                return -ENAMETOOLONG;
+        mutex_lock(&cgroup_mutex);
+        root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
+        if (root) {
+                cgrp = task_cgroup_from_root(task, root);
+                ret = cgroup_path(cgrp, buf, buflen);
+        } else {
+                /* if no hierarchy exists, everyone is in "/" */
+                memcpy(buf, "/", 2);
+        }
+        mutex_unlock(&cgroup_mutex);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(task_cgroup_path);
 /*
 * Control Group taskset
 */
@@ -1910,10 +1968,11 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
 *
 * Must be called with cgroup_mutex and threadgroup locked.
 */
-static void cgroup_task_migrate(struct cgroup *oldcgrp,
+static void cgroup_task_migrate(struct cgroup *old_cgrp,
-                                struct task_struct *tsk, struct css_set *newcg)
+                                struct task_struct *tsk,
+                                struct css_set *new_cset)
 {
-        struct css_set *oldcg;
+        struct css_set *old_cset;
        /*
         * We are synchronized through threadgroup_lock() against PF_EXITING
@@ -1921,25 +1980,25 @@ static void cgroup_task_migrate(struct cgroup *oldcgrp,
         * css_set to init_css_set and dropping the old one.
         */
        WARN_ON_ONCE(tsk->flags & PF_EXITING);
-        oldcg = tsk->cgroups;
+        old_cset = task_css_set(tsk);
        task_lock(tsk);
-        rcu_assign_pointer(tsk->cgroups, newcg);
+        rcu_assign_pointer(tsk->cgroups, new_cset);
        task_unlock(tsk);
        /* Update the css_set linked lists if we're using them */
        write_lock(&css_set_lock);
        if (!list_empty(&tsk->cg_list))
-                list_move(&tsk->cg_list, &newcg->tasks);
+                list_move(&tsk->cg_list, &new_cset->tasks);
        write_unlock(&css_set_lock);
        /*
-         * We just gained a reference on oldcg by taking it from the task. As
+         * We just gained a reference on old_cset by taking it from the
-         * trading it for newcg is protected by cgroup_mutex, we're safe to drop
+         * task. As trading it for new_cset is protected by cgroup_mutex,
-         * it here; it will be freed under RCU.
+         * we're safe to drop it here; it will be freed under RCU.
         */
-        set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+        set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
-        put_css_set(oldcg);
+        put_css_set(old_cset);
 }
 /**
@@ -2029,7 +2088,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
        /*
         * step 1: check that we can legitimately attach to the cgroup.
         */
-        for_each_subsys(root, ss) {
+        for_each_root_subsys(root, ss) {
                if (ss->can_attach) {
                        retval = ss->can_attach(cgrp, &tset);
                        if (retval) {
@@ -2044,8 +2103,11 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
         * we use find_css_set, which allocates a new one if necessary.
         */
        for (i = 0; i < group_size; i++) {
+                struct css_set *old_cset;
                tc = flex_array_get(group, i);
-                tc->cg = find_css_set(tc->task->cgroups, cgrp);
+                old_cset = task_css_set(tc->task);
+                tc->cg = find_css_set(old_cset, cgrp);
                if (!tc->cg) {
                        retval = -ENOMEM;
                        goto out_put_css_set_refs;
@@ -2066,7 +2128,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
        /*
         * step 4: do subsystem attach callbacks.
         */
-        for_each_subsys(root, ss) {
+        for_each_root_subsys(root, ss) {
                if (ss->attach)
                        ss->attach(cgrp, &tset);
        }
@@ -2086,7 +2148,7 @@ out_put_css_set_refs:
        }
 out_cancel_attach:
        if (retval) {
-                for_each_subsys(root, ss) {
+                for_each_root_subsys(root, ss) {
                        if (ss == failed_ss)
                                break;
                        if (ss->cancel_attach)
@@ -2323,7 +2385,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
        struct cftype *cft = __d_cft(file->f_dentry);
        struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
-        if (cgroup_is_removed(cgrp))
+        if (cgroup_is_dead(cgrp))
                return -ENODEV;
        if (cft->write)
                return cft->write(cgrp, cft, file, buf, nbytes, ppos);
@@ -2368,7 +2430,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
        struct cftype *cft = __d_cft(file->f_dentry);
        struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
-        if (cgroup_is_removed(cgrp))
+        if (cgroup_is_dead(cgrp))
                return -ENODEV;
        if (cft->read)
@@ -2435,10 +2497,12 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
        cft = __d_cft(file->f_dentry);
        if (cft->read_map || cft->read_seq_string) {
-                struct cgroup_seqfile_state *state =
+                struct cgroup_seqfile_state *state;
-                        kzalloc(sizeof(*state), GFP_USER);
+                state = kzalloc(sizeof(*state), GFP_USER);
                if (!state)
                        return -ENOMEM;
                state->cft = cft;
                state->cgroup = __d_cgrp(file->f_dentry->d_parent);
                file->f_op = &cgroup_seqfile_operations;
@@ -2486,6 +2550,13 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
        cgrp = __d_cgrp(old_dentry);
+        /*
+         * This isn't a proper migration and its usefulness is very
+         * limited.  Disallow if sane_behavior.
+         */
+        if (cgroup_sane_behavior(cgrp))
+                return -EPERM;
        name = cgroup_alloc_name(new_dentry);
        if (!name)
                return -ENOMEM;
@@ -2496,7 +2567,7 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
                return ret;
        }
-        old_name = cgrp->name;
+        old_name = rcu_dereference_protected(cgrp->name, true);
        rcu_assign_pointer(cgrp->name, name);
        kfree_rcu(old_name, rcu_head);
@@ -2577,7 +2648,7 @@ static const struct inode_operations cgroup_file_inode_operations = {
 };
 static const struct inode_operations cgroup_dir_inode_operations = {
-        .lookup = cgroup_lookup,
+        .lookup = simple_lookup,
        .mkdir = cgroup_mkdir,
        .rmdir = cgroup_rmdir,
        .rename = cgroup_rename,
@@ -2587,14 +2658,6 @@ static const struct inode_operations cgroup_dir_inode_operations = {
        .removexattr = cgroup_removexattr,
 };
-static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
-{
-        if (dentry->d_name.len > NAME_MAX)
-                return ERR_PTR(-ENAMETOOLONG);
-        d_add(dentry, NULL);
-        return NULL;
-}
 /*
 * Check if a file is a control file
 */
@@ -2747,58 +2810,78 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
        return ret;
 }
-static DEFINE_MUTEX(cgroup_cft_mutex);
 static void cgroup_cfts_prepare(void)
-        __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex)
+        __acquires(&cgroup_mutex)
 {
        /*
         * Thanks to the entanglement with vfs inode locking, we can't walk
         * the existing cgroups under cgroup_mutex and create files.
-         * Instead, we increment reference on all cgroups and build list of
+         * Instead, we use cgroup_for_each_descendant_pre() and drop RCU
-         * them using @cgrp->cft_q_node.  Grab cgroup_cft_mutex to ensure
+         * read lock before calling cgroup_addrm_files().
-         * exclusive access to the field.
         */
-        mutex_lock(&cgroup_cft_mutex);
        mutex_lock(&cgroup_mutex);
 }
 static void cgroup_cfts_commit(struct cgroup_subsys *ss,
                               struct cftype *cfts, bool is_add)
-        __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex)
+        __releases(&cgroup_mutex)
 {
        LIST_HEAD(pending);
-        struct cgroup *cgrp, *n;
+        struct cgroup *cgrp, *root = &ss->root->top_cgroup;
+        struct super_block *sb = ss->root->sb;
+        struct dentry *prev = NULL;
+        struct inode *inode;
+        u64 update_before;
        /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
-        if (cfts && ss->root != &rootnode) {
+        if (!cfts || ss->root == &cgroup_dummy_root ||
-                list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) {
+            !atomic_inc_not_zero(&sb->s_active)) {
-                        dget(cgrp->dentry);
+                mutex_unlock(&cgroup_mutex);
-                        list_add_tail(&cgrp->cft_q_node, &pending);
+                return;
-                }
        }
-        mutex_unlock(&cgroup_mutex);
        /*
-         * All new cgroups will see @cfts update on @ss->cftsets.  Add/rm
+         * All cgroups which are created after we drop cgroup_mutex will
-         * files for all cgroups which were created before.
+         * have the updated set of files, so we only need to update the
+         * cgroups created before the current @cgroup_serial_nr_next.
         */
-        list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) {
+        update_before = cgroup_serial_nr_next;
-                struct inode *inode = cgrp->dentry->d_inode;
+        mutex_unlock(&cgroup_mutex);
+        /* @root always needs to be updated */
+        inode = root->dentry->d_inode;
+        mutex_lock(&inode->i_mutex);
+        mutex_lock(&cgroup_mutex);
+        cgroup_addrm_files(root, ss, cfts, is_add);
+        mutex_unlock(&cgroup_mutex);
+        mutex_unlock(&inode->i_mutex);
+        /* add/rm files for all cgroups created before */
+        rcu_read_lock();
+        cgroup_for_each_descendant_pre(cgrp, root) {
+                if (cgroup_is_dead(cgrp))
+                        continue;
+                inode = cgrp->dentry->d_inode;
+                dget(cgrp->dentry);
+                rcu_read_unlock();
+                dput(prev);
+                prev = cgrp->dentry;
                mutex_lock(&inode->i_mutex);
                mutex_lock(&cgroup_mutex);
-                if (!cgroup_is_removed(cgrp))
+                if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
                        cgroup_addrm_files(cgrp, ss, cfts, is_add);
                mutex_unlock(&cgroup_mutex);
                mutex_unlock(&inode->i_mutex);
-                list_del_init(&cgrp->cft_q_node);
+                rcu_read_lock();
-                dput(cgrp->dentry);
        }
+        rcu_read_unlock();
-        mutex_unlock(&cgroup_cft_mutex);
+        dput(prev);
+        deactivate_super(sb);
 }
 /**
@@ -2853,7 +2936,8 @@ int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
        list_for_each_entry(set, &ss->cftsets, node) {
                if (set->cfts == cfts) {
-                        list_del_init(&set->node);
+                        list_del(&set->node);
+                        kfree(set);
                        cgroup_cfts_commit(ss, cfts, false);
                        return 0;
                }
@@ -2872,12 +2956,11 @@ int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 int cgroup_task_count(const struct cgroup *cgrp)
 {
        int count = 0;
-        struct cg_cgroup_link *link;
+        struct cgrp_cset_link *link;
        read_lock(&css_set_lock);
-        list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
+        list_for_each_entry(link, &cgrp->cset_links, cset_link)
-                count += atomic_read(&link->cg->refcount);
+                count += atomic_read(&link->cset->refcount);
-        }
        read_unlock(&css_set_lock);
        return count;
 }
@@ -2886,25 +2969,24 @@ int cgroup_task_count(const struct cgroup *cgrp)
 * Advance a list_head iterator.  The iterator should be positioned at
 * the start of a css_set
 */
-static void cgroup_advance_iter(struct cgroup *cgrp,
+static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it)
-                                struct cgroup_iter *it)
 {
-        struct list_head *l = it->cg_link;
+        struct list_head *l = it->cset_link;
-        struct cg_cgroup_link *link;
+        struct cgrp_cset_link *link;
-        struct css_set *cg;
+        struct css_set *cset;
        /* Advance to the next non-empty css_set */
        do {
                l = l->next;
-                if (l == &cgrp->css_sets) {
+                if (l == &cgrp->cset_links) {
-                        it->cg_link = NULL;
+                        it->cset_link = NULL;
                        return;
                }
-                link = list_entry(l, struct cg_cgroup_link, cgrp_link_list);
+                link = list_entry(l, struct cgrp_cset_link, cset_link);
-                cg = link->cg;
+                cset = link->cset;
-        } while (list_empty(&cg->tasks));
+        } while (list_empty(&cset->tasks));
-        it->cg_link = l;
+        it->cset_link = l;
-        it->task = cg->tasks.next;
+        it->task = cset->tasks.next;
 }
 /*
@@ -2934,7 +3016,7 @@ static void cgroup_enable_task_cg_lists(void)
                 * entry won't be deleted though the process has exited.
                 */
                if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
-                        list_add(&p->cg_list, &p->cgroups->tasks);
+                        list_add(&p->cg_list, &task_css_set(p)->tasks);
                task_unlock(p);
        } while_each_thread(g, p);
        read_unlock(&tasklist_lock);
@@ -2942,12 +3024,67 @@ static void cgroup_enable_task_cg_lists(void)
 }
 /**
+ * cgroup_next_sibling - find the next sibling of a given cgroup
+ * @pos: the current cgroup
+ *
+ * This function returns the next sibling of @pos and should be called
+ * under RCU read lock.  The only requirement is that @pos is accessible.
+ * The next sibling is guaranteed to be returned regardless of @pos's
+ * state.
+ */
+struct cgroup *cgroup_next_sibling(struct cgroup *pos)
+{
+        struct cgroup *next;
+        WARN_ON_ONCE(!rcu_read_lock_held());
+        /*
+         * @pos could already have been removed.  Once a cgroup is removed,
+         * its ->sibling.next is no longer updated when its next sibling
+         * changes.  As CGRP_DEAD assertion is serialized and happens
+         * before the cgroup is taken off the ->sibling list, if we see it
+         * unasserted, it's guaranteed that the next sibling hasn't
+         * finished its grace period even if it's already removed, and thus
+         * safe to dereference from this RCU critical section.  If
+         * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
+         * to be visible as %true here.
+         */
+        if (likely(!cgroup_is_dead(pos))) {
+                next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
+                if (&next->sibling != &pos->parent->children)
+                        return next;
+                return NULL;
+        }
+        /*
+         * Can't dereference the next pointer.  Each cgroup is given a
+         * monotonically increasing unique serial number and always
+         * appended to the sibling list, so the next one can be found by
+         * walking the parent's children until we see a cgroup with higher
+         * serial number than @pos's.
+         *
+         * While this path can be slow, it's taken only when either the
+         * current cgroup is removed or iteration and removal race.
+         */
+        list_for_each_entry_rcu(next, &pos->parent->children, sibling)
+                if (next->serial_nr > pos->serial_nr)
+                        return next;
+        return NULL;
+}
+EXPORT_SYMBOL_GPL(cgroup_next_sibling);
+/**
 * cgroup_next_descendant_pre - find the next descendant for pre-order walk
 * @pos: the current position (%NULL to initiate traversal)
 * @cgroup: cgroup whose descendants to walk
 *
 * To be used by cgroup_for_each_descendant_pre().  Find the next
 * descendant to visit for pre-order traversal of @cgroup's descendants.
+ *
+ * While this function requires RCU read locking, it doesn't require the
+ * whole traversal to be contained in a single RCU critical section.  This
+ * function will return the correct next descendant as long as both @pos
+ * and @cgroup are accessible and @pos is a descendant of @cgroup.
 */
 struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
                                          struct cgroup *cgroup)
@@ -2967,11 +3104,9 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
        /* no child, visit my or the closest ancestor's next sibling */
        while (pos != cgroup) {
-                next = list_entry_rcu(pos->sibling.next, struct cgroup,
+                next = cgroup_next_sibling(pos);
-                                      sibling);
+                if (next)
-                if (&next->sibling != &pos->parent->children)
                        return next;
                pos = pos->parent;
        }
@@ -2986,6 +3121,11 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
 * Return the rightmost descendant of @pos.  If there's no descendant,
 * @pos is returned.  This can be used during pre-order traversal to skip
 * subtree of @pos.
+ *
+ * While this function requires RCU read locking, it doesn't require the
+ * whole traversal to be contained in a single RCU critical section.  This
+ * function will return the correct rightmost descendant as long as @pos is
+ * accessible.
 */
 struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
 {
@@ -3025,6 +3165,11 @@ static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
 *
 * To be used by cgroup_for_each_descendant_post().  Find the next
 * descendant to visit for post-order traversal of @cgroup's descendants.
+ *
+ * While this function requires RCU read locking, it doesn't require the
+ * whole traversal to be contained in a single RCU critical section.  This
+ * function will return the correct next descendant as long as both @pos
+ * and @cgroup are accessible and @pos is a descendant of @cgroup.
 */
 struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
                                           struct cgroup *cgroup)
@@ -3040,8 +3185,8 @@ struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
        }
        /* if there's an unvisited sibling, visit its leftmost descendant */
-        next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
+        next = cgroup_next_sibling(pos);
-        if (&next->sibling != &pos->parent->children)
+        if (next)
                return cgroup_leftmost_descendant(next);
        /* no sibling left, visit parent */
@@ -3062,7 +3207,7 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
                cgroup_enable_task_cg_lists();
        read_lock(&css_set_lock);
-        it->cg_link = &cgrp->css_sets;
+        it->cset_link = &cgrp->cset_links;
        cgroup_advance_iter(cgrp, it);
 }
@@ -3071,16 +3216,16 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
 {
        struct task_struct *res;
        struct list_head *l = it->task;
-        struct cg_cgroup_link *link;
+        struct cgrp_cset_link *link;
        /* If the iterator cg is NULL, we have no tasks */
-        if (!it->cg_link)
+        if (!it->cset_link)
                return NULL;
        res = list_entry(l, struct task_struct, cg_list);
        /* Advance iterator to find next entry */
        l = l->next;
-        link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
+        link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link);
-        if (l == &link->cg->tasks) {
+        if (l == &link->cset->tasks) {
                /* We reached the end of this task list - move on to
                 * the next cg_cgroup_link */
                cgroup_advance_iter(cgrp, it);
@@ -3411,7 +3556,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
                }
        }
        /* entry not found; create a new one */
-        l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
+        l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
        if (!l) {
                mutex_unlock(&cgrp->pidlist_mutex);
                return l;
@@ -3420,8 +3565,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
        down_write(&l->mutex);
        l->key.type = type;
        l->key.ns = get_pid_ns(ns);
-        l->use_count = 0; /* don't increment here */
-        l->list = NULL;
        l->owner = cgrp;
        list_add(&l->links, &cgrp->pidlists);
        mutex_unlock(&cgrp->pidlist_mutex);
@@ -3727,6 +3870,23 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
 }
 /*
+ * When dput() is called asynchronously, if umount has been done and
+ * then deactivate_super() in cgroup_free_fn() kills the superblock,
+ * there's a small window that vfs will see the root dentry with non-zero
+ * refcnt and trigger BUG().
+ *
+ * That's why we hold a reference before dput() and drop it right after.
+ */
+static void cgroup_dput(struct cgroup *cgrp)
+{
+        struct super_block *sb = cgrp->root->sb;
+        atomic_inc(&sb->s_active);
+        dput(cgrp->dentry);
+        deactivate_super(sb);
+}
+/*
 * Unregister event and free resources.
 *
 * Gets called from workqueue.
@@ -3746,7 +3906,7 @@ static void cgroup_event_remove(struct work_struct *work)
        eventfd_ctx_put(event->eventfd);
        kfree(event);
-        dput(cgrp->dentry);
+        cgroup_dput(cgrp);
 }
 /*
@@ -3933,33 +4093,16 @@ static int cgroup_clone_children_write(struct cgroup *cgrp,
        return 0;
 }
-/*
+static struct cftype cgroup_base_files[] = {
- * for the common functions, 'private' gives the type of file
- */
-/* for hysterical raisins, we can't put this on the older files */
-#define CGROUP_FILE_GENERIC_PREFIX "cgroup."
-static struct cftype files[] = {
        {
-                .name = "tasks",
+                .name = "cgroup.procs",
-                .open = cgroup_tasks_open,
-                .write_u64 = cgroup_tasks_write,
-                .release = cgroup_pidlist_release,
-                .mode = S_IRUGO | S_IWUSR,
-        },
-        {
-                .name = CGROUP_FILE_GENERIC_PREFIX "procs",
                .open = cgroup_procs_open,
                .write_u64 = cgroup_procs_write,
                .release = cgroup_pidlist_release,
                .mode = S_IRUGO | S_IWUSR,
        },
        {
-                .name = "notify_on_release",
+                .name = "cgroup.event_control",
-                .read_u64 = cgroup_read_notify_on_release,
-                .write_u64 = cgroup_write_notify_on_release,
-        },
-        {
-                .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
                .write_string = cgroup_write_event_control,
                .mode = S_IWUGO,
        },
@@ -3974,9 +4117,29 @@ static struct cftype files[] = {
                .flags = CFTYPE_ONLY_ON_ROOT,
                .read_seq_string = cgroup_sane_behavior_show,
        },
+        /*
+         * Historical crazy stuff.  These don't have "cgroup."  prefix and
+         * don't exist if sane_behavior.  If you're depending on these, be
+         * prepared to be burned.
+         */
+        {
+                .name = "tasks",
+                .flags = CFTYPE_INSANE,         /* use "procs" instead */
+                .open = cgroup_tasks_open,
+                .write_u64 = cgroup_tasks_write,
+                .release = cgroup_pidlist_release,
+                .mode = S_IRUGO | S_IWUSR,
+        },
+        {
+                .name = "notify_on_release",
+                .flags = CFTYPE_INSANE,
+                .read_u64 = cgroup_read_notify_on_release,
+                .write_u64 = cgroup_write_notify_on_release,
+        },
        {
                .name = "release_agent",
-                .flags = CFTYPE_ONLY_ON_ROOT,
+                .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
                .read_seq_string = cgroup_release_agent_show,
                .write_string = cgroup_release_agent_write,
                .max_write_len = PATH_MAX,
@@ -3997,13 +4160,13 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
        struct cgroup_subsys *ss;
        if (base_files) {
-                err = cgroup_addrm_files(cgrp, NULL, files, true);
+                err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true);
                if (err < 0)
                        return err;
        }
        /* process cftsets of each subsystem */
-        for_each_subsys(cgrp->root, ss) {
+        for_each_root_subsys(cgrp->root, ss) {
                struct cftype_set *set;
                if (!test_bit(ss->subsys_id, &subsys_mask))
                        continue;
@@ -4013,15 +4176,17 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
        }
        /* This cgroup is ready now */
-        for_each_subsys(cgrp->root, ss) {
+        for_each_root_subsys(cgrp->root, ss) {
                struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+                struct css_id *id = rcu_dereference_protected(css->id, true);
                /*
                 * Update id->css pointer and make this css visible from
                 * CSS ID functions. This pointer will be dereferened
                 * from RCU-read-side without locks.
                 */
-                if (css->id)
+                if (id)
-                        rcu_assign_pointer(css->id->css, css);
+                        rcu_assign_pointer(id->css, css);
        }
        return 0;
@@ -4031,12 +4196,16 @@ static void css_dput_fn(struct work_struct *work)
 {
        struct cgroup_subsys_state *css =
                container_of(work, struct cgroup_subsys_state, dput_work);
-        struct dentry *dentry = css->cgroup->dentry;
-        struct super_block *sb = dentry->d_sb;
-        atomic_inc(&sb->s_active);
+        cgroup_dput(css->cgroup);
-        dput(dentry);
+}
-        deactivate_super(sb);
+static void css_release(struct percpu_ref *ref)
+{
+        struct cgroup_subsys_state *css =
+                container_of(ref, struct cgroup_subsys_state, refcnt);
+        schedule_work(&css->dput_work);
 }
 static void init_cgroup_css(struct cgroup_subsys_state *css,
@@ -4044,10 +4213,9 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
                               struct cgroup *cgrp)
 {
        css->cgroup = cgrp;
-        atomic_set(&css->refcnt, 1);
        css->flags = 0;
        css->id = NULL;
-        if (cgrp == dummytop)
+        if (cgrp == cgroup_dummy_top)
                css->flags |= CSS_ROOT;
        BUG_ON(cgrp->subsys[ss->subsys_id]);
        cgrp->subsys[ss->subsys_id] = css;
@@ -4157,7 +4325,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
                set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
-        for_each_subsys(root, ss) {
+        for_each_root_subsys(root, ss) {
                struct cgroup_subsys_state *css;
                css = ss->css_alloc(cgrp);
@@ -4165,7 +4333,15 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                        err = PTR_ERR(css);
                        goto err_free_all;
                }
+                err = percpu_ref_init(&css->refcnt, css_release);
+                if (err) {
+                        ss->css_free(cgrp);
+                        goto err_free_all;
+                }
                init_cgroup_css(css, ss, cgrp);
                if (ss->use_id) {
                        err = alloc_css_id(ss, parent, cgrp);
                        if (err)
@@ -4183,20 +4359,21 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                goto err_free_all;
        lockdep_assert_held(&dentry->d_inode->i_mutex);
+        cgrp->serial_nr = cgroup_serial_nr_next++;
        /* allocation complete, commit to creation */
-        list_add_tail(&cgrp->allcg_node, &root->allcg_list);
        list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
        root->number_of_cgroups++;
        /* each css holds a ref to the cgroup's dentry */
-        for_each_subsys(root, ss)
+        for_each_root_subsys(root, ss)
                dget(dentry);
        /* hold a ref to the parent's dentry */
        dget(parent->dentry);
        /* creation succeeded, notify subsystems */
-        for_each_subsys(root, ss) {
+        for_each_root_subsys(root, ss) {
                err = online_css(ss, cgrp);
                if (err)
                        goto err_destroy;
@@ -4221,9 +4398,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        return 0;
 err_free_all:
-        for_each_subsys(root, ss) {
+        for_each_root_subsys(root, ss) {
-                if (cgrp->subsys[ss->subsys_id])
+                struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+                if (css) {
+                        percpu_ref_cancel_init(&css->refcnt);
                        ss->css_free(cgrp);
+                }
        }
        mutex_unlock(&cgroup_mutex);
        /* Release the reference count that we took on the superblock */
@@ -4251,63 +4432,120 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        return cgroup_create(c_parent, dentry, mode | S_IFDIR);
 }
+static void cgroup_css_killed(struct cgroup *cgrp)
+{
+        if (!atomic_dec_and_test(&cgrp->css_kill_cnt))
+                return;
+        /* percpu ref's of all css's are killed, kick off the next step */
+        INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
+        schedule_work(&cgrp->destroy_work);
+}
+static void css_ref_killed_fn(struct percpu_ref *ref)
+{
+        struct cgroup_subsys_state *css =
+                container_of(ref, struct cgroup_subsys_state, refcnt);
+        cgroup_css_killed(css->cgroup);
+}
+/**
+ * cgroup_destroy_locked - the first stage of cgroup destruction
+ * @cgrp: cgroup to be destroyed
+ *
+ * css's make use of percpu refcnts whose killing latency shouldn't be
+ * exposed to userland and are RCU protected.  Also, cgroup core needs to
+ * guarantee that css_tryget() won't succeed by the time ->css_offline() is
+ * invoked.  To satisfy all the requirements, destruction is implemented in
+ * the following two steps.
+ *
+ * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
+ *     userland visible parts and start killing the percpu refcnts of
+ *     css's.  Set up so that the next stage will be kicked off once all
+ *     the percpu refcnts are confirmed to be killed.
+ *
+ * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
+ *     rest of destruction.  Once all cgroup references are gone, the
+ *     cgroup is RCU-freed.
+ *
+ * This function implements s1.  After this step, @cgrp is gone as far as
+ * the userland is concerned and a new cgroup with the same name may be
+ * created.  As cgroup doesn't care about the names internally, this
+ * doesn't cause any problem.
+ */
 static int cgroup_destroy_locked(struct cgroup *cgrp)
        __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
 {
        struct dentry *d = cgrp->dentry;
-        struct cgroup *parent = cgrp->parent;
        struct cgroup_event *event, *tmp;
        struct cgroup_subsys *ss;
+        bool empty;
        lockdep_assert_held(&d->d_inode->i_mutex);
        lockdep_assert_held(&cgroup_mutex);
-        if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children))
+        /*
+         * css_set_lock synchronizes access to ->cset_links and prevents
+         * @cgrp from being removed while __put_css_set() is in progress.
+         */
+        read_lock(&css_set_lock);
+        empty = list_empty(&cgrp->cset_links) && list_empty(&cgrp->children);
+        read_unlock(&css_set_lock);
+        if (!empty)
                return -EBUSY;
        /*
-         * Block new css_tryget() by deactivating refcnt and mark @cgrp
+         * Block new css_tryget() by killing css refcnts.  cgroup core
-         * removed.  This makes future css_tryget() and child creation
+         * guarantees that, by the time ->css_offline() is invoked, no new
-         * attempts fail thus maintaining the removal conditions verified
+         * css reference will be given out via css_tryget().  We can't
-         * above.
+         * simply call percpu_ref_kill() and proceed to offlining css's
+         * because percpu_ref_kill() doesn't guarantee that the ref is seen
+         * as killed on all CPUs on return.
+         *
+         * Use percpu_ref_kill_and_confirm() to get notifications as each
+         * css is confirmed to be seen as killed on all CPUs.  The
+         * notification callback keeps track of the number of css's to be
+         * killed and schedules cgroup_offline_fn() to perform the rest of
+         * destruction once the percpu refs of all css's are confirmed to
+         * be killed.
         */
-        for_each_subsys(cgrp->root, ss) {
+        atomic_set(&cgrp->css_kill_cnt, 1);
+        for_each_root_subsys(cgrp->root, ss) {
                struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
-                WARN_ON(atomic_read(&css->refcnt) < 0);
+                /*
-                atomic_add(CSS_DEACT_BIAS, &css->refcnt);
+                 * Killing would put the base ref, but we need to keep it
-        }
+                 * alive until after ->css_offline.
-        set_bit(CGRP_REMOVED, &cgrp->flags);
+                 */
+                percpu_ref_get(&css->refcnt);
-        /* tell subsystems to initate destruction */
+                atomic_inc(&cgrp->css_kill_cnt);
-        for_each_subsys(cgrp->root, ss)
+                percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn);
-                offline_css(ss, cgrp);
+        }
+        cgroup_css_killed(cgrp);
        /*
-         * Put all the base refs.  Each css holds an extra reference to the
+         * Mark @cgrp dead.  This prevents further task migration and child
-         * cgroup's dentry and cgroup removal proceeds regardless of css
+         * creation by disabling cgroup_lock_live_group().  Note that
-         * refs.  On the last put of each css, whenever that may be, the
+         * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to
-         * extra dentry ref is put so that dentry destruction happens only
+         * resume iteration after dropping RCU read lock.  See
-         * after all css's are released.
+         * cgroup_next_sibling() for details.
         */
-        for_each_subsys(cgrp->root, ss)
+        set_bit(CGRP_DEAD, &cgrp->flags);
-                css_put(cgrp->subsys[ss->subsys_id]);
+        /* CGRP_DEAD is set, remove from ->release_list for the last time */
        raw_spin_lock(&release_list_lock);
        if (!list_empty(&cgrp->release_list))
                list_del_init(&cgrp->release_list);
        raw_spin_unlock(&release_list_lock);
-        /* delete this cgroup from parent->children */
+        /*
-        list_del_rcu(&cgrp->sibling);
+         * Remove @cgrp directory.  The removal puts the base ref but we
-        list_del_init(&cgrp->allcg_node);
+         * aren't quite done with @cgrp yet, so hold onto it.
+         */
        dget(d);
        cgroup_d_remove_dir(d);
-        dput(d);
-        set_bit(CGRP_RELEASABLE, &parent->flags);
-        check_for_release(parent);
        /*
         * Unregister events and notify userspace.
@@ -4322,6 +4560,53 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
        spin_unlock(&cgrp->event_list_lock);
        return 0;
+};
+/**
+ * cgroup_offline_fn - the second step of cgroup destruction
+ * @work: cgroup->destroy_free_work
+ *
+ * This function is invoked from a work item for a cgroup which is being
+ * destroyed after the percpu refcnts of all css's are guaranteed to be
+ * seen as killed on all CPUs, and performs the rest of destruction.  This
+ * is the second step of destruction described in the comment above
+ * cgroup_destroy_locked().
+ */
+static void cgroup_offline_fn(struct work_struct *work)
+{
+        struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
+        struct cgroup *parent = cgrp->parent;
+        struct dentry *d = cgrp->dentry;
+        struct cgroup_subsys *ss;
+        mutex_lock(&cgroup_mutex);
+        /*
+         * css_tryget() is guaranteed to fail now.  Tell subsystems to
+         * initate destruction.
+         */
+        for_each_root_subsys(cgrp->root, ss)
+                offline_css(ss, cgrp);
+        /*
+         * Put the css refs from cgroup_destroy_locked().  Each css holds
+         * an extra reference to the cgroup's dentry and cgroup removal
+         * proceeds regardless of css refs.  On the last put of each css,
+         * whenever that may be, the extra dentry ref is put so that dentry
+         * destruction happens only after all css's are released.
+         */
+        for_each_root_subsys(cgrp->root, ss)
+                css_put(cgrp->subsys[ss->subsys_id]);
+        /* delete this cgroup from parent->children */
+        list_del_rcu(&cgrp->sibling);
+        dput(d);
+        set_bit(CGRP_RELEASABLE, &parent->flags);
+        check_for_release(parent);
+        mutex_unlock(&cgroup_mutex);
 }
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
@@ -4361,12 +4646,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
        cgroup_init_cftsets(ss);
        /* Create the top cgroup state for this subsystem */
-        list_add(&ss->sibling, &rootnode.subsys_list);
+        list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
-        ss->root = &rootnode;
+        ss->root = &cgroup_dummy_root;
-        css = ss->css_alloc(dummytop);
+        css = ss->css_alloc(cgroup_dummy_top);
        /* We don't handle early failures gracefully */
        BUG_ON(IS_ERR(css));
-        init_cgroup_css(css, ss, dummytop);
+        init_cgroup_css(css, ss, cgroup_dummy_top);
        /* Update the init_css_set to contain a subsys
         * pointer to this state - since the subsystem is
@@ -4381,7 +4666,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
         * need to invoke fork callbacks here. */
        BUG_ON(!list_empty(&init_task.tasks));
-        BUG_ON(online_css(ss, dummytop));
+        BUG_ON(online_css(ss, cgroup_dummy_top));
        mutex_unlock(&cgroup_mutex);
@@ -4404,7 +4689,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
        struct cgroup_subsys_state *css;
        int i, ret;
        struct hlist_node *tmp;
-        struct css_set *cg;
+        struct css_set *cset;
        unsigned long key;
        /* check name and function validity */
@@ -4427,7 +4712,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
         */
        if (ss->module == NULL) {
                /* a sanity check */
-                BUG_ON(subsys[ss->subsys_id] != ss);
+                BUG_ON(cgroup_subsys[ss->subsys_id] != ss);
                return 0;
        }
@@ -4435,26 +4720,26 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
        cgroup_init_cftsets(ss);
        mutex_lock(&cgroup_mutex);
-        subsys[ss->subsys_id] = ss;
+        cgroup_subsys[ss->subsys_id] = ss;
        /*
         * no ss->css_alloc seems to need anything important in the ss
-         * struct, so this can happen first (i.e. before the rootnode
+         * struct, so this can happen first (i.e. before the dummy root
         * attachment).
         */
-        css = ss->css_alloc(dummytop);
+        css = ss->css_alloc(cgroup_dummy_top);
        if (IS_ERR(css)) {
-                /* failure case - need to deassign the subsys[] slot. */
+                /* failure case - need to deassign the cgroup_subsys[] slot. */
-                subsys[ss->subsys_id] = NULL;
+                cgroup_subsys[ss->subsys_id] = NULL;
                mutex_unlock(&cgroup_mutex);
                return PTR_ERR(css);
        }
-        list_add(&ss->sibling, &rootnode.subsys_list);
+        list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
-        ss->root = &rootnode;
+        ss->root = &cgroup_dummy_root;
        /* our new subsystem will be attached to the dummy hierarchy. */
-        init_cgroup_css(css, ss, dummytop);
+        init_cgroup_css(css, ss, cgroup_dummy_top);
        /* init_idr must be after init_cgroup_css because it sets css->id. */
        if (ss->use_id) {
                ret = cgroup_init_idr(ss, css);
@@ -4471,21 +4756,21 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
         * this is all done under the css_set_lock.
         */
        write_lock(&css_set_lock);
-        hash_for_each_safe(css_set_table, i, tmp, cg, hlist) {
+        hash_for_each_safe(css_set_table, i, tmp, cset, hlist) {
                /* skip entries that we already rehashed */
-                if (cg->subsys[ss->subsys_id])
+                if (cset->subsys[ss->subsys_id])
                        continue;
                /* remove existing entry */
-                hash_del(&cg->hlist);
+                hash_del(&cset->hlist);
                /* set new value */
-                cg->subsys[ss->subsys_id] = css;
+                cset->subsys[ss->subsys_id] = css;
                /* recompute hash and restore entry */
-                key = css_set_hash(cg->subsys);
+                key = css_set_hash(cset->subsys);
-                hash_add(css_set_table, &cg->hlist, key);
+                hash_add(css_set_table, &cset->hlist, key);
        }
        write_unlock(&css_set_lock);
-        ret = online_css(ss, dummytop);
+        ret = online_css(ss, cgroup_dummy_top);
        if (ret)
                goto err_unload;
@@ -4511,7 +4796,7 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
 */
 void cgroup_unload_subsys(struct cgroup_subsys *ss)
 {
-        struct cg_cgroup_link *link;
+        struct cgrp_cset_link *link;
        BUG_ON(ss->module == NULL);
@@ -4520,45 +4805,46 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
         * try_module_get in parse_cgroupfs_options should ensure that it
         * doesn't start being used while we're killing it off.
         */
-        BUG_ON(ss->root != &rootnode);
+        BUG_ON(ss->root != &cgroup_dummy_root);
        mutex_lock(&cgroup_mutex);
-        offline_css(ss, dummytop);
+        offline_css(ss, cgroup_dummy_top);
        if (ss->use_id)
                idr_destroy(&ss->idr);
        /* deassign the subsys_id */
-        subsys[ss->subsys_id] = NULL;
+        cgroup_subsys[ss->subsys_id] = NULL;
-        /* remove subsystem from rootnode's list of subsystems */
+        /* remove subsystem from the dummy root's list of subsystems */
        list_del_init(&ss->sibling);
        /*
-         * disentangle the css from all css_sets attached to the dummytop. as
+         * disentangle the css from all css_sets attached to the dummy
-         * in loading, we need to pay our respects to the hashtable gods.
+         * top. as in loading, we need to pay our respects to the hashtable
+         * gods.
         */
        write_lock(&css_set_lock);
-        list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
+        list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) {
-                struct css_set *cg = link->cg;
+                struct css_set *cset = link->cset;
                unsigned long key;
-                hash_del(&cg->hlist);
+                hash_del(&cset->hlist);
-                cg->subsys[ss->subsys_id] = NULL;
+                cset->subsys[ss->subsys_id] = NULL;
-                key = css_set_hash(cg->subsys);
+                key = css_set_hash(cset->subsys);
-                hash_add(css_set_table, &cg->hlist, key);
+                hash_add(css_set_table, &cset->hlist, key);
        }
        write_unlock(&css_set_lock);
        /*
-         * remove subsystem's css from the dummytop and free it - need to
+         * remove subsystem's css from the cgroup_dummy_top and free it -
-         * free before marking as null because ss->css_free needs the
+         * need to free before marking as null because ss->css_free needs
-         * cgrp->subsys pointer to find their state. note that this also
+         * the cgrp->subsys pointer to find their state. note that this
-         * takes care of freeing the css_id.
+         * also takes care of freeing the css_id.
         */
-        ss->css_free(dummytop);
+        ss->css_free(cgroup_dummy_top);
-        dummytop->subsys[ss->subsys_id] = NULL;
+        cgroup_dummy_top->subsys[ss->subsys_id] = NULL;
        mutex_unlock(&cgroup_mutex);
 }
@@ -4572,30 +4858,25 @@ EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
 */
 int __init cgroup_init_early(void)
 {
+        struct cgroup_subsys *ss;
        int i;
        atomic_set(&init_css_set.refcount, 1);
-        INIT_LIST_HEAD(&init_css_set.cg_links);
+        INIT_LIST_HEAD(&init_css_set.cgrp_links);
        INIT_LIST_HEAD(&init_css_set.tasks);
        INIT_HLIST_NODE(&init_css_set.hlist);
        css_set_count = 1;
-        init_cgroup_root(&rootnode);
+        init_cgroup_root(&cgroup_dummy_root);
-        root_count = 1;
+        cgroup_root_count = 1;
-        init_task.cgroups = &init_css_set;
+        RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
-        init_css_set_link.cg = &init_css_set;
-        init_css_set_link.cgrp = dummytop;
-        list_add(&init_css_set_link.cgrp_link_list,
-                 &rootnode.top_cgroup.css_sets);
-        list_add(&init_css_set_link.cg_link_list,
-                 &init_css_set.cg_links);
-        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-                struct cgroup_subsys *ss = subsys[i];
-                /* at bootup time, we don't worry about modular subsystems */
-                if (!ss || ss->module)
-                        continue;
+        init_cgrp_cset_link.cset = &init_css_set;
+        init_cgrp_cset_link.cgrp = cgroup_dummy_top;
+        list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links);
+        list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links);
+        /* at bootup time, we don't worry about modular subsystems */
+        for_each_builtin_subsys(ss, i) {
                BUG_ON(!ss->name);
                BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
                BUG_ON(!ss->css_alloc);
@@ -4620,30 +4901,33 @@ int __init cgroup_init_early(void)
 */
 int __init cgroup_init(void)
 {
-        int err;
+        struct cgroup_subsys *ss;
-        int i;
        unsigned long key;
+        int i, err;
        err = bdi_init(&cgroup_backing_dev_info);
        if (err)
                return err;
-        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+        for_each_builtin_subsys(ss, i) {
-                struct cgroup_subsys *ss = subsys[i];
-                /* at bootup time, we don't worry about modular subsystems */
-                if (!ss || ss->module)
-                        continue;
                if (!ss->early_init)
                        cgroup_init_subsys(ss);
                if (ss->use_id)
                        cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
        }
+        /* allocate id for the dummy hierarchy */
+        mutex_lock(&cgroup_mutex);
+        mutex_lock(&cgroup_root_mutex);
        /* Add init_css_set to the hash table */
        key = css_set_hash(init_css_set.subsys);
        hash_add(css_set_table, &init_css_set.hlist, key);
-        BUG_ON(!init_root_id(&rootnode));
+        BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1));
+        mutex_unlock(&cgroup_root_mutex);
+        mutex_unlock(&cgroup_mutex);
        cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
        if (!cgroup_kobj) {
@@ -4708,7 +4992,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)
                int count = 0;
                seq_printf(m, "%d:", root->hierarchy_id);
-                for_each_subsys(root, ss)
+                for_each_root_subsys(root, ss)
                        seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
                if (strlen(root->name))
                        seq_printf(m, "%sname=%s", count ? "," : "",
@@ -4734,6 +5018,7 @@ out:
 /* Display information about each subsystem and each hierarchy */
 static int proc_cgroupstats_show(struct seq_file *m, void *v)
 {
+        struct cgroup_subsys *ss;
        int i;
        seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
@@ -4743,14 +5028,12 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
         * subsys/hierarchy state.
         */
        mutex_lock(&cgroup_mutex);
-        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-                struct cgroup_subsys *ss = subsys[i];
+        for_each_subsys(ss, i)
-                if (ss == NULL)
-                        continue;
                seq_printf(m, "%s\t%d\t%d\t%d\n",
                           ss->name, ss->root->hierarchy_id,
                           ss->root->number_of_cgroups, !ss->disabled);
-        }
        mutex_unlock(&cgroup_mutex);
        return 0;
 }
@@ -4786,8 +5069,8 @@ static const struct file_operations proc_cgroupstats_operations = {
 void cgroup_fork(struct task_struct *child)
 {
        task_lock(current);
+        get_css_set(task_css_set(current));
        child->cgroups = current->cgroups;
-        get_css_set(child->cgroups);
        task_unlock(current);
        INIT_LIST_HEAD(&child->cg_list);
 }
@@ -4804,6 +5087,7 @@ void cgroup_fork(struct task_struct *child)
 */
 void cgroup_post_fork(struct task_struct *child)
 {
+        struct cgroup_subsys *ss;
        int i;
        /*
@@ -4821,7 +5105,7 @@ void cgroup_post_fork(struct task_struct *child)
                write_lock(&css_set_lock);
                task_lock(child);
                if (list_empty(&child->cg_list))
-                        list_add(&child->cg_list, &child->cgroups->tasks);
+                        list_add(&child->cg_list, &task_css_set(child)->tasks);
                task_unlock(child);
                write_unlock(&css_set_lock);
        }
@@ -4840,12 +5124,9 @@ void cgroup_post_fork(struct task_struct *child)
                 * of the array can be freed at module unload, so we
                 * can't touch that.
                 */
-                for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+                for_each_builtin_subsys(ss, i)
-                        struct cgroup_subsys *ss = subsys[i];
                        if (ss->fork)
                                ss->fork(child);
-                }
        }
 }
@@ -4886,7 +5167,8 @@ void cgroup_post_fork(struct task_struct *child)
 */
 void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 {
-        struct css_set *cg;
+        struct cgroup_subsys *ss;
+        struct css_set *cset;
        int i;
        /*
@@ -4903,36 +5185,32 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
        /* Reassign the task to the init_css_set. */
        task_lock(tsk);
-        cg = tsk->cgroups;
+        cset = task_css_set(tsk);
-        tsk->cgroups = &init_css_set;
+        RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
        if (run_callbacks && need_forkexit_callback) {
                /*
                 * fork/exit callbacks are supported only for builtin
                 * subsystems, see cgroup_post_fork() for details.
                 */
-                for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+                for_each_builtin_subsys(ss, i) {
-                        struct cgroup_subsys *ss = subsys[i];
                        if (ss->exit) {
-                                struct cgroup *old_cgrp =
+                                struct cgroup *old_cgrp = cset->subsys[i]->cgroup;
-                                        rcu_dereference_raw(cg->subsys[i])->cgroup;
                                struct cgroup *cgrp = task_cgroup(tsk, i);
                                ss->exit(cgrp, old_cgrp, tsk);
                        }
                }
        }
        task_unlock(tsk);
-        put_css_set_taskexit(cg);
+        put_css_set_taskexit(cset);
 }
 static void check_for_release(struct cgroup *cgrp)
 {
-        /* All of these checks rely on RCU to keep the cgroup
-         * structure alive */
        if (cgroup_is_releasable(cgrp) &&
-            !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) {
+            list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) {
                /*
                 * Control Group is currently removeable. If it's not
                 * already queued for a userspace notification, queue
@@ -4941,7 +5219,7 @@ static void check_for_release(struct cgroup *cgrp)
                int need_schedule_work = 0;
                raw_spin_lock(&release_list_lock);
-                if (!cgroup_is_removed(cgrp) &&
+                if (!cgroup_is_dead(cgrp) &&
                    list_empty(&cgrp->release_list)) {
                        list_add(&cgrp->release_list, &release_list);
                        need_schedule_work = 1;
@@ -4952,34 +5230,6 @@ static void check_for_release(struct cgroup *cgrp)
        }
 }
-/* Caller must verify that the css is not for root cgroup */
-bool __css_tryget(struct cgroup_subsys_state *css)
-{
-        while (true) {
-                int t, v;
-                v = css_refcnt(css);
-                t = atomic_cmpxchg(&css->refcnt, v, v + 1);
-                if (likely(t == v))
-                        return true;
-                else if (t < 0)
-                        return false;
-                cpu_relax();
-        }
-}
-EXPORT_SYMBOL_GPL(__css_tryget);
-/* Caller must verify that the css is not for root cgroup */
-void __css_put(struct cgroup_subsys_state *css)
-{
-        int v;
-        v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
-        if (v == 0)
-                schedule_work(&css->dput_work);
-}
-EXPORT_SYMBOL_GPL(__css_put);
 /*
 * Notify userspace when a cgroup is released, by running the
 * configured release agent with the name of the cgroup (path
@@ -5054,23 +5304,19 @@ static void cgroup_release_agent(struct work_struct *work)
 static int __init cgroup_disable(char *str)
 {
-        int i;
+        struct cgroup_subsys *ss;
        char *token;
+        int i;
        while ((token = strsep(&str, ",")) != NULL) {
                if (!*token)
                        continue;
-                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-                        struct cgroup_subsys *ss = subsys[i];
-                        /*
-                         * cgroup_disable, being at boot time, can't
-                         * know about module subsystems, so we don't
-                         * worry about them.
-                         */
-                        if (!ss || ss->module)
-                                continue;
+                /*
+                 * cgroup_disable, being at boot time, can't know about
+                 * module subsystems, so we don't worry about them.
+                 */
+                for_each_builtin_subsys(ss, i) {
                        if (!strcmp(token, ss->name)) {
                                ss->disabled = 1;
                                printk(KERN_INFO "Disabling %s control group"
@@ -5087,9 +5333,7 @@ __setup("cgroup_disable=", cgroup_disable);
 * Functons for CSS ID.
 */
-/*
+/* to get ID other than 0, this should be called when !cgroup_is_dead() */
- *To get ID other than 0, this should be called when !cgroup_is_removed().
- */
 unsigned short css_id(struct cgroup_subsys_state *css)
 {
        struct css_id *cssid;
@@ -5099,7 +5343,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
         * on this or this is under rcu_read_lock(). Once css->id is allocated,
         * it's unchanged until freed.
         */
-        cssid = rcu_dereference_check(css->id, css_refcnt(css));
+        cssid = rcu_dereference_raw(css->id);
        if (cssid)
                return cssid->id;
@@ -5107,18 +5351,6 @@ unsigned short css_id(struct cgroup_subsys_state *css)
 }
 EXPORT_SYMBOL_GPL(css_id);
-unsigned short css_depth(struct cgroup_subsys_state *css)
-{
-        struct css_id *cssid;
-        cssid = rcu_dereference_check(css->id, css_refcnt(css));
-        if (cssid)
-                return cssid->depth;
-        return 0;
-}
-EXPORT_SYMBOL_GPL(css_depth);
 /**
 *  css_is_ancestor - test "root" css is an ancestor of "child"
 * @child: the css to be tested.
@@ -5153,7 +5385,8 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,
 void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
 {
-        struct css_id *id = css->id;
+        struct css_id *id = rcu_dereference_protected(css->id, true);
        /* When this is called before css_id initialization, id can be NULL */
        if (!id)
                return;
@@ -5219,8 +5452,8 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
                return PTR_ERR(newid);
        newid->stack[0] = newid->id;
-        newid->css = rootcss;
+        RCU_INIT_POINTER(newid->css, rootcss);
-        rootcss->id = newid;
+        RCU_INIT_POINTER(rootcss->id, newid);
        return 0;
 }
@@ -5234,7 +5467,7 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
        subsys_id = ss->subsys_id;
        parent_css = parent->subsys[subsys_id];
        child_css = child->subsys[subsys_id];
-        parent_id = parent_css->id;
+        parent_id = rcu_dereference_protected(parent_css->id, true);
        depth = parent_id->depth + 1;
        child_id = get_new_cssid(ss, depth);
@@ -5299,7 +5532,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
 }
 #ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
+static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp)
 {
        struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
@@ -5309,48 +5542,43 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
        return css;
 }
-static void debug_css_free(struct cgroup *cont)
+static void debug_css_free(struct cgroup *cgrp)
-{
-        kfree(cont->subsys[debug_subsys_id]);
-}
-static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
 {
-        return atomic_read(&cont->count);
+        kfree(cgrp->subsys[debug_subsys_id]);
 }
-static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft)
+static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft)
 {
-        return cgroup_task_count(cont);
+        return cgroup_task_count(cgrp);
 }
-static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
+static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft)
 {
        return (u64)(unsigned long)current->cgroups;
 }
-static u64 current_css_set_refcount_read(struct cgroup *cont,
+static u64 current_css_set_refcount_read(struct cgroup *cgrp,
-                                           struct cftype *cft)
+                                         struct cftype *cft)
 {
        u64 count;
        rcu_read_lock();
-        count = atomic_read(&current->cgroups->refcount);
+        count = atomic_read(&task_css_set(current)->refcount);
        rcu_read_unlock();
        return count;
 }
-static int current_css_set_cg_links_read(struct cgroup *cont,
+static int current_css_set_cg_links_read(struct cgroup *cgrp,
                                         struct cftype *cft,
                                         struct seq_file *seq)
 {
-        struct cg_cgroup_link *link;
+        struct cgrp_cset_link *link;
-        struct css_set *cg;
+        struct css_set *cset;
        read_lock(&css_set_lock);
        rcu_read_lock();
-        cg = rcu_dereference(current->cgroups);
+        cset = rcu_dereference(current->cgroups);
-        list_for_each_entry(link, &cg->cg_links, cg_link_list) {
+        list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
                struct cgroup *c = link->cgrp;
                const char *name;
@@ -5367,19 +5595,19 @@ static int current_css_set_cg_links_read(struct cgroup *cont,
 }
 #define MAX_TASKS_SHOWN_PER_CSS 25
-static int cgroup_css_links_read(struct cgroup *cont,
+static int cgroup_css_links_read(struct cgroup *cgrp,
                                 struct cftype *cft,
                                 struct seq_file *seq)
 {
-        struct cg_cgroup_link *link;
+        struct cgrp_cset_link *link;
        read_lock(&css_set_lock);
-        list_for_each_entry(link, &cont->css_sets, cgrp_link_list) {
+        list_for_each_entry(link, &cgrp->cset_links, cset_link) {
-                struct css_set *cg = link->cg;
+                struct css_set *cset = link->cset;
                struct task_struct *task;
                int count = 0;
-                seq_printf(seq, "css_set %p\n", cg);
+                seq_printf(seq, "css_set %p\n", cset);
-                list_for_each_entry(task, &cg->tasks, cg_list) {
+                list_for_each_entry(task, &cset->tasks, cg_list) {
                        if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
                                seq_puts(seq, "  ...\n");
                                break;
@@ -5400,10 +5628,6 @@ static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
 static struct cftype debug_files[] =  {
        {
-                .name = "cgroup_refcount",
-                .read_u64 = cgroup_refcount_read,
-        },
-        {
                .name = "taskcount",
                .read_u64 = debug_taskcount_read,
        },
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 198a38883e64..b2b227b82123 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -366,7 +366,7 @@ EXPORT_SYMBOL(cpu_down);
 #endif /*CONFIG_HOTPLUG_CPU*/
 /* Requires cpu_add_remove_lock to be held */
-static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
+static int _cpu_up(unsigned int cpu, int tasks_frozen)
 {
        int ret, nr_calls = 0;
        void *hcpu = (void *)(long)cpu;
@@ -419,7 +419,7 @@ out:
        return ret;
 }
-int __cpuinit cpu_up(unsigned int cpu)
+int cpu_up(unsigned int cpu)
 {
        int err = 0;
@@ -618,7 +618,7 @@ core_initcall(cpu_hotplug_pm_sync_init);
 * It must be called by the arch code on the new cpu, before the new cpu
 * enables interrupts and before the "boot" cpu returns from __cpu_up().
 */
-void __cpuinit notify_cpu_starting(unsigned int cpu)
+void notify_cpu_starting(unsigned int cpu)
 {
        unsigned long val = CPU_STARTING;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 64b3f791bbe5..e5657788fedd 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -59,6 +59,7 @@
 #include <linux/mutex.h>
 #include <linux/workqueue.h>
 #include <linux/cgroup.h>
+#include <linux/wait.h>
 /*
 * Tracks how many cpusets are currently defined in system.
@@ -87,6 +88,18 @@ struct cpuset {
        cpumask_var_t cpus_allowed;     /* CPUs allowed to tasks in cpuset */
        nodemask_t mems_allowed;        /* Memory Nodes allowed to tasks */
+        /*
+         * This is old Memory Nodes tasks took on.
+         *
+         * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
+         * - A new cpuset's old_mems_allowed is initialized when some
+         *   task is moved into it.
+         * - old_mems_allowed is used in cpuset_migrate_mm() when we change
+         *   cpuset.mems_allowed and have tasks' nodemask updated, and
+         *   then old_mems_allowed is updated to mems_allowed.
+         */
+        nodemask_t old_mems_allowed;
        struct fmeter fmeter;           /* memory_pressure filter */
        /*
@@ -100,14 +113,12 @@ struct cpuset {
        /* for custom sched domain */
        int relax_domain_level;
-        struct work_struct hotplug_work;
 };
 /* Retrieve the cpuset for a cgroup */
-static inline struct cpuset *cgroup_cs(struct cgroup *cont)
+static inline struct cpuset *cgroup_cs(struct cgroup *cgrp)
 {
-        return container_of(cgroup_subsys_state(cont, cpuset_subsys_id),
+        return container_of(cgroup_subsys_state(cgrp, cpuset_subsys_id),
                            struct cpuset, css);
 }
@@ -267,14 +278,11 @@ static DEFINE_MUTEX(callback_mutex);
 /*
 * CPU / memory hotplug is handled asynchronously.
 */
-static struct workqueue_struct *cpuset_propagate_hotplug_wq;
 static void cpuset_hotplug_workfn(struct work_struct *work);
-static void cpuset_propagate_hotplug_workfn(struct work_struct *work);
-static void schedule_cpuset_propagate_hotplug(struct cpuset *cs);
 static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
+static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
 /*
 * This is ugly, but preserves the userspace API for existing cpuset
 * users. If someone tries to mount the "cpuset" filesystem, we
@@ -304,53 +312,38 @@ static struct file_system_type cpuset_fs_type = {
 /*
 * Return in pmask the portion of a cpusets's cpus_allowed that
 * are online.  If none are online, walk up the cpuset hierarchy
- * until we find one that does have some online cpus.  If we get
+ * until we find one that does have some online cpus.  The top
- * all the way to the top and still haven't found any online cpus,
+ * cpuset always has some cpus online.
- * return cpu_online_mask.  Or if passed a NULL cs from an exit'ing
- * task, return cpu_online_mask.
 *
 * One way or another, we guarantee to return some non-empty subset
 * of cpu_online_mask.
 *
 * Call with callback_mutex held.
 */
 static void guarantee_online_cpus(const struct cpuset *cs,
                                  struct cpumask *pmask)
 {
-        while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
+        while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
                cs = parent_cs(cs);
-        if (cs)
+        cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
-                cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
-        else
-                cpumask_copy(pmask, cpu_online_mask);
-        BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
 }
 /*
 * Return in *pmask the portion of a cpusets's mems_allowed that
 * are online, with memory.  If none are online with memory, walk
 * up the cpuset hierarchy until we find one that does have some
- * online mems.  If we get all the way to the top and still haven't
+ * online mems.  The top cpuset always has some mems online.
- * found any online mems, return node_states[N_MEMORY].
 *
 * One way or another, we guarantee to return some non-empty subset
 * of node_states[N_MEMORY].
 *
 * Call with callback_mutex held.
 */
 static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
 {
-        while (cs && !nodes_intersects(cs->mems_allowed,
+        while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
-                                        node_states[N_MEMORY]))
                cs = parent_cs(cs);
-        if (cs)
+        nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]);
-                nodes_and(*pmask, cs->mems_allowed,
-                                        node_states[N_MEMORY]);
-        else
-                *pmask = node_states[N_MEMORY];
-        BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY]));
 }
 /*
@@ -440,7 +433,7 @@ static void free_trial_cpuset(struct cpuset *trial)
 static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
 {
-        struct cgroup *cont;
+        struct cgroup *cgrp;
        struct cpuset *c, *par;
        int ret;
@@ -448,7 +441,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
        /* Each of our child cpusets must be a subset of us */
        ret = -EBUSY;
-        cpuset_for_each_child(c, cont, cur)
+        cpuset_for_each_child(c, cgrp, cur)
                if (!is_cpuset_subset(c, trial))
                        goto out;
@@ -469,7 +462,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
         * overlap
         */
        ret = -EINVAL;
-        cpuset_for_each_child(c, cont, par) {
+        cpuset_for_each_child(c, cgrp, par) {
                if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
                    c != cur &&
                    cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
@@ -486,7 +479,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
         */
        ret = -ENOSPC;
        if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) &&
-            (cpumask_empty(trial->cpus_allowed) ||
+            (cpumask_empty(trial->cpus_allowed) &&
             nodes_empty(trial->mems_allowed)))
                goto out;
@@ -540,7 +533,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
 * This function builds a partial partition of the systems CPUs
 * A 'partial partition' is a set of non-overlapping subsets whose
 * union is a subset of that set.
- * The output of this function needs to be passed to kernel/sched.c
+ * The output of this function needs to be passed to kernel/sched/core.c
 * partition_sched_domains() routine, which will rebuild the scheduler's
 * load balancing domains (sched domains) as specified by that partial
 * partition.
@@ -569,7 +562,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
 *         is a subset of one of these domains, while there are as
 *         many such domains as possible, each as small as possible.
 * doms  - Conversion of 'csa' to an array of cpumasks, for passing to
- *         the kernel/sched.c routine partition_sched_domains() in a
+ *         the kernel/sched/core.c routine partition_sched_domains() in a
 *         convenient format, that can be easily compared to the prior
 *         value to determine what partition elements (sched domains)
 *         were changed (added or removed.)
@@ -798,21 +791,43 @@ void rebuild_sched_domains(void)
        mutex_unlock(&cpuset_mutex);
 }
-/**
+/*
- * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's
+ * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus
- * @tsk: task to test
+ * @cs: the cpuset in interest
- * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
 *
- * Call with cpuset_mutex held.  May take callback_mutex during call.
+ * A cpuset's effective cpumask is the cpumask of the nearest ancestor
- * Called for each task in a cgroup by cgroup_scan_tasks().
+ * with non-empty cpus. We use effective cpumask whenever:
- * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
+ * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask
- * words, if its mask is not equal to its cpuset's mask).
+ *   if the cpuset they reside in has no cpus)
+ * - we want to retrieve task_cs(tsk)'s cpus_allowed.
+ *
+ * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an
+ * exception. See comments there.
 */
-static int cpuset_test_cpumask(struct task_struct *tsk,
+static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs)
-                               struct cgroup_scanner *scan)
 {
-        return !cpumask_equal(&tsk->cpus_allowed,
+        while (cpumask_empty(cs->cpus_allowed))
-                        (cgroup_cs(scan->cg))->cpus_allowed);
+                cs = parent_cs(cs);
+        return cs;
+}
+/*
+ * effective_nodemask_cpuset - return nearest ancestor with non-empty mems
+ * @cs: the cpuset in interest
+ *
+ * A cpuset's effective nodemask is the nodemask of the nearest ancestor
+ * with non-empty memss. We use effective nodemask whenever:
+ * - we update tasks' mems_allowed. (they take on the ancestor's nodemask
+ *   if the cpuset they reside in has no mems)
+ * - we want to retrieve task_cs(tsk)'s mems_allowed.
+ *
+ * Called with cpuset_mutex held.
+ */
+static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
+{
+        while (nodes_empty(cs->mems_allowed))
+                cs = parent_cs(cs);
+        return cs;
 }
 /**
@@ -829,7 +844,10 @@ static int cpuset_test_cpumask(struct task_struct *tsk,
 static void cpuset_change_cpumask(struct task_struct *tsk,
                                  struct cgroup_scanner *scan)
 {
-        set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed));
+        struct cpuset *cpus_cs;
+        cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cg));
+        set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed);
 }
 /**
@@ -850,12 +868,51 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
        struct cgroup_scanner scan;
        scan.cg = cs->css.cgroup;
-        scan.test_task = cpuset_test_cpumask;
+        scan.test_task = NULL;
        scan.process_task = cpuset_change_cpumask;
        scan.heap = heap;
        cgroup_scan_tasks(&scan);
 }
+/*
+ * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
+ * @root_cs: the root cpuset of the hierarchy
+ * @update_root: update root cpuset or not?
+ * @heap: the heap used by cgroup_scan_tasks()
+ *
+ * This will update cpumasks of tasks in @root_cs and all other empty cpusets
+ * which take on cpumask of @root_cs.
+ *
+ * Called with cpuset_mutex held
+ */
+static void update_tasks_cpumask_hier(struct cpuset *root_cs,
+                                      bool update_root, struct ptr_heap *heap)
+{
+        struct cpuset *cp;
+        struct cgroup *pos_cgrp;
+        if (update_root)
+                update_tasks_cpumask(root_cs, heap);
+        rcu_read_lock();
+        cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
+                /* skip the whole subtree if @cp have some CPU */
+                if (!cpumask_empty(cp->cpus_allowed)) {
+                        pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
+                        continue;
+                }
+                if (!css_tryget(&cp->css))
+                        continue;
+                rcu_read_unlock();
+                update_tasks_cpumask(cp, heap);
+                rcu_read_lock();
+                css_put(&cp->css);
+        }
+        rcu_read_unlock();
+}
 /**
 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
 * @cs: the cpuset to consider
@@ -888,14 +945,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
                if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
                        return -EINVAL;
        }
-        retval = validate_change(cs, trialcs);
-        if (retval < 0)
-                return retval;
        /* Nothing to do if the cpus didn't change */
        if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
                return 0;
+        retval = validate_change(cs, trialcs);
+        if (retval < 0)
+                return retval;
        retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
        if (retval)
                return retval;
@@ -906,11 +964,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
        cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
        mutex_unlock(&callback_mutex);
-        /*
+        update_tasks_cpumask_hier(cs, true, &heap);
-         * Scan tasks in the cpuset, and update the cpumasks of any
-         * that need an update.
-         */
-        update_tasks_cpumask(cs, &heap);
        heap_free(&heap);
@@ -943,12 +997,14 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
                                                        const nodemask_t *to)
 {
        struct task_struct *tsk = current;
+        struct cpuset *mems_cs;
        tsk->mems_allowed = *to;
        do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
-        guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
+        mems_cs = effective_nodemask_cpuset(task_cs(tsk));
+        guarantee_online_mems(mems_cs, &tsk->mems_allowed);
 }
 /*
@@ -1007,16 +1063,12 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
 static void cpuset_change_nodemask(struct task_struct *p,
                                   struct cgroup_scanner *scan)
 {
+        struct cpuset *cs = cgroup_cs(scan->cg);
        struct mm_struct *mm;
-        struct cpuset *cs;
        int migrate;
-        const nodemask_t *oldmem = scan->data;
+        nodemask_t *newmems = scan->data;
-        static nodemask_t newmems;      /* protected by cpuset_mutex */
-        cs = cgroup_cs(scan->cg);
-        guarantee_online_mems(cs, &newmems);
-        cpuset_change_task_nodemask(p, &newmems);
+        cpuset_change_task_nodemask(p, newmems);
        mm = get_task_mm(p);
        if (!mm)
@@ -1026,7 +1078,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
        mpol_rebind_mm(mm, &cs->mems_allowed);
        if (migrate)
-                cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
+                cpuset_migrate_mm(mm, &cs->old_mems_allowed, newmems);
        mmput(mm);
 }
@@ -1035,25 +1087,27 @@ static void *cpuset_being_rebound;
 /**
 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
- * @oldmem: old mems_allowed of cpuset cs
 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
 *
 * Called with cpuset_mutex held
 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
 * if @heap != NULL.
 */
-static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
+static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
-                                 struct ptr_heap *heap)
 {
+        static nodemask_t newmems;      /* protected by cpuset_mutex */
        struct cgroup_scanner scan;
+        struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
        cpuset_being_rebound = cs;              /* causes mpol_dup() rebind */
+        guarantee_online_mems(mems_cs, &newmems);
        scan.cg = cs->css.cgroup;
        scan.test_task = NULL;
        scan.process_task = cpuset_change_nodemask;
        scan.heap = heap;
-        scan.data = (nodemask_t *)oldmem;
+        scan.data = &newmems;
        /*
         * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
@@ -1067,11 +1121,56 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
         */
        cgroup_scan_tasks(&scan);
+        /*
+         * All the tasks' nodemasks have been updated, update
+         * cs->old_mems_allowed.
+         */
+        cs->old_mems_allowed = newmems;
        /* We're done rebinding vmas to this cpuset's new mems_allowed. */
        cpuset_being_rebound = NULL;
 }
 /*
+ * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
+ * @cs: the root cpuset of the hierarchy
+ * @update_root: update the root cpuset or not?
+ * @heap: the heap used by cgroup_scan_tasks()
+ *
+ * This will update nodemasks of tasks in @root_cs and all other empty cpusets
+ * which take on nodemask of @root_cs.
+ *
+ * Called with cpuset_mutex held
+ */
+static void update_tasks_nodemask_hier(struct cpuset *root_cs,
+                                       bool update_root, struct ptr_heap *heap)
+{
+        struct cpuset *cp;
+        struct cgroup *pos_cgrp;
+        if (update_root)
+                update_tasks_nodemask(root_cs, heap);
+        rcu_read_lock();
+        cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
+                /* skip the whole subtree if @cp have some CPU */
+                if (!nodes_empty(cp->mems_allowed)) {
+                        pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
+                        continue;
+                }
+                if (!css_tryget(&cp->css))
+                        continue;
+                rcu_read_unlock();
+                update_tasks_nodemask(cp, heap);
+                rcu_read_lock();
+                css_put(&cp->css);
+        }
+        rcu_read_unlock();
+}
+/*
 * Handle user request to change the 'mems' memory placement
 * of a cpuset.  Needs to validate the request, update the
 * cpusets mems_allowed, and for each task in the cpuset,
@@ -1087,13 +1186,9 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
 static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
                           const char *buf)
 {
-        NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
        int retval;
        struct ptr_heap heap;
-        if (!oldmem)
-                return -ENOMEM;
        /*
         * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
         * it's read-only
@@ -1122,8 +1217,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
                        goto done;
                }
        }
-        *oldmem = cs->mems_allowed;
-        if (nodes_equal(*oldmem, trialcs->mems_allowed)) {
+        if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
                retval = 0;             /* Too easy - nothing to do */
                goto done;
        }
@@ -1139,11 +1234,10 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
        cs->mems_allowed = trialcs->mems_allowed;
        mutex_unlock(&callback_mutex);
-        update_tasks_nodemask(cs, oldmem, &heap);
+        update_tasks_nodemask_hier(cs, true, &heap);
        heap_free(&heap);
 done:
-        NODEMASK_FREE(oldmem);
        return retval;
 }
@@ -1372,8 +1466,13 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
        mutex_lock(&cpuset_mutex);
+        /*
+         * We allow to move tasks into an empty cpuset if sane_behavior
+         * flag is set.
+         */
        ret = -ENOSPC;
-        if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
+        if (!cgroup_sane_behavior(cgrp) &&
+            (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
                goto out_unlock;
        cgroup_taskset_for_each(task, cgrp, tset) {
@@ -1422,8 +1521,7 @@ static cpumask_var_t cpus_attach;
 static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 {
-        /* static bufs protected by cpuset_mutex */
+        /* static buf protected by cpuset_mutex */
-        static nodemask_t cpuset_attach_nodemask_from;
        static nodemask_t cpuset_attach_nodemask_to;
        struct mm_struct *mm;
        struct task_struct *task;
@@ -1431,6 +1529,8 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
        struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);
        struct cpuset *cs = cgroup_cs(cgrp);
        struct cpuset *oldcs = cgroup_cs(oldcgrp);
+        struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
+        struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
        mutex_lock(&cpuset_mutex);
@@ -1438,9 +1538,9 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
        if (cs == &top_cpuset)
                cpumask_copy(cpus_attach, cpu_possible_mask);
        else
-                guarantee_online_cpus(cs, cpus_attach);
+                guarantee_online_cpus(cpus_cs, cpus_attach);
-        guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+        guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
        cgroup_taskset_for_each(task, cgrp, tset) {
                /*
@@ -1457,26 +1557,32 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
         * Change mm, possibly for multiple threads in a threadgroup. This is
         * expensive and may sleep.
         */
-        cpuset_attach_nodemask_from = oldcs->mems_allowed;
        cpuset_attach_nodemask_to = cs->mems_allowed;
        mm = get_task_mm(leader);
        if (mm) {
+                struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs);
                mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
-                if (is_memory_migrate(cs))
-                        cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from,
+                /*
+                 * old_mems_allowed is the same with mems_allowed here, except
+                 * if this task is being moved automatically due to hotplug.
+                 * In that case @mems_allowed has been updated and is empty,
+                 * so @old_mems_allowed is the right nodesets that we migrate
+                 * mm from.
+                 */
+                if (is_memory_migrate(cs)) {
+                        cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed,
                                          &cpuset_attach_nodemask_to);
+                }
                mmput(mm);
        }
-        cs->attach_in_progress--;
+        cs->old_mems_allowed = cpuset_attach_nodemask_to;
-        /*
+        cs->attach_in_progress--;
-         * We may have raced with CPU/memory hotunplug.  Trigger hotplug
+        if (!cs->attach_in_progress)
-         * propagation if @cs doesn't have any CPU or memory.  It will move
+                wake_up(&cpuset_attach_wq);
-         * the newly added tasks to the nearest parent which can execute.
-         */
-        if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
-                schedule_cpuset_propagate_hotplug(cs);
        mutex_unlock(&cpuset_mutex);
 }
@@ -1588,13 +1694,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
         * resources, wait for the previously scheduled operations before
         * proceeding, so that we don't end up keep removing tasks added
         * after execution capability is restored.
-         *
-         * Flushing cpuset_hotplug_work is enough to synchronize against
-         * hotplug hanlding; however, cpuset_attach() may schedule
-         * propagation work directly.  Flush the workqueue too.
         */
        flush_work(&cpuset_hotplug_work);
-        flush_workqueue(cpuset_propagate_hotplug_wq);
        mutex_lock(&cpuset_mutex);
        if (!is_cpuset_online(cs))
@@ -1658,13 +1759,13 @@ static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
        return count;
 }
-static ssize_t cpuset_common_file_read(struct cgroup *cont,
+static ssize_t cpuset_common_file_read(struct cgroup *cgrp,
                                       struct cftype *cft,
                                       struct file *file,
                                       char __user *buf,
                                       size_t nbytes, loff_t *ppos)
 {
-        struct cpuset *cs = cgroup_cs(cont);
+        struct cpuset *cs = cgroup_cs(cgrp);
        cpuset_filetype_t type = cft->private;
        char *page;
        ssize_t retval = 0;
@@ -1694,9 +1795,9 @@ out:
        return retval;
 }
-static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
+static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft)
 {
-        struct cpuset *cs = cgroup_cs(cont);
+        struct cpuset *cs = cgroup_cs(cgrp);
        cpuset_filetype_t type = cft->private;
        switch (type) {
        case FILE_CPU_EXCLUSIVE:
@@ -1725,9 +1826,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
        return 0;
 }
-static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
+static s64 cpuset_read_s64(struct cgroup *cgrp, struct cftype *cft)
 {
-        struct cpuset *cs = cgroup_cs(cont);
+        struct cpuset *cs = cgroup_cs(cgrp);
        cpuset_filetype_t type = cft->private;
        switch (type) {
        case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1839,14 +1940,14 @@ static struct cftype files[] = {
 /*
 *      cpuset_css_alloc - allocate a cpuset css
- *      cont:   control group that the new cpuset will be part of
+ *      cgrp:   control group that the new cpuset will be part of
 */
-static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
+static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp)
 {
        struct cpuset *cs;
-        if (!cont->parent)
+        if (!cgrp->parent)
                return &top_cpuset.css;
        cs = kzalloc(sizeof(*cs), GFP_KERNEL);
@@ -1861,7 +1962,6 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
        cpumask_clear(cs->cpus_allowed);
        nodes_clear(cs->mems_allowed);
        fmeter_init(&cs->fmeter);
-        INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn);
        cs->relax_domain_level = -1;
        return &cs->css;
@@ -1942,9 +2042,9 @@ static void cpuset_css_offline(struct cgroup *cgrp)
 * will call rebuild_sched_domains_locked().
 */
-static void cpuset_css_free(struct cgroup *cont)
+static void cpuset_css_free(struct cgroup *cgrp)
 {
-        struct cpuset *cs = cgroup_cs(cont);
+        struct cpuset *cs = cgroup_cs(cgrp);
        free_cpumask_var(cs->cpus_allowed);
        kfree(cs);
@@ -2024,41 +2124,64 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
 }
 /**
- * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset
+ * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
 * @cs: cpuset in interest
 *
 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
 * offline, update @cs accordingly.  If @cs ends up with no CPU or memory,
 * all its tasks are moved to the nearest ancestor with both resources.
 */
-static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
+static void cpuset_hotplug_update_tasks(struct cpuset *cs)
 {
        static cpumask_t off_cpus;
-        static nodemask_t off_mems, tmp_mems;
+        static nodemask_t off_mems;
-        struct cpuset *cs = container_of(work, struct cpuset, hotplug_work);
        bool is_empty;
+        bool sane = cgroup_sane_behavior(cs->css.cgroup);
+retry:
+        wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
        mutex_lock(&cpuset_mutex);
+        /*
+         * We have raced with task attaching. We wait until attaching
+         * is finished, so we won't attach a task to an empty cpuset.
+         */
+        if (cs->attach_in_progress) {
+                mutex_unlock(&cpuset_mutex);
+                goto retry;
+        }
        cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
        nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
-        /* remove offline cpus from @cs */
+        mutex_lock(&callback_mutex);
-        if (!cpumask_empty(&off_cpus)) {
+        cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
-                mutex_lock(&callback_mutex);
+        mutex_unlock(&callback_mutex);
-                cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
-                mutex_unlock(&callback_mutex);
+        /*
+         * If sane_behavior flag is set, we need to update tasks' cpumask
+         * for empty cpuset to take on ancestor's cpumask. Otherwise, don't
+         * call update_tasks_cpumask() if the cpuset becomes empty, as
+         * the tasks in it will be migrated to an ancestor.
+         */
+        if ((sane && cpumask_empty(cs->cpus_allowed)) ||
+            (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
                update_tasks_cpumask(cs, NULL);
-        }
-        /* remove offline mems from @cs */
+        mutex_lock(&callback_mutex);
-        if (!nodes_empty(off_mems)) {
+        nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
-                tmp_mems = cs->mems_allowed;
+        mutex_unlock(&callback_mutex);
-                mutex_lock(&callback_mutex);
-                nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
+        /*
-                mutex_unlock(&callback_mutex);
+         * If sane_behavior flag is set, we need to update tasks' nodemask
-                update_tasks_nodemask(cs, &tmp_mems, NULL);
+         * for empty cpuset to take on ancestor's nodemask. Otherwise, don't
-        }
+         * call update_tasks_nodemask() if the cpuset becomes empty, as
+         * the tasks in it will be migratd to an ancestor.
+         */
+        if ((sane && nodes_empty(cs->mems_allowed)) ||
+            (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
+                update_tasks_nodemask(cs, NULL);
        is_empty = cpumask_empty(cs->cpus_allowed) ||
                nodes_empty(cs->mems_allowed);
@@ -2066,40 +2189,14 @@ static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
        mutex_unlock(&cpuset_mutex);
        /*
-         * If @cs became empty, move tasks to the nearest ancestor with
+         * If sane_behavior flag is set, we'll keep tasks in empty cpusets.
-         * execution resources.  This is full cgroup operation which will
+         *
+         * Otherwise move tasks to the nearest ancestor with execution
+         * resources.  This is full cgroup operation which will
         * also call back into cpuset.  Should be done outside any lock.
         */
-        if (is_empty)
+        if (!sane && is_empty)
                remove_tasks_in_empty_cpuset(cs);
-        /* the following may free @cs, should be the last operation */
-        css_put(&cs->css);
-}
-/**
- * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset
- * @cs: cpuset of interest
- *
- * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and
- * memory masks according to top_cpuset.
- */
-static void schedule_cpuset_propagate_hotplug(struct cpuset *cs)
-{
-        /*
-         * Pin @cs.  The refcnt will be released when the work item
-         * finishes executing.
-         */
-        if (!css_tryget(&cs->css))
-                return;
-        /*
-         * Queue @cs->hotplug_work.  If already pending, lose the css ref.
-         * cpuset_propagate_hotplug_wq is ordered and propagation will
-         * happen in the order this function is called.
-         */
-        if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work))
-                css_put(&cs->css);
 }
 /**
@@ -2112,18 +2209,17 @@ static void schedule_cpuset_propagate_hotplug(struct cpuset *cs)
 * actively using CPU hotplug but making no active use of cpusets.
 *
 * Non-root cpusets are only affected by offlining.  If any CPUs or memory
- * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all
+ * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
- * descendants.
+ * all descendants.
 *
 * Note that CPU offlining during suspend is ignored.  We don't modify
 * cpusets across suspend/resume cycles at all.
 */
 static void cpuset_hotplug_workfn(struct work_struct *work)
 {
-        static cpumask_t new_cpus, tmp_cpus;
+        static cpumask_t new_cpus;
-        static nodemask_t new_mems, tmp_mems;
+        static nodemask_t new_mems;
        bool cpus_updated, mems_updated;
-        bool cpus_offlined, mems_offlined;
        mutex_lock(&cpuset_mutex);
@@ -2132,12 +2228,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
        new_mems = node_states[N_MEMORY];
        cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
-        cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed,
-                                       &new_cpus);
        mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
-        nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems);
-        mems_offlined = !nodes_empty(tmp_mems);
        /* synchronize cpus_allowed to cpu_active_mask */
        if (cpus_updated) {
@@ -2149,28 +2240,32 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
        /* synchronize mems_allowed to N_MEMORY */
        if (mems_updated) {
-                tmp_mems = top_cpuset.mems_allowed;
                mutex_lock(&callback_mutex);
                top_cpuset.mems_allowed = new_mems;
                mutex_unlock(&callback_mutex);
-                update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL);
+                update_tasks_nodemask(&top_cpuset, NULL);
        }
-        /* if cpus or mems went down, we need to propagate to descendants */
+        mutex_unlock(&cpuset_mutex);
-        if (cpus_offlined || mems_offlined) {
+        /* if cpus or mems changed, we need to propagate to descendants */
+        if (cpus_updated || mems_updated) {
                struct cpuset *cs;
                struct cgroup *pos_cgrp;
                rcu_read_lock();
-                cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset)
+                cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) {
-                        schedule_cpuset_propagate_hotplug(cs);
+                        if (!css_tryget(&cs->css))
-                rcu_read_unlock();
+                                continue;
-        }
+                        rcu_read_unlock();
-        mutex_unlock(&cpuset_mutex);
+                        cpuset_hotplug_update_tasks(cs);
-        /* wait for propagations to finish */
+                        rcu_read_lock();
-        flush_workqueue(cpuset_propagate_hotplug_wq);
+                        css_put(&cs->css);
+                }
+                rcu_read_unlock();
+        }
        /* rebuild sched domains if cpus_allowed has changed */
        if (cpus_updated)
@@ -2219,12 +2314,9 @@ void __init cpuset_init_smp(void)
 {
        cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
        top_cpuset.mems_allowed = node_states[N_MEMORY];
+        top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
        register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
-        cpuset_propagate_hotplug_wq =
-                alloc_ordered_workqueue("cpuset_hotplug", 0);
-        BUG_ON(!cpuset_propagate_hotplug_wq);
 }
 /**
@@ -2240,21 +2332,23 @@ void __init cpuset_init_smp(void)
 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 {
+        struct cpuset *cpus_cs;
        mutex_lock(&callback_mutex);
        task_lock(tsk);
-        guarantee_online_cpus(task_cs(tsk), pmask);
+        cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
+        guarantee_online_cpus(cpus_cs, pmask);
        task_unlock(tsk);
        mutex_unlock(&callback_mutex);
 }
 void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
 {
-        const struct cpuset *cs;
+        const struct cpuset *cpus_cs;
        rcu_read_lock();
-        cs = task_cs(tsk);
+        cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
-        if (cs)
+        do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed);
-                do_set_cpus_allowed(tsk, cs->cpus_allowed);
        rcu_read_unlock();
        /*
@@ -2293,11 +2387,13 @@ void cpuset_init_current_mems_allowed(void)
 nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
 {
+        struct cpuset *mems_cs;
        nodemask_t mask;
        mutex_lock(&callback_mutex);
        task_lock(tsk);
-        guarantee_online_mems(task_cs(tsk), &mask);
+        mems_cs = effective_nodemask_cpuset(task_cs(tsk));
+        guarantee_online_mems(mems_cs, &mask);
        task_unlock(tsk);
        mutex_unlock(&callback_mutex);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b391907d5352..f86599e8c123 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -165,10 +165,28 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free'
 /*
 * max perf event sample rate
 */
-#define DEFAULT_MAX_SAMPLE_RATE 100000
+#define DEFAULT_MAX_SAMPLE_RATE         100000
-int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
+#define DEFAULT_SAMPLE_PERIOD_NS        (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
-static int max_samples_per_tick __read_mostly =
+#define DEFAULT_CPU_TIME_MAX_PERCENT    25
-        DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
+static int max_samples_per_tick __read_mostly   = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+static int perf_sample_period_ns __read_mostly  = DEFAULT_SAMPLE_PERIOD_NS;
+static atomic_t perf_sample_allowed_ns __read_mostly =
+        ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100);
+void update_perf_cpu_limits(void)
+{
+        u64 tmp = perf_sample_period_ns;
+        tmp *= sysctl_perf_cpu_time_max_percent;
+        do_div(tmp, 100);
+        atomic_set(&perf_sample_allowed_ns, tmp);
+}
+static int perf_rotate_context(struct perf_cpu_context *cpuctx);
 int perf_proc_update_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
@@ -180,10 +198,78 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
                return ret;
        max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
+        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+        update_perf_cpu_limits();
+        return 0;
+}
+int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
+int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
+                                void __user *buffer, size_t *lenp,
+                                loff_t *ppos)
+{
+        int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+        if (ret || !write)
+                return ret;
+        update_perf_cpu_limits();
        return 0;
 }
+/*
+ * perf samples are done in some very critical code paths (NMIs).
+ * If they take too much CPU time, the system can lock up and not
+ * get any real work done.  This will drop the sample rate when
+ * we detect that events are taking too long.
+ */
+#define NR_ACCUMULATED_SAMPLES 128
+DEFINE_PER_CPU(u64, running_sample_length);
+void perf_sample_event_took(u64 sample_len_ns)
+{
+        u64 avg_local_sample_len;
+        u64 local_samples_len;
+        if (atomic_read(&perf_sample_allowed_ns) == 0)
+                return;
+        /* decay the counter by 1 average sample */
+        local_samples_len = __get_cpu_var(running_sample_length);
+        local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
+        local_samples_len += sample_len_ns;
+        __get_cpu_var(running_sample_length) = local_samples_len;
+        /*
+         * note: this will be biased artifically low until we have
+         * seen NR_ACCUMULATED_SAMPLES.  Doing it this way keeps us
+         * from having to maintain a count.
+         */
+        avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
+        if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns))
+                return;
+        if (max_samples_per_tick <= 1)
+                return;
+        max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
+        sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
+        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+        printk_ratelimited(KERN_WARNING
+                        "perf samples too long (%lld > %d), lowering "
+                        "kernel.perf_event_max_sample_rate to %d\n",
+                        avg_local_sample_len,
+                        atomic_read(&perf_sample_allowed_ns),
+                        sysctl_perf_event_sample_rate);
+        update_perf_cpu_limits();
+}
 static atomic64_t perf_event_id;
 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
@@ -655,6 +741,106 @@ perf_cgroup_mark_enabled(struct perf_event *event,
 }
 #endif
+/*
+ * set default to be dependent on timer tick just
+ * like original code
+ */
+#define PERF_CPU_HRTIMER (1000 / HZ)
+/*
+ * function must be called with interrupts disbled
+ */
+static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
+{
+        struct perf_cpu_context *cpuctx;
+        enum hrtimer_restart ret = HRTIMER_NORESTART;
+        int rotations = 0;
+        WARN_ON(!irqs_disabled());
+        cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
+        rotations = perf_rotate_context(cpuctx);
+        /*
+         * arm timer if needed
+         */
+        if (rotations) {
+                hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
+                ret = HRTIMER_RESTART;
+        }
+        return ret;
+}
+/* CPU is going down */
+void perf_cpu_hrtimer_cancel(int cpu)
+{
+        struct perf_cpu_context *cpuctx;
+        struct pmu *pmu;
+        unsigned long flags;
+        if (WARN_ON(cpu != smp_processor_id()))
+                return;
+        local_irq_save(flags);
+        rcu_read_lock();
+        list_for_each_entry_rcu(pmu, &pmus, entry) {
+                cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+                if (pmu->task_ctx_nr == perf_sw_context)
+                        continue;
+                hrtimer_cancel(&cpuctx->hrtimer);
+        }
+        rcu_read_unlock();
+        local_irq_restore(flags);
+}
+static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+{
+        struct hrtimer *hr = &cpuctx->hrtimer;
+        struct pmu *pmu = cpuctx->ctx.pmu;
+        int timer;
+        /* no multiplexing needed for SW PMU */
+        if (pmu->task_ctx_nr == perf_sw_context)
+                return;
+        /*
+         * check default is sane, if not set then force to
+         * default interval (1/tick)
+         */
+        timer = pmu->hrtimer_interval_ms;
+        if (timer < 1)
+                timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
+        cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+        hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+        hr->function = perf_cpu_hrtimer_handler;
+}
+static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
+{
+        struct hrtimer *hr = &cpuctx->hrtimer;
+        struct pmu *pmu = cpuctx->ctx.pmu;
+        /* not for SW PMU */
+        if (pmu->task_ctx_nr == perf_sw_context)
+                return;
+        if (hrtimer_active(hr))
+                return;
+        if (!hrtimer_callback_running(hr))
+                __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
+                                         0, HRTIMER_MODE_REL_PINNED, 0);
+}
 void perf_pmu_disable(struct pmu *pmu)
 {
        int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -761,8 +947,18 @@ perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
 {
        struct perf_event_context *ctx;
-        rcu_read_lock();
 retry:
+        /*
+         * One of the few rules of preemptible RCU is that one cannot do
+         * rcu_read_unlock() while holding a scheduler (or nested) lock when
+         * part of the read side critical section was preemptible -- see
+         * rcu_read_unlock_special().
+         *
+         * Since ctx->lock nests under rq->lock we must ensure the entire read
+         * side critical section is non-preemptible.
+         */
+        preempt_disable();
+        rcu_read_lock();
        ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
        if (ctx) {
                /*
@@ -778,6 +974,8 @@ retry:
                raw_spin_lock_irqsave(&ctx->lock, *flags);
                if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
                        raw_spin_unlock_irqrestore(&ctx->lock, *flags);
+                        rcu_read_unlock();
+                        preempt_enable();
                        goto retry;
                }
@@ -787,6 +985,7 @@ retry:
                }
        }
        rcu_read_unlock();
+        preempt_enable();
        return ctx;
 }
@@ -1503,6 +1702,7 @@ group_sched_in(struct perf_event *group_event,
        if (event_sched_in(group_event, cpuctx, ctx)) {
                pmu->cancel_txn(pmu);
+                perf_cpu_hrtimer_restart(cpuctx);
                return -EAGAIN;
        }
@@ -1549,6 +1749,8 @@ group_error:
        pmu->cancel_txn(pmu);
+        perf_cpu_hrtimer_restart(cpuctx);
        return -EAGAIN;
 }
@@ -1761,7 +1963,16 @@ static int __perf_event_enable(void *info)
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        int err;
-        if (WARN_ON_ONCE(!ctx->is_active))
+        /*
+         * There's a time window between 'ctx->is_active' check
+         * in perf_event_enable function and this place having:
+         *   - IRQs on
+         *   - ctx->lock unlocked
+         *
+         * where the task could be killed and 'ctx' deactivated
+         * by perf_event_exit_task.
+         */
+        if (!ctx->is_active)
                return -EINVAL;
        raw_spin_lock(&ctx->lock);
@@ -1804,8 +2015,10 @@ static int __perf_event_enable(void *info)
                 * If this event can't go on and it's part of a
                 * group, then the whole group has to come off.
                 */
-                if (leader != event)
+                if (leader != event) {
                        group_sched_out(leader, cpuctx, ctx);
+                        perf_cpu_hrtimer_restart(cpuctx);
+                }
                if (leader->attr.pinned) {
                        update_group_times(leader);
                        leader->state = PERF_EVENT_STATE_ERROR;
@@ -2552,7 +2765,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
 * because they're strictly cpu affine and rotate_start is called with IRQs
 * disabled, while rotate_context is called from IRQ context.
 */
-static void perf_rotate_context(struct perf_cpu_context *cpuctx)
+static int perf_rotate_context(struct perf_cpu_context *cpuctx)
 {
        struct perf_event_context *ctx = NULL;
        int rotate = 0, remove = 1;
@@ -2591,6 +2804,8 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
 done:
        if (remove)
                list_del_init(&cpuctx->rotation_list);
+        return rotate;
 }
 #ifdef CONFIG_NO_HZ_FULL
@@ -2622,10 +2837,6 @@ void perf_event_task_tick(void)
                ctx = cpuctx->task_ctx;
                if (ctx)
                        perf_adjust_freq_unthr_context(ctx, throttled);
-                if (cpuctx->jiffies_interval == 1 ||
-                                !(jiffies % cpuctx->jiffies_interval))
-                        perf_rotate_context(cpuctx);
        }
 }
@@ -5036,7 +5247,7 @@ static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
 * sign as trigger.
 */
-static u64 perf_swevent_set_period(struct perf_event *event)
+u64 perf_swevent_set_period(struct perf_event *event)
 {
        struct hw_perf_event *hwc = &event->hw;
        u64 period = hwc->last_period;
@@ -5979,9 +6190,54 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
        return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
 }
+static ssize_t
+perf_event_mux_interval_ms_show(struct device *dev,
+                                struct device_attribute *attr,
+                                char *page)
+{
+        struct pmu *pmu = dev_get_drvdata(dev);
+        return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
+}
+static ssize_t
+perf_event_mux_interval_ms_store(struct device *dev,
+                                 struct device_attribute *attr,
+                                 const char *buf, size_t count)
+{
+        struct pmu *pmu = dev_get_drvdata(dev);
+        int timer, cpu, ret;
+        ret = kstrtoint(buf, 0, &timer);
+        if (ret)
+                return ret;
+        if (timer < 1)
+                return -EINVAL;
+        /* same value, noting to do */
+        if (timer == pmu->hrtimer_interval_ms)
+                return count;
+        pmu->hrtimer_interval_ms = timer;
+        /* update all cpuctx for this PMU */
+        for_each_possible_cpu(cpu) {
+                struct perf_cpu_context *cpuctx;
+                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+                cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+                if (hrtimer_active(&cpuctx->hrtimer))
+                        hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
+        }
+        return count;
+}
 static struct device_attribute pmu_dev_attrs[] = {
-       __ATTR_RO(type),
+        __ATTR_RO(type),
-       __ATTR_NULL,
+        __ATTR_RW(perf_event_mux_interval_ms),
+        __ATTR_NULL,
 };
 static int pmu_bus_running;
@@ -6027,7 +6283,7 @@ free_dev:
 static struct lock_class_key cpuctx_mutex;
 static struct lock_class_key cpuctx_lock;
-int perf_pmu_register(struct pmu *pmu, char *name, int type)
+int perf_pmu_register(struct pmu *pmu, const char *name, int type)
 {
        int cpu, ret;
@@ -6076,7 +6332,9 @@ skip_type:
                lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
                cpuctx->ctx.type = cpu_context;
                cpuctx->ctx.pmu = pmu;
-                cpuctx->jiffies_interval = 1;
+                __perf_cpu_hrtimer_init(cpuctx, cpu);
                INIT_LIST_HEAD(&cpuctx->rotation_list);
                cpuctx->unique_pmu = pmu;
        }
@@ -6402,11 +6660,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
                if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
                        return -EINVAL;
-                /* kernel level capture: check permissions */
-                if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
-                    && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
-                        return -EACCES;
                /* propagate priv level, when not set for branch */
                if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
@@ -6424,6 +6677,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
                         */
                        attr->branch_sample_type = mask;
                }
+                /* privileged levels capture (kernel, hv): check permissions */
+                if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
+                    && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+                        return -EACCES;
        }
        if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
@@ -7228,7 +7485,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
                 * child.
                 */
-                child_ctx = alloc_perf_context(event->pmu, child);
+                child_ctx = alloc_perf_context(parent_ctx->pmu, child);
                if (!child_ctx)
                        return -ENOMEM;
@@ -7371,7 +7628,7 @@ static void __init perf_event_init_all_cpus(void)
        }
 }
-static void __cpuinit perf_event_init_cpu(int cpu)
+static void perf_event_init_cpu(int cpu)
 {
        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
@@ -7460,7 +7717,7 @@ static struct notifier_block perf_reboot_notifier = {
        .priority = INT_MIN,
 };
-static int __cpuinit
+static int
 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 {
        unsigned int cpu = (long)hcpu;
@@ -7476,7 +7733,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
        case CPU_DOWN_PREPARE:
                perf_event_exit_cpu(cpu);
                break;
        default:
                break;
        }
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 20185ea64aa6..1559fb0b9296 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -46,23 +46,26 @@
 #include <linux/smp.h>
 #include <linux/hw_breakpoint.h>
 /*
 * Constraints data
 */
+struct bp_cpuinfo {
+        /* Number of pinned cpu breakpoints in a cpu */
+        unsigned int    cpu_pinned;
+        /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */
+        unsigned int    *tsk_pinned;
+        /* Number of non-pinned cpu/task breakpoints in a cpu */
+        unsigned int    flexible; /* XXX: placeholder, see fetch_this_slot() */
+};
-/* Number of pinned cpu breakpoints in a cpu */
+static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]);
-static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]);
-/* Number of pinned task breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]);
-/* Number of non-pinned cpu/task breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
 static int nr_slots[TYPE_MAX];
+static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type)
+{
+        return per_cpu_ptr(bp_cpuinfo + type, cpu);
+}
 /* Keep track of the breakpoints attached to tasks */
 static LIST_HEAD(bp_task_head);
@@ -96,8 +99,8 @@ static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
 */
 static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
 {
+        unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
        int i;
-        unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
        for (i = nr_slots[type] - 1; i >= 0; i--) {
                if (tsk_pinned[i] > 0)
@@ -127,6 +130,13 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
        return count;
 }
+static const struct cpumask *cpumask_of_bp(struct perf_event *bp)
+{
+        if (bp->cpu >= 0)
+                return cpumask_of(bp->cpu);
+        return cpu_possible_mask;
+}
 /*
 * Report the number of pinned/un-pinned breakpoints we have in
 * a given cpu (cpu > -1) or in all of them (cpu = -1).
@@ -135,25 +145,15 @@ static void
 fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
                    enum bp_type_idx type)
 {
-        int cpu = bp->cpu;
+        const struct cpumask *cpumask = cpumask_of_bp(bp);
-        struct task_struct *tsk = bp->hw.bp_target;
+        int cpu;
-        if (cpu >= 0) {
-                slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
-                if (!tsk)
-                        slots->pinned += max_task_bp_pinned(cpu, type);
-                else
-                        slots->pinned += task_bp_pinned(cpu, bp, type);
-                slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
-                return;
-        }
-        for_each_possible_cpu(cpu) {
+        for_each_cpu(cpu, cpumask) {
-                unsigned int nr;
+                struct bp_cpuinfo *info = get_bp_info(cpu, type);
+                int nr;
-                nr = per_cpu(nr_cpu_bp_pinned[type], cpu);
+                nr = info->cpu_pinned;
-                if (!tsk)
+                if (!bp->hw.bp_target)
                        nr += max_task_bp_pinned(cpu, type);
                else
                        nr += task_bp_pinned(cpu, bp, type);
@@ -161,8 +161,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
                if (nr > slots->pinned)
                        slots->pinned = nr;
-                nr = per_cpu(nr_bp_flexible[type], cpu);
+                nr = info->flexible;
                if (nr > slots->flexible)
                        slots->flexible = nr;
        }
@@ -182,29 +181,19 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight)
 /*
 * Add a pinned breakpoint for the given task in our constraint table
 */
-static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable,
+static void toggle_bp_task_slot(struct perf_event *bp, int cpu,
                                enum bp_type_idx type, int weight)
 {
-        unsigned int *tsk_pinned;
+        unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
-        int old_count = 0;
+        int old_idx, new_idx;
-        int old_idx = 0;
-        int idx = 0;
+        old_idx = task_bp_pinned(cpu, bp, type) - 1;
+        new_idx = old_idx + weight;
-        old_count = task_bp_pinned(cpu, bp, type);
-        old_idx = old_count - 1;
+        if (old_idx >= 0)
-        idx = old_idx + weight;
+                tsk_pinned[old_idx]--;
+        if (new_idx >= 0)
-        /* tsk_pinned[n] is the number of tasks having n breakpoints */
+                tsk_pinned[new_idx]++;
-        tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
-        if (enable) {
-                tsk_pinned[idx]++;
-                if (old_count > 0)
-                        tsk_pinned[old_idx]--;
-        } else {
-                tsk_pinned[idx]--;
-                if (old_count > 0)
-                        tsk_pinned[old_idx]++;
-        }
 }
 /*
@@ -214,33 +203,26 @@ static void
 toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
               int weight)
 {
-        int cpu = bp->cpu;
+        const struct cpumask *cpumask = cpumask_of_bp(bp);
-        struct task_struct *tsk = bp->hw.bp_target;
+        int cpu;
-        /* Pinned counter cpu profiling */
+        if (!enable)
-        if (!tsk) {
+                weight = -weight;
-                if (enable)
+        /* Pinned counter cpu profiling */
-                        per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight;
+        if (!bp->hw.bp_target) {
-                else
+                get_bp_info(bp->cpu, type)->cpu_pinned += weight;
-                        per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
                return;
        }
        /* Pinned counter task profiling */
+        for_each_cpu(cpu, cpumask)
-        if (!enable)
+                toggle_bp_task_slot(bp, cpu, type, weight);
-                list_del(&bp->hw.bp_list);
-        if (cpu >= 0) {
-                toggle_bp_task_slot(bp, cpu, enable, type, weight);
-        } else {
-                for_each_possible_cpu(cpu)
-                        toggle_bp_task_slot(bp, cpu, enable, type, weight);
-        }
        if (enable)
                list_add_tail(&bp->hw.bp_list, &bp_task_head);
+        else
+                list_del(&bp->hw.bp_list);
 }
 /*
@@ -261,8 +243,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
 *
 *   - If attached to a single cpu, check:
 *
- *       (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu)
+ *       (per_cpu(info->flexible, cpu) || (per_cpu(info->cpu_pinned, cpu)
- *           + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM
+ *           + max(per_cpu(info->tsk_pinned, cpu)))) < HBP_NUM
 *
 *       -> If there are already non-pinned counters in this cpu, it means
 *          there is already a free slot for them.
@@ -272,8 +254,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
 *
 *   - If attached to every cpus, check:
 *
- *       (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *))
+ *       (per_cpu(info->flexible, *) || (max(per_cpu(info->cpu_pinned, *))
- *           + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM
+ *           + max(per_cpu(info->tsk_pinned, *)))) < HBP_NUM
 *
 *       -> This is roughly the same, except we check the number of per cpu
 *          bp for every cpu and we keep the max one. Same for the per tasks
@@ -284,16 +266,16 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
 *
 *   - If attached to a single cpu, check:
 *
- *       ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu)
+ *       ((per_cpu(info->flexible, cpu) > 1) + per_cpu(info->cpu_pinned, cpu)
- *            + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM
+ *            + max(per_cpu(info->tsk_pinned, cpu))) < HBP_NUM
 *
- *       -> Same checks as before. But now the nr_bp_flexible, if any, must keep
+ *       -> Same checks as before. But now the info->flexible, if any, must keep
 *          one register at least (or they will never be fed).
 *
 *   - If attached to every cpus, check:
 *
- *       ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
+ *       ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *))
- *            + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM
+ *            + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM
 */
 static int __reserve_bp_slot(struct perf_event *bp)
 {
@@ -518,8 +500,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
                            perf_overflow_handler_t triggered,
                            void *context)
 {
-        struct perf_event * __percpu *cpu_events, **pevent, *bp;
+        struct perf_event * __percpu *cpu_events, *bp;
-        long err;
+        long err = 0;
        int cpu;
        cpu_events = alloc_percpu(typeof(*cpu_events));
@@ -528,31 +510,21 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
        get_online_cpus();
        for_each_online_cpu(cpu) {
-                pevent = per_cpu_ptr(cpu_events, cpu);
                bp = perf_event_create_kernel_counter(attr, cpu, NULL,
                                                      triggered, context);
-                *pevent = bp;
                if (IS_ERR(bp)) {
                        err = PTR_ERR(bp);
-                        goto fail;
+                        break;
                }
-        }
-        put_online_cpus();
-        return cpu_events;
+                per_cpu(*cpu_events, cpu) = bp;
-fail:
-        for_each_online_cpu(cpu) {
-                pevent = per_cpu_ptr(cpu_events, cpu);
-                if (IS_ERR(*pevent))
-                        break;
-                unregister_hw_breakpoint(*pevent);
        }
        put_online_cpus();
-        free_percpu(cpu_events);
+        if (likely(!err))
+                return cpu_events;
+        unregister_wide_hw_breakpoint(cpu_events);
        return (void __percpu __force *)ERR_PTR(err);
 }
 EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
@@ -564,12 +536,10 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
 void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
 {
        int cpu;
-        struct perf_event **pevent;
-        for_each_possible_cpu(cpu) {
+        for_each_possible_cpu(cpu)
-                pevent = per_cpu_ptr(cpu_events, cpu);
+                unregister_hw_breakpoint(per_cpu(*cpu_events, cpu));
-                unregister_hw_breakpoint(*pevent);
-        }
        free_percpu(cpu_events);
 }
 EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
@@ -612,6 +582,11 @@ static int hw_breakpoint_add(struct perf_event *bp, int flags)
        if (!(flags & PERF_EF_START))
                bp->hw.state = PERF_HES_STOPPED;
+        if (is_sampling_event(bp)) {
+                bp->hw.last_period = bp->hw.sample_period;
+                perf_swevent_set_period(bp);
+        }
        return arch_install_hw_breakpoint(bp);
 }
@@ -650,7 +625,6 @@ static struct pmu perf_breakpoint = {
 int __init init_hw_breakpoint(void)
 {
-        unsigned int **task_bp_pinned;
        int cpu, err_cpu;
        int i;
@@ -659,10 +633,11 @@ int __init init_hw_breakpoint(void)
        for_each_possible_cpu(cpu) {
                for (i = 0; i < TYPE_MAX; i++) {
-                        task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu);
+                        struct bp_cpuinfo *info = get_bp_info(cpu, i);
-                        *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i],
-                                                  GFP_KERNEL);
+                        info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int),
-                        if (!*task_bp_pinned)
+                                                        GFP_KERNEL);
+                        if (!info->tsk_pinned)
                                goto err_alloc;
                }
        }
@@ -676,7 +651,7 @@ int __init init_hw_breakpoint(void)
 err_alloc:
        for_each_possible_cpu(err_cpu) {
                for (i = 0; i < TYPE_MAX; i++)
-                        kfree(per_cpu(nr_task_bp_pinned[i], err_cpu));
+                        kfree(get_bp_info(err_cpu, i)->tsk_pinned);
                if (err_cpu == cpu)
                        break;
        }
diff --git a/kernel/exit.c b/kernel/exit.c
index 7bb73f9d09db..a949819055d5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -312,17 +312,6 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
        }
 }
-void __set_special_pids(struct pid *pid)
-{
-        struct task_struct *curr = current->group_leader;
-        if (task_session(curr) != pid)
-                change_pid(curr, PIDTYPE_SID, pid);
-        if (task_pgrp(curr) != pid)
-                change_pid(curr, PIDTYPE_PGID, pid);
-}
 /*
 * Let kernel threads use this to say that they allow a certain signal.
 * Must not be used if kthread was cloned with CLONE_SIGHAND.
@@ -819,7 +808,7 @@ void do_exit(long code)
        /*
         * FIXME: do that only when needed, using sched_exit tracepoint
         */
-        ptrace_put_breakpoints(tsk);
+        flush_ptrace_hw_breakpoint(tsk);
        exit_notify(tsk, group_dead);
 #ifdef CONFIG_NUMA
@@ -835,7 +824,7 @@ void do_exit(long code)
        /*
         * Make sure we are holding no locks:
         */
-        debug_check_no_locks_held(tsk);
+        debug_check_no_locks_held();
        /*
         * We can do this unlocked here. The futex code uses this flag
         * just to verify whether the pi state cleanup has been done
diff --git a/kernel/fork.c b/kernel/fork.c
index 987b28a1f01b..e23bb19e2a3e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -365,8 +365,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
        mm->locked_vm = 0;
        mm->mmap = NULL;
        mm->mmap_cache = NULL;
-        mm->free_area_cache = oldmm->mmap_base;
-        mm->cached_hole_size = ~0UL;
        mm->map_count = 0;
        cpumask_clear(mm_cpumask(mm));
        mm->mm_rb = RB_ROOT;
@@ -540,8 +538,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
        mm->nr_ptes = 0;
        memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
        spin_lock_init(&mm->page_table_lock);
-        mm->free_area_cache = TASK_UNMAPPED_BASE;
-        mm->cached_hole_size = ~0UL;
        mm_init_aio(mm);
        mm_init_owner(mm, p);
@@ -1121,6 +1117,12 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
        INIT_LIST_HEAD(&tsk->cpu_timers[2]);
 }
+static inline void
+init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
+{
+         task->pids[type].pid = pid;
+}
 /*
 * This creates a new process as a copy of the old one,
 * but does not actually start it yet.
@@ -1199,8 +1201,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        retval = -EAGAIN;
        if (atomic_read(&p->real_cred->user->processes) >=
                        task_rlimit(p, RLIMIT_NPROC)) {
-                if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
+                if (p->real_cred->user != INIT_USER &&
-                    p->real_cred->user != INIT_USER)
+                    !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
                        goto bad_fork_free;
        }
        current->flags &= ~PF_NPROC_EXCEEDED;
@@ -1354,11 +1356,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                        goto bad_fork_cleanup_io;
        }
-        p->pid = pid_nr(pid);
-        p->tgid = p->pid;
-        if (clone_flags & CLONE_THREAD)
-                p->tgid = current->tgid;
        p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
        /*
         * Clear TID on mm_release()?
@@ -1394,12 +1391,19 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        clear_all_latency_tracing(p);
        /* ok, now we should be set up.. */
-        if (clone_flags & CLONE_THREAD)
+        p->pid = pid_nr(pid);
+        if (clone_flags & CLONE_THREAD) {
                p->exit_signal = -1;
-        else if (clone_flags & CLONE_PARENT)
+                p->group_leader = current->group_leader;
-                p->exit_signal = current->group_leader->exit_signal;
+                p->tgid = current->tgid;
-        else
+        } else {
-                p->exit_signal = (clone_flags & CSIGNAL);
+                if (clone_flags & CLONE_PARENT)
+                        p->exit_signal = current->group_leader->exit_signal;
+                else
+                        p->exit_signal = (clone_flags & CSIGNAL);
+                p->group_leader = p;
+                p->tgid = p->pid;
+        }
        p->pdeath_signal = 0;
        p->exit_state = 0;
@@ -1408,15 +1412,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
        p->dirty_paused_when = 0;
-        /*
-         * Ok, make it visible to the rest of the system.
-         * We dont wake it up yet.
-         */
-        p->group_leader = p;
        INIT_LIST_HEAD(&p->thread_group);
        p->task_works = NULL;
-        /* Need tasklist lock for parent etc handling! */
+        /*
+         * Make it visible to the rest of the system, but dont wake it up yet.
+         * Need tasklist lock for parent etc handling!
+         */
        write_lock_irq(&tasklist_lock);
        /* CLONE_PARENT re-uses the old parent */
@@ -1446,18 +1448,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                goto bad_fork_free_pid;
        }
-        if (clone_flags & CLONE_THREAD) {
-                current->signal->nr_threads++;
-                atomic_inc(&current->signal->live);
-                atomic_inc(&current->signal->sigcnt);
-                p->group_leader = current->group_leader;
-                list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
-        }
        if (likely(p->pid)) {
                ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
+                init_task_pid(p, PIDTYPE_PID, pid);
                if (thread_group_leader(p)) {
+                        init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
+                        init_task_pid(p, PIDTYPE_SID, task_session(current));
                        if (is_child_reaper(pid)) {
                                ns_of_pid(pid)->child_reaper = p;
                                p->signal->flags |= SIGNAL_UNKILLABLE;
@@ -1465,13 +1463,19 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                        p->signal->leader_pid = pid;
                        p->signal->tty = tty_kref_get(current->signal->tty);
-                        attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
-                        attach_pid(p, PIDTYPE_SID, task_session(current));
                        list_add_tail(&p->sibling, &p->real_parent->children);
                        list_add_tail_rcu(&p->tasks, &init_task.tasks);
+                        attach_pid(p, PIDTYPE_PGID);
+                        attach_pid(p, PIDTYPE_SID);
                        __this_cpu_inc(process_counts);
+                } else {
+                        current->signal->nr_threads++;
+                        atomic_inc(&current->signal->live);
+                        atomic_inc(&current->signal->sigcnt);
+                        list_add_tail_rcu(&p->thread_group,
+                                          &p->group_leader->thread_group);
                }
-                attach_pid(p, PIDTYPE_PID, pid);
+                attach_pid(p, PIDTYPE_PID);
                nr_threads++;
        }
@@ -1542,7 +1546,7 @@ static inline void init_idle_pids(struct pid_link *links)
        }
 }
-struct task_struct * __cpuinit fork_idle(int cpu)
+struct task_struct *fork_idle(int cpu)
 {
        struct task_struct *task;
        task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0);
@@ -1675,6 +1679,12 @@ SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
                 int __user *, parent_tidptr,
                 int __user *, child_tidptr,
                 int, tls_val)
+#elif defined(CONFIG_CLONE_BACKWARDS3)
+SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
+                int, stack_size,
+                int __user *, parent_tidptr,
+                int __user *, child_tidptr,
+                int, tls_val)
 #else
 SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
                 int __user *, parent_tidptr,
diff --git a/kernel/freezer.c b/kernel/freezer.c
index c38893b0efba..b462fa197517 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -33,7 +33,7 @@ static DEFINE_SPINLOCK(freezer_lock);
 */
 bool freezing_slow_path(struct task_struct *p)
 {
-        if (p->flags & PF_NOFREEZE)
+        if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))
                return false;
        if (pm_nosig_freezing || cgroup_freezing(p))
@@ -110,6 +110,18 @@ bool freeze_task(struct task_struct *p)
 {
        unsigned long flags;
+        /*
+         * This check can race with freezer_do_not_count, but worst case that
+         * will result in an extra wakeup being sent to the task.  It does not
+         * race with freezer_count(), the barriers in freezer_count() and
+         * freezer_should_skip() ensure that either freezer_count() sees
+         * freezing == true in try_to_freeze() and freezes, or
+         * freezer_should_skip() sees !PF_FREEZE_SKIP and freezes the task
+         * normally.
+         */
+        if (freezer_should_skip(p))
+                return false;
        spin_lock_irqsave(&freezer_lock, flags);
        if (!freezing(p) || frozen(p)) {
                spin_unlock_irqrestore(&freezer_lock, flags);
diff --git a/kernel/futex.c b/kernel/futex.c
index b26dcfc02c94..c3a1a55a5214 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -61,6 +61,8 @@
 #include <linux/nsproxy.h>
 #include <linux/ptrace.h>
 #include <linux/sched/rt.h>
+#include <linux/hugetlb.h>
+#include <linux/freezer.h>
 #include <asm/futex.h>
@@ -365,7 +367,7 @@ again:
        } else {
                key->both.offset |= FUT_OFF_INODE; /* inode-based key */
                key->shared.inode = page_head->mapping->host;
-                key->shared.pgoff = page_head->index;
+                key->shared.pgoff = basepage_index(page);
        }
        get_futex_key_refs(key);
@@ -1807,7 +1809,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
                 * is no timeout, or if it has yet to expire.
                 */
                if (!timeout || timeout->task)
-                        schedule();
+                        freezable_schedule();
        }
        __set_current_state(TASK_RUNNING);
 }
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index fd4b13b131f8..383319bae3f7 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -47,6 +47,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/sched/rt.h>
 #include <linux/timer.h>
+#include <linux/freezer.h>
 #include <asm/uaccess.h>
@@ -721,17 +722,20 @@ static int hrtimer_switch_to_hres(void)
        return 1;
 }
+static void clock_was_set_work(struct work_struct *work)
+{
+        clock_was_set();
+}
+static DECLARE_WORK(hrtimer_work, clock_was_set_work);
 /*
- * Called from timekeeping code to reprogramm the hrtimer interrupt
+ * Called from timekeeping and resume code to reprogramm the hrtimer
- * device. If called from the timer interrupt context we defer it to
+ * interrupt device on all cpus.
- * softirq context.
 */
 void clock_was_set_delayed(void)
 {
-        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+        schedule_work(&hrtimer_work);
-        cpu_base->clock_was_set = 1;
-        __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
 }
 #else
@@ -773,15 +777,19 @@ void clock_was_set(void)
 /*
 * During resume we might have to reprogram the high resolution timer
- * interrupt (on the local CPU):
+ * interrupt on all online CPUs.  However, all other CPUs will be
+ * stopped with IRQs interrupts disabled so the clock_was_set() call
+ * must be deferred.
 */
 void hrtimers_resume(void)
 {
        WARN_ONCE(!irqs_disabled(),
                  KERN_INFO "hrtimers_resume() called with IRQs enabled!");
+        /* Retrigger on the local CPU */
        retrigger_next_event(NULL);
-        timerfd_clock_was_set();
+        /* And schedule a retrigger for all others */
+        clock_was_set_delayed();
 }
 static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
@@ -1432,13 +1440,6 @@ void hrtimer_peek_ahead_timers(void)
 static void run_hrtimer_softirq(struct softirq_action *h)
 {
-        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
-        if (cpu_base->clock_was_set) {
-                cpu_base->clock_was_set = 0;
-                clock_was_set();
-        }
        hrtimer_peek_ahead_timers();
 }
@@ -1545,7 +1546,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
                        t->task = NULL;
                if (likely(t->task))
-                        schedule();
+                        freezable_schedule();
                hrtimer_cancel(&t->timer);
                mode = HRTIMER_MODE_ABS;
@@ -1658,7 +1659,7 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
 /*
 * Functions related to boot-time initialization:
 */
-static void __cpuinit init_hrtimers_cpu(int cpu)
+static void init_hrtimers_cpu(int cpu)
 {
        struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
        int i;
@@ -1739,7 +1740,7 @@ static void migrate_hrtimers(int scpu)
 #endif /* CONFIG_HOTPLUG_CPU */
-static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
+static int hrtimer_cpu_notify(struct notifier_block *self,
                                        unsigned long action, void *hcpu)
 {
        int scpu = (long)hcpu;
@@ -1772,7 +1773,7 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
        return NOTIFY_OK;
 }
-static struct notifier_block __cpuinitdata hrtimers_nb = {
+static struct notifier_block hrtimers_nb = {
        .notifier_call = hrtimer_cpu_notify,
 };
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index cbd97ce0b000..a3bb14fbe5c6 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -213,6 +213,19 @@ void irq_enable(struct irq_desc *desc)
        irq_state_clr_masked(desc);
 }
+/**
+ * irq_disable - Mark interupt disabled
+ * @desc:       irq descriptor which should be disabled
+ *
+ * If the chip does not implement the irq_disable callback, we
+ * use a lazy disable approach. That means we mark the interrupt
+ * disabled, but leave the hardware unmasked. That's an
+ * optimization because we avoid the hardware access for the
+ * common case where no interrupt happens after we marked it
+ * disabled. If an interrupt happens, then the interrupt flow
+ * handler masks the line at the hardware level and marks it
+ * pending.
+ */
 void irq_disable(struct irq_desc *desc)
 {
        irq_state_set_disabled(desc);
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index c89295a8f668..452d6f2ba21d 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -7,6 +7,7 @@
 #include <linux/irq.h>
 #include <linux/slab.h>
 #include <linux/export.h>
+#include <linux/irqdomain.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
 #include <linux/syscore_ops.h>
@@ -16,11 +17,6 @@
 static LIST_HEAD(gc_list);
 static DEFINE_RAW_SPINLOCK(gc_lock);
-static inline struct irq_chip_regs *cur_regs(struct irq_data *d)
-{
-        return &container_of(d->chip, struct irq_chip_type, chip)->regs;
-}
 /**
 * irq_gc_noop - NOOP function
 * @d: irq_data
@@ -39,16 +35,17 @@ void irq_gc_noop(struct irq_data *d)
 void irq_gc_mask_disable_reg(struct irq_data *d)
 {
        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
-        u32 mask = 1 << (d->irq - gc->irq_base);
+        struct irq_chip_type *ct = irq_data_get_chip_type(d);
+        u32 mask = d->mask;
        irq_gc_lock(gc);
-        irq_reg_writel(mask, gc->reg_base + cur_regs(d)->disable);
+        irq_reg_writel(mask, gc->reg_base + ct->regs.disable);
-        gc->mask_cache &= ~mask;
+        *ct->mask_cache &= ~mask;
        irq_gc_unlock(gc);
 }
 /**
- * irq_gc_mask_set_mask_bit - Mask chip via setting bit in mask register
+ * irq_gc_mask_set_bit - Mask chip via setting bit in mask register
 * @d: irq_data
 *
 * Chip has a single mask register. Values of this register are cached
@@ -57,16 +54,18 @@ void irq_gc_mask_disable_reg(struct irq_data *d)
 void irq_gc_mask_set_bit(struct irq_data *d)
 {
        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
-        u32 mask = 1 << (d->irq - gc->irq_base);
+        struct irq_chip_type *ct = irq_data_get_chip_type(d);
+        u32 mask = d->mask;
        irq_gc_lock(gc);
-        gc->mask_cache |= mask;
+        *ct->mask_cache |= mask;
-        irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask);
+        irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask);
        irq_gc_unlock(gc);
 }
+EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit);
 /**
- * irq_gc_mask_set_mask_bit - Mask chip via clearing bit in mask register
+ * irq_gc_mask_clr_bit - Mask chip via clearing bit in mask register
 * @d: irq_data
 *
 * Chip has a single mask register. Values of this register are cached
@@ -75,13 +74,15 @@ void irq_gc_mask_set_bit(struct irq_data *d)
 void irq_gc_mask_clr_bit(struct irq_data *d)
 {
        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
-        u32 mask = 1 << (d->irq - gc->irq_base);
+        struct irq_chip_type *ct = irq_data_get_chip_type(d);
+        u32 mask = d->mask;
        irq_gc_lock(gc);
-        gc->mask_cache &= ~mask;
+        *ct->mask_cache &= ~mask;
-        irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask);
+        irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask);
        irq_gc_unlock(gc);
 }
+EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit);
 /**
 * irq_gc_unmask_enable_reg - Unmask chip via enable register
@@ -93,11 +94,12 @@ void irq_gc_mask_clr_bit(struct irq_data *d)
 void irq_gc_unmask_enable_reg(struct irq_data *d)
 {
        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
-        u32 mask = 1 << (d->irq - gc->irq_base);
+        struct irq_chip_type *ct = irq_data_get_chip_type(d);
+        u32 mask = d->mask;
        irq_gc_lock(gc);
-        irq_reg_writel(mask, gc->reg_base + cur_regs(d)->enable);
+        irq_reg_writel(mask, gc->reg_base + ct->regs.enable);
-        gc->mask_cache |= mask;
+        *ct->mask_cache |= mask;
        irq_gc_unlock(gc);
 }
@@ -108,12 +110,14 @@ void irq_gc_unmask_enable_reg(struct irq_data *d)
 void irq_gc_ack_set_bit(struct irq_data *d)
 {
        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
-        u32 mask = 1 << (d->irq - gc->irq_base);
+        struct irq_chip_type *ct = irq_data_get_chip_type(d);
+        u32 mask = d->mask;
        irq_gc_lock(gc);
-        irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
+        irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
        irq_gc_unlock(gc);
 }
+EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit);
 /**
 * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit
@@ -122,25 +126,27 @@ void irq_gc_ack_set_bit(struct irq_data *d)
 void irq_gc_ack_clr_bit(struct irq_data *d)
 {
        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
-        u32 mask = ~(1 << (d->irq - gc->irq_base));
+        struct irq_chip_type *ct = irq_data_get_chip_type(d);
+        u32 mask = ~d->mask;
        irq_gc_lock(gc);
-        irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
+        irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
        irq_gc_unlock(gc);
 }
 /**
- * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt
+ * irq_gc_mask_disable_reg_and_ack - Mask and ack pending interrupt
 * @d: irq_data
 */
 void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
 {
        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
-        u32 mask = 1 << (d->irq - gc->irq_base);
+        struct irq_chip_type *ct = irq_data_get_chip_type(d);
+        u32 mask = d->mask;
        irq_gc_lock(gc);
-        irq_reg_writel(mask, gc->reg_base + cur_regs(d)->mask);
+        irq_reg_writel(mask, gc->reg_base + ct->regs.mask);
-        irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
+        irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
        irq_gc_unlock(gc);
 }
@@ -151,16 +157,18 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
 void irq_gc_eoi(struct irq_data *d)
 {
        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
-        u32 mask = 1 << (d->irq - gc->irq_base);
+        struct irq_chip_type *ct = irq_data_get_chip_type(d);
+        u32 mask = d->mask;
        irq_gc_lock(gc);
-        irq_reg_writel(mask, gc->reg_base + cur_regs(d)->eoi);
+        irq_reg_writel(mask, gc->reg_base + ct->regs.eoi);
        irq_gc_unlock(gc);
 }
 /**
 * irq_gc_set_wake - Set/clr wake bit for an interrupt
- * @d: irq_data
+ * @d:  irq_data
+ * @on: Indicates whether the wake bit should be set or cleared
 *
 * For chips where the wake from suspend functionality is not
 * configured in a separate register and the wakeup active state is
@@ -169,7 +177,7 @@ void irq_gc_eoi(struct irq_data *d)
 int irq_gc_set_wake(struct irq_data *d, unsigned int on)
 {
        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
-        u32 mask = 1 << (d->irq - gc->irq_base);
+        u32 mask = d->mask;
        if (!(mask & gc->wake_enabled))
                return -EINVAL;
@@ -183,6 +191,19 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on)
        return 0;
 }
+static void
+irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
+                      int num_ct, unsigned int irq_base,
+                      void __iomem *reg_base, irq_flow_handler_t handler)
+{
+        raw_spin_lock_init(&gc->lock);
+        gc->num_ct = num_ct;
+        gc->irq_base = irq_base;
+        gc->reg_base = reg_base;
+        gc->chip_types->chip.name = name;
+        gc->chip_types->handler = handler;
+}
 /**
 * irq_alloc_generic_chip - Allocate a generic chip and initialize it
 * @name:       Name of the irq chip
@@ -203,23 +224,183 @@ irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base,
        gc = kzalloc(sz, GFP_KERNEL);
        if (gc) {
-                raw_spin_lock_init(&gc->lock);
+                irq_init_generic_chip(gc, name, num_ct, irq_base, reg_base,
-                gc->num_ct = num_ct;
+                                      handler);
-                gc->irq_base = irq_base;
-                gc->reg_base = reg_base;
-                gc->chip_types->chip.name = name;
-                gc->chip_types->handler = handler;
        }
        return gc;
 }
 EXPORT_SYMBOL_GPL(irq_alloc_generic_chip);
+static void
+irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags)
+{
+        struct irq_chip_type *ct = gc->chip_types;
+        u32 *mskptr = &gc->mask_cache, mskreg = ct->regs.mask;
+        int i;
+        for (i = 0; i < gc->num_ct; i++) {
+                if (flags & IRQ_GC_MASK_CACHE_PER_TYPE) {
+                        mskptr = &ct[i].mask_cache_priv;
+                        mskreg = ct[i].regs.mask;
+                }
+                ct[i].mask_cache = mskptr;
+                if (flags & IRQ_GC_INIT_MASK_CACHE)
+                        *mskptr = irq_reg_readl(gc->reg_base + mskreg);
+        }
+}
+/**
+ * irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain
+ * @d:                  irq domain for which to allocate chips
+ * @irqs_per_chip:      Number of interrupts each chip handles
+ * @num_ct:             Number of irq_chip_type instances associated with this
+ * @name:               Name of the irq chip
+ * @handler:            Default flow handler associated with these chips
+ * @clr:                IRQ_* bits to clear in the mapping function
+ * @set:                IRQ_* bits to set in the mapping function
+ * @gcflags:            Generic chip specific setup flags
+ */
+int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
+                                   int num_ct, const char *name,
+                                   irq_flow_handler_t handler,
+                                   unsigned int clr, unsigned int set,
+                                   enum irq_gc_flags gcflags)
+{
+        struct irq_domain_chip_generic *dgc;
+        struct irq_chip_generic *gc;
+        int numchips, sz, i;
+        unsigned long flags;
+        void *tmp;
+        if (d->gc)
+                return -EBUSY;
+        numchips = DIV_ROUND_UP(d->revmap_size, irqs_per_chip);
+        if (!numchips)
+                return -EINVAL;
+        /* Allocate a pointer, generic chip and chiptypes for each chip */
+        sz = sizeof(*dgc) + numchips * sizeof(gc);
+        sz += numchips * (sizeof(*gc) + num_ct * sizeof(struct irq_chip_type));
+        tmp = dgc = kzalloc(sz, GFP_KERNEL);
+        if (!dgc)
+                return -ENOMEM;
+        dgc->irqs_per_chip = irqs_per_chip;
+        dgc->num_chips = numchips;
+        dgc->irq_flags_to_set = set;
+        dgc->irq_flags_to_clear = clr;
+        dgc->gc_flags = gcflags;
+        d->gc = dgc;
+        /* Calc pointer to the first generic chip */
+        tmp += sizeof(*dgc) + numchips * sizeof(gc);
+        for (i = 0; i < numchips; i++) {
+                /* Store the pointer to the generic chip */
+                dgc->gc[i] = gc = tmp;
+                irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip,
+                                      NULL, handler);
+                gc->domain = d;
+                raw_spin_lock_irqsave(&gc_lock, flags);
+                list_add_tail(&gc->list, &gc_list);
+                raw_spin_unlock_irqrestore(&gc_lock, flags);
+                /* Calc pointer to the next generic chip */
+                tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
+        }
+        d->name = name;
+        return 0;
+}
+EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips);
+/**
+ * irq_get_domain_generic_chip - Get a pointer to the generic chip of a hw_irq
+ * @d:                  irq domain pointer
+ * @hw_irq:             Hardware interrupt number
+ */
+struct irq_chip_generic *
+irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq)
+{
+        struct irq_domain_chip_generic *dgc = d->gc;
+        int idx;
+        if (!dgc)
+                return NULL;
+        idx = hw_irq / dgc->irqs_per_chip;
+        if (idx >= dgc->num_chips)
+                return NULL;
+        return dgc->gc[idx];
+}
+EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip);
 /*
 * Separate lockdep class for interrupt chip which can nest irq_desc
 * lock.
 */
 static struct lock_class_key irq_nested_lock_class;
+/*
+ * irq_map_generic_chip - Map a generic chip for an irq domain
+ */
+static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
+                                irq_hw_number_t hw_irq)
+{
+        struct irq_data *data = irq_get_irq_data(virq);
+        struct irq_domain_chip_generic *dgc = d->gc;
+        struct irq_chip_generic *gc;
+        struct irq_chip_type *ct;
+        struct irq_chip *chip;
+        unsigned long flags;
+        int idx;
+        if (!d->gc)
+                return -ENODEV;
+        idx = hw_irq / dgc->irqs_per_chip;
+        if (idx >= dgc->num_chips)
+                return -EINVAL;
+        gc = dgc->gc[idx];
+        idx = hw_irq % dgc->irqs_per_chip;
+        if (test_bit(idx, &gc->unused))
+                return -ENOTSUPP;
+        if (test_bit(idx, &gc->installed))
+                return -EBUSY;
+        ct = gc->chip_types;
+        chip = &ct->chip;
+        /* We only init the cache for the first mapping of a generic chip */
+        if (!gc->installed) {
+                raw_spin_lock_irqsave(&gc->lock, flags);
+                irq_gc_init_mask_cache(gc, dgc->gc_flags);
+                raw_spin_unlock_irqrestore(&gc->lock, flags);
+        }
+        /* Mark the interrupt as installed */
+        set_bit(idx, &gc->installed);
+        if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK)
+                irq_set_lockdep_class(virq, &irq_nested_lock_class);
+        if (chip->irq_calc_mask)
+                chip->irq_calc_mask(data);
+        else
+                data->mask = 1 << idx;
+        irq_set_chip_and_handler(virq, chip, ct->handler);
+        irq_set_chip_data(virq, gc);
+        irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set);
+        return 0;
+}
+struct irq_domain_ops irq_generic_chip_ops = {
+        .map    = irq_map_generic_chip,
+        .xlate  = irq_domain_xlate_onetwocell,
+};
+EXPORT_SYMBOL_GPL(irq_generic_chip_ops);
 /**
 * irq_setup_generic_chip - Setup a range of interrupts with a generic chip
 * @gc:         Generic irq chip holding all data
@@ -237,15 +418,14 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
                            unsigned int set)
 {
        struct irq_chip_type *ct = gc->chip_types;
+        struct irq_chip *chip = &ct->chip;
        unsigned int i;
        raw_spin_lock(&gc_lock);
        list_add_tail(&gc->list, &gc_list);
        raw_spin_unlock(&gc_lock);
-        /* Init mask cache ? */
+        irq_gc_init_mask_cache(gc, flags);
-        if (flags & IRQ_GC_INIT_MASK_CACHE)
-                gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask);
        for (i = gc->irq_base; msk; msk >>= 1, i++) {
                if (!(msk & 0x01))
@@ -254,7 +434,15 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
                if (flags & IRQ_GC_INIT_NESTED_LOCK)
                        irq_set_lockdep_class(i, &irq_nested_lock_class);
-                irq_set_chip_and_handler(i, &ct->chip, ct->handler);
+                if (!(flags & IRQ_GC_NO_MASK)) {
+                        struct irq_data *d = irq_get_irq_data(i);
+                        if (chip->irq_calc_mask)
+                                chip->irq_calc_mask(d);
+                        else
+                                d->mask = 1 << (i - gc->irq_base);
+                }
+                irq_set_chip_and_handler(i, chip, ct->handler);
                irq_set_chip_data(i, gc);
                irq_modify_status(i, clr, set);
        }
@@ -265,7 +453,7 @@ EXPORT_SYMBOL_GPL(irq_setup_generic_chip);
 /**
 * irq_setup_alt_chip - Switch to alternative chip
 * @d:          irq_data for this interrupt
- * @type        Flow type to be initialized
+ * @type:       Flow type to be initialized
 *
 * Only to be called from chip->irq_set_type() callbacks.
 */
@@ -317,6 +505,24 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
 }
 EXPORT_SYMBOL_GPL(irq_remove_generic_chip);
+static struct irq_data *irq_gc_get_irq_data(struct irq_chip_generic *gc)
+{
+        unsigned int virq;
+        if (!gc->domain)
+                return irq_get_irq_data(gc->irq_base);
+        /*
+         * We don't know which of the irqs has been actually
+         * installed. Use the first one.
+         */
+        if (!gc->installed)
+                return NULL;
+        virq = irq_find_mapping(gc->domain, gc->irq_base + __ffs(gc->installed));
+        return virq ? irq_get_irq_data(virq) : NULL;
+}
 #ifdef CONFIG_PM
 static int irq_gc_suspend(void)
 {
@@ -325,8 +531,12 @@ static int irq_gc_suspend(void)
        list_for_each_entry(gc, &gc_list, list) {
                struct irq_chip_type *ct = gc->chip_types;
-                if (ct->chip.irq_suspend)
+                if (ct->chip.irq_suspend) {
-                        ct->chip.irq_suspend(irq_get_irq_data(gc->irq_base));
+                        struct irq_data *data = irq_gc_get_irq_data(gc);
+                        if (data)
+                                ct->chip.irq_suspend(data);
+                }
        }
        return 0;
 }
@@ -338,8 +548,12 @@ static void irq_gc_resume(void)
        list_for_each_entry(gc, &gc_list, list) {
                struct irq_chip_type *ct = gc->chip_types;
-                if (ct->chip.irq_resume)
+                if (ct->chip.irq_resume) {
-                        ct->chip.irq_resume(irq_get_irq_data(gc->irq_base));
+                        struct irq_data *data = irq_gc_get_irq_data(gc);
+                        if (data)
+                                ct->chip.irq_resume(data);
+                }
        }
 }
 #else
@@ -354,8 +568,12 @@ static void irq_gc_shutdown(void)
        list_for_each_entry(gc, &gc_list, list) {
                struct irq_chip_type *ct = gc->chip_types;
-                if (ct->chip.irq_pm_shutdown)
+                if (ct->chip.irq_pm_shutdown) {
-                        ct->chip.irq_pm_shutdown(irq_get_irq_data(gc->irq_base));
+                        struct irq_data *data = irq_gc_get_irq_data(gc);
+                        if (data)
+                                ct->chip.irq_pm_shutdown(data);
+                }
        }
 }
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 54a4d5223238..706724e9835d 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -16,12 +16,6 @@
 #include <linux/smp.h>
 #include <linux/fs.h>
-#define IRQ_DOMAIN_MAP_LEGACY 0 /* driver allocated fixed range of irqs.
-                                 * ie. legacy 8259, gets irqs 1..15 */
-#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */
-#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */
-#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */
 static LIST_HEAD(irq_domain_list);
 static DEFINE_MUTEX(irq_domain_mutex);
@@ -29,9 +23,11 @@ static DEFINE_MUTEX(revmap_trees_mutex);
 static struct irq_domain *irq_default_domain;
 /**
- * irq_domain_alloc() - Allocate a new irq_domain data structure
+ * __irq_domain_add() - Allocate a new irq_domain data structure
 * @of_node: optional device-tree node of the interrupt controller
- * @revmap_type: type of reverse mapping to use
+ * @size: Size of linear map; 0 for radix mapping only
+ * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
+ *              direct mapping
 * @ops: map/unmap domain callbacks
 * @host_data: Controller private data pointer
 *
@@ -39,41 +35,35 @@ static struct irq_domain *irq_default_domain;
 * register allocated irq_domain with irq_domain_register().  Returns pointer
 * to IRQ domain, or NULL on failure.
 */
-static struct irq_domain *irq_domain_alloc(struct device_node *of_node,
+struct irq_domain *__irq_domain_add(struct device_node *of_node, int size,
-                                           unsigned int revmap_type,
+                                    irq_hw_number_t hwirq_max, int direct_max,
-                                           const struct irq_domain_ops *ops,
+                                    const struct irq_domain_ops *ops,
-                                           void *host_data)
+                                    void *host_data)
 {
        struct irq_domain *domain;
-        domain = kzalloc_node(sizeof(*domain), GFP_KERNEL,
+        domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size),
-                              of_node_to_nid(of_node));
+                              GFP_KERNEL, of_node_to_nid(of_node));
        if (WARN_ON(!domain))
                return NULL;
        /* Fill structure */
-        domain->revmap_type = revmap_type;
+        INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
        domain->ops = ops;
        domain->host_data = host_data;
        domain->of_node = of_node_get(of_node);
+        domain->hwirq_max = hwirq_max;
+        domain->revmap_size = size;
+        domain->revmap_direct_max_irq = direct_max;
-        return domain;
-}
-static void irq_domain_free(struct irq_domain *domain)
-{
-        of_node_put(domain->of_node);
-        kfree(domain);
-}
-static void irq_domain_add(struct irq_domain *domain)
-{
        mutex_lock(&irq_domain_mutex);
        list_add(&domain->link, &irq_domain_list);
        mutex_unlock(&irq_domain_mutex);
-        pr_debug("Allocated domain of type %d @0x%p\n",
-                 domain->revmap_type, domain);
+        pr_debug("Added domain %s\n", domain->name);
+        return domain;
 }
+EXPORT_SYMBOL_GPL(__irq_domain_add);
 /**
 * irq_domain_remove() - Remove an irq domain.
@@ -87,29 +77,12 @@ void irq_domain_remove(struct irq_domain *domain)
 {
        mutex_lock(&irq_domain_mutex);
-        switch (domain->revmap_type) {
+        /*
-        case IRQ_DOMAIN_MAP_LEGACY:
+         * radix_tree_delete() takes care of destroying the root
-                /*
+         * node when all entries are removed. Shout if there are
-                 * Legacy domains don't manage their own irq_desc
+         * any mappings left.
-                 * allocations, we expect the caller to handle irq_desc
+         */
-                 * freeing on their own.
+        WARN_ON(domain->revmap_tree.height);
-                 */
-                break;
-        case IRQ_DOMAIN_MAP_TREE:
-                /*
-                 * radix_tree_delete() takes care of destroying the root
-                 * node when all entries are removed. Shout if there are
-                 * any mappings left.
-                 */
-                WARN_ON(domain->revmap_data.tree.height);
-                break;
-        case IRQ_DOMAIN_MAP_LINEAR:
-                kfree(domain->revmap_data.linear.revmap);
-                domain->revmap_data.linear.size = 0;
-                break;
-        case IRQ_DOMAIN_MAP_NOMAP:
-                break;
-        }
        list_del(&domain->link);
@@ -121,44 +94,30 @@ void irq_domain_remove(struct irq_domain *domain)
        mutex_unlock(&irq_domain_mutex);
-        pr_debug("Removed domain of type %d @0x%p\n",
+        pr_debug("Removed domain %s\n", domain->name);
-                 domain->revmap_type, domain);
-        irq_domain_free(domain);
+        of_node_put(domain->of_node);
+        kfree(domain);
 }
 EXPORT_SYMBOL_GPL(irq_domain_remove);
-static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
-                                             irq_hw_number_t hwirq)
-{
-        irq_hw_number_t first_hwirq = domain->revmap_data.legacy.first_hwirq;
-        int size = domain->revmap_data.legacy.size;
-        if (WARN_ON(hwirq < first_hwirq || hwirq >= first_hwirq + size))
-                return 0;
-        return hwirq - first_hwirq + domain->revmap_data.legacy.first_irq;
-}
 /**
- * irq_domain_add_simple() - Allocate and register a simple irq_domain.
+ * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs
 * @of_node: pointer to interrupt controller's device tree node.
 * @size: total number of irqs in mapping
 * @first_irq: first number of irq block assigned to the domain,
- *      pass zero to assign irqs on-the-fly. This will result in a
+ *      pass zero to assign irqs on-the-fly. If first_irq is non-zero, then
- *      linear IRQ domain so it is important to use irq_create_mapping()
+ *      pre-map all of the irqs in the domain to virqs starting at first_irq.
- *      for each used IRQ, especially when SPARSE_IRQ is enabled.
 * @ops: map/unmap domain callbacks
 * @host_data: Controller private data pointer
 *
- * Allocates a legacy irq_domain if irq_base is positive or a linear
+ * Allocates an irq_domain, and optionally if first_irq is positive then also
- * domain otherwise. For the legacy domain, IRQ descriptors will also
+ * allocate irq_descs and map all of the hwirqs to virqs starting at first_irq.
- * be allocated.
 *
 * This is intended to implement the expected behaviour for most
- * interrupt controllers which is that a linear mapping should
+ * interrupt controllers. If device tree is used, then first_irq will be 0 and
- * normally be used unless the system requires a legacy mapping in
+ * irqs get mapped dynamically on the fly. However, if the controller requires
- * order to support supplying interrupt numbers during non-DT
+ * static virq assignments (non-DT boot) then it will set that up correctly.
- * registration of devices.
 */
 struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
                                         unsigned int size,
@@ -166,33 +125,25 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
                                         const struct irq_domain_ops *ops,
                                         void *host_data)
 {
-        if (first_irq > 0) {
+        struct irq_domain *domain;
-                int irq_base;
+        domain = __irq_domain_add(of_node, size, size, 0, ops, host_data);
+        if (!domain)
+                return NULL;
+        if (first_irq > 0) {
                if (IS_ENABLED(CONFIG_SPARSE_IRQ)) {
-                        /*
+                        /* attempt to allocated irq_descs */
-                         * Set the descriptor allocator to search for a
+                        int rc = irq_alloc_descs(first_irq, first_irq, size,
-                         * 1-to-1 mapping, such as irq_alloc_desc_at().
+                                                 of_node_to_nid(of_node));
-                         * Use of_node_to_nid() which is defined to
+                        if (rc < 0)
-                         * numa_node_id() on platforms that have no custom
-                         * implementation.
-                         */
-                        irq_base = irq_alloc_descs(first_irq, first_irq, size,
-                                                   of_node_to_nid(of_node));
-                        if (irq_base < 0) {
                                pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
                                        first_irq);
-                                irq_base = first_irq;
+                }
-                        }
+                irq_domain_associate_many(domain, first_irq, 0, size);
-                } else
-                        irq_base = first_irq;
-                return irq_domain_add_legacy(of_node, size, irq_base, 0,
-                                             ops, host_data);
        }
-        /* A linear domain is the default */
+        return domain;
-        return irq_domain_add_linear(of_node, size, ops, host_data);
 }
 EXPORT_SYMBOL_GPL(irq_domain_add_simple);
@@ -219,131 +170,19 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
                                         void *host_data)
 {
        struct irq_domain *domain;
-        unsigned int i;
-        domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LEGACY, ops, host_data);
+        domain = __irq_domain_add(of_node, first_hwirq + size,
+                                  first_hwirq + size, 0, ops, host_data);
        if (!domain)
                return NULL;
-        domain->revmap_data.legacy.first_irq = first_irq;
+        irq_domain_associate_many(domain, first_irq, first_hwirq, size);
-        domain->revmap_data.legacy.first_hwirq = first_hwirq;
-        domain->revmap_data.legacy.size = size;
-        mutex_lock(&irq_domain_mutex);
-        /* Verify that all the irqs are available */
-        for (i = 0; i < size; i++) {
-                int irq = first_irq + i;
-                struct irq_data *irq_data = irq_get_irq_data(irq);
-                if (WARN_ON(!irq_data || irq_data->domain)) {
-                        mutex_unlock(&irq_domain_mutex);
-                        irq_domain_free(domain);
-                        return NULL;
-                }
-        }
-        /* Claim all of the irqs before registering a legacy domain */
-        for (i = 0; i < size; i++) {
-                struct irq_data *irq_data = irq_get_irq_data(first_irq + i);
-                irq_data->hwirq = first_hwirq + i;
-                irq_data->domain = domain;
-        }
-        mutex_unlock(&irq_domain_mutex);
-        for (i = 0; i < size; i++) {
-                int irq = first_irq + i;
-                int hwirq = first_hwirq + i;
-                /* IRQ0 gets ignored */
-                if (!irq)
-                        continue;
-                /* Legacy flags are left to default at this point,
-                 * one can then use irq_create_mapping() to
-                 * explicitly change them
-                 */
-                if (ops->map)
-                        ops->map(domain, irq, hwirq);
-                /* Clear norequest flags */
-                irq_clear_status_flags(irq, IRQ_NOREQUEST);
-        }
-        irq_domain_add(domain);
        return domain;
 }
 EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
 /**
- * irq_domain_add_linear() - Allocate and register a linear revmap irq_domain.
- * @of_node: pointer to interrupt controller's device tree node.
- * @size: Number of interrupts in the domain.
- * @ops: map/unmap domain callbacks
- * @host_data: Controller private data pointer
- */
-struct irq_domain *irq_domain_add_linear(struct device_node *of_node,
-                                         unsigned int size,
-                                         const struct irq_domain_ops *ops,
-                                         void *host_data)
-{
-        struct irq_domain *domain;
-        unsigned int *revmap;
-        revmap = kzalloc_node(sizeof(*revmap) * size, GFP_KERNEL,
-                              of_node_to_nid(of_node));
-        if (WARN_ON(!revmap))
-                return NULL;
-        domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, ops, host_data);
-        if (!domain) {
-                kfree(revmap);
-                return NULL;
-        }
-        domain->revmap_data.linear.size = size;
-        domain->revmap_data.linear.revmap = revmap;
-        irq_domain_add(domain);
-        return domain;
-}
-EXPORT_SYMBOL_GPL(irq_domain_add_linear);
-struct irq_domain *irq_domain_add_nomap(struct device_node *of_node,
-                                         unsigned int max_irq,
-                                         const struct irq_domain_ops *ops,
-                                         void *host_data)
-{
-        struct irq_domain *domain = irq_domain_alloc(of_node,
-                                        IRQ_DOMAIN_MAP_NOMAP, ops, host_data);
-        if (domain) {
-                domain->revmap_data.nomap.max_irq = max_irq ? max_irq : ~0;
-                irq_domain_add(domain);
-        }
-        return domain;
-}
-EXPORT_SYMBOL_GPL(irq_domain_add_nomap);
-/**
- * irq_domain_add_tree()
- * @of_node: pointer to interrupt controller's device tree node.
- * @ops: map/unmap domain callbacks
- *
- * Note: The radix tree will be allocated later during boot automatically
- * (the reverse mapping will use the slow path until that happens).
- */
-struct irq_domain *irq_domain_add_tree(struct device_node *of_node,
-                                         const struct irq_domain_ops *ops,
-                                         void *host_data)
-{
-        struct irq_domain *domain = irq_domain_alloc(of_node,
-                                        IRQ_DOMAIN_MAP_TREE, ops, host_data);
-        if (domain) {
-                INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL);
-                irq_domain_add(domain);
-        }
-        return domain;
-}
-EXPORT_SYMBOL_GPL(irq_domain_add_tree);
-/**
 * irq_find_host() - Locates a domain for a given device node
 * @node: device-tree node of the interrupt controller
 */
@@ -391,125 +230,108 @@ void irq_set_default_host(struct irq_domain *domain)
 }
 EXPORT_SYMBOL_GPL(irq_set_default_host);
-static void irq_domain_disassociate_many(struct irq_domain *domain,
+static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
-                                         unsigned int irq_base, int count)
 {
-        /*
+        struct irq_data *irq_data = irq_get_irq_data(irq);
-         * disassociate in reverse order;
+        irq_hw_number_t hwirq;
-         * not strictly necessary, but nice for unwinding
-         */
-        while (count--) {
-                int irq = irq_base + count;
-                struct irq_data *irq_data = irq_get_irq_data(irq);
-                irq_hw_number_t hwirq;
-                if (WARN_ON(!irq_data || irq_data->domain != domain))
+        if (WARN(!irq_data || irq_data->domain != domain,
-                        continue;
+                 "virq%i doesn't exist; cannot disassociate\n", irq))
+                return;
-                hwirq = irq_data->hwirq;
+        hwirq = irq_data->hwirq;
-                irq_set_status_flags(irq, IRQ_NOREQUEST);
+        irq_set_status_flags(irq, IRQ_NOREQUEST);
-                /* remove chip and handler */
+        /* remove chip and handler */
-                irq_set_chip_and_handler(irq, NULL, NULL);
+        irq_set_chip_and_handler(irq, NULL, NULL);
-                /* Make sure it's completed */
+        /* Make sure it's completed */
-                synchronize_irq(irq);
+        synchronize_irq(irq);
-                /* Tell the PIC about it */
+        /* Tell the PIC about it */
-                if (domain->ops->unmap)
+        if (domain->ops->unmap)
-                        domain->ops->unmap(domain, irq);
+                domain->ops->unmap(domain, irq);
-                smp_mb();
+        smp_mb();
-                irq_data->domain = NULL;
+        irq_data->domain = NULL;
-                irq_data->hwirq = 0;
+        irq_data->hwirq = 0;
-                /* Clear reverse map */
+        /* Clear reverse map for this hwirq */
-                switch(domain->revmap_type) {
+        if (hwirq < domain->revmap_size) {
-                case IRQ_DOMAIN_MAP_LINEAR:
+                domain->linear_revmap[hwirq] = 0;
-                        if (hwirq < domain->revmap_data.linear.size)
+        } else {
-                                domain->revmap_data.linear.revmap[hwirq] = 0;
+                mutex_lock(&revmap_trees_mutex);
-                        break;
+                radix_tree_delete(&domain->revmap_tree, hwirq);
-                case IRQ_DOMAIN_MAP_TREE:
+                mutex_unlock(&revmap_trees_mutex);
-                        mutex_lock(&revmap_trees_mutex);
-                        radix_tree_delete(&domain->revmap_data.tree, hwirq);
-                        mutex_unlock(&revmap_trees_mutex);
-                        break;
-                }
        }
 }
-int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
+int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
-                              irq_hw_number_t hwirq_base, int count)
+                         irq_hw_number_t hwirq)
 {
-        unsigned int virq = irq_base;
+        struct irq_data *irq_data = irq_get_irq_data(virq);
-        irq_hw_number_t hwirq = hwirq_base;
+        int ret;
-        int i, ret;
-        pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__,
+        if (WARN(hwirq >= domain->hwirq_max,
-                of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count);
+                 "error: hwirq 0x%x is too large for %s\n", (int)hwirq, domain->name))
+                return -EINVAL;
+        if (WARN(!irq_data, "error: virq%i is not allocated", virq))
+                return -EINVAL;
+        if (WARN(irq_data->domain, "error: virq%i is already associated", virq))
+                return -EINVAL;
-        for (i = 0; i < count; i++) {
+        mutex_lock(&irq_domain_mutex);
-                struct irq_data *irq_data = irq_get_irq_data(virq + i);
+        irq_data->hwirq = hwirq;
+        irq_data->domain = domain;
-                if (WARN(!irq_data, "error: irq_desc not allocated; "
+        if (domain->ops->map) {
-                         "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i))
+                ret = domain->ops->map(domain, virq, hwirq);
-                        return -EINVAL;
+                if (ret != 0) {
-                if (WARN(irq_data->domain, "error: irq_desc already associated; "
+                        /*
-                         "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i))
+                         * If map() returns -EPERM, this interrupt is protected
-                        return -EINVAL;
+                         * by the firmware or some other service and shall not
-        };
+                         * be mapped. Don't bother telling the user about it.
+                         */
-        for (i = 0; i < count; i++, virq++, hwirq++) {
+                        if (ret != -EPERM) {
-                struct irq_data *irq_data = irq_get_irq_data(virq);
+                                pr_info("%s didn't like hwirq-0x%lx to VIRQ%i mapping (rc=%d)\n",
+                                       domain->name, hwirq, virq, ret);
-                irq_data->hwirq = hwirq;
-                irq_data->domain = domain;
-                if (domain->ops->map) {
-                        ret = domain->ops->map(domain, virq, hwirq);
-                        if (ret != 0) {
-                                /*
-                                 * If map() returns -EPERM, this interrupt is protected
-                                 * by the firmware or some other service and shall not
-                                 * be mapped.
-                                 *
-                                 * Since on some platforms we blindly try to map everything
-                                 * we end up with a log full of backtraces.
-                                 *
-                                 * So instead, we silently fail on -EPERM, it is the
-                                 * responsibility of the PIC driver to display a relevant
-                                 * message if needed.
-                                 */
-                                if (ret != -EPERM) {
-                                        pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n",
-                                               virq, hwirq, ret);
-                                        WARN_ON(1);
-                                }
-                                irq_data->domain = NULL;
-                                irq_data->hwirq = 0;
-                                goto err_unmap;
                        }
+                        irq_data->domain = NULL;
+                        irq_data->hwirq = 0;
+                        mutex_unlock(&irq_domain_mutex);
+                        return ret;
                }
-                switch (domain->revmap_type) {
+                /* If not already assigned, give the domain the chip's name */
-                case IRQ_DOMAIN_MAP_LINEAR:
+                if (!domain->name && irq_data->chip)
-                        if (hwirq < domain->revmap_data.linear.size)
+                        domain->name = irq_data->chip->name;
-                                domain->revmap_data.linear.revmap[hwirq] = virq;
+        }
-                        break;
-                case IRQ_DOMAIN_MAP_TREE:
-                        mutex_lock(&revmap_trees_mutex);
-                        radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data);
-                        mutex_unlock(&revmap_trees_mutex);
-                        break;
-                }
-                irq_clear_status_flags(virq, IRQ_NOREQUEST);
+        if (hwirq < domain->revmap_size) {
+                domain->linear_revmap[hwirq] = virq;
+        } else {
+                mutex_lock(&revmap_trees_mutex);
+                radix_tree_insert(&domain->revmap_tree, hwirq, irq_data);
+                mutex_unlock(&revmap_trees_mutex);
        }
+        mutex_unlock(&irq_domain_mutex);
+        irq_clear_status_flags(virq, IRQ_NOREQUEST);
        return 0;
+}
+EXPORT_SYMBOL_GPL(irq_domain_associate);
- err_unmap:
+void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
-        irq_domain_disassociate_many(domain, irq_base, i);
+                               irq_hw_number_t hwirq_base, int count)
-        return -EINVAL;
+{
+        int i;
+        pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__,
+                of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count);
+        for (i = 0; i < count; i++) {
+                irq_domain_associate(domain, irq_base + i, hwirq_base + i);
+        }
 }
 EXPORT_SYMBOL_GPL(irq_domain_associate_many);
@@ -519,7 +341,9 @@ EXPORT_SYMBOL_GPL(irq_domain_associate_many);
 *
 * This routine is used for irq controllers which can choose the hardware
 * interrupt numbers they generate. In such a case it's simplest to use
- * the linux irq as the hardware interrupt number.
+ * the linux irq as the hardware interrupt number. It still uses the linear
+ * or radix tree to store the mapping, but the irq controller can optimize
+ * the revmap path by using the hwirq directly.
 */
 unsigned int irq_create_direct_mapping(struct irq_domain *domain)
 {
@@ -528,17 +352,14 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
        if (domain == NULL)
                domain = irq_default_domain;
-        if (WARN_ON(!domain || domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP))
-                return 0;
        virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
        if (!virq) {
                pr_debug("create_direct virq allocation failed\n");
                return 0;
        }
-        if (virq >= domain->revmap_data.nomap.max_irq) {
+        if (virq >= domain->revmap_direct_max_irq) {
                pr_err("ERROR: no free irqs available below %i maximum\n",
-                        domain->revmap_data.nomap.max_irq);
+                        domain->revmap_direct_max_irq);
                irq_free_desc(virq);
                return 0;
        }
@@ -575,9 +396,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
        if (domain == NULL)
                domain = irq_default_domain;
        if (domain == NULL) {
-                pr_warning("irq_create_mapping called for"
+                WARN(1, "%s(, %lx) called with NULL domain\n", __func__, hwirq);
-                           " NULL domain, hwirq=%lx\n", hwirq);
-                WARN_ON(1);
                return 0;
        }
        pr_debug("-> using domain @%p\n", domain);
@@ -589,10 +408,6 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
                return virq;
        }
-        /* Get a virtual interrupt number */
-        if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
-                return irq_domain_legacy_revmap(domain, hwirq);
        /* Allocate a virtual interrupt number */
        hint = hwirq % nr_irqs;
        if (hint == 0)
@@ -645,12 +460,7 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
        if (unlikely(ret < 0))
                return ret;
-        ret = irq_domain_associate_many(domain, irq_base, hwirq_base, count);
+        irq_domain_associate_many(domain, irq_base, hwirq_base, count);
-        if (unlikely(ret < 0)) {
-                irq_free_descs(irq_base, count);
-                return ret;
-        }
        return 0;
 }
 EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
@@ -665,20 +475,8 @@ unsigned int irq_create_of_mapping(struct device_node *controller,
        domain = controller ? irq_find_host(controller) : irq_default_domain;
        if (!domain) {
-#ifdef CONFIG_MIPS
+                pr_warn("no irq domain found for %s !\n",
-                /*
+                        of_node_full_name(controller));
-                 * Workaround to avoid breaking interrupt controller drivers
-                 * that don't yet register an irq_domain.  This is temporary
-                 * code. ~~~gcl, Feb 24, 2012
-                 *
-                 * Scheduled for removal in Linux v3.6.  That should be enough
-                 * time.
-                 */
-                if (intsize > 0)
-                        return intspec[0];
-#endif
-                pr_warning("no irq domain found for %s !\n",
-                           of_node_full_name(controller));
                return 0;
        }
@@ -698,7 +496,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller,
        /* Set type if specified and different than the current one */
        if (type != IRQ_TYPE_NONE &&
-            type != (irqd_get_trigger_type(irq_get_irq_data(virq))))
+            type != irq_get_trigger_type(virq))
                irq_set_irq_type(virq, type);
        return virq;
 }
@@ -720,11 +518,7 @@ void irq_dispose_mapping(unsigned int virq)
        if (WARN_ON(domain == NULL))
                return;
-        /* Never unmap legacy interrupts */
+        irq_domain_disassociate(domain, virq);
-        if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
-                return;
-        irq_domain_disassociate_many(domain, virq, 1);
        irq_free_desc(virq);
 }
 EXPORT_SYMBOL_GPL(irq_dispose_mapping);
@@ -745,63 +539,51 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
        if (domain == NULL)
                return 0;
-        switch (domain->revmap_type) {
+        if (hwirq < domain->revmap_direct_max_irq) {
-        case IRQ_DOMAIN_MAP_LEGACY:
-                return irq_domain_legacy_revmap(domain, hwirq);
-        case IRQ_DOMAIN_MAP_LINEAR:
-                return irq_linear_revmap(domain, hwirq);
-        case IRQ_DOMAIN_MAP_TREE:
-                rcu_read_lock();
-                data = radix_tree_lookup(&domain->revmap_data.tree, hwirq);
-                rcu_read_unlock();
-                if (data)
-                        return data->irq;
-                break;
-        case IRQ_DOMAIN_MAP_NOMAP:
                data = irq_get_irq_data(hwirq);
                if (data && (data->domain == domain) && (data->hwirq == hwirq))
                        return hwirq;
-                break;
        }
-        return 0;
+        /* Check if the hwirq is in the linear revmap. */
-}
+        if (hwirq < domain->revmap_size)
-EXPORT_SYMBOL_GPL(irq_find_mapping);
+                return domain->linear_revmap[hwirq];
-/**
- * irq_linear_revmap() - Find a linux irq from a hw irq number.
- * @domain: domain owning this hardware interrupt
- * @hwirq: hardware irq number in that domain space
- *
- * This is a fast path that can be called directly by irq controller code to
- * save a handful of instructions.
- */
-unsigned int irq_linear_revmap(struct irq_domain *domain,
-                               irq_hw_number_t hwirq)
-{
-        BUG_ON(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR);
-        /* Check revmap bounds; complain if exceeded */
-        if (WARN_ON(hwirq >= domain->revmap_data.linear.size))
-                return 0;
-        return domain->revmap_data.linear.revmap[hwirq];
+        rcu_read_lock();
+        data = radix_tree_lookup(&domain->revmap_tree, hwirq);
+        rcu_read_unlock();
+        return data ? data->irq : 0;
 }
-EXPORT_SYMBOL_GPL(irq_linear_revmap);
+EXPORT_SYMBOL_GPL(irq_find_mapping);
 #ifdef CONFIG_IRQ_DOMAIN_DEBUG
 static int virq_debug_show(struct seq_file *m, void *private)
 {
        unsigned long flags;
        struct irq_desc *desc;
-        const char *p;
+        struct irq_domain *domain;
-        static const char none[] = "none";
+        struct radix_tree_iter iter;
-        void *data;
+        void *data, **slot;
        int i;
-        seq_printf(m, "%-5s  %-7s  %-15s  %-*s  %s\n", "irq", "hwirq",
+        seq_printf(m, " %-16s  %-6s  %-10s  %-10s  %s\n",
+                   "name", "mapped", "linear-max", "direct-max", "devtree-node");
+        mutex_lock(&irq_domain_mutex);
+        list_for_each_entry(domain, &irq_domain_list, link) {
+                int count = 0;
+                radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0)
+                        count++;
+                seq_printf(m, "%c%-16s  %6u  %10u  %10u  %s\n",
+                           domain == irq_default_domain ? '*' : ' ', domain->name,
+                           domain->revmap_size + count, domain->revmap_size,
+                           domain->revmap_direct_max_irq,
+                           domain->of_node ? of_node_full_name(domain->of_node) : "");
+        }
+        mutex_unlock(&irq_domain_mutex);
+        seq_printf(m, "%-5s  %-7s  %-15s  %-*s  %6s  %-14s  %s\n", "irq", "hwirq",
                      "chip name", (int)(2 * sizeof(void *) + 2), "chip data",
-                      "domain name");
+                      "active", "type", "domain");
        for (i = 1; i < nr_irqs; i++) {
                desc = irq_to_desc(i);
@@ -809,28 +591,28 @@ static int virq_debug_show(struct seq_file *m, void *private)
                        continue;
                raw_spin_lock_irqsave(&desc->lock, flags);
+                domain = desc->irq_data.domain;
-                if (desc->action && desc->action->handler) {
+                if (domain) {
                        struct irq_chip *chip;
+                        int hwirq = desc->irq_data.hwirq;
+                        bool direct;
                        seq_printf(m, "%5d  ", i);
-                        seq_printf(m, "0x%05lx  ", desc->irq_data.hwirq);
+                        seq_printf(m, "0x%05x  ", hwirq);
                        chip = irq_desc_get_chip(desc);
-                        if (chip && chip->name)
+                        seq_printf(m, "%-15s  ", (chip && chip->name) ? chip->name : "none");
-                                p = chip->name;
-                        else
-                                p = none;
-                        seq_printf(m, "%-15s  ", p);
                        data = irq_desc_get_chip_data(desc);
                        seq_printf(m, data ? "0x%p  " : "  %p  ", data);
-                        if (desc->irq_data.domain)
+                        seq_printf(m, "   %c    ", (desc->action && desc->action->handler) ? '*' : ' ');
-                                p = of_node_full_name(desc->irq_data.domain->of_node);
+                        direct = (i == hwirq) && (i < domain->revmap_direct_max_irq);
-                        else
+                        seq_printf(m, "%6s%-8s  ",
-                                p = none;
+                                   (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX",
-                        seq_printf(m, "%s\n", p);
+                                   direct ? "(DIRECT)" : "");
+                        seq_printf(m, "%s\n", desc->irq_data.domain->name);
                }
                raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -927,18 +709,3 @@ const struct irq_domain_ops irq_domain_simple_ops = {
        .xlate = irq_domain_xlate_onetwocell,
 };
 EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
-#ifdef CONFIG_OF_IRQ
-void irq_domain_generate_simple(const struct of_device_id *match,
-                                u64 phys_base, unsigned int irq_start)
-{
-        struct device_node *node;
-        pr_debug("looking for phys_base=%llx, irq_start=%i\n",
-                (unsigned long long) phys_base, (int) irq_start);
-        node = of_find_matching_node_by_address(NULL, match, phys_base);
-        if (node)
-                irq_domain_add_legacy(node, 32, irq_start, 0,
-                                      &irq_domain_simple_ops, NULL);
-}
-EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
-#endif
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index fa17855ca65a..514bcfd855a8 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -555,9 +555,9 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
                return 0;
        if (irq_settings_can_request(desc)) {
-                if (desc->action)
+                if (!desc->action ||
-                        if (irqflags & desc->action->flags & IRQF_SHARED)
+                    irqflags & desc->action->flags & IRQF_SHARED)
-                                canrequest =1;
+                        canrequest = 1;
        }
        irq_put_desc_unlock(desc, flags);
        return canrequest;
@@ -840,9 +840,6 @@ static void irq_thread_dtor(struct callback_head *unused)
 static int irq_thread(void *data)
 {
        struct callback_head on_exit_work;
-        static const struct sched_param param = {
-                .sched_priority = MAX_USER_RT_PRIO/2,
-        };
        struct irqaction *action = data;
        struct irq_desc *desc = irq_to_desc(action->irq);
        irqreturn_t (*handler_fn)(struct irq_desc *desc,
@@ -854,8 +851,6 @@ static int irq_thread(void *data)
        else
                handler_fn = irq_thread_fn;
-        sched_setscheduler(current, SCHED_FIFO, &param);
        init_task_work(&on_exit_work, irq_thread_dtor);
        task_work_add(current, &on_exit_work, false);
@@ -950,6 +945,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
         */
        if (new->thread_fn && !nested) {
                struct task_struct *t;
+                static const struct sched_param param = {
+                        .sched_priority = MAX_USER_RT_PRIO/2,
+                };
                t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
                                   new->name);
@@ -957,6 +955,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                        ret = PTR_ERR(t);
                        goto out_mput;
                }
+                sched_setscheduler(t, SCHED_FIFO, &param);
                /*
                 * We keep the reference to the task struct even if
                 * the thread dies to avoid that the interrupt code
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 19ed5c425c3b..36f6ee181b0c 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -462,6 +462,8 @@ int show_interrupts(struct seq_file *p, void *v)
        } else {
                seq_printf(p, " %8s", "None");
        }
+        if (desc->irq_data.domain)
+                seq_printf(p, " %*d", prec, (int) desc->irq_data.hwirq);
 #ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL
        seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge");
 #endif
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 8241906c4b61..fb326365b694 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -147,6 +147,9 @@ int __request_module(bool wait, const char *fmt, ...)
         */
        WARN_ON_ONCE(wait && current_is_async());
+        if (!modprobe_path[0])
+                return 0;
        va_start(args, fmt);
        ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
        va_end(args);
@@ -569,14 +572,6 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
        int retval = 0;
        helper_lock();
-        if (!sub_info->path) {
-                retval = -EINVAL;
-                goto out;
-        }
-        if (sub_info->path[0] == '\0')
-                goto out;
        if (!khelper_wq || usermodehelper_disabled) {
                retval = -EBUSY;
                goto out;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index bddf3b201a48..6e33498d665c 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2332,6 +2332,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
        if (copy_from_user(buf, user_buf, buf_size))
                return -EFAULT;
+        buf[buf_size] = '\0';
        switch (buf[0]) {
        case 'y':
        case 'Y':
@@ -2343,6 +2344,8 @@ static ssize_t write_enabled_file_bool(struct file *file,
        case '0':
                disarm_all_kprobes();
                break;
+        default:
+                return -EINVAL;
        }
        return count;
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 1f3186b37fd5..e16c45b9ee77 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -4090,7 +4090,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
 }
 EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
-static void print_held_locks_bug(struct task_struct *curr)
+static void print_held_locks_bug(void)
 {
        if (!debug_locks_off())
                return;
@@ -4099,22 +4099,21 @@ static void print_held_locks_bug(struct task_struct *curr)
        printk("\n");
        printk("=====================================\n");
-        printk("[ BUG: lock held at task exit time! ]\n");
+        printk("[ BUG: %s/%d still has locks held! ]\n",
+               current->comm, task_pid_nr(current));
        print_kernel_ident();
        printk("-------------------------------------\n");
-        printk("%s/%d is exiting with locks still held!\n",
+        lockdep_print_held_locks(current);
-                curr->comm, task_pid_nr(curr));
-        lockdep_print_held_locks(curr);
        printk("\nstack backtrace:\n");
        dump_stack();
 }
-void debug_check_no_locks_held(struct task_struct *task)
+void debug_check_no_locks_held(void)
 {
-        if (unlikely(task->lockdep_depth > 0))
+        if (unlikely(current->lockdep_depth > 0))
-                print_held_locks_bug(task);
+                print_held_locks_bug();
 }
+EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
 void debug_show_all_locks(void)
 {
diff --git a/kernel/module.c b/kernel/module.c
index cab4bce49c23..206915830d29 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -455,7 +455,7 @@ const struct kernel_symbol *find_symbol(const char *name,
 EXPORT_SYMBOL_GPL(find_symbol);
 /* Search for module by name: must hold module_mutex. */
-static struct module *find_module_all(const char *name,
+static struct module *find_module_all(const char *name, size_t len,
                                      bool even_unformed)
 {
        struct module *mod;
@@ -463,7 +463,7 @@ static struct module *find_module_all(const char *name,
        list_for_each_entry(mod, &modules, list) {
                if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
                        continue;
-                if (strcmp(mod->name, name) == 0)
+                if (strlen(mod->name) == len && !memcmp(mod->name, name, len))
                        return mod;
        }
        return NULL;
@@ -471,7 +471,7 @@ static struct module *find_module_all(const char *name,
 struct module *find_module(const char *name)
 {
-        return find_module_all(name, false);
+        return find_module_all(name, strlen(name), false);
 }
 EXPORT_SYMBOL_GPL(find_module);
@@ -482,23 +482,28 @@ static inline void __percpu *mod_percpu(struct module *mod)
        return mod->percpu;
 }
-static int percpu_modalloc(struct module *mod,
+static int percpu_modalloc(struct module *mod, struct load_info *info)
-                           unsigned long size, unsigned long align)
 {
+        Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu];
+        unsigned long align = pcpusec->sh_addralign;
+        if (!pcpusec->sh_size)
+                return 0;
        if (align > PAGE_SIZE) {
                printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
                       mod->name, align, PAGE_SIZE);
                align = PAGE_SIZE;
        }
-        mod->percpu = __alloc_reserved_percpu(size, align);
+        mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align);
        if (!mod->percpu) {
                printk(KERN_WARNING
                       "%s: Could not allocate %lu bytes percpu data\n",
-                       mod->name, size);
+                       mod->name, (unsigned long)pcpusec->sh_size);
                return -ENOMEM;
        }
-        mod->percpu_size = size;
+        mod->percpu_size = pcpusec->sh_size;
        return 0;
 }
@@ -563,10 +568,12 @@ static inline void __percpu *mod_percpu(struct module *mod)
 {
        return NULL;
 }
-static inline int percpu_modalloc(struct module *mod,
+static int percpu_modalloc(struct module *mod, struct load_info *info)
-                                  unsigned long size, unsigned long align)
 {
-        return -ENOMEM;
+        /* UP modules shouldn't have this section: ENOMEM isn't quite right */
+        if (info->sechdrs[info->index.pcpu].sh_size != 0)
+                return -ENOMEM;
+        return 0;
 }
 static inline void percpu_modfree(struct module *mod)
 {
@@ -2927,7 +2934,6 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
 {
        /* Module within temporary copy. */
        struct module *mod;
-        Elf_Shdr *pcpusec;
        int err;
        mod = setup_load_info(info, flags);
@@ -2942,17 +2948,10 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
        err = module_frob_arch_sections(info->hdr, info->sechdrs,
                                        info->secstrings, mod);
        if (err < 0)
-                goto out;
+                return ERR_PTR(err);
-        pcpusec = &info->sechdrs[info->index.pcpu];
+        /* We will do a special allocation for per-cpu sections later. */
-        if (pcpusec->sh_size) {
+        info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
-                /* We have a special allocation for this section. */
-                err = percpu_modalloc(mod,
-                                      pcpusec->sh_size, pcpusec->sh_addralign);
-                if (err)
-                        goto out;
-                pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC;
-        }
        /* Determine total sizes, and put offsets in sh_entsize.  For now
           this is done generically; there doesn't appear to be any
@@ -2963,17 +2962,12 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
        /* Allocate and move to the final place */
        err = move_module(mod, info);
        if (err)
-                goto free_percpu;
+                return ERR_PTR(err);
        /* Module has been copied to its final place now: return it. */
        mod = (void *)info->sechdrs[info->index.mod].sh_addr;
        kmemleak_load_module(mod, info);
        return mod;
-free_percpu:
-        percpu_modfree(mod);
-out:
-        return ERR_PTR(err);
 }
 /* mod is no longer valid after this! */
@@ -3014,7 +3008,7 @@ static bool finished_loading(const char *name)
        bool ret;
        mutex_lock(&module_mutex);
-        mod = find_module_all(name, true);
+        mod = find_module_all(name, strlen(name), true);
        ret = !mod || mod->state == MODULE_STATE_LIVE
                || mod->state == MODULE_STATE_GOING;
        mutex_unlock(&module_mutex);
@@ -3152,7 +3146,8 @@ static int add_unformed_module(struct module *mod)
 again:
        mutex_lock(&module_mutex);
-        if ((old = find_module_all(mod->name, true)) != NULL) {
+        old = find_module_all(mod->name, strlen(mod->name), true);
+        if (old != NULL) {
                if (old->state == MODULE_STATE_COMING
                    || old->state == MODULE_STATE_UNFORMED) {
                        /* Wait in case it fails to load. */
@@ -3198,6 +3193,17 @@ out:
        return err;
 }
+static int unknown_module_param_cb(char *param, char *val, const char *modname)
+{
+        /* Check for magic 'dyndbg' arg */ 
+        int ret = ddebug_dyndbg_module_param_cb(param, val, modname);
+        if (ret != 0) {
+                printk(KERN_WARNING "%s: unknown parameter '%s' ignored\n",
+                       modname, param);
+        }
+        return 0;
+}
 /* Allocate and load the module: note that size of section 0 is always
   zero, and we rely on this for optional sections. */
 static int load_module(struct load_info *info, const char __user *uargs,
@@ -3237,6 +3243,11 @@ static int load_module(struct load_info *info, const char __user *uargs,
        }
 #endif
+        /* To avoid stressing percpu allocator, do this once we're unique. */
+        err = percpu_modalloc(mod, info);
+        if (err)
+                goto unlink_mod;
        /* Now module is in final location, initialize linked lists, etc. */
        err = module_unload_init(mod);
        if (err)
@@ -3284,7 +3295,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
        /* Module is ready to execute: parsing args may do that. */
        err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
-                         -32768, 32767, &ddebug_dyndbg_module_param_cb);
+                         -32768, 32767, unknown_module_param_cb);
        if (err < 0)
                goto bug_cleanup;
@@ -3563,10 +3574,8 @@ unsigned long module_kallsyms_lookup_name(const char *name)
        /* Don't lock: we're in enough trouble already. */
        preempt_disable();
        if ((colon = strchr(name, ':')) != NULL) {
-                *colon = '\0';
+                if ((mod = find_module_all(name, colon - name, false)) != NULL)
-                if ((mod = find_module(name)) != NULL)
                        ret = mod_find_symname(mod, colon+1);
-                *colon = ':';
        } else {
                list_for_each_entry_rcu(mod, &modules, list) {
                        if (mod->state == MODULE_STATE_UNFORMED)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index ad53a664f113..a52ee7bb830d 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -18,6 +18,7 @@
 * Also see Documentation/mutex-design.txt.
 */
 #include <linux/mutex.h>
+#include <linux/ww_mutex.h>
 #include <linux/sched.h>
 #include <linux/sched/rt.h>
 #include <linux/export.h>
@@ -254,16 +255,165 @@ void __sched mutex_unlock(struct mutex *lock)
 EXPORT_SYMBOL(mutex_unlock);
+/**
+ * ww_mutex_unlock - release the w/w mutex
+ * @lock: the mutex to be released
+ *
+ * Unlock a mutex that has been locked by this task previously with any of the
+ * ww_mutex_lock* functions (with or without an acquire context). It is
+ * forbidden to release the locks after releasing the acquire context.
+ *
+ * This function must not be used in interrupt context. Unlocking
+ * of a unlocked mutex is not allowed.
+ */
+void __sched ww_mutex_unlock(struct ww_mutex *lock)
+{
+        /*
+         * The unlocking fastpath is the 0->1 transition from 'locked'
+         * into 'unlocked' state:
+         */
+        if (lock->ctx) {
+#ifdef CONFIG_DEBUG_MUTEXES
+                DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
+#endif
+                if (lock->ctx->acquired > 0)
+                        lock->ctx->acquired--;
+                lock->ctx = NULL;
+        }
+#ifndef CONFIG_DEBUG_MUTEXES
+        /*
+         * When debugging is enabled we must not clear the owner before time,
+         * the slow path will always be taken, and that clears the owner field
+         * after verifying that it was indeed current.
+         */
+        mutex_clear_owner(&lock->base);
+#endif
+        __mutex_fastpath_unlock(&lock->base.count, __mutex_unlock_slowpath);
+}
+EXPORT_SYMBOL(ww_mutex_unlock);
+static inline int __sched
+__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
+{
+        struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+        struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
+        if (!hold_ctx)
+                return 0;
+        if (unlikely(ctx == hold_ctx))
+                return -EALREADY;
+        if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
+            (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
+#ifdef CONFIG_DEBUG_MUTEXES
+                DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
+                ctx->contending_lock = ww;
+#endif
+                return -EDEADLK;
+        }
+        return 0;
+}
+static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
+                                                   struct ww_acquire_ctx *ww_ctx)
+{
+#ifdef CONFIG_DEBUG_MUTEXES
+        /*
+         * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
+         * but released with a normal mutex_unlock in this call.
+         *
+         * This should never happen, always use ww_mutex_unlock.
+         */
+        DEBUG_LOCKS_WARN_ON(ww->ctx);
+        /*
+         * Not quite done after calling ww_acquire_done() ?
+         */
+        DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
+        if (ww_ctx->contending_lock) {
+                /*
+                 * After -EDEADLK you tried to
+                 * acquire a different ww_mutex? Bad!
+                 */
+                DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
+                /*
+                 * You called ww_mutex_lock after receiving -EDEADLK,
+                 * but 'forgot' to unlock everything else first?
+                 */
+                DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
+                ww_ctx->contending_lock = NULL;
+        }
+        /*
+         * Naughty, using a different class will lead to undefined behavior!
+         */
+        DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
+#endif
+        ww_ctx->acquired++;
+}
+/*
+ * after acquiring lock with fastpath or when we lost out in contested
+ * slowpath, set ctx and wake up any waiters so they can recheck.
+ *
+ * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
+ * as the fastpath and opportunistic spinning are disabled in that case.
+ */
+static __always_inline void
+ww_mutex_set_context_fastpath(struct ww_mutex *lock,
+                               struct ww_acquire_ctx *ctx)
+{
+        unsigned long flags;
+        struct mutex_waiter *cur;
+        ww_mutex_lock_acquired(lock, ctx);
+        lock->ctx = ctx;
+        /*
+         * The lock->ctx update should be visible on all cores before
+         * the atomic read is done, otherwise contended waiters might be
+         * missed. The contended waiters will either see ww_ctx == NULL
+         * and keep spinning, or it will acquire wait_lock, add itself
+         * to waiter list and sleep.
+         */
+        smp_mb(); /* ^^^ */
+        /*
+         * Check if lock is contended, if not there is nobody to wake up
+         */
+        if (likely(atomic_read(&lock->base.count) == 0))
+                return;
+        /*
+         * Uh oh, we raced in fastpath, wake up everyone in this case,
+         * so they can see the new lock->ctx.
+         */
+        spin_lock_mutex(&lock->base.wait_lock, flags);
+        list_for_each_entry(cur, &lock->base.wait_list, list) {
+                debug_mutex_wake_waiter(&lock->base, cur);
+                wake_up_process(cur->task);
+        }
+        spin_unlock_mutex(&lock->base.wait_lock, flags);
+}
 /*
 * Lock a mutex (possibly interruptible), slowpath:
 */
-static inline int __sched
+static __always_inline int __sched
 __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
-                    struct lockdep_map *nest_lock, unsigned long ip)
+                    struct lockdep_map *nest_lock, unsigned long ip,
+                    struct ww_acquire_ctx *ww_ctx)
 {
        struct task_struct *task = current;
        struct mutex_waiter waiter;
        unsigned long flags;
+        int ret;
        preempt_disable();
        mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
@@ -298,6 +448,22 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                struct task_struct *owner;
                struct mspin_node  node;
+                if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) {
+                        struct ww_mutex *ww;
+                        ww = container_of(lock, struct ww_mutex, base);
+                        /*
+                         * If ww->ctx is set the contents are undefined, only
+                         * by acquiring wait_lock there is a guarantee that
+                         * they are not invalid when reading.
+                         *
+                         * As such, when deadlock detection needs to be
+                         * performed the optimistic spinning cannot be done.
+                         */
+                        if (ACCESS_ONCE(ww->ctx))
+                                break;
+                }
                /*
                 * If there's an owner, wait for it to either
                 * release the lock or go to sleep.
@@ -312,6 +478,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                if ((atomic_read(&lock->count) == 1) &&
                    (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
                        lock_acquired(&lock->dep_map, ip);
+                        if (!__builtin_constant_p(ww_ctx == NULL)) {
+                                struct ww_mutex *ww;
+                                ww = container_of(lock, struct ww_mutex, base);
+                                ww_mutex_set_context_fastpath(ww, ww_ctx);
+                        }
                        mutex_set_owner(lock);
                        mspin_unlock(MLOCK(lock), &node);
                        preempt_enable();
@@ -371,15 +544,16 @@ slowpath:
                 * TASK_UNINTERRUPTIBLE case.)
                 */
                if (unlikely(signal_pending_state(state, task))) {
-                        mutex_remove_waiter(lock, &waiter,
+                        ret = -EINTR;
-                                            task_thread_info(task));
+                        goto err;
-                        mutex_release(&lock->dep_map, 1, ip);
+                }
-                        spin_unlock_mutex(&lock->wait_lock, flags);
-                        debug_mutex_free_waiter(&waiter);
+                if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) {
-                        preempt_enable();
+                        ret = __mutex_lock_check_stamp(lock, ww_ctx);
-                        return -EINTR;
+                        if (ret)
+                                goto err;
                }
                __set_task_state(task, state);
                /* didn't get the lock, go to sleep: */
@@ -394,6 +568,30 @@ done:
        mutex_remove_waiter(lock, &waiter, current_thread_info());
        mutex_set_owner(lock);
+        if (!__builtin_constant_p(ww_ctx == NULL)) {
+                struct ww_mutex *ww = container_of(lock,
+                                                      struct ww_mutex,
+                                                      base);
+                struct mutex_waiter *cur;
+                /*
+                 * This branch gets optimized out for the common case,
+                 * and is only important for ww_mutex_lock.
+                 */
+                ww_mutex_lock_acquired(ww, ww_ctx);
+                ww->ctx = ww_ctx;
+                /*
+                 * Give any possible sleeping processes the chance to wake up,
+                 * so they can recheck if they have to back off.
+                 */
+                list_for_each_entry(cur, &lock->wait_list, list) {
+                        debug_mutex_wake_waiter(lock, cur);
+                        wake_up_process(cur->task);
+                }
+        }
        /* set it to 0 if there are no waiters left: */
        if (likely(list_empty(&lock->wait_list)))
                atomic_set(&lock->count, 0);
@@ -404,6 +602,14 @@ done:
        preempt_enable();
        return 0;
+err:
+        mutex_remove_waiter(lock, &waiter, task_thread_info(task));
+        spin_unlock_mutex(&lock->wait_lock, flags);
+        debug_mutex_free_waiter(&waiter);
+        mutex_release(&lock->dep_map, 1, ip);
+        preempt_enable();
+        return ret;
 }
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -411,7 +617,8 @@ void __sched
 mutex_lock_nested(struct mutex *lock, unsigned int subclass)
 {
        might_sleep();
-        __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
+        __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
+                            subclass, NULL, _RET_IP_, NULL);
 }
 EXPORT_SYMBOL_GPL(mutex_lock_nested);
@@ -420,7 +627,8 @@ void __sched
 _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
 {
        might_sleep();
-        __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_);
+        __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
+                            0, nest, _RET_IP_, NULL);
 }
 EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
@@ -429,7 +637,8 @@ int __sched
 mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
 {
        might_sleep();
-        return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_);
+        return __mutex_lock_common(lock, TASK_KILLABLE,
+                                   subclass, NULL, _RET_IP_, NULL);
 }
 EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
@@ -438,10 +647,68 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
 {
        might_sleep();
        return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
-                                   subclass, NULL, _RET_IP_);
+                                   subclass, NULL, _RET_IP_, NULL);
 }
 EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
+static inline int
+ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
+        unsigned tmp;
+        if (ctx->deadlock_inject_countdown-- == 0) {
+                tmp = ctx->deadlock_inject_interval;
+                if (tmp > UINT_MAX/4)
+                        tmp = UINT_MAX;
+                else
+                        tmp = tmp*2 + tmp + tmp/2;
+                ctx->deadlock_inject_interval = tmp;
+                ctx->deadlock_inject_countdown = tmp;
+                ctx->contending_lock = lock;
+                ww_mutex_unlock(lock);
+                return -EDEADLK;
+        }
+#endif
+        return 0;
+}
+int __sched
+__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+        int ret;
+        might_sleep();
+        ret =  __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE,
+                                   0, &ctx->dep_map, _RET_IP_, ctx);
+        if (!ret && ctx->acquired > 1)
+                return ww_mutex_deadlock_injection(lock, ctx);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(__ww_mutex_lock);
+int __sched
+__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+        int ret;
+        might_sleep();
+        ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE,
+                                  0, &ctx->dep_map, _RET_IP_, ctx);
+        if (!ret && ctx->acquired > 1)
+                return ww_mutex_deadlock_injection(lock, ctx);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
 #endif
 /*
@@ -494,10 +761,10 @@ __mutex_unlock_slowpath(atomic_t *lock_count)
 * mutex_lock_interruptible() and mutex_trylock().
 */
 static noinline int __sched
-__mutex_lock_killable_slowpath(atomic_t *lock_count);
+__mutex_lock_killable_slowpath(struct mutex *lock);
 static noinline int __sched
-__mutex_lock_interruptible_slowpath(atomic_t *lock_count);
+__mutex_lock_interruptible_slowpath(struct mutex *lock);
 /**
 * mutex_lock_interruptible - acquire the mutex, interruptible
@@ -515,12 +782,12 @@ int __sched mutex_lock_interruptible(struct mutex *lock)
        int ret;
        might_sleep();
-        ret =  __mutex_fastpath_lock_retval
+        ret =  __mutex_fastpath_lock_retval(&lock->count);
-                        (&lock->count, __mutex_lock_interruptible_slowpath);
+        if (likely(!ret)) {
-        if (!ret)
                mutex_set_owner(lock);
+                return 0;
-        return ret;
+        } else
+                return __mutex_lock_interruptible_slowpath(lock);
 }
 EXPORT_SYMBOL(mutex_lock_interruptible);
@@ -530,12 +797,12 @@ int __sched mutex_lock_killable(struct mutex *lock)
        int ret;
        might_sleep();
-        ret = __mutex_fastpath_lock_retval
+        ret = __mutex_fastpath_lock_retval(&lock->count);
-                        (&lock->count, __mutex_lock_killable_slowpath);
+        if (likely(!ret)) {
-        if (!ret)
                mutex_set_owner(lock);
+                return 0;
-        return ret;
+        } else
+                return __mutex_lock_killable_slowpath(lock);
 }
 EXPORT_SYMBOL(mutex_lock_killable);
@@ -544,24 +811,39 @@ __mutex_lock_slowpath(atomic_t *lock_count)
 {
        struct mutex *lock = container_of(lock_count, struct mutex, count);
-        __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
+        __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0,
+                            NULL, _RET_IP_, NULL);
 }
 static noinline int __sched
-__mutex_lock_killable_slowpath(atomic_t *lock_count)
+__mutex_lock_killable_slowpath(struct mutex *lock)
 {
-        struct mutex *lock = container_of(lock_count, struct mutex, count);
+        return __mutex_lock_common(lock, TASK_KILLABLE, 0,
+                                   NULL, _RET_IP_, NULL);
+}
-        return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
+static noinline int __sched
+__mutex_lock_interruptible_slowpath(struct mutex *lock)
+{
+        return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0,
+                                   NULL, _RET_IP_, NULL);
 }
 static noinline int __sched
-__mutex_lock_interruptible_slowpath(atomic_t *lock_count)
+__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
-        struct mutex *lock = container_of(lock_count, struct mutex, count);
+        return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0,
+                                   NULL, _RET_IP_, ctx);
+}
-        return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
+static noinline int __sched
+__ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
+                                            struct ww_acquire_ctx *ctx)
+{
+        return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0,
+                                   NULL, _RET_IP_, ctx);
 }
 #endif
 /*
@@ -617,6 +899,45 @@ int __sched mutex_trylock(struct mutex *lock)
 }
 EXPORT_SYMBOL(mutex_trylock);
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+int __sched
+__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+        int ret;
+        might_sleep();
+        ret = __mutex_fastpath_lock_retval(&lock->base.count);
+        if (likely(!ret)) {
+                ww_mutex_set_context_fastpath(lock, ctx);
+                mutex_set_owner(&lock->base);
+        } else
+                ret = __ww_mutex_lock_slowpath(lock, ctx);
+        return ret;
+}
+EXPORT_SYMBOL(__ww_mutex_lock);
+int __sched
+__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+        int ret;
+        might_sleep();
+        ret = __mutex_fastpath_lock_retval(&lock->base.count);
+        if (likely(!ret)) {
+                ww_mutex_set_context_fastpath(lock, ctx);
+                mutex_set_owner(&lock->base);
+        } else
+                ret = __ww_mutex_lock_interruptible_slowpath(lock, ctx);
+        return ret;
+}
+EXPORT_SYMBOL(__ww_mutex_lock_interruptible);
+#endif
 /**
 * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
 * @cnt: the atomic which we are to dec
diff --git a/kernel/panic.c b/kernel/panic.c
index 167ec097ce8b..801864600514 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -15,6 +15,7 @@
 #include <linux/notifier.h>
 #include <linux/module.h>
 #include <linux/random.h>
+#include <linux/ftrace.h>
 #include <linux/reboot.h>
 #include <linux/delay.h>
 #include <linux/kexec.h>
@@ -399,8 +400,11 @@ struct slowpath_args {
 static void warn_slowpath_common(const char *file, int line, void *caller,
                                 unsigned taint, struct slowpath_args *args)
 {
-        printk(KERN_WARNING "------------[ cut here ]------------\n");
+        disable_trace_on_warning();
-        printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller);
+        pr_warn("------------[ cut here ]------------\n");
+        pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS()\n",
+                raw_smp_processor_id(), current->pid, file, line, caller);
        if (args)
                vprintk(args->fmt, args->args);
diff --git a/kernel/params.c b/kernel/params.c
index 53b958fcd639..440e65d1a544 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -787,7 +787,7 @@ static void __init kernel_add_sysfs_param(const char *name,
 }
 /*
- * param_sysfs_builtin - add contents in /sys/parameters for built-in modules
+ * param_sysfs_builtin - add sysfs parameters for built-in modules
 *
 * Add module_parameters to sysfs for "modules" built into the kernel.
 *
diff --git a/kernel/pid.c b/kernel/pid.c
index 0db3e791a06d..66505c1dfc51 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -75,6 +75,7 @@ struct pid_namespace init_pid_ns = {
                [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
        },
        .last_pid = 0,
+        .nr_hashed = PIDNS_HASH_ADDING,
        .level = 0,
        .child_reaper = &init_task,
        .user_ns = &init_user_ns,
@@ -373,14 +374,10 @@ EXPORT_SYMBOL_GPL(find_vpid);
 /*
 * attach_pid() must be called with the tasklist_lock write-held.
 */
-void attach_pid(struct task_struct *task, enum pid_type type,
+void attach_pid(struct task_struct *task, enum pid_type type)
-                struct pid *pid)
 {
-        struct pid_link *link;
+        struct pid_link *link = &task->pids[type];
+        hlist_add_head_rcu(&link->node, &link->pid->tasks[type]);
-        link = &task->pids[type];
-        link->pid = pid;
-        hlist_add_head_rcu(&link->node, &pid->tasks[type]);
 }
 static void __change_pid(struct task_struct *task, enum pid_type type,
@@ -412,7 +409,7 @@ void change_pid(struct task_struct *task, enum pid_type type,
                struct pid *pid)
 {
        __change_pid(task, type, pid);
-        attach_pid(task, type, pid);
+        attach_pid(task, type);
 }
 /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
@@ -594,7 +591,6 @@ void __init pidmap_init(void)
        /* Reserve PID 0. We never call free_pidmap(0) */
        set_bit(0, init_pid_ns.pidmap[0].page);
        atomic_dec(&init_pid_ns.pidmap[0].nr_free);
-        init_pid_ns.nr_hashed = PIDNS_HASH_ADDING;
        init_pid_ns.pid_cachep = KMEM_CACHE(pid,
                        SLAB_HWCACHE_ALIGN | SLAB_PANIC);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 42670e9b44e0..c7f31aa272f7 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -51,59 +51,28 @@ static int check_clock(const clockid_t which_clock)
        return error;
 }
-static inline union cpu_time_count
+static inline unsigned long long
 timespec_to_sample(const clockid_t which_clock, const struct timespec *tp)
 {
-        union cpu_time_count ret;
+        unsigned long long ret;
-        ret.sched = 0;          /* high half always zero when .cpu used */
+        ret = 0;                /* high half always zero when .cpu used */
        if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
-                ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
+                ret = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
        } else {
-                ret.cpu = timespec_to_cputime(tp);
+                ret = cputime_to_expires(timespec_to_cputime(tp));
        }
        return ret;
 }
 static void sample_to_timespec(const clockid_t which_clock,
-                               union cpu_time_count cpu,
+                               unsigned long long expires,
                               struct timespec *tp)
 {
        if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED)
-                *tp = ns_to_timespec(cpu.sched);
+                *tp = ns_to_timespec(expires);
        else
-                cputime_to_timespec(cpu.cpu, tp);
+                cputime_to_timespec((__force cputime_t)expires, tp);
-}
-static inline int cpu_time_before(const clockid_t which_clock,
-                                  union cpu_time_count now,
-                                  union cpu_time_count then)
-{
-        if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
-                return now.sched < then.sched;
-        }  else {
-                return now.cpu < then.cpu;
-        }
-}
-static inline void cpu_time_add(const clockid_t which_clock,
-                                union cpu_time_count *acc,
-                                union cpu_time_count val)
-{
-        if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
-                acc->sched += val.sched;
-        }  else {
-                acc->cpu += val.cpu;
-        }
-}
-static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
-                                                union cpu_time_count a,
-                                                union cpu_time_count b)
-{
-        if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
-                a.sched -= b.sched;
-        }  else {
-                a.cpu -= b.cpu;
-        }
-        return a;
 }
 /*
@@ -111,47 +80,31 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
 * given the current clock sample.
 */
 static void bump_cpu_timer(struct k_itimer *timer,
-                                  union cpu_time_count now)
+                           unsigned long long now)
 {
        int i;
+        unsigned long long delta, incr;
-        if (timer->it.cpu.incr.sched == 0)
+        if (timer->it.cpu.incr == 0)
                return;
-        if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
+        if (now < timer->it.cpu.expires)
-                unsigned long long delta, incr;
+                return;
-                if (now.sched < timer->it.cpu.expires.sched)
+        incr = timer->it.cpu.incr;
-                        return;
+        delta = now + incr - timer->it.cpu.expires;
-                incr = timer->it.cpu.incr.sched;
-                delta = now.sched + incr - timer->it.cpu.expires.sched;
-                /* Don't use (incr*2 < delta), incr*2 might overflow. */
-                for (i = 0; incr < delta - incr; i++)
-                        incr = incr << 1;
-                for (; i >= 0; incr >>= 1, i--) {
-                        if (delta < incr)
-                                continue;
-                        timer->it.cpu.expires.sched += incr;
-                        timer->it_overrun += 1 << i;
-                        delta -= incr;
-                }
-        } else {
-                cputime_t delta, incr;
-                if (now.cpu < timer->it.cpu.expires.cpu)
+        /* Don't use (incr*2 < delta), incr*2 might overflow. */
-                        return;
+        for (i = 0; incr < delta - incr; i++)
-                incr = timer->it.cpu.incr.cpu;
+                incr = incr << 1;
-                delta = now.cpu + incr - timer->it.cpu.expires.cpu;
-                /* Don't use (incr*2 < delta), incr*2 might overflow. */
+        for (; i >= 0; incr >>= 1, i--) {
-                for (i = 0; incr < delta - incr; i++)
+                if (delta < incr)
-                             incr += incr;
+                        continue;
-                for (; i >= 0; incr = incr >> 1, i--) {
-                        if (delta < incr)
+                timer->it.cpu.expires += incr;
-                                continue;
+                timer->it_overrun += 1 << i;
-                        timer->it.cpu.expires.cpu += incr;
+                delta -= incr;
-                        timer->it_overrun += 1 << i;
-                        delta -= incr;
-                }
        }
 }
@@ -170,21 +123,21 @@ static inline int task_cputime_zero(const struct task_cputime *cputime)
        return 0;
 }
-static inline cputime_t prof_ticks(struct task_struct *p)
+static inline unsigned long long prof_ticks(struct task_struct *p)
 {
        cputime_t utime, stime;
        task_cputime(p, &utime, &stime);
-        return utime + stime;
+        return cputime_to_expires(utime + stime);
 }
-static inline cputime_t virt_ticks(struct task_struct *p)
+static inline unsigned long long virt_ticks(struct task_struct *p)
 {
        cputime_t utime;
        task_cputime(p, &utime, NULL);
-        return utime;
+        return cputime_to_expires(utime);
 }
 static int
@@ -225,19 +178,19 @@ posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
 * Sample a per-thread clock for the given task.
 */
 static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
-                            union cpu_time_count *cpu)
+                            unsigned long long *sample)
 {
        switch (CPUCLOCK_WHICH(which_clock)) {
        default:
                return -EINVAL;
        case CPUCLOCK_PROF:
-                cpu->cpu = prof_ticks(p);
+                *sample = prof_ticks(p);
                break;
        case CPUCLOCK_VIRT:
-                cpu->cpu = virt_ticks(p);
+                *sample = virt_ticks(p);
                break;
        case CPUCLOCK_SCHED:
-                cpu->sched = task_sched_runtime(p);
+                *sample = task_sched_runtime(p);
                break;
        }
        return 0;
@@ -284,7 +237,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
 */
 static int cpu_clock_sample_group(const clockid_t which_clock,
                                  struct task_struct *p,
-                                  union cpu_time_count *cpu)
+                                  unsigned long long *sample)
 {
        struct task_cputime cputime;
@@ -293,15 +246,15 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
                return -EINVAL;
        case CPUCLOCK_PROF:
                thread_group_cputime(p, &cputime);
-                cpu->cpu = cputime.utime + cputime.stime;
+                *sample = cputime_to_expires(cputime.utime + cputime.stime);
                break;
        case CPUCLOCK_VIRT:
                thread_group_cputime(p, &cputime);
-                cpu->cpu = cputime.utime;
+                *sample = cputime_to_expires(cputime.utime);
                break;
        case CPUCLOCK_SCHED:
                thread_group_cputime(p, &cputime);
-                cpu->sched = cputime.sum_exec_runtime;
+                *sample = cputime.sum_exec_runtime;
                break;
        }
        return 0;
@@ -312,7 +265,7 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
 {
        const pid_t pid = CPUCLOCK_PID(which_clock);
        int error = -EINVAL;
-        union cpu_time_count rtn;
+        unsigned long long rtn;
        if (pid == 0) {
                /*
@@ -446,6 +399,15 @@ static int posix_cpu_timer_del(struct k_itimer *timer)
        return ret;
 }
+static void cleanup_timers_list(struct list_head *head,
+                                unsigned long long curr)
+{
+        struct cpu_timer_list *timer, *next;
+        list_for_each_entry_safe(timer, next, head, entry)
+                list_del_init(&timer->entry);
+}
 /*
 * Clean out CPU timers still ticking when a thread exited.  The task
 * pointer is cleared, and the expiry time is replaced with the residual
@@ -456,37 +418,12 @@ static void cleanup_timers(struct list_head *head,
                           cputime_t utime, cputime_t stime,
                           unsigned long long sum_exec_runtime)
 {
-        struct cpu_timer_list *timer, *next;
-        cputime_t ptime = utime + stime;
-        list_for_each_entry_safe(timer, next, head, entry) {
-                list_del_init(&timer->entry);
-                if (timer->expires.cpu < ptime) {
-                        timer->expires.cpu = 0;
-                } else {
-                        timer->expires.cpu -= ptime;
-                }
-        }
-        ++head;
+        cputime_t ptime = utime + stime;
-        list_for_each_entry_safe(timer, next, head, entry) {
-                list_del_init(&timer->entry);
-                if (timer->expires.cpu < utime) {
-                        timer->expires.cpu = 0;
-                } else {
-                        timer->expires.cpu -= utime;
-                }
-        }
-        ++head;
+        cleanup_timers_list(head, cputime_to_expires(ptime));
-        list_for_each_entry_safe(timer, next, head, entry) {
+        cleanup_timers_list(++head, cputime_to_expires(utime));
-                list_del_init(&timer->entry);
+        cleanup_timers_list(++head, sum_exec_runtime);
-                if (timer->expires.sched < sum_exec_runtime) {
-                        timer->expires.sched = 0;
-                } else {
-                        timer->expires.sched -= sum_exec_runtime;
-                }
-        }
 }
 /*
@@ -516,17 +453,21 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
                       tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
 }
-static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
+static void clear_dead_task(struct k_itimer *itimer, unsigned long long now)
 {
+        struct cpu_timer_list *timer = &itimer->it.cpu;
        /*
         * That's all for this thread or process.
         * We leave our residual in expires to be reported.
         */
-        put_task_struct(timer->it.cpu.task);
+        put_task_struct(timer->task);
-        timer->it.cpu.task = NULL;
+        timer->task = NULL;
-        timer->it.cpu.expires = cpu_time_sub(timer->it_clock,
+        if (timer->expires < now) {
-                                             timer->it.cpu.expires,
+                timer->expires = 0;
-                                             now);
+        } else {
+                timer->expires -= now;
+        }
 }
 static inline int expires_gt(cputime_t expires, cputime_t new_exp)
@@ -558,14 +499,14 @@ static void arm_timer(struct k_itimer *timer)
        listpos = head;
        list_for_each_entry(next, head, entry) {
-                if (cpu_time_before(timer->it_clock, nt->expires, next->expires))
+                if (nt->expires < next->expires)
                        break;
                listpos = &next->entry;
        }
        list_add(&nt->entry, listpos);
        if (listpos == head) {
-                union cpu_time_count *exp = &nt->expires;
+                unsigned long long exp = nt->expires;
                /*
                 * We are the new earliest-expiring POSIX 1.b timer, hence
@@ -576,17 +517,17 @@ static void arm_timer(struct k_itimer *timer)
                switch (CPUCLOCK_WHICH(timer->it_clock)) {
                case CPUCLOCK_PROF:
-                        if (expires_gt(cputime_expires->prof_exp, exp->cpu))
+                        if (expires_gt(cputime_expires->prof_exp, expires_to_cputime(exp)))
-                                cputime_expires->prof_exp = exp->cpu;
+                                cputime_expires->prof_exp = expires_to_cputime(exp);
                        break;
                case CPUCLOCK_VIRT:
-                        if (expires_gt(cputime_expires->virt_exp, exp->cpu))
+                        if (expires_gt(cputime_expires->virt_exp, expires_to_cputime(exp)))
-                                cputime_expires->virt_exp = exp->cpu;
+                                cputime_expires->virt_exp = expires_to_cputime(exp);
                        break;
                case CPUCLOCK_SCHED:
                        if (cputime_expires->sched_exp == 0 ||
-                            cputime_expires->sched_exp > exp->sched)
+                            cputime_expires->sched_exp > exp)
-                                cputime_expires->sched_exp = exp->sched;
+                                cputime_expires->sched_exp = exp;
                        break;
                }
        }
@@ -601,20 +542,20 @@ static void cpu_timer_fire(struct k_itimer *timer)
                /*
                 * User don't want any signal.
                 */
-                timer->it.cpu.expires.sched = 0;
+                timer->it.cpu.expires = 0;
        } else if (unlikely(timer->sigq == NULL)) {
                /*
                 * This a special case for clock_nanosleep,
                 * not a normal timer from sys_timer_create.
                 */
                wake_up_process(timer->it_process);
-                timer->it.cpu.expires.sched = 0;
+                timer->it.cpu.expires = 0;
-        } else if (timer->it.cpu.incr.sched == 0) {
+        } else if (timer->it.cpu.incr == 0) {
                /*
                 * One-shot timer.  Clear it as soon as it's fired.
                 */
                posix_timer_event(timer, 0);
-                timer->it.cpu.expires.sched = 0;
+                timer->it.cpu.expires = 0;
        } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) {
                /*
                 * The signal did not get queued because the signal
@@ -632,7 +573,7 @@ static void cpu_timer_fire(struct k_itimer *timer)
 */
 static int cpu_timer_sample_group(const clockid_t which_clock,
                                  struct task_struct *p,
-                                  union cpu_time_count *cpu)
+                                  unsigned long long *sample)
 {
        struct task_cputime cputime;
@@ -641,13 +582,13 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
        default:
                return -EINVAL;
        case CPUCLOCK_PROF:
-                cpu->cpu = cputime.utime + cputime.stime;
+                *sample = cputime_to_expires(cputime.utime + cputime.stime);
                break;
        case CPUCLOCK_VIRT:
-                cpu->cpu = cputime.utime;
+                *sample = cputime_to_expires(cputime.utime);
                break;
        case CPUCLOCK_SCHED:
-                cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
+                *sample = cputime.sum_exec_runtime + task_delta_exec(p);
                break;
        }
        return 0;
@@ -694,7 +635,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
                               struct itimerspec *new, struct itimerspec *old)
 {
        struct task_struct *p = timer->it.cpu.task;
-        union cpu_time_count old_expires, new_expires, old_incr, val;
+        unsigned long long old_expires, new_expires, old_incr, val;
        int ret;
        if (unlikely(p == NULL)) {
@@ -749,7 +690,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
        }
        if (old) {
-                if (old_expires.sched == 0) {
+                if (old_expires == 0) {
                        old->it_value.tv_sec = 0;
                        old->it_value.tv_nsec = 0;
                } else {
@@ -764,11 +705,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
                         * new setting.
                         */
                        bump_cpu_timer(timer, val);
-                        if (cpu_time_before(timer->it_clock, val,
+                        if (val < timer->it.cpu.expires) {
-                                            timer->it.cpu.expires)) {
+                                old_expires = timer->it.cpu.expires - val;
-                                old_expires = cpu_time_sub(
-                                        timer->it_clock,
-                                        timer->it.cpu.expires, val);
                                sample_to_timespec(timer->it_clock,
                                                   old_expires,
                                                   &old->it_value);
@@ -791,8 +729,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
                goto out;
        }
-        if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) {
+        if (new_expires != 0 && !(flags & TIMER_ABSTIME)) {
-                cpu_time_add(timer->it_clock, &new_expires, val);
+                new_expires += val;
        }
        /*
@@ -801,8 +739,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
         * arm the timer (we'll just fake it for timer_gettime).
         */
        timer->it.cpu.expires = new_expires;
-        if (new_expires.sched != 0 &&
+        if (new_expires != 0 && val < new_expires) {
-            cpu_time_before(timer->it_clock, val, new_expires)) {
                arm_timer(timer);
        }
@@ -826,8 +763,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
        timer->it_overrun_last = 0;
        timer->it_overrun = -1;
-        if (new_expires.sched != 0 &&
+        if (new_expires != 0 && !(val < new_expires)) {
-            !cpu_time_before(timer->it_clock, val, new_expires)) {
                /*
                 * The designated time already passed, so we notify
                 * immediately, even if the thread never runs to
@@ -849,7 +785,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 {
-        union cpu_time_count now;
+        unsigned long long now;
        struct task_struct *p = timer->it.cpu.task;
        int clear_dead;
@@ -859,7 +795,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
        sample_to_timespec(timer->it_clock,
                           timer->it.cpu.incr, &itp->it_interval);
-        if (timer->it.cpu.expires.sched == 0) { /* Timer not armed at all.  */
+        if (timer->it.cpu.expires == 0) {       /* Timer not armed at all.  */
                itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
                return;
        }
@@ -891,7 +827,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
                         */
                        put_task_struct(p);
                        timer->it.cpu.task = NULL;
-                        timer->it.cpu.expires.sched = 0;
+                        timer->it.cpu.expires = 0;
                        read_unlock(&tasklist_lock);
                        goto dead;
                } else {
@@ -912,10 +848,9 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
                goto dead;
        }
-        if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) {
+        if (now < timer->it.cpu.expires) {
                sample_to_timespec(timer->it_clock,
-                                   cpu_time_sub(timer->it_clock,
+                                   timer->it.cpu.expires - now,
-                                                timer->it.cpu.expires, now),
                                   &itp->it_value);
        } else {
                /*
@@ -927,6 +862,28 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
        }
 }
+static unsigned long long
+check_timers_list(struct list_head *timers,
+                  struct list_head *firing,
+                  unsigned long long curr)
+{
+        int maxfire = 20;
+        while (!list_empty(timers)) {
+                struct cpu_timer_list *t;
+                t = list_first_entry(timers, struct cpu_timer_list, entry);
+                if (!--maxfire || curr < t->expires)
+                        return t->expires;
+                t->firing = 1;
+                list_move_tail(&t->entry, firing);
+        }
+        return 0;
+}
 /*
 * Check for any per-thread CPU timers that have fired and move them off
 * the tsk->cpu_timers[N] list onto the firing list.  Here we update the
@@ -935,54 +892,20 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 static void check_thread_timers(struct task_struct *tsk,
                                struct list_head *firing)
 {
-        int maxfire;
        struct list_head *timers = tsk->cpu_timers;
        struct signal_struct *const sig = tsk->signal;
+        struct task_cputime *tsk_expires = &tsk->cputime_expires;
+        unsigned long long expires;
        unsigned long soft;
-        maxfire = 20;
+        expires = check_timers_list(timers, firing, prof_ticks(tsk));
-        tsk->cputime_expires.prof_exp = 0;
+        tsk_expires->prof_exp = expires_to_cputime(expires);
-        while (!list_empty(timers)) {
-                struct cpu_timer_list *t = list_first_entry(timers,
-                                                      struct cpu_timer_list,
-                                                      entry);
-                if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) {
-                        tsk->cputime_expires.prof_exp = t->expires.cpu;
-                        break;
-                }
-                t->firing = 1;
-                list_move_tail(&t->entry, firing);
-        }
-        ++timers;
+        expires = check_timers_list(++timers, firing, virt_ticks(tsk));
-        maxfire = 20;
+        tsk_expires->virt_exp = expires_to_cputime(expires);
-        tsk->cputime_expires.virt_exp = 0;
-        while (!list_empty(timers)) {
-                struct cpu_timer_list *t = list_first_entry(timers,
-                                                      struct cpu_timer_list,
-                                                      entry);
-                if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) {
-                        tsk->cputime_expires.virt_exp = t->expires.cpu;
-                        break;
-                }
-                t->firing = 1;
-                list_move_tail(&t->entry, firing);
-        }
-        ++timers;
+        tsk_expires->sched_exp = check_timers_list(++timers, firing,
-        maxfire = 20;
+                                                   tsk->se.sum_exec_runtime);
-        tsk->cputime_expires.sched_exp = 0;
-        while (!list_empty(timers)) {
-                struct cpu_timer_list *t = list_first_entry(timers,
-                                                      struct cpu_timer_list,
-                                                      entry);
-                if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
-                        tsk->cputime_expires.sched_exp = t->expires.sched;
-                        break;
-                }
-                t->firing = 1;
-                list_move_tail(&t->entry, firing);
-        }
        /*
         * Check for the special case thread timers.
@@ -1030,7 +953,8 @@ static void stop_process_timers(struct signal_struct *sig)
 static u32 onecputick;
 static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
-                             cputime_t *expires, cputime_t cur_time, int signo)
+                             unsigned long long *expires,
+                             unsigned long long cur_time, int signo)
 {
        if (!it->expires)
                return;
@@ -1066,9 +990,8 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
 static void check_process_timers(struct task_struct *tsk,
                                 struct list_head *firing)
 {
-        int maxfire;
        struct signal_struct *const sig = tsk->signal;
-        cputime_t utime, ptime, virt_expires, prof_expires;
+        unsigned long long utime, ptime, virt_expires, prof_expires;
        unsigned long long sum_sched_runtime, sched_expires;
        struct list_head *timers = sig->cpu_timers;
        struct task_cputime cputime;
@@ -1078,52 +1001,13 @@ static void check_process_timers(struct task_struct *tsk,
         * Collect the current process totals.
         */
        thread_group_cputimer(tsk, &cputime);
-        utime = cputime.utime;
+        utime = cputime_to_expires(cputime.utime);
-        ptime = utime + cputime.stime;
+        ptime = utime + cputime_to_expires(cputime.stime);
        sum_sched_runtime = cputime.sum_exec_runtime;
-        maxfire = 20;
-        prof_expires = 0;
-        while (!list_empty(timers)) {
-                struct cpu_timer_list *tl = list_first_entry(timers,
-                                                      struct cpu_timer_list,
-                                                      entry);
-                if (!--maxfire || ptime < tl->expires.cpu) {
-                        prof_expires = tl->expires.cpu;
-                        break;
-                }
-                tl->firing = 1;
-                list_move_tail(&tl->entry, firing);
-        }
-        ++timers;
+        prof_expires = check_timers_list(timers, firing, ptime);
-        maxfire = 20;
+        virt_expires = check_timers_list(++timers, firing, utime);
-        virt_expires = 0;
+        sched_expires = check_timers_list(++timers, firing, sum_sched_runtime);
-        while (!list_empty(timers)) {
-                struct cpu_timer_list *tl = list_first_entry(timers,
-                                                      struct cpu_timer_list,
-                                                      entry);
-                if (!--maxfire || utime < tl->expires.cpu) {
-                        virt_expires = tl->expires.cpu;
-                        break;
-                }
-                tl->firing = 1;
-                list_move_tail(&tl->entry, firing);
-        }
-        ++timers;
-        maxfire = 20;
-        sched_expires = 0;
-        while (!list_empty(timers)) {
-                struct cpu_timer_list *tl = list_first_entry(timers,
-                                                      struct cpu_timer_list,
-                                                      entry);
-                if (!--maxfire || sum_sched_runtime < tl->expires.sched) {
-                        sched_expires = tl->expires.sched;
-                        break;
-                }
-                tl->firing = 1;
-                list_move_tail(&tl->entry, firing);
-        }
        /*
         * Check for the special case process timers.
@@ -1162,8 +1046,8 @@ static void check_process_timers(struct task_struct *tsk,
                }
        }
-        sig->cputime_expires.prof_exp = prof_expires;
+        sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires);
-        sig->cputime_expires.virt_exp = virt_expires;
+        sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires);
        sig->cputime_expires.sched_exp = sched_expires;
        if (task_cputime_zero(&sig->cputime_expires))
                stop_process_timers(sig);
@@ -1176,7 +1060,7 @@ static void check_process_timers(struct task_struct *tsk,
 void posix_cpu_timer_schedule(struct k_itimer *timer)
 {
        struct task_struct *p = timer->it.cpu.task;
-        union cpu_time_count now;
+        unsigned long long now;
        if (unlikely(p == NULL))
                /*
@@ -1205,7 +1089,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
                         */
                        put_task_struct(p);
                        timer->it.cpu.task = p = NULL;
-                        timer->it.cpu.expires.sched = 0;
+                        timer->it.cpu.expires = 0;
                        goto out_unlock;
                } else if (unlikely(p->exit_state) && thread_group_empty(p)) {
                        /*
@@ -1213,6 +1097,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
                         * not yet reaped.  Take this opportunity to
                         * drop our task ref.
                         */
+                        cpu_timer_sample_group(timer->it_clock, p, &now);
                        clear_dead_task(timer, now);
                        goto out_unlock;
                }
@@ -1387,7 +1272,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
                           cputime_t *newval, cputime_t *oldval)
 {
-        union cpu_time_count now;
+        unsigned long long now;
        BUG_ON(clock_idx == CPUCLOCK_SCHED);
        cpu_timer_sample_group(clock_idx, tsk, &now);
@@ -1399,17 +1284,17 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
                 * it to be absolute.
                 */
                if (*oldval) {
-                        if (*oldval <= now.cpu) {
+                        if (*oldval <= now) {
                                /* Just about to fire. */
                                *oldval = cputime_one_jiffy;
                        } else {
-                                *oldval -= now.cpu;
+                                *oldval -= now;
                        }
                }
                if (!*newval)
                        goto out;
-                *newval += now.cpu;
+                *newval += now;
        }
        /*
@@ -1459,7 +1344,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
                }
                while (!signal_pending(current)) {
-                        if (timer.it.cpu.expires.sched == 0) {
+                        if (timer.it.cpu.expires == 0) {
                                /*
                                 * Our timer fired and was reset, below
                                 * deletion can not fail.
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 5dfdc9ea180b..d444c4e834f4 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -100,7 +100,6 @@ config PM_SLEEP_SMP
        depends on SMP
        depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
        depends on PM_SLEEP
-        select HOTPLUG
        select HOTPLUG_CPU
 config PM_AUTOSLEEP
@@ -263,6 +262,26 @@ config PM_GENERIC_DOMAINS
        bool
        depends on PM
+config WQ_POWER_EFFICIENT_DEFAULT
+        bool "Enable workqueue power-efficient mode by default"
+        depends on PM
+        default n
+        help
+          Per-cpu workqueues are generally preferred because they show
+          better performance thanks to cache locality; unfortunately,
+          per-cpu workqueues tend to be more power hungry than unbound
+          workqueues.
+          Enabling workqueue.power_efficient kernel parameter makes the
+          per-cpu workqueues which were observed to contribute
+          significantly to power consumption unbound, leading to measurably
+          lower power usage at the cost of small performance overhead.
+          This config option determines whether workqueue.power_efficient
+          is enabled by default.
+          If in doubt, say N.
 config PM_GENERIC_DOMAINS_SLEEP
        def_bool y
        depends on PM_SLEEP && PM_GENERIC_DOMAINS
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
index c6422ffeda9a..9012ecf7b814 100644
--- a/kernel/power/autosleep.c
+++ b/kernel/power/autosleep.c
@@ -32,7 +32,8 @@ static void try_to_suspend(struct work_struct *work)
        mutex_lock(&autosleep_lock);
-        if (!pm_save_wakeup_count(initial_count)) {
+        if (!pm_save_wakeup_count(initial_count) ||
+                system_state != SYSTEM_RUNNING) {
                mutex_unlock(&autosleep_lock);
                goto out;
        }
diff --git a/kernel/power/main.c b/kernel/power/main.c
index d77663bfedeb..1d1bf630e6e9 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -424,6 +424,8 @@ static ssize_t wakeup_count_store(struct kobject *kobj,
        if (sscanf(buf, "%u", &val) == 1) {
                if (pm_save_wakeup_count(val))
                        error = n;
+                else
+                        pm_print_active_wakeup_sources();
        }
 out:
@@ -528,6 +530,10 @@ pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr,
        if (sscanf(buf, "%d", &val) == 1) {
                pm_trace_enabled = !!val;
+                if (pm_trace_enabled) {
+                        pr_warn("PM: Enabling pm_trace changes system date and time during resume.\n"
+                                "PM: Correct system time has to be restored manually after resume.\n");
+                }
                return n;
        }
        return -EINVAL;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 98088e0e71e8..06ec8869dbf1 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -30,9 +30,10 @@ static int try_to_freeze_tasks(bool user_only)
        unsigned int todo;
        bool wq_busy = false;
        struct timeval start, end;
-        u64 elapsed_csecs64;
+        u64 elapsed_msecs64;
-        unsigned int elapsed_csecs;
+        unsigned int elapsed_msecs;
        bool wakeup = false;
+        int sleep_usecs = USEC_PER_MSEC;
        do_gettimeofday(&start);
@@ -68,22 +69,25 @@ static int try_to_freeze_tasks(bool user_only)
                /*
                 * We need to retry, but first give the freezing tasks some
-                 * time to enter the refrigerator.
+                 * time to enter the refrigerator.  Start with an initial
+                 * 1 ms sleep followed by exponential backoff until 8 ms.
                 */
-                msleep(10);
+                usleep_range(sleep_usecs / 2, sleep_usecs);
+                if (sleep_usecs < 8 * USEC_PER_MSEC)
+                        sleep_usecs *= 2;
        }
        do_gettimeofday(&end);
-        elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start);
+        elapsed_msecs64 = timeval_to_ns(&end) - timeval_to_ns(&start);
-        do_div(elapsed_csecs64, NSEC_PER_SEC / 100);
+        do_div(elapsed_msecs64, NSEC_PER_MSEC);
-        elapsed_csecs = elapsed_csecs64;
+        elapsed_msecs = elapsed_msecs64;
        if (todo) {
                printk("\n");
-                printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds "
+                printk(KERN_ERR "Freezing of tasks %s after %d.%03d seconds "
                       "(%d tasks refusing to freeze, wq_busy=%d):\n",
                       wakeup ? "aborted" : "failed",
-                       elapsed_csecs / 100, elapsed_csecs % 100,
+                       elapsed_msecs / 1000, elapsed_msecs % 1000,
                       todo - wq_busy, wq_busy);
                if (!wakeup) {
@@ -96,8 +100,8 @@ static int try_to_freeze_tasks(bool user_only)
                        read_unlock(&tasklist_lock);
                }
        } else {
-                printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100,
+                printk("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
-                        elapsed_csecs % 100);
+                        elapsed_msecs % 1000);
        }
        return todo ? -EBUSY : 0;
@@ -105,6 +109,8 @@ static int try_to_freeze_tasks(bool user_only)
 /**
 * freeze_processes - Signal user space processes to enter the refrigerator.
+ * The current thread will not be frozen.  The same process that calls
+ * freeze_processes must later call thaw_processes.
 *
 * On success, returns 0.  On failure, -errno and system is fully thawed.
 */
@@ -116,6 +122,9 @@ int freeze_processes(void)
        if (error)
                return error;
+        /* Make sure this task doesn't get frozen */
+        current->flags |= PF_SUSPEND_TASK;
        if (!pm_freezing)
                atomic_inc(&system_freezing_cnt);
@@ -164,6 +173,7 @@ int freeze_kernel_threads(void)
 void thaw_processes(void)
 {
        struct task_struct *g, *p;
+        struct task_struct *curr = current;
        if (pm_freezing)
                atomic_dec(&system_freezing_cnt);
@@ -178,10 +188,15 @@ void thaw_processes(void)
        read_lock(&tasklist_lock);
        do_each_thread(g, p) {
+                /* No other threads should have PF_SUSPEND_TASK set */
+                WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK));
                __thaw_task(p);
        } while_each_thread(g, p);
        read_unlock(&tasklist_lock);
+        WARN_ON(!(curr->flags & PF_SUSPEND_TASK));
+        curr->flags &= ~PF_SUSPEND_TASK;
        usermodehelper_enable();
        schedule();
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 587dddeebf15..06fe28589e9c 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -44,6 +44,7 @@
 #include <linux/uaccess.h>
 #include <linux/export.h>
+#include <trace/events/power.h>
 /*
 * locking rule: all changes to constraints or notifiers lists
@@ -202,6 +203,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
        spin_unlock_irqrestore(&pm_qos_lock, flags);
+        trace_pm_qos_update_target(action, prev_value, curr_value);
        if (prev_value != curr_value) {
                blocking_notifier_call_chain(c->notifiers,
                                             (unsigned long)curr_value,
@@ -272,6 +274,7 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf,
        spin_unlock_irqrestore(&pm_qos_lock, irqflags);
+        trace_pm_qos_update_flags(action, prev_value, curr_value);
        return prev_value != curr_value;
 }
@@ -333,6 +336,7 @@ void pm_qos_add_request(struct pm_qos_request *req,
        }
        req->pm_qos_class = pm_qos_class;
        INIT_DELAYED_WORK(&req->work, pm_qos_work_fn);
+        trace_pm_qos_add_request(pm_qos_class, value);
        pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints,
                             &req->node, PM_QOS_ADD_REQ, value);
 }
@@ -361,6 +365,7 @@ void pm_qos_update_request(struct pm_qos_request *req,
        cancel_delayed_work_sync(&req->work);
+        trace_pm_qos_update_request(req->pm_qos_class, new_value);
        if (new_value != req->node.prio)
                pm_qos_update_target(
                        pm_qos_array[req->pm_qos_class]->constraints,
@@ -387,6 +392,8 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value,
        cancel_delayed_work_sync(&req->work);
+        trace_pm_qos_update_request_timeout(req->pm_qos_class,
+                                            new_value, timeout_us);
        if (new_value != req->node.prio)
                pm_qos_update_target(
                        pm_qos_array[req->pm_qos_class]->constraints,
@@ -416,6 +423,7 @@ void pm_qos_remove_request(struct pm_qos_request *req)
        cancel_delayed_work_sync(&req->work);
+        trace_pm_qos_remove_request(req->pm_qos_class, PM_QOS_DEFAULT_VALUE);
        pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
                             &req->node, PM_QOS_REMOVE_REQ,
                             PM_QOS_DEFAULT_VALUE);
@@ -477,7 +485,7 @@ static int find_pm_qos_object_by_minor(int minor)
 {
        int pm_qos_class;
-        for (pm_qos_class = 0;
+        for (pm_qos_class = PM_QOS_CPU_DMA_LATENCY;
                pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
                if (minor ==
                        pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
@@ -491,7 +499,7 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
        long pm_qos_class;
        pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
-        if (pm_qos_class >= 0) {
+        if (pm_qos_class >= PM_QOS_CPU_DMA_LATENCY) {
                struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL);
                if (!req)
                        return -ENOMEM;
@@ -584,7 +592,7 @@ static int __init pm_qos_power_init(void)
        BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
-        for (i = 1; i < PM_QOS_NUM_CLASSES; i++) {
+        for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) {
                ret = register_pm_qos_misc(pm_qos_array[i]);
                if (ret < 0) {
                        printk(KERN_ERR "pm_qos_param: %s setup failed\n",
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0de28576807d..349587bb03e1 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -642,8 +642,9 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
        region->end_pfn = end_pfn;
        list_add_tail(&region->list, &nosave_regions);
 Report:
-        printk(KERN_INFO "PM: Registered nosave memory: %016lx - %016lx\n",
+        printk(KERN_INFO "PM: Registered nosave memory: [mem %#010llx-%#010llx]\n",
-                start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
+                (unsigned long long) start_pfn << PAGE_SHIFT,
+                ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
 }
 /*
@@ -1651,7 +1652,7 @@ unsigned long snapshot_get_image_size(void)
 static int init_header(struct swsusp_info *info)
 {
        memset(info, 0, sizeof(struct swsusp_info));
-        info->num_physpages = num_physpages;
+        info->num_physpages = get_num_physpages();
        info->image_pages = nr_copy_pages;
        info->pages = snapshot_get_image_size();
        info->size = info->pages;
@@ -1795,7 +1796,7 @@ static int check_header(struct swsusp_info *info)
        char *reason;
        reason = check_image_kernel(info);
-        if (!reason && info->num_physpages != num_physpages)
+        if (!reason && info->num_physpages != get_num_physpages())
                reason = "memory size";
        if (reason) {
                printk(KERN_ERR "PM: Image mismatch: %s\n", reason);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index bef86d121eb2..ece04223bb1e 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -269,7 +269,7 @@ int suspend_devices_and_enter(suspend_state_t state)
        suspend_test_start();
        error = dpm_suspend_start(PMSG_SUSPEND);
        if (error) {
-                printk(KERN_ERR "PM: Some devices failed to suspend\n");
+                pr_err("PM: Some devices failed to suspend, or early wake event detected\n");
                goto Recover_platform;
        }
        suspend_test_finish("suspend devices");
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
new file mode 100644
index 000000000000..85405bdcf2b3
--- /dev/null
+++ b/kernel/printk/Makefile
@@ -0,0 +1,2 @@
+obj-y   = printk.o
+obj-$(CONFIG_A11Y_BRAILLE_CONSOLE)      += braille.o
diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c
new file mode 100644
index 000000000000..276762f3a460
--- /dev/null
+++ b/kernel/printk/braille.c
@@ -0,0 +1,49 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/kernel.h>
+#include <linux/console.h>
+#include <linux/string.h>
+#include "console_cmdline.h"
+#include "braille.h"
+char *_braille_console_setup(char **str, char **brl_options)
+{
+        if (!memcmp(*str, "brl,", 4)) {
+                *brl_options = "";
+                *str += 4;
+        } else if (!memcmp(str, "brl=", 4)) {
+                *brl_options = *str + 4;
+                *str = strchr(*brl_options, ',');
+                if (!*str)
+                        pr_err("need port name after brl=\n");
+                else
+                        *((*str)++) = 0;
+        } else
+                return NULL;
+        return *str;
+}
+int
+_braille_register_console(struct console *console, struct console_cmdline *c)
+{
+        int rtn = 0;
+        if (c->brl_options) {
+                console->flags |= CON_BRL;
+                rtn = braille_register_console(console, c->index, c->options,
+                                               c->brl_options);
+        }
+        return rtn;
+}
+int
+_braille_unregister_console(struct console *console)
+{
+        if (console->flags & CON_BRL)
+                return braille_unregister_console(console);
+        return 0;
+}
diff --git a/kernel/printk/braille.h b/kernel/printk/braille.h
new file mode 100644
index 000000000000..769d771145c8
--- /dev/null
+++ b/kernel/printk/braille.h
@@ -0,0 +1,48 @@
+#ifndef _PRINTK_BRAILLE_H
+#define _PRINTK_BRAILLE_H
+#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
+static inline void
+braille_set_options(struct console_cmdline *c, char *brl_options)
+{
+        c->brl_options = brl_options;
+}
+char *
+_braille_console_setup(char **str, char **brl_options);
+int
+_braille_register_console(struct console *console, struct console_cmdline *c);
+int
+_braille_unregister_console(struct console *console);
+#else
+static inline void
+braille_set_options(struct console_cmdline *c, char *brl_options)
+{
+}
+static inline char *
+_braille_console_setup(char **str, char **brl_options)
+{
+        return NULL;
+}
+static inline int
+_braille_register_console(struct console *console, struct console_cmdline *c)
+{
+        return 0;
+}
+static inline int
+_braille_unregister_console(struct console *console)
+{
+        return 0;
+}
+#endif
+#endif
diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h
new file mode 100644
index 000000000000..cbd69d842341
--- /dev/null
+++ b/kernel/printk/console_cmdline.h
@@ -0,0 +1,14 @@
+#ifndef _CONSOLE_CMDLINE_H
+#define _CONSOLE_CMDLINE_H
+struct console_cmdline
+{
+        char    name[8];                        /* Name of the driver       */
+        int     index;                          /* Minor dev. to use        */
+        char    *options;                       /* Options for the driver   */
+#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
+        char    *brl_options;                   /* Options for braille driver */
+#endif
+};
+#endif
diff --git a/kernel/printk.c b/kernel/printk/printk.c
index 8212c1aef125..5b5a7080e2a5 100644
--- a/kernel/printk.c
+++ b/kernel/printk/printk.c
@@ -51,6 +51,9 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/printk.h>
+#include "console_cmdline.h"
+#include "braille.h"
 /* printk's without a loglevel use this.. */
 #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
@@ -105,19 +108,11 @@ static struct console *exclusive_console;
 /*
 *      Array of consoles built from command line options (console=)
 */
-struct console_cmdline
-{
-        char    name[8];                        /* Name of the driver       */
-        int     index;                          /* Minor dev. to use        */
-        char    *options;                       /* Options for the driver   */
-#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
-        char    *brl_options;                   /* Options for braille driver */
-#endif
-};
 #define MAX_CMDLINECONSOLES 8
 static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
 static int selected_console = -1;
 static int preferred_console = -1;
 int console_set_on_cmdline;
@@ -178,7 +173,7 @@ static int console_may_schedule;
 *         67                           "g"
 *   0032     00 00 00                  padding to next message header
 *
- * The 'struct log' buffer header must never be directly exported to
+ * The 'struct printk_log' buffer header must never be directly exported to
 * userspace, it is a kernel-private implementation detail that might
 * need to be changed in the future, when the requirements change.
 *
@@ -200,7 +195,7 @@ enum log_flags {
        LOG_CONT        = 8,    /* text is a fragment of a continuation line */
 };
-struct log {
+struct printk_log {
        u64 ts_nsec;            /* timestamp in nanoseconds */
        u16 len;                /* length of entire record */
        u16 text_len;           /* length of text buffer */
@@ -248,7 +243,7 @@ static u32 clear_idx;
 #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
 #define LOG_ALIGN 4
 #else
-#define LOG_ALIGN __alignof__(struct log)
+#define LOG_ALIGN __alignof__(struct printk_log)
 #endif
 #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
 static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
@@ -259,35 +254,35 @@ static u32 log_buf_len = __LOG_BUF_LEN;
 static volatile unsigned int logbuf_cpu = UINT_MAX;
 /* human readable text of the record */
-static char *log_text(const struct log *msg)
+static char *log_text(const struct printk_log *msg)
 {
-        return (char *)msg + sizeof(struct log);
+        return (char *)msg + sizeof(struct printk_log);
 }
 /* optional key/value pair dictionary attached to the record */
-static char *log_dict(const struct log *msg)
+static char *log_dict(const struct printk_log *msg)
 {
-        return (char *)msg + sizeof(struct log) + msg->text_len;
+        return (char *)msg + sizeof(struct printk_log) + msg->text_len;
 }
 /* get record by index; idx must point to valid msg */
-static struct log *log_from_idx(u32 idx)
+static struct printk_log *log_from_idx(u32 idx)
 {
-        struct log *msg = (struct log *)(log_buf + idx);
+        struct printk_log *msg = (struct printk_log *)(log_buf + idx);
        /*
         * A length == 0 record is the end of buffer marker. Wrap around and
         * read the message at the start of the buffer.
         */
        if (!msg->len)
-                return (struct log *)log_buf;
+                return (struct printk_log *)log_buf;
        return msg;
 }
 /* get next record; idx must point to valid msg */
 static u32 log_next(u32 idx)
 {
-        struct log *msg = (struct log *)(log_buf + idx);
+        struct printk_log *msg = (struct printk_log *)(log_buf + idx);
        /* length == 0 indicates the end of the buffer; wrap */
        /*
@@ -296,7 +291,7 @@ static u32 log_next(u32 idx)
         * return the one after that.
         */
        if (!msg->len) {
-                msg = (struct log *)log_buf;
+                msg = (struct printk_log *)log_buf;
                return msg->len;
        }
        return idx + msg->len;
@@ -308,11 +303,11 @@ static void log_store(int facility, int level,
                      const char *dict, u16 dict_len,
                      const char *text, u16 text_len)
 {
-        struct log *msg;
+        struct printk_log *msg;
        u32 size, pad_len;
        /* number of '\0' padding bytes to next message */
-        size = sizeof(struct log) + text_len + dict_len;
+        size = sizeof(struct printk_log) + text_len + dict_len;
        pad_len = (-size) & (LOG_ALIGN - 1);
        size += pad_len;
@@ -324,7 +319,7 @@ static void log_store(int facility, int level,
                else
                        free = log_first_idx - log_next_idx;
-                if (free > size + sizeof(struct log))
+                if (free > size + sizeof(struct printk_log))
                        break;
                /* drop old messages until we have enough contiuous space */
@@ -332,18 +327,18 @@ static void log_store(int facility, int level,
                log_first_seq++;
        }
-        if (log_next_idx + size + sizeof(struct log) >= log_buf_len) {
+        if (log_next_idx + size + sizeof(struct printk_log) >= log_buf_len) {
                /*
                 * This message + an additional empty header does not fit
                 * at the end of the buffer. Add an empty header with len == 0
                 * to signify a wrap around.
                 */
-                memset(log_buf + log_next_idx, 0, sizeof(struct log));
+                memset(log_buf + log_next_idx, 0, sizeof(struct printk_log));
                log_next_idx = 0;
        }
        /* fill message */
-        msg = (struct log *)(log_buf + log_next_idx);
+        msg = (struct printk_log *)(log_buf + log_next_idx);
        memcpy(log_text(msg), text, text_len);
        msg->text_len = text_len;
        memcpy(log_dict(msg), dict, dict_len);
@@ -356,7 +351,7 @@ static void log_store(int facility, int level,
        else
                msg->ts_nsec = local_clock();
        memset(log_dict(msg) + dict_len, 0, pad_len);
-        msg->len = sizeof(struct log) + text_len + dict_len + pad_len;
+        msg->len = sizeof(struct printk_log) + text_len + dict_len + pad_len;
        /* insert message */
        log_next_idx += msg->len;
@@ -479,7 +474,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
                            size_t count, loff_t *ppos)
 {
        struct devkmsg_user *user = file->private_data;
-        struct log *msg;
+        struct printk_log *msg;
        u64 ts_usec;
        size_t i;
        char cont = '-';
@@ -724,14 +719,14 @@ void log_buf_kexec_setup(void)
        VMCOREINFO_SYMBOL(log_first_idx);
        VMCOREINFO_SYMBOL(log_next_idx);
        /*
-         * Export struct log size and field offsets. User space tools can
+         * Export struct printk_log size and field offsets. User space tools can
         * parse it and detect any changes to structure down the line.
         */
-        VMCOREINFO_STRUCT_SIZE(log);
+        VMCOREINFO_STRUCT_SIZE(printk_log);
-        VMCOREINFO_OFFSET(log, ts_nsec);
+        VMCOREINFO_OFFSET(printk_log, ts_nsec);
-        VMCOREINFO_OFFSET(log, len);
+        VMCOREINFO_OFFSET(printk_log, len);
-        VMCOREINFO_OFFSET(log, text_len);
+        VMCOREINFO_OFFSET(printk_log, text_len);
-        VMCOREINFO_OFFSET(log, dict_len);
+        VMCOREINFO_OFFSET(printk_log, dict_len);
 }
 #endif
@@ -884,7 +879,7 @@ static size_t print_time(u64 ts, char *buf)
                       (unsigned long)ts, rem_nsec / 1000);
 }
-static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
+static size_t print_prefix(const struct printk_log *msg, bool syslog, char *buf)
 {
        size_t len = 0;
        unsigned int prefix = (msg->facility << 3) | msg->level;
@@ -907,7 +902,7 @@ static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
        return len;
 }
-static size_t msg_print_text(const struct log *msg, enum log_flags prev,
+static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
                             bool syslog, char *buf, size_t size)
 {
        const char *text = log_text(msg);
@@ -969,7 +964,7 @@ static size_t msg_print_text(const struct log *msg, enum log_flags prev,
 static int syslog_print(char __user *buf, int size)
 {
        char *text;
-        struct log *msg;
+        struct printk_log *msg;
        int len = 0;
        text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
@@ -1060,7 +1055,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
                idx = clear_idx;
                prev = 0;
                while (seq < log_next_seq) {
-                        struct log *msg = log_from_idx(idx);
+                        struct printk_log *msg = log_from_idx(idx);
                        len += msg_print_text(msg, prev, true, NULL, 0);
                        prev = msg->flags;
@@ -1073,7 +1068,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
                idx = clear_idx;
                prev = 0;
                while (len > size && seq < log_next_seq) {
-                        struct log *msg = log_from_idx(idx);
+                        struct printk_log *msg = log_from_idx(idx);
                        len -= msg_print_text(msg, prev, true, NULL, 0);
                        prev = msg->flags;
@@ -1087,7 +1082,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
                len = 0;
                prev = 0;
                while (len >= 0 && seq < next_seq) {
-                        struct log *msg = log_from_idx(idx);
+                        struct printk_log *msg = log_from_idx(idx);
                        int textlen;
                        textlen = msg_print_text(msg, prev, true, text,
@@ -1233,7 +1228,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
                        error = 0;
                        while (seq < log_next_seq) {
-                                struct log *msg = log_from_idx(idx);
+                                struct printk_log *msg = log_from_idx(idx);
                                error += msg_print_text(msg, prev, true, NULL, 0);
                                idx = log_next(idx);
@@ -1369,9 +1364,9 @@ static int console_trylock_for_printk(unsigned int cpu)
                }
        }
        logbuf_cpu = UINT_MAX;
+        raw_spin_unlock(&logbuf_lock);
        if (wake)
                up(&console_sem);
-        raw_spin_unlock(&logbuf_lock);
        return retval;
 }
@@ -1719,10 +1714,10 @@ static struct cont {
        u8 level;
        bool flushed:1;
 } cont;
-static struct log *log_from_idx(u32 idx) { return NULL; }
+static struct printk_log *log_from_idx(u32 idx) { return NULL; }
 static u32 log_next(u32 idx) { return 0; }
 static void call_console_drivers(int level, const char *text, size_t len) {}
-static size_t msg_print_text(const struct log *msg, enum log_flags prev,
+static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
                             bool syslog, char *buf, size_t size) { return 0; }
 static size_t cont_print_text(char *text, size_t size) { return 0; }
@@ -1761,23 +1756,23 @@ static int __add_preferred_console(char *name, int idx, char *options,
         *      See if this tty is not yet registered, and
         *      if we have a slot free.
         */
-        for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
+        for (i = 0, c = console_cmdline;
-                if (strcmp(console_cmdline[i].name, name) == 0 &&
+             i < MAX_CMDLINECONSOLES && c->name[0];
-                          console_cmdline[i].index == idx) {
+             i++, c++) {
-                                if (!brl_options)
+                if (strcmp(c->name, name) == 0 && c->index == idx) {
-                                        selected_console = i;
+                        if (!brl_options)
-                                return 0;
+                                selected_console = i;
+                        return 0;
                }
+        }
        if (i == MAX_CMDLINECONSOLES)
                return -E2BIG;
        if (!brl_options)
                selected_console = i;
-        c = &console_cmdline[i];
        strlcpy(c->name, name, sizeof(c->name));
        c->options = options;
-#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
+        braille_set_options(c, brl_options);
-        c->brl_options = brl_options;
-#endif
        c->index = idx;
        return 0;
 }
@@ -1790,20 +1785,8 @@ static int __init console_setup(char *str)
        char *s, *options, *brl_options = NULL;
        int idx;
-#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
+        if (_braille_console_setup(&str, &brl_options))
-        if (!memcmp(str, "brl,", 4)) {
+                return 1;
-                brl_options = "";
-                str += 4;
-        } else if (!memcmp(str, "brl=", 4)) {
-                brl_options = str + 4;
-                str = strchr(brl_options, ',');
-                if (!str) {
-                        printk(KERN_ERR "need port name after brl=\n");
-                        return 1;
-                }
-                *(str++) = 0;
-        }
-#endif
        /*
         * Decode str into name, index, options.
@@ -1858,15 +1841,15 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
        struct console_cmdline *c;
        int i;
-        for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
+        for (i = 0, c = console_cmdline;
-                if (strcmp(console_cmdline[i].name, name) == 0 &&
+             i < MAX_CMDLINECONSOLES && c->name[0];
-                          console_cmdline[i].index == idx) {
+             i++, c++)
-                                c = &console_cmdline[i];
+                if (strcmp(c->name, name) == 0 && c->index == idx) {
-                                strlcpy(c->name, name_new, sizeof(c->name));
+                        strlcpy(c->name, name_new, sizeof(c->name));
-                                c->name[sizeof(c->name) - 1] = 0;
+                        c->name[sizeof(c->name) - 1] = 0;
-                                c->options = options;
+                        c->options = options;
-                                c->index = idx_new;
+                        c->index = idx_new;
-                                return i;
+                        return i;
                }
        /* not found */
        return -1;
@@ -1921,7 +1904,7 @@ void resume_console(void)
 * called when a new CPU comes online (or fails to come up), and ensures
 * that any such output gets printed.
 */
-static int __cpuinit console_cpu_notify(struct notifier_block *self,
+static int console_cpu_notify(struct notifier_block *self,
        unsigned long action, void *hcpu)
 {
        switch (action) {
@@ -2046,7 +2029,7 @@ void console_unlock(void)
        console_cont_flush(text, sizeof(text));
 again:
        for (;;) {
-                struct log *msg;
+                struct printk_log *msg;
                size_t len;
                int level;
@@ -2241,6 +2224,7 @@ void register_console(struct console *newcon)
        int i;
        unsigned long flags;
        struct console *bcon = NULL;
+        struct console_cmdline *c;
        /*
         * before we register a new CON_BOOT console, make sure we don't
@@ -2288,30 +2272,25 @@ void register_console(struct console *newcon)
         *      See if this console matches one we selected on
         *      the command line.
         */
-        for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0];
+        for (i = 0, c = console_cmdline;
-                        i++) {
+             i < MAX_CMDLINECONSOLES && c->name[0];
-                if (strcmp(console_cmdline[i].name, newcon->name) != 0)
+             i++, c++) {
+                if (strcmp(c->name, newcon->name) != 0)
                        continue;
                if (newcon->index >= 0 &&
-                    newcon->index != console_cmdline[i].index)
+                    newcon->index != c->index)
                        continue;
                if (newcon->index < 0)
-                        newcon->index = console_cmdline[i].index;
+                        newcon->index = c->index;
-#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
-                if (console_cmdline[i].brl_options) {
+                if (_braille_register_console(newcon, c))
-                        newcon->flags |= CON_BRL;
-                        braille_register_console(newcon,
-                                        console_cmdline[i].index,
-                                        console_cmdline[i].options,
-                                        console_cmdline[i].brl_options);
                        return;
-                }
-#endif
                if (newcon->setup &&
                    newcon->setup(newcon, console_cmdline[i].options) != 0)
                        break;
                newcon->flags |= CON_ENABLED;
-                newcon->index = console_cmdline[i].index;
+                newcon->index = c->index;
                if (i == selected_console) {
                        newcon->flags |= CON_CONSDEV;
                        preferred_console = selected_console;
@@ -2394,13 +2373,13 @@ EXPORT_SYMBOL(register_console);
 int unregister_console(struct console *console)
 {
        struct console *a, *b;
-        int res = 1;
+        int res;
-#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
+        res = _braille_unregister_console(console);
-        if (console->flags & CON_BRL)
+        if (res)
-                return braille_unregister_console(console);
+                return res;
-#endif
+        res = 1;
        console_lock();
        if (console_drivers == console) {
                console_drivers=console->next;
@@ -2666,7 +2645,7 @@ void kmsg_dump(enum kmsg_dump_reason reason)
 bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
                               char *line, size_t size, size_t *len)
 {
-        struct log *msg;
+        struct printk_log *msg;
        size_t l = 0;
        bool ret = false;
@@ -2778,7 +2757,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
        idx = dumper->cur_idx;
        prev = 0;
        while (seq < dumper->next_seq) {
-                struct log *msg = log_from_idx(idx);
+                struct printk_log *msg = log_from_idx(idx);
                l += msg_print_text(msg, prev, true, NULL, 0);
                idx = log_next(idx);
@@ -2791,7 +2770,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
        idx = dumper->cur_idx;
        prev = 0;
        while (l > size && seq < dumper->next_seq) {
-                struct log *msg = log_from_idx(idx);
+                struct printk_log *msg = log_from_idx(idx);
                l -= msg_print_text(msg, prev, true, NULL, 0);
                idx = log_next(idx);
@@ -2806,7 +2785,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
        l = 0;
        prev = 0;
        while (seq < dumper->next_seq) {
-                struct log *msg = log_from_idx(idx);
+                struct printk_log *msg = log_from_idx(idx);
                l += msg_print_text(msg, prev, syslog, buf + l, size - l);
                idx = log_next(idx);
diff --git a/kernel/profile.c b/kernel/profile.c
index 0bf400737660..6631e1ef55ab 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -331,7 +331,7 @@ out:
        put_cpu();
 }
-static int __cpuinit profile_cpu_callback(struct notifier_block *info,
+static int profile_cpu_callback(struct notifier_block *info,
                                        unsigned long action, void *__cpu)
 {
        int node, cpu = (unsigned long)__cpu;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 335a7ae697f5..a146ee327f6a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -844,6 +844,47 @@ int ptrace_request(struct task_struct *child, long request,
                        ret = ptrace_setsiginfo(child, &siginfo);
                break;
+        case PTRACE_GETSIGMASK:
+                if (addr != sizeof(sigset_t)) {
+                        ret = -EINVAL;
+                        break;
+                }
+                if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t)))
+                        ret = -EFAULT;
+                else
+                        ret = 0;
+                break;
+        case PTRACE_SETSIGMASK: {
+                sigset_t new_set;
+                if (addr != sizeof(sigset_t)) {
+                        ret = -EINVAL;
+                        break;
+                }
+                if (copy_from_user(&new_set, datavp, sizeof(sigset_t))) {
+                        ret = -EFAULT;
+                        break;
+                }
+                sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
+                /*
+                 * Every thread does recalc_sigpending() after resume, so
+                 * retarget_shared_pending() and recalc_sigpending() are not
+                 * called here.
+                 */
+                spin_lock_irq(&child->sighand->siglock);
+                child->blocked = new_set;
+                spin_unlock_irq(&child->sighand->siglock);
+                ret = 0;
+                break;
+        }
        case PTRACE_INTERRUPT:
                /*
                 * Stop tracee without any side-effect on signal or job
@@ -948,8 +989,7 @@ int ptrace_request(struct task_struct *child, long request,
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
        case PTRACE_GETREGSET:
-        case PTRACE_SETREGSET:
+        case PTRACE_SETREGSET: {
-        {
                struct iovec kiov;
                struct iovec __user *uiov = datavp;
@@ -1181,19 +1221,3 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
        return ret;
 }
 #endif  /* CONFIG_COMPAT */
-#ifdef CONFIG_HAVE_HW_BREAKPOINT
-int ptrace_get_breakpoints(struct task_struct *tsk)
-{
-        if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt))
-                return 0;
-        return -1;
-}
-void ptrace_put_breakpoints(struct task_struct *tsk)
-{
-        if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt))
-                flush_ptrace_hw_breakpoint(tsk);
-}
-#endif /* CONFIG_HAVE_HW_BREAKPOINT */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 48ab70384a4c..cce6ba8bbace 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -104,31 +104,7 @@ void __rcu_read_unlock(void)
 }
 EXPORT_SYMBOL_GPL(__rcu_read_unlock);
-/*
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
- * Check for a task exiting while in a preemptible-RCU read-side
- * critical section, clean up if so.  No need to issue warnings,
- * as debug_check_no_locks_held() already does this if lockdep
- * is enabled.
- */
-void exit_rcu(void)
-{
-        struct task_struct *t = current;
-        if (likely(list_empty(&current->rcu_node_entry)))
-                return;
-        t->rcu_read_lock_nesting = 1;
-        barrier();
-        t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED;
-        __rcu_read_unlock();
-}
-#else /* #ifdef CONFIG_PREEMPT_RCU */
-void exit_rcu(void)
-{
-}
-#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key rcu_lock_key;
@@ -145,9 +121,6 @@ static struct lock_class_key rcu_sched_lock_key;
 struct lockdep_map rcu_sched_lock_map =
        STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key);
 EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
-#endif
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
 int debug_lockdep_rcu_enabled(void)
 {
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index a0714a51b6d7..aa344111de3e 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -44,7 +44,6 @@
 /* Forward declarations for rcutiny_plugin.h. */
 struct rcu_ctrlblk;
-static void invoke_rcu_callbacks(void);
 static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
 static void rcu_process_callbacks(struct softirq_action *unused);
 static void __call_rcu(struct rcu_head *head,
@@ -205,7 +204,7 @@ static int rcu_is_cpu_rrupt_from_idle(void)
 */
 static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
 {
-        reset_cpu_stall_ticks(rcp);
+        RCU_TRACE(reset_cpu_stall_ticks(rcp));
        if (rcp->rcucblist != NULL &&
            rcp->donetail != rcp->curtail) {
                rcp->donetail = rcp->curtail;
@@ -227,7 +226,7 @@ void rcu_sched_qs(int cpu)
        local_irq_save(flags);
        if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
            rcu_qsctr_help(&rcu_bh_ctrlblk))
-                invoke_rcu_callbacks();
+                raise_softirq(RCU_SOFTIRQ);
        local_irq_restore(flags);
 }
@@ -240,7 +239,7 @@ void rcu_bh_qs(int cpu)
        local_irq_save(flags);
        if (rcu_qsctr_help(&rcu_bh_ctrlblk))
-                invoke_rcu_callbacks();
+                raise_softirq(RCU_SOFTIRQ);
        local_irq_restore(flags);
 }
@@ -252,12 +251,11 @@ void rcu_bh_qs(int cpu)
 */
 void rcu_check_callbacks(int cpu, int user)
 {
-        check_cpu_stalls();
+        RCU_TRACE(check_cpu_stalls());
        if (user || rcu_is_cpu_rrupt_from_idle())
                rcu_sched_qs(cpu);
        else if (!in_softirq())
                rcu_bh_qs(cpu);
-        rcu_preempt_check_callbacks();
 }
 /*
@@ -278,7 +276,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
                                              ACCESS_ONCE(rcp->rcucblist),
                                              need_resched(),
                                              is_idle_task(current),
-                                              rcu_is_callbacks_kthread()));
+                                              false));
                return;
        }
@@ -290,7 +288,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
        *rcp->donetail = NULL;
        if (rcp->curtail == rcp->donetail)
                rcp->curtail = &rcp->rcucblist;
-        rcu_preempt_remove_callbacks(rcp);
        rcp->donetail = &rcp->rcucblist;
        local_irq_restore(flags);
@@ -309,14 +306,13 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
        RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
        RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(),
                                      is_idle_task(current),
-                                      rcu_is_callbacks_kthread()));
+                                      false));
 }
 static void rcu_process_callbacks(struct softirq_action *unused)
 {
        __rcu_process_callbacks(&rcu_sched_ctrlblk);
        __rcu_process_callbacks(&rcu_bh_ctrlblk);
-        rcu_preempt_process_callbacks();
 }
 /*
@@ -382,3 +378,8 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
        __call_rcu(head, func, &rcu_bh_ctrlblk);
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
+void rcu_init(void)
+{
+        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+}
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 8a233002faeb..0cd385acccfa 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -53,958 +53,10 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
 };
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
+#include <linux/kernel_stat.h>
 int rcu_scheduler_active __read_mostly;
 EXPORT_SYMBOL_GPL(rcu_scheduler_active);
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-#ifdef CONFIG_RCU_TRACE
-static void check_cpu_stall(struct rcu_ctrlblk *rcp)
-{
-        unsigned long j;
-        unsigned long js;
-        if (rcu_cpu_stall_suppress)
-                return;
-        rcp->ticks_this_gp++;
-        j = jiffies;
-        js = rcp->jiffies_stall;
-        if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
-                pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
-                       rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
-                       jiffies - rcp->gp_start, rcp->qlen);
-                dump_stack();
-        }
-        if (*rcp->curtail && ULONG_CMP_GE(j, js))
-                rcp->jiffies_stall = jiffies +
-                        3 * rcu_jiffies_till_stall_check() + 3;
-        else if (ULONG_CMP_GE(j, js))
-                rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
-}
-static void check_cpu_stall_preempt(void);
-#endif /* #ifdef CONFIG_RCU_TRACE */
-static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
-{
-#ifdef CONFIG_RCU_TRACE
-        rcp->ticks_this_gp = 0;
-        rcp->gp_start = jiffies;
-        rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
-#endif /* #ifdef CONFIG_RCU_TRACE */
-}
-static void check_cpu_stalls(void)
-{
-        RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk));
-        RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk));
-        RCU_TRACE(check_cpu_stall_preempt());
-}
-#ifdef CONFIG_TINY_PREEMPT_RCU
-#include <linux/delay.h>
-/* Global control variables for preemptible RCU. */
-struct rcu_preempt_ctrlblk {
-        struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */
-        struct rcu_head **nexttail;
-                                /* Tasks blocked in a preemptible RCU */
-                                /*  read-side critical section while an */
-                                /*  preemptible-RCU grace period is in */
-                                /*  progress must wait for a later grace */
-                                /*  period.  This pointer points to the */
-                                /*  ->next pointer of the last task that */
-                                /*  must wait for a later grace period, or */
-                                /*  to &->rcb.rcucblist if there is no */
-                                /*  such task. */
-        struct list_head blkd_tasks;
-                                /* Tasks blocked in RCU read-side critical */
-                                /*  section.  Tasks are placed at the head */
-                                /*  of this list and age towards the tail. */
-        struct list_head *gp_tasks;
-                                /* Pointer to the first task blocking the */
-                                /*  current grace period, or NULL if there */
-                                /*  is no such task. */
-        struct list_head *exp_tasks;
-                                /* Pointer to first task blocking the */
-                                /*  current expedited grace period, or NULL */
-                                /*  if there is no such task.  If there */
-                                /*  is no current expedited grace period, */
-                                /*  then there cannot be any such task. */
-#ifdef CONFIG_RCU_BOOST
-        struct list_head *boost_tasks;
-                                /* Pointer to first task that needs to be */
-                                /*  priority-boosted, or NULL if no priority */
-                                /*  boosting is needed.  If there is no */
-                                /*  current or expedited grace period, there */
-                                /*  can be no such task. */
-#endif /* #ifdef CONFIG_RCU_BOOST */
-        u8 gpnum;               /* Current grace period. */
-        u8 gpcpu;               /* Last grace period blocked by the CPU. */
-        u8 completed;           /* Last grace period completed. */
-                                /*  If all three are equal, RCU is idle. */
-#ifdef CONFIG_RCU_BOOST
-        unsigned long boost_time; /* When to start boosting (jiffies) */
-#endif /* #ifdef CONFIG_RCU_BOOST */
-#ifdef CONFIG_RCU_TRACE
-        unsigned long n_grace_periods;
-#ifdef CONFIG_RCU_BOOST
-        unsigned long n_tasks_boosted;
-                                /* Total number of tasks boosted. */
-        unsigned long n_exp_boosts;
-                                /* Number of tasks boosted for expedited GP. */
-        unsigned long n_normal_boosts;
-                                /* Number of tasks boosted for normal GP. */
-        unsigned long n_balk_blkd_tasks;
-                                /* Refused to boost: no blocked tasks. */
-        unsigned long n_balk_exp_gp_tasks;
-                                /* Refused to boost: nothing blocking GP. */
-        unsigned long n_balk_boost_tasks;
-                                /* Refused to boost: already boosting. */
-        unsigned long n_balk_notyet;
-                                /* Refused to boost: not yet time. */
-        unsigned long n_balk_nos;
-                                /* Refused to boost: not sure why, though. */
-                                /*  This can happen due to race conditions. */
-#endif /* #ifdef CONFIG_RCU_BOOST */
-#endif /* #ifdef CONFIG_RCU_TRACE */
-};
-static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
-        .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist,
-        .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist,
-        .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist,
-        .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks),
-        RCU_TRACE(.rcb.name = "rcu_preempt")
-};
-static int rcu_preempted_readers_exp(void);
-static void rcu_report_exp_done(void);
-/*
- * Return true if the CPU has not yet responded to the current grace period.
- */
-static int rcu_cpu_blocking_cur_gp(void)
-{
-        return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum;
-}
-/*
- * Check for a running RCU reader.  Because there is only one CPU,
- * there can be but one running RCU reader at a time.  ;-)
- *
- * Returns zero if there are no running readers.  Returns a positive
- * number if there is at least one reader within its RCU read-side
- * critical section.  Returns a negative number if an outermost reader
- * is in the midst of exiting from its RCU read-side critical section
- *
- * Returns zero if there are no running readers.  Returns a positive
- * number if there is at least one reader within its RCU read-side
- * critical section.  Returns a negative number if an outermost reader
- * is in the midst of exiting from its RCU read-side critical section.
- */
-static int rcu_preempt_running_reader(void)
-{
-        return current->rcu_read_lock_nesting;
-}
-/*
- * Check for preempted RCU readers blocking any grace period.
- * If the caller needs a reliable answer, it must disable hard irqs.
- */
-static int rcu_preempt_blocked_readers_any(void)
-{
-        return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks);
-}
-/*
- * Check for preempted RCU readers blocking the current grace period.
- * If the caller needs a reliable answer, it must disable hard irqs.
- */
-static int rcu_preempt_blocked_readers_cgp(void)
-{
-        return rcu_preempt_ctrlblk.gp_tasks != NULL;
-}
-/*
- * Return true if another preemptible-RCU grace period is needed.
- */
-static int rcu_preempt_needs_another_gp(void)
-{
-        return *rcu_preempt_ctrlblk.rcb.curtail != NULL;
-}
-/*
- * Return true if a preemptible-RCU grace period is in progress.
- * The caller must disable hardirqs.
- */
-static int rcu_preempt_gp_in_progress(void)
-{
-        return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum;
-}
-/*
- * Advance a ->blkd_tasks-list pointer to the next entry, instead
- * returning NULL if at the end of the list.
- */
-static struct list_head *rcu_next_node_entry(struct task_struct *t)
-{
-        struct list_head *np;
-        np = t->rcu_node_entry.next;
-        if (np == &rcu_preempt_ctrlblk.blkd_tasks)
-                np = NULL;
-        return np;
-}
-#ifdef CONFIG_RCU_TRACE
-#ifdef CONFIG_RCU_BOOST
-static void rcu_initiate_boost_trace(void);
-#endif /* #ifdef CONFIG_RCU_BOOST */
-/*
- * Dump additional statistice for TINY_PREEMPT_RCU.
- */
-static void show_tiny_preempt_stats(struct seq_file *m)
-{
-        seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n",
-                   rcu_preempt_ctrlblk.rcb.qlen,
-                   rcu_preempt_ctrlblk.n_grace_periods,
-                   rcu_preempt_ctrlblk.gpnum,
-                   rcu_preempt_ctrlblk.gpcpu,
-                   rcu_preempt_ctrlblk.completed,
-                   "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)],
-                   "N."[!rcu_preempt_ctrlblk.gp_tasks],
-                   "E."[!rcu_preempt_ctrlblk.exp_tasks]);
-#ifdef CONFIG_RCU_BOOST
-        seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
-                   "             ",
-                   "B."[!rcu_preempt_ctrlblk.boost_tasks],
-                   rcu_preempt_ctrlblk.n_tasks_boosted,
-                   rcu_preempt_ctrlblk.n_exp_boosts,
-                   rcu_preempt_ctrlblk.n_normal_boosts,
-                   (int)(jiffies & 0xffff),
-                   (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
-        seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n",
-                   "             balk",
-                   rcu_preempt_ctrlblk.n_balk_blkd_tasks,
-                   rcu_preempt_ctrlblk.n_balk_exp_gp_tasks,
-                   rcu_preempt_ctrlblk.n_balk_boost_tasks,
-                   rcu_preempt_ctrlblk.n_balk_notyet,
-                   rcu_preempt_ctrlblk.n_balk_nos);
-#endif /* #ifdef CONFIG_RCU_BOOST */
-}
-#endif /* #ifdef CONFIG_RCU_TRACE */
-#ifdef CONFIG_RCU_BOOST
-#include "rtmutex_common.h"
-#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
-/* Controls for rcu_kthread() kthread. */
-static struct task_struct *rcu_kthread_task;
-static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
-static unsigned long have_rcu_kthread_work;
-/*
- * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
- * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
- */
-static int rcu_boost(void)
-{
-        unsigned long flags;
-        struct rt_mutex mtx;
-        struct task_struct *t;
-        struct list_head *tb;
-        if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
-            rcu_preempt_ctrlblk.exp_tasks == NULL)
-                return 0;  /* Nothing to boost. */
-        local_irq_save(flags);
-        /*
-         * Recheck with irqs disabled: all tasks in need of boosting
-         * might exit their RCU read-side critical sections on their own
-         * if we are preempted just before disabling irqs.
-         */
-        if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
-            rcu_preempt_ctrlblk.exp_tasks == NULL) {
-                local_irq_restore(flags);
-                return 0;
-        }
-        /*
-         * Preferentially boost tasks blocking expedited grace periods.
-         * This cannot starve the normal grace periods because a second
-         * expedited grace period must boost all blocked tasks, including
-         * those blocking the pre-existing normal grace period.
-         */
-        if (rcu_preempt_ctrlblk.exp_tasks != NULL) {
-                tb = rcu_preempt_ctrlblk.exp_tasks;
-                RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
-        } else {
-                tb = rcu_preempt_ctrlblk.boost_tasks;
-                RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
-        }
-        RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
-        /*
-         * We boost task t by manufacturing an rt_mutex that appears to
-         * be held by task t.  We leave a pointer to that rt_mutex where
-         * task t can find it, and task t will release the mutex when it
-         * exits its outermost RCU read-side critical section.  Then
-         * simply acquiring this artificial rt_mutex will boost task
-         * t's priority.  (Thanks to tglx for suggesting this approach!)
-         */
-        t = container_of(tb, struct task_struct, rcu_node_entry);
-        rt_mutex_init_proxy_locked(&mtx, t);
-        t->rcu_boost_mutex = &mtx;
-        local_irq_restore(flags);
-        rt_mutex_lock(&mtx);
-        rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
-        return ACCESS_ONCE(rcu_preempt_ctrlblk.boost_tasks) != NULL ||
-               ACCESS_ONCE(rcu_preempt_ctrlblk.exp_tasks) != NULL;
-}
-/*
- * Check to see if it is now time to start boosting RCU readers blocking
- * the current grace period, and, if so, tell the rcu_kthread_task to
- * start boosting them.  If there is an expedited boost in progress,
- * we wait for it to complete.
- *
- * If there are no blocked readers blocking the current grace period,
- * return 0 to let the caller know, otherwise return 1.  Note that this
- * return value is independent of whether or not boosting was done.
- */
-static int rcu_initiate_boost(void)
-{
-        if (!rcu_preempt_blocked_readers_cgp() &&
-            rcu_preempt_ctrlblk.exp_tasks == NULL) {
-                RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++);
-                return 0;
-        }
-        if (rcu_preempt_ctrlblk.exp_tasks != NULL ||
-            (rcu_preempt_ctrlblk.gp_tasks != NULL &&
-             rcu_preempt_ctrlblk.boost_tasks == NULL &&
-             ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) {
-                if (rcu_preempt_ctrlblk.exp_tasks == NULL)
-                        rcu_preempt_ctrlblk.boost_tasks =
-                                rcu_preempt_ctrlblk.gp_tasks;
-                invoke_rcu_callbacks();
-        } else {
-                RCU_TRACE(rcu_initiate_boost_trace());
-        }
-        return 1;
-}
-#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
-/*
- * Do priority-boost accounting for the start of a new grace period.
- */
-static void rcu_preempt_boost_start_gp(void)
-{
-        rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
-}
-#else /* #ifdef CONFIG_RCU_BOOST */
-/*
- * If there is no RCU priority boosting, we don't initiate boosting,
- * but we do indicate whether there are blocked readers blocking the
- * current grace period.
- */
-static int rcu_initiate_boost(void)
-{
-        return rcu_preempt_blocked_readers_cgp();
-}
-/*
- * If there is no RCU priority boosting, nothing to do at grace-period start.
- */
-static void rcu_preempt_boost_start_gp(void)
-{
-}
-#endif /* else #ifdef CONFIG_RCU_BOOST */
-/*
- * Record a preemptible-RCU quiescent state for the specified CPU.  Note
- * that this just means that the task currently running on the CPU is
- * in a quiescent state.  There might be any number of tasks blocked
- * while in an RCU read-side critical section.
- *
- * Unlike the other rcu_*_qs() functions, callers to this function
- * must disable irqs in order to protect the assignment to
- * ->rcu_read_unlock_special.
- *
- * Because this is a single-CPU implementation, the only way a grace
- * period can end is if the CPU is in a quiescent state.  The reason is
- * that a blocked preemptible-RCU reader can exit its critical section
- * only if the CPU is running it at the time.  Therefore, when the
- * last task blocking the current grace period exits its RCU read-side
- * critical section, neither the CPU nor blocked tasks will be stopping
- * the current grace period.  (In contrast, SMP implementations
- * might have CPUs running in RCU read-side critical sections that
- * block later grace periods -- but this is not possible given only
- * one CPU.)
- */
-static void rcu_preempt_cpu_qs(void)
-{
-        /* Record both CPU and task as having responded to current GP. */
-        rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
-        current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
-        /* If there is no GP then there is nothing more to do.  */
-        if (!rcu_preempt_gp_in_progress())
-                return;
-        /*
-         * Check up on boosting.  If there are readers blocking the
-         * current grace period, leave.
-         */
-        if (rcu_initiate_boost())
-                return;
-        /* Advance callbacks. */
-        rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum;
-        rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail;
-        rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail;
-        /* If there are no blocked readers, next GP is done instantly. */
-        if (!rcu_preempt_blocked_readers_any())
-                rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
-        /* If there are done callbacks, cause them to be invoked. */
-        if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
-                invoke_rcu_callbacks();
-}
-/*
- * Start a new RCU grace period if warranted.  Hard irqs must be disabled.
- */
-static void rcu_preempt_start_gp(void)
-{
-        if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) {
-                /* Official start of GP. */
-                rcu_preempt_ctrlblk.gpnum++;
-                RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
-                reset_cpu_stall_ticks(&rcu_preempt_ctrlblk.rcb);
-                /* Any blocked RCU readers block new GP. */
-                if (rcu_preempt_blocked_readers_any())
-                        rcu_preempt_ctrlblk.gp_tasks =
-                                rcu_preempt_ctrlblk.blkd_tasks.next;
-                /* Set up for RCU priority boosting. */
-                rcu_preempt_boost_start_gp();
-                /* If there is no running reader, CPU is done with GP. */
-                if (!rcu_preempt_running_reader())
-                        rcu_preempt_cpu_qs();
-        }
-}
-/*
- * We have entered the scheduler, and the current task might soon be
- * context-switched away from.  If this task is in an RCU read-side
- * critical section, we will no longer be able to rely on the CPU to
- * record that fact, so we enqueue the task on the blkd_tasks list.
- * If the task started after the current grace period began, as recorded
- * by ->gpcpu, we enqueue at the beginning of the list.  Otherwise
- * before the element referenced by ->gp_tasks (or at the tail if
- * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element.
- * The task will dequeue itself when it exits the outermost enclosing
- * RCU read-side critical section.  Therefore, the current grace period
- * cannot be permitted to complete until the ->gp_tasks pointer becomes
- * NULL.
- *
- * Caller must disable preemption.
- */
-void rcu_preempt_note_context_switch(void)
-{
-        struct task_struct *t = current;
-        unsigned long flags;
-        local_irq_save(flags); /* must exclude scheduler_tick(). */
-        if (rcu_preempt_running_reader() > 0 &&
-            (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
-                /* Possibly blocking in an RCU read-side critical section. */
-                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
-                /*
-                 * If this CPU has already checked in, then this task
-                 * will hold up the next grace period rather than the
-                 * current grace period.  Queue the task accordingly.
-                 * If the task is queued for the current grace period
-                 * (i.e., this CPU has not yet passed through a quiescent
-                 * state for the current grace period), then as long
-                 * as that task remains queued, the current grace period
-                 * cannot end.
-                 */
-                list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks);
-                if (rcu_cpu_blocking_cur_gp())
-                        rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry;
-        } else if (rcu_preempt_running_reader() < 0 &&
-                   t->rcu_read_unlock_special) {
-                /*
-                 * Complete exit from RCU read-side critical section on
-                 * behalf of preempted instance of __rcu_read_unlock().
-                 */
-                rcu_read_unlock_special(t);
-        }
-        /*
-         * Either we were not in an RCU read-side critical section to
-         * begin with, or we have now recorded that critical section
-         * globally.  Either way, we can now note a quiescent state
-         * for this CPU.  Again, if we were in an RCU read-side critical
-         * section, and if that critical section was blocking the current
-         * grace period, then the fact that the task has been enqueued
-         * means that current grace period continues to be blocked.
-         */
-        rcu_preempt_cpu_qs();
-        local_irq_restore(flags);
-}
-/*
- * Handle special cases during rcu_read_unlock(), such as needing to
- * notify RCU core processing or task having blocked during the RCU
- * read-side critical section.
- */
-void rcu_read_unlock_special(struct task_struct *t)
-{
-        int empty;
-        int empty_exp;
-        unsigned long flags;
-        struct list_head *np;
-#ifdef CONFIG_RCU_BOOST
-        struct rt_mutex *rbmp = NULL;
-#endif /* #ifdef CONFIG_RCU_BOOST */
-        int special;
-        /*
-         * NMI handlers cannot block and cannot safely manipulate state.
-         * They therefore cannot possibly be special, so just leave.
-         */
-        if (in_nmi())
-                return;
-        local_irq_save(flags);
-        /*
-         * If RCU core is waiting for this CPU to exit critical section,
-         * let it know that we have done so.
-         */
-        special = t->rcu_read_unlock_special;
-        if (special & RCU_READ_UNLOCK_NEED_QS)
-                rcu_preempt_cpu_qs();
-        /* Hardware IRQ handlers cannot block. */
-        if (in_irq() || in_serving_softirq()) {
-                local_irq_restore(flags);
-                return;
-        }
-        /* Clean up if blocked during RCU read-side critical section. */
-        if (special & RCU_READ_UNLOCK_BLOCKED) {
-                t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
-                /*
-                 * Remove this task from the ->blkd_tasks list and adjust
-                 * any pointers that might have been referencing it.
-                 */
-                empty = !rcu_preempt_blocked_readers_cgp();
-                empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
-                np = rcu_next_node_entry(t);
-                list_del_init(&t->rcu_node_entry);
-                if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
-                        rcu_preempt_ctrlblk.gp_tasks = np;
-                if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
-                        rcu_preempt_ctrlblk.exp_tasks = np;
-#ifdef CONFIG_RCU_BOOST
-                if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
-                        rcu_preempt_ctrlblk.boost_tasks = np;
-#endif /* #ifdef CONFIG_RCU_BOOST */
-                /*
-                 * If this was the last task on the current list, and if
-                 * we aren't waiting on the CPU, report the quiescent state
-                 * and start a new grace period if needed.
-                 */
-                if (!empty && !rcu_preempt_blocked_readers_cgp()) {
-                        rcu_preempt_cpu_qs();
-                        rcu_preempt_start_gp();
-                }
-                /*
-                 * If this was the last task on the expedited lists,
-                 * then we need wake up the waiting task.
-                 */
-                if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
-                        rcu_report_exp_done();
-        }
-#ifdef CONFIG_RCU_BOOST
-        /* Unboost self if was boosted. */
-        if (t->rcu_boost_mutex != NULL) {
-                rbmp = t->rcu_boost_mutex;
-                t->rcu_boost_mutex = NULL;
-                rt_mutex_unlock(rbmp);
-        }
-#endif /* #ifdef CONFIG_RCU_BOOST */
-        local_irq_restore(flags);
-}
-/*
- * Check for a quiescent state from the current CPU.  When a task blocks,
- * the task is recorded in the rcu_preempt_ctrlblk structure, which is
- * checked elsewhere.  This is called from the scheduling-clock interrupt.
- *
- * Caller must disable hard irqs.
- */
-static void rcu_preempt_check_callbacks(void)
-{
-        struct task_struct *t = current;
-        if (rcu_preempt_gp_in_progress() &&
-            (!rcu_preempt_running_reader() ||
-             !rcu_cpu_blocking_cur_gp()))
-                rcu_preempt_cpu_qs();
-        if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
-            rcu_preempt_ctrlblk.rcb.donetail)
-                invoke_rcu_callbacks();
-        if (rcu_preempt_gp_in_progress() &&
-            rcu_cpu_blocking_cur_gp() &&
-            rcu_preempt_running_reader() > 0)
-                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
-}
-/*
- * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
- * update, so this is invoked from rcu_process_callbacks() to
- * handle that case.  Of course, it is invoked for all flavors of
- * RCU, but RCU callbacks can appear only on one of the lists, and
- * neither ->nexttail nor ->donetail can possibly be NULL, so there
- * is no need for an explicit check.
- */
-static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
-{
-        if (rcu_preempt_ctrlblk.nexttail == rcp->donetail)
-                rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist;
-}
-/*
- * Process callbacks for preemptible RCU.
- */
-static void rcu_preempt_process_callbacks(void)
-{
-        __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
-}
-/*
- * Queue a preemptible -RCU callback for invocation after a grace period.
- */
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
-{
-        unsigned long flags;
-        debug_rcu_head_queue(head);
-        head->func = func;
-        head->next = NULL;
-        local_irq_save(flags);
-        *rcu_preempt_ctrlblk.nexttail = head;
-        rcu_preempt_ctrlblk.nexttail = &head->next;
-        RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++);
-        rcu_preempt_start_gp();  /* checks to see if GP needed. */
-        local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(call_rcu);
-/*
- * synchronize_rcu - wait until a grace period has elapsed.
- *
- * Control will return to the caller some time after a full grace
- * period has elapsed, in other words after all currently executing RCU
- * read-side critical sections have completed.  RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
- * and may be nested.
- */
-void synchronize_rcu(void)
-{
-        rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
-                           !lock_is_held(&rcu_lock_map) &&
-                           !lock_is_held(&rcu_sched_lock_map),
-                           "Illegal synchronize_rcu() in RCU read-side critical section");
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-        if (!rcu_scheduler_active)
-                return;
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-        WARN_ON_ONCE(rcu_preempt_running_reader());
-        if (!rcu_preempt_blocked_readers_any())
-                return;
-        /* Once we get past the fastpath checks, same code as rcu_barrier(). */
-        if (rcu_expedited)
-                synchronize_rcu_expedited();
-        else
-                rcu_barrier();
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu);
-static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
-static unsigned long sync_rcu_preempt_exp_count;
-static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
-/*
- * Return non-zero if there are any tasks in RCU read-side critical
- * sections blocking the current preemptible-RCU expedited grace period.
- * If there is no preemptible-RCU expedited grace period currently in
- * progress, returns zero unconditionally.
- */
-static int rcu_preempted_readers_exp(void)
-{
-        return rcu_preempt_ctrlblk.exp_tasks != NULL;
-}
-/*
- * Report the exit from RCU read-side critical section for the last task
- * that queued itself during or before the current expedited preemptible-RCU
- * grace period.
- */
-static void rcu_report_exp_done(void)
-{
-        wake_up(&sync_rcu_preempt_exp_wq);
-}
-/*
- * Wait for an rcu-preempt grace period, but expedite it.  The basic idea
- * is to rely in the fact that there is but one CPU, and that it is
- * illegal for a task to invoke synchronize_rcu_expedited() while in a
- * preemptible-RCU read-side critical section.  Therefore, any such
- * critical sections must correspond to blocked tasks, which must therefore
- * be on the ->blkd_tasks list.  So just record the current head of the
- * list in the ->exp_tasks pointer, and wait for all tasks including and
- * after the task pointed to by ->exp_tasks to drain.
- */
-void synchronize_rcu_expedited(void)
-{
-        unsigned long flags;
-        struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk;
-        unsigned long snap;
-        barrier(); /* ensure prior action seen before grace period. */
-        WARN_ON_ONCE(rcu_preempt_running_reader());
-        /*
-         * Acquire lock so that there is only one preemptible RCU grace
-         * period in flight.  Of course, if someone does the expedited
-         * grace period for us while we are acquiring the lock, just leave.
-         */
-        snap = sync_rcu_preempt_exp_count + 1;
-        mutex_lock(&sync_rcu_preempt_exp_mutex);
-        if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count))
-                goto unlock_mb_ret; /* Others did our work for us. */
-        local_irq_save(flags);
-        /*
-         * All RCU readers have to already be on blkd_tasks because
-         * we cannot legally be executing in an RCU read-side critical
-         * section.
-         */
-        /* Snapshot current head of ->blkd_tasks list. */
-        rpcp->exp_tasks = rpcp->blkd_tasks.next;
-        if (rpcp->exp_tasks == &rpcp->blkd_tasks)
-                rpcp->exp_tasks = NULL;
-        /* Wait for tail of ->blkd_tasks list to drain. */
-        if (!rcu_preempted_readers_exp()) {
-                local_irq_restore(flags);
-        } else {
-                rcu_initiate_boost();
-                local_irq_restore(flags);
-                wait_event(sync_rcu_preempt_exp_wq,
-                           !rcu_preempted_readers_exp());
-        }
-        /* Clean up and exit. */
-        barrier(); /* ensure expedited GP seen before counter increment. */
-        sync_rcu_preempt_exp_count++;
-unlock_mb_ret:
-        mutex_unlock(&sync_rcu_preempt_exp_mutex);
-        barrier(); /* ensure subsequent action seen after grace period. */
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
-/*
- * Does preemptible RCU need the CPU to stay out of dynticks mode?
- */
-int rcu_preempt_needs_cpu(void)
-{
-        return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
-}
-#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
-#ifdef CONFIG_RCU_TRACE
-/*
- * Because preemptible RCU does not exist, it is not necessary to
- * dump out its statistics.
- */
-static void show_tiny_preempt_stats(struct seq_file *m)
-{
-}
-#endif /* #ifdef CONFIG_RCU_TRACE */
-/*
- * Because preemptible RCU does not exist, it never has any callbacks
- * to check.
- */
-static void rcu_preempt_check_callbacks(void)
-{
-}
-/*
- * Because preemptible RCU does not exist, it never has any callbacks
- * to remove.
- */
-static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
-{
-}
-/*
- * Because preemptible RCU does not exist, it never has any callbacks
- * to process.
- */
-static void rcu_preempt_process_callbacks(void)
-{
-}
-#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
-#ifdef CONFIG_RCU_BOOST
-/*
- * Wake up rcu_kthread() to process callbacks now eligible for invocation
- * or to boost readers.
- */
-static void invoke_rcu_callbacks(void)
-{
-        have_rcu_kthread_work = 1;
-        if (rcu_kthread_task != NULL)
-                wake_up(&rcu_kthread_wq);
-}
-#ifdef CONFIG_RCU_TRACE
-/*
- * Is the current CPU running the RCU-callbacks kthread?
- * Caller must have preemption disabled.
- */
-static bool rcu_is_callbacks_kthread(void)
-{
-        return rcu_kthread_task == current;
-}
-#endif /* #ifdef CONFIG_RCU_TRACE */
-/*
- * This kthread invokes RCU callbacks whose grace periods have
- * elapsed.  It is awakened as needed, and takes the place of the
- * RCU_SOFTIRQ that is used for this purpose when boosting is disabled.
- * This is a kthread, but it is never stopped, at least not until
- * the system goes down.
- */
-static int rcu_kthread(void *arg)
-{
-        unsigned long work;
-        unsigned long morework;
-        unsigned long flags;
-        for (;;) {
-                wait_event_interruptible(rcu_kthread_wq,
-                                         have_rcu_kthread_work != 0);
-                morework = rcu_boost();
-                local_irq_save(flags);
-                work = have_rcu_kthread_work;
-                have_rcu_kthread_work = morework;
-                local_irq_restore(flags);
-                if (work)
-                        rcu_process_callbacks(NULL);
-                schedule_timeout_interruptible(1); /* Leave CPU for others. */
-        }
-        return 0;  /* Not reached, but needed to shut gcc up. */
-}
-/*
- * Spawn the kthread that invokes RCU callbacks.
- */
-static int __init rcu_spawn_kthreads(void)
-{
-        struct sched_param sp;
-        rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
-        sp.sched_priority = RCU_BOOST_PRIO;
-        sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
-        return 0;
-}
-early_initcall(rcu_spawn_kthreads);
-#else /* #ifdef CONFIG_RCU_BOOST */
-/* Hold off callback invocation until early_initcall() time. */
-static int rcu_scheduler_fully_active __read_mostly;
-/*
- * Start up softirq processing of callbacks.
- */
-void invoke_rcu_callbacks(void)
-{
-        if (rcu_scheduler_fully_active)
-                raise_softirq(RCU_SOFTIRQ);
-}
-#ifdef CONFIG_RCU_TRACE
-/*
- * There is no callback kthread, so this thread is never it.
- */
-static bool rcu_is_callbacks_kthread(void)
-{
-        return false;
-}
-#endif /* #ifdef CONFIG_RCU_TRACE */
-static int __init rcu_scheduler_really_started(void)
-{
-        rcu_scheduler_fully_active = 1;
-        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
-        raise_softirq(RCU_SOFTIRQ);  /* Invoke any callbacks from early boot. */
-        return 0;
-}
-early_initcall(rcu_scheduler_really_started);
-#endif /* #else #ifdef CONFIG_RCU_BOOST */
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-#include <linux/kernel_stat.h>
 /*
 * During boot, we forgive RCU lockdep issues.  After this function is
@@ -1020,25 +72,6 @@ void __init rcu_scheduler_starting(void)
 #ifdef CONFIG_RCU_TRACE
-#ifdef CONFIG_RCU_BOOST
-static void rcu_initiate_boost_trace(void)
-{
-        if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
-                rcu_preempt_ctrlblk.n_balk_blkd_tasks++;
-        else if (rcu_preempt_ctrlblk.gp_tasks == NULL &&
-                 rcu_preempt_ctrlblk.exp_tasks == NULL)
-                rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++;
-        else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
-                rcu_preempt_ctrlblk.n_balk_boost_tasks++;
-        else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
-                rcu_preempt_ctrlblk.n_balk_notyet++;
-        else
-                rcu_preempt_ctrlblk.n_balk_nos++;
-}
-#endif /* #ifdef CONFIG_RCU_BOOST */
 static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
 {
        unsigned long flags;
@@ -1053,7 +86,6 @@ static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
 */
 static int show_tiny_stats(struct seq_file *m, void *unused)
 {
-        show_tiny_preempt_stats(m);
        seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
        seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
        return 0;
@@ -1103,11 +135,40 @@ MODULE_AUTHOR("Paul E. McKenney");
 MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
 MODULE_LICENSE("GPL");
-static void check_cpu_stall_preempt(void)
+static void check_cpu_stall(struct rcu_ctrlblk *rcp)
 {
-#ifdef CONFIG_TINY_PREEMPT_RCU
+        unsigned long j;
-        check_cpu_stall(&rcu_preempt_ctrlblk.rcb);
+        unsigned long js;
-#endif /* #ifdef CONFIG_TINY_PREEMPT_RCU */
+        if (rcu_cpu_stall_suppress)
+                return;
+        rcp->ticks_this_gp++;
+        j = jiffies;
+        js = rcp->jiffies_stall;
+        if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
+                pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
+                       rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
+                       jiffies - rcp->gp_start, rcp->qlen);
+                dump_stack();
+        }
+        if (*rcp->curtail && ULONG_CMP_GE(j, js))
+                rcp->jiffies_stall = jiffies +
+                        3 * rcu_jiffies_till_stall_check() + 3;
+        else if (ULONG_CMP_GE(j, js))
+                rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
+}
+static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
+{
+        rcp->ticks_this_gp = 0;
+        rcp->gp_start = jiffies;
+        rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
+}
+static void check_cpu_stalls(void)
+{
+        RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk));
+        RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk));
 }
 #endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index e1f3a8c96724..f4871e52c546 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -695,44 +695,6 @@ static struct rcu_torture_ops srcu_sync_ops = {
        .name           = "srcu_sync"
 };
-static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl)
-{
-        return srcu_read_lock_raw(&srcu_ctl);
-}
-static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl)
-{
-        srcu_read_unlock_raw(&srcu_ctl, idx);
-}
-static struct rcu_torture_ops srcu_raw_ops = {
-        .init           = rcu_sync_torture_init,
-        .readlock       = srcu_torture_read_lock_raw,
-        .read_delay     = srcu_read_delay,
-        .readunlock     = srcu_torture_read_unlock_raw,
-        .completed      = srcu_torture_completed,
-        .deferred_free  = srcu_torture_deferred_free,
-        .sync           = srcu_torture_synchronize,
-        .call           = NULL,
-        .cb_barrier     = NULL,
-        .stats          = srcu_torture_stats,
-        .name           = "srcu_raw"
-};
-static struct rcu_torture_ops srcu_raw_sync_ops = {
-        .init           = rcu_sync_torture_init,
-        .readlock       = srcu_torture_read_lock_raw,
-        .read_delay     = srcu_read_delay,
-        .readunlock     = srcu_torture_read_unlock_raw,
-        .completed      = srcu_torture_completed,
-        .deferred_free  = rcu_sync_torture_deferred_free,
-        .sync           = srcu_torture_synchronize,
-        .call           = NULL,
-        .cb_barrier     = NULL,
-        .stats          = srcu_torture_stats,
-        .name           = "srcu_raw_sync"
-};
 static void srcu_torture_synchronize_expedited(void)
 {
        synchronize_srcu_expedited(&srcu_ctl);
@@ -1514,7 +1476,7 @@ rcu_torture_shutdown(void *arg)
 * Execute random CPU-hotplug operations at the interval specified
 * by the onoff_interval.
 */
-static int __cpuinit
+static int
 rcu_torture_onoff(void *arg)
 {
        int cpu;
@@ -1596,7 +1558,7 @@ rcu_torture_onoff(void *arg)
        return 0;
 }
-static int __cpuinit
+static int
 rcu_torture_onoff_init(void)
 {
        int ret;
@@ -1639,7 +1601,7 @@ static void rcu_torture_onoff_cleanup(void)
 * CPU-stall kthread.  It waits as specified by stall_cpu_holdoff, then
 * induces a CPU stall for the time specified by stall_cpu.
 */
-static int __cpuinit rcu_torture_stall(void *args)
+static int rcu_torture_stall(void *args)
 {
        unsigned long stop_at;
@@ -1983,7 +1945,6 @@ rcu_torture_init(void)
                { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
                  &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
                  &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops,
-                  &srcu_raw_ops, &srcu_raw_sync_ops,
                  &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
        mutex_lock(&fullstop_mutex);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 35380019f0fc..068de3a93606 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -218,8 +218,8 @@ module_param(blimit, long, 0444);
 module_param(qhimark, long, 0444);
 module_param(qlowmark, long, 0444);
-static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS;
+static ulong jiffies_till_first_fqs = ULONG_MAX;
-static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;
+static ulong jiffies_till_next_fqs = ULONG_MAX;
 module_param(jiffies_till_first_fqs, ulong, 0644);
 module_param(jiffies_till_next_fqs, ulong, 0644);
@@ -866,7 +866,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
         * See Documentation/RCU/stallwarn.txt for info on how to debug
         * RCU CPU stall warnings.
         */
-        printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks:",
+        pr_err("INFO: %s detected stalls on CPUs/tasks:",
               rsp->name);
        print_cpu_stall_info_begin();
        rcu_for_each_leaf_node(rsp, rnp) {
@@ -899,7 +899,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
               smp_processor_id(), (long)(jiffies - rsp->gp_start),
               rsp->gpnum, rsp->completed, totqlen);
        if (ndetected == 0)
-                printk(KERN_ERR "INFO: Stall ended before state dump start\n");
+                pr_err("INFO: Stall ended before state dump start\n");
        else if (!trigger_all_cpu_backtrace())
                rcu_dump_cpu_stacks(rsp);
@@ -922,7 +922,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
         * See Documentation/RCU/stallwarn.txt for info on how to debug
         * RCU CPU stall warnings.
         */
-        printk(KERN_ERR "INFO: %s self-detected stall on CPU", rsp->name);
+        pr_err("INFO: %s self-detected stall on CPU", rsp->name);
        print_cpu_stall_info_begin();
        print_cpu_stall_info(rsp, smp_processor_id());
        print_cpu_stall_info_end();
@@ -985,65 +985,6 @@ void rcu_cpu_stall_reset(void)
 }
 /*
- * Update CPU-local rcu_data state to record the newly noticed grace period.
- * This is used both when we started the grace period and when we notice
- * that someone else started the grace period.  The caller must hold the
- * ->lock of the leaf rcu_node structure corresponding to the current CPU,
- *  and must have irqs disabled.
- */
-static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
-{
-        if (rdp->gpnum != rnp->gpnum) {
-                /*
-                 * If the current grace period is waiting for this CPU,
-                 * set up to detect a quiescent state, otherwise don't
-                 * go looking for one.
-                 */
-                rdp->gpnum = rnp->gpnum;
-                trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart");
-                rdp->passed_quiesce = 0;
-                rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
-                zero_cpu_stall_ticks(rdp);
-        }
-}
-static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
-{
-        unsigned long flags;
-        struct rcu_node *rnp;
-        local_irq_save(flags);
-        rnp = rdp->mynode;
-        if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */
-            !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
-                local_irq_restore(flags);
-                return;
-        }
-        __note_new_gpnum(rsp, rnp, rdp);
-        raw_spin_unlock_irqrestore(&rnp->lock, flags);
-}
-/*
- * Did someone else start a new RCU grace period start since we last
- * checked?  Update local state appropriately if so.  Must be called
- * on the CPU corresponding to rdp.
- */
-static int
-check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
-{
-        unsigned long flags;
-        int ret = 0;
-        local_irq_save(flags);
-        if (rdp->gpnum != rsp->gpnum) {
-                note_new_gpnum(rsp, rdp);
-                ret = 1;
-        }
-        local_irq_restore(flags);
-        return ret;
-}
-/*
 * Initialize the specified rcu_data structure's callback list to empty.
 */
 static void init_callback_list(struct rcu_data *rdp)
@@ -1313,18 +1254,16 @@ static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
 }
 /*
- * Advance this CPU's callbacks, but only if the current grace period
+ * Update CPU-local rcu_data state to record the beginnings and ends of
- * has ended.  This may be called only from the CPU to whom the rdp
+ * grace periods.  The caller must hold the ->lock of the leaf rcu_node
- * belongs.  In addition, the corresponding leaf rcu_node structure's
+ * structure corresponding to the current CPU, and must have irqs disabled.
- * ->lock must be held by the caller, with irqs disabled.
 */
-static void
+static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
-__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
 {
-        /* Did another grace period end? */
+        /* Handle the ends of any preceding grace periods first. */
        if (rdp->completed == rnp->completed) {
-                /* No, so just accelerate recent callbacks. */
+                /* No grace period end, so just accelerate recent callbacks. */
                rcu_accelerate_cbs(rsp, rnp, rdp);
        } else {
@@ -1335,68 +1274,40 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
                /* Remember that we saw this grace-period completion. */
                rdp->completed = rnp->completed;
                trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend");
+        }
+        if (rdp->gpnum != rnp->gpnum) {
                /*
-                 * If we were in an extended quiescent state, we may have
+                 * If the current grace period is waiting for this CPU,
-                 * missed some grace periods that others CPUs handled on
+                 * set up to detect a quiescent state, otherwise don't
-                 * our behalf. Catch up with this state to avoid noting
+                 * go looking for one.
-                 * spurious new grace periods.  If another grace period
-                 * has started, then rnp->gpnum will have advanced, so
-                 * we will detect this later on.  Of course, any quiescent
-                 * states we found for the old GP are now invalid.
-                 */
-                if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) {
-                        rdp->gpnum = rdp->completed;
-                        rdp->passed_quiesce = 0;
-                }
-                /*
-                 * If RCU does not need a quiescent state from this CPU,
-                 * then make sure that this CPU doesn't go looking for one.
                 */
-                if ((rnp->qsmask & rdp->grpmask) == 0)
+                rdp->gpnum = rnp->gpnum;
-                        rdp->qs_pending = 0;
+                trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart");
+                rdp->passed_quiesce = 0;
+                rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
+                zero_cpu_stall_ticks(rdp);
        }
 }
-/*
+static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
- * Advance this CPU's callbacks, but only if the current grace period
- * has ended.  This may be called only from the CPU to whom the rdp
- * belongs.
- */
-static void
-rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
 {
        unsigned long flags;
        struct rcu_node *rnp;
        local_irq_save(flags);
        rnp = rdp->mynode;
-        if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */
+        if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) &&
+             rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */
            !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
                local_irq_restore(flags);
                return;
        }
-        __rcu_process_gp_end(rsp, rnp, rdp);
+        __note_gp_changes(rsp, rnp, rdp);
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 /*
- * Do per-CPU grace-period initialization for running CPU.  The caller
- * must hold the lock of the leaf rcu_node structure corresponding to
- * this CPU.
- */
-static void
-rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
-{
-        /* Prior grace period ended, so advance callbacks for current CPU. */
-        __rcu_process_gp_end(rsp, rnp, rdp);
-        /* Set state so that this CPU will detect the next quiescent state. */
-        __note_new_gpnum(rsp, rnp, rdp);
-}
-/*
 * Initialize a new grace period.
 */
 static int rcu_gp_init(struct rcu_state *rsp)
@@ -1444,7 +1355,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
                WARN_ON_ONCE(rnp->completed != rsp->completed);
                ACCESS_ONCE(rnp->completed) = rsp->completed;
                if (rnp == rdp->mynode)
-                        rcu_start_gp_per_cpu(rsp, rnp, rdp);
+                        __note_gp_changes(rsp, rnp, rdp);
                rcu_preempt_boost_start_gp(rnp);
                trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
                                            rnp->level, rnp->grplo,
@@ -1527,7 +1438,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
                ACCESS_ONCE(rnp->completed) = rsp->gpnum;
                rdp = this_cpu_ptr(rsp->rda);
                if (rnp == rdp->mynode)
-                        __rcu_process_gp_end(rsp, rnp, rdp);
+                        __note_gp_changes(rsp, rnp, rdp);
                nocb += rcu_future_gp_cleanup(rsp, rnp);
                raw_spin_unlock_irq(&rnp->lock);
                cond_resched();
@@ -1805,9 +1716,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
 static void
 rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 {
-        /* If there is now a new grace period, record and return. */
+        /* Check for grace-period ends and beginnings. */
-        if (check_for_new_grace_period(rsp, rdp))
+        note_gp_changes(rsp, rdp);
-                return;
        /*
         * Does this CPU still need to do its part for current grace period?
@@ -2271,9 +2181,6 @@ __rcu_process_callbacks(struct rcu_state *rsp)
        WARN_ON_ONCE(rdp->beenonline == 0);
-        /* Handle the end of a grace period that some other CPU ended.  */
-        rcu_process_gp_end(rsp, rdp);
        /* Update RCU state based on any recent quiescent states. */
        rcu_check_quiescent_state(rsp, rdp);
@@ -2358,8 +2265,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
        if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
                /* Are we ignoring a completed grace period? */
-                rcu_process_gp_end(rsp, rdp);
+                note_gp_changes(rsp, rdp);
-                check_for_new_grace_period(rsp, rdp);
                /* Start a new grace period if one not already started. */
                if (!rcu_gp_in_progress(rsp)) {
@@ -3004,7 +2910,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 * can accept some slop in the rsp->completed access due to the fact
 * that this CPU cannot possibly have any RCU callbacks in flight yet.
 */
-static void __cpuinit
+static void
 rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
 {
        unsigned long flags;
@@ -3056,7 +2962,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
        mutex_unlock(&rsp->onoff_mutex);
 }
-static void __cpuinit rcu_prepare_cpu(int cpu)
+static void rcu_prepare_cpu(int cpu)
 {
        struct rcu_state *rsp;
@@ -3068,7 +2974,7 @@ static void __cpuinit rcu_prepare_cpu(int cpu)
 /*
 * Handle CPU online/offline notification events.
 */
-static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
+static int rcu_cpu_notify(struct notifier_block *self,
                                    unsigned long action, void *hcpu)
 {
        long cpu = (long)hcpu;
@@ -3120,7 +3026,7 @@ static int __init rcu_spawn_gp_kthread(void)
        struct task_struct *t;
        for_each_rcu_flavor(rsp) {
-                t = kthread_run(rcu_gp_kthread, rsp, rsp->name);
+                t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name);
                BUG_ON(IS_ERR(t));
                rnp = rcu_get_root(rsp);
                raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -3265,11 +3171,25 @@ static void __init rcu_init_one(struct rcu_state *rsp,
 */
 static void __init rcu_init_geometry(void)
 {
+        ulong d;
        int i;
        int j;
        int n = nr_cpu_ids;
        int rcu_capacity[MAX_RCU_LVLS + 1];
+        /*
+         * Initialize any unspecified boot parameters.
+         * The default values of jiffies_till_first_fqs and
+         * jiffies_till_next_fqs are set to the RCU_JIFFIES_TILL_FORCE_QS
+         * value, which is a function of HZ, then adding one for each
+         * RCU_JIFFIES_FQS_DIV CPUs that might be on the system.
+         */
+        d = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV;
+        if (jiffies_till_first_fqs == ULONG_MAX)
+                jiffies_till_first_fqs = d;
+        if (jiffies_till_next_fqs == ULONG_MAX)
+                jiffies_till_next_fqs = d;
        /* If the compile-time values are accurate, just leave. */
        if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF &&
            nr_cpu_ids == NR_CPUS)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 4df503470e42..b3832581043c 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -343,12 +343,17 @@ struct rcu_data {
 #define RCU_FORCE_QS            3       /* Need to force quiescent state. */
 #define RCU_SIGNAL_INIT         RCU_SAVE_DYNTICK
-#define RCU_JIFFIES_TILL_FORCE_QS        3      /* for rsp->jiffies_force_qs */
+#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
+                                        /* For jiffies_till_first_fqs and */
+                                        /*  and jiffies_till_next_fqs. */
-#define RCU_STALL_RAT_DELAY             2       /* Allow other CPUs time */
+#define RCU_JIFFIES_FQS_DIV     256     /* Very large systems need more */
-                                                /*  to take at least one */
+                                        /*  delay between bouts of */
-                                                /*  scheduling clock irq */
+                                        /*  quiescent-state forcing. */
-                                                /*  before ratting on them. */
+#define RCU_STALL_RAT_DELAY     2       /* Allow other CPUs time to take */
+                                        /*  at least one scheduling clock */
+                                        /*  irq before ratting on them. */
 #define rcu_wait(cond)                                                  \
 do {                                                                    \
@@ -516,10 +521,10 @@ static void invoke_rcu_callbacks_kthread(void);
 static bool rcu_is_callbacks_kthread(void);
 #ifdef CONFIG_RCU_BOOST
 static void rcu_preempt_do_callbacks(void);
-static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
+static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
                                                 struct rcu_node *rnp);
 #endif /* #ifdef CONFIG_RCU_BOOST */
-static void __cpuinit rcu_prepare_kthreads(int cpu);
+static void rcu_prepare_kthreads(int cpu);
 static void rcu_cleanup_after_idle(int cpu);
 static void rcu_prepare_for_idle(int cpu);
 static void rcu_idle_count_callbacks_posted(void);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 3db5a375d8dd..769e12e3151b 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -53,38 +53,37 @@ static char __initdata nocb_buf[NR_CPUS * 5];
 static void __init rcu_bootup_announce_oddness(void)
 {
 #ifdef CONFIG_RCU_TRACE
-        printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n");
+        pr_info("\tRCU debugfs-based tracing is enabled.\n");
 #endif
 #if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
-        printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
+        pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
               CONFIG_RCU_FANOUT);
 #endif
 #ifdef CONFIG_RCU_FANOUT_EXACT
-        printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n");
+        pr_info("\tHierarchical RCU autobalancing is disabled.\n");
 #endif
 #ifdef CONFIG_RCU_FAST_NO_HZ
-        printk(KERN_INFO
+        pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
-               "\tRCU dyntick-idle grace-period acceleration is enabled.\n");
 #endif
 #ifdef CONFIG_PROVE_RCU
-        printk(KERN_INFO "\tRCU lockdep checking is enabled.\n");
+        pr_info("\tRCU lockdep checking is enabled.\n");
 #endif
 #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
-        printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
+        pr_info("\tRCU torture testing starts during boot.\n");
 #endif
 #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
-        printk(KERN_INFO "\tDump stacks of tasks blocking RCU-preempt GP.\n");
+        pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n");
 #endif
 #if defined(CONFIG_RCU_CPU_STALL_INFO)
-        printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n");
+        pr_info("\tAdditional per-CPU info printed with stalls.\n");
 #endif
 #if NUM_RCU_LVL_4 != 0
-        printk(KERN_INFO "\tFour-level hierarchy is enabled.\n");
+        pr_info("\tFour-level hierarchy is enabled.\n");
 #endif
        if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
-                printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
+                pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
        if (nr_cpu_ids != NR_CPUS)
-                printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
+                pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
 #ifdef CONFIG_RCU_NOCB_CPU
 #ifndef CONFIG_RCU_NOCB_CPU_NONE
        if (!have_rcu_nocb_mask) {
@@ -92,19 +91,19 @@ static void __init rcu_bootup_announce_oddness(void)
                have_rcu_nocb_mask = true;
        }
 #ifdef CONFIG_RCU_NOCB_CPU_ZERO
-        pr_info("\tExperimental no-CBs CPU 0\n");
+        pr_info("\tOffload RCU callbacks from CPU 0\n");
        cpumask_set_cpu(0, rcu_nocb_mask);
 #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
 #ifdef CONFIG_RCU_NOCB_CPU_ALL
-        pr_info("\tExperimental no-CBs for all CPUs\n");
+        pr_info("\tOffload RCU callbacks from all CPUs\n");
        cpumask_setall(rcu_nocb_mask);
 #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
 #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
        if (have_rcu_nocb_mask) {
                cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
-                pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf);
+                pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf);
                if (rcu_nocb_poll)
-                        pr_info("\tExperimental polled no-CBs CPUs.\n");
+                        pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
        }
 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
 }
@@ -123,7 +122,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp);
 */
 static void __init rcu_bootup_announce(void)
 {
-        printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n");
+        pr_info("Preemptible hierarchical RCU implementation.\n");
        rcu_bootup_announce_oddness();
 }
@@ -490,13 +489,13 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
 static void rcu_print_task_stall_begin(struct rcu_node *rnp)
 {
-        printk(KERN_ERR "\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
+        pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
               rnp->level, rnp->grplo, rnp->grphi);
 }
 static void rcu_print_task_stall_end(void)
 {
-        printk(KERN_CONT "\n");
+        pr_cont("\n");
 }
 #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
@@ -526,7 +525,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
        t = list_entry(rnp->gp_tasks,
                       struct task_struct, rcu_node_entry);
        list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
-                printk(KERN_CONT " P%d", t->pid);
+                pr_cont(" P%d", t->pid);
                ndetected++;
        }
        rcu_print_task_stall_end();
@@ -933,6 +932,24 @@ static void __init __rcu_init_preempt(void)
        rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
 }
+/*
+ * Check for a task exiting while in a preemptible-RCU read-side
+ * critical section, clean up if so.  No need to issue warnings,
+ * as debug_check_no_locks_held() already does this if lockdep
+ * is enabled.
+ */
+void exit_rcu(void)
+{
+        struct task_struct *t = current;
+        if (likely(list_empty(&current->rcu_node_entry)))
+                return;
+        t->rcu_read_lock_nesting = 1;
+        barrier();
+        t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED;
+        __rcu_read_unlock();
+}
 #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 static struct rcu_state *rcu_state = &rcu_sched_state;
@@ -942,7 +959,7 @@ static struct rcu_state *rcu_state = &rcu_sched_state;
 */
 static void __init rcu_bootup_announce(void)
 {
-        printk(KERN_INFO "Hierarchical RCU implementation.\n");
+        pr_info("Hierarchical RCU implementation.\n");
        rcu_bootup_announce_oddness();
 }
@@ -1101,6 +1118,14 @@ static void __init __rcu_init_preempt(void)
 {
 }
+/*
+ * Because preemptible RCU does not exist, tasks cannot possibly exit
+ * while in preemptible RCU read-side critical sections.
+ */
+void exit_rcu(void)
+{
+}
 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
 #ifdef CONFIG_RCU_BOOST
@@ -1327,7 +1352,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
 * already exist.  We only create this kthread for preemptible RCU.
 * Returns zero if all is well, a negated errno otherwise.
 */
-static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
+static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
                                                 struct rcu_node *rnp)
 {
        int rnp_index = rnp - &rsp->node[0];
@@ -1482,7 +1507,7 @@ static int __init rcu_spawn_kthreads(void)
 }
 early_initcall(rcu_spawn_kthreads);
-static void __cpuinit rcu_prepare_kthreads(int cpu)
+static void rcu_prepare_kthreads(int cpu)
 {
        struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
        struct rcu_node *rnp = rdp->mynode;
@@ -1524,7 +1549,7 @@ static int __init rcu_scheduler_really_started(void)
 }
 early_initcall(rcu_scheduler_really_started);
-static void __cpuinit rcu_prepare_kthreads(int cpu)
+static void rcu_prepare_kthreads(int cpu)
 {
 }
@@ -1629,7 +1654,7 @@ static bool rcu_try_advance_all_cbs(void)
                 */
                if (rdp->completed != rnp->completed &&
                    rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
-                        rcu_process_gp_end(rsp, rdp);
+                        note_gp_changes(rsp, rdp);
                if (cpu_has_callbacks_ready_to_invoke(rdp))
                        cbs_ready = true;
@@ -1883,7 +1908,7 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
 /* Initiate the stall-info list. */
 static void print_cpu_stall_info_begin(void)
 {
-        printk(KERN_CONT "\n");
+        pr_cont("\n");
 }
 /*
@@ -1914,7 +1939,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
                ticks_value = rsp->gpnum - rdp->gpnum;
        }
        print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
-        printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",
+        pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",
               cpu, ticks_value, ticks_title,
               atomic_read(&rdtp->dynticks) & 0xfff,
               rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
@@ -1925,7 +1950,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
 /* Terminate the stall-info list. */
 static void print_cpu_stall_info_end(void)
 {
-        printk(KERN_ERR "\t");
+        pr_err("\t");
 }
 /* Zero ->ticks_this_gp for all flavors of RCU. */
@@ -1948,17 +1973,17 @@ static void increment_cpu_stall_ticks(void)
 static void print_cpu_stall_info_begin(void)
 {
-        printk(KERN_CONT " {");
+        pr_cont(" {");
 }
 static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
 {
-        printk(KERN_CONT " %d", cpu);
+        pr_cont(" %d", cpu);
 }
 static void print_cpu_stall_info_end(void)
 {
-        printk(KERN_CONT "} ");
+        pr_cont("} ");
 }
 static void zero_cpu_stall_ticks(struct rcu_data *rdp)
diff --git a/kernel/reboot.c b/kernel/reboot.c
new file mode 100644
index 000000000000..269ed9384cc4
--- /dev/null
+++ b/kernel/reboot.c
@@ -0,0 +1,419 @@
+/*
+ *  linux/kernel/reboot.c
+ *
+ *  Copyright (C) 2013  Linus Torvalds
+ */
+#define pr_fmt(fmt)     "reboot: " fmt
+#include <linux/ctype.h>
+#include <linux/export.h>
+#include <linux/kexec.h>
+#include <linux/kmod.h>
+#include <linux/kmsg_dump.h>
+#include <linux/reboot.h>
+#include <linux/suspend.h>
+#include <linux/syscalls.h>
+#include <linux/syscore_ops.h>
+#include <linux/uaccess.h>
+/*
+ * this indicates whether you can reboot with ctrl-alt-del: the default is yes
+ */
+int C_A_D = 1;
+struct pid *cad_pid;
+EXPORT_SYMBOL(cad_pid);
+#if defined(CONFIG_ARM) || defined(CONFIG_UNICORE32)
+#define DEFAULT_REBOOT_MODE             = REBOOT_HARD
+#else
+#define DEFAULT_REBOOT_MODE
+#endif
+enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE;
+int reboot_default;
+int reboot_cpu;
+enum reboot_type reboot_type = BOOT_ACPI;
+int reboot_force;
+/*
+ * If set, this is used for preparing the system to power off.
+ */
+void (*pm_power_off_prepare)(void);
+/**
+ *      emergency_restart - reboot the system
+ *
+ *      Without shutting down any hardware or taking any locks
+ *      reboot the system.  This is called when we know we are in
+ *      trouble so this is our best effort to reboot.  This is
+ *      safe to call in interrupt context.
+ */
+void emergency_restart(void)
+{
+        kmsg_dump(KMSG_DUMP_EMERG);
+        machine_emergency_restart();
+}
+EXPORT_SYMBOL_GPL(emergency_restart);
+void kernel_restart_prepare(char *cmd)
+{
+        blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
+        system_state = SYSTEM_RESTART;
+        usermodehelper_disable();
+        device_shutdown();
+}
+/**
+ *      register_reboot_notifier - Register function to be called at reboot time
+ *      @nb: Info about notifier function to be called
+ *
+ *      Registers a function with the list of functions
+ *      to be called at reboot time.
+ *
+ *      Currently always returns zero, as blocking_notifier_chain_register()
+ *      always returns zero.
+ */
+int register_reboot_notifier(struct notifier_block *nb)
+{
+        return blocking_notifier_chain_register(&reboot_notifier_list, nb);
+}
+EXPORT_SYMBOL(register_reboot_notifier);
+/**
+ *      unregister_reboot_notifier - Unregister previously registered reboot notifier
+ *      @nb: Hook to be unregistered
+ *
+ *      Unregisters a previously registered reboot
+ *      notifier function.
+ *
+ *      Returns zero on success, or %-ENOENT on failure.
+ */
+int unregister_reboot_notifier(struct notifier_block *nb)
+{
+        return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
+}
+EXPORT_SYMBOL(unregister_reboot_notifier);
+static void migrate_to_reboot_cpu(void)
+{
+        /* The boot cpu is always logical cpu 0 */
+        int cpu = reboot_cpu;
+        cpu_hotplug_disable();
+        /* Make certain the cpu I'm about to reboot on is online */
+        if (!cpu_online(cpu))
+                cpu = cpumask_first(cpu_online_mask);
+        /* Prevent races with other tasks migrating this task */
+        current->flags |= PF_NO_SETAFFINITY;
+        /* Make certain I only run on the appropriate processor */
+        set_cpus_allowed_ptr(current, cpumask_of(cpu));
+}
+/**
+ *      kernel_restart - reboot the system
+ *      @cmd: pointer to buffer containing command to execute for restart
+ *              or %NULL
+ *
+ *      Shutdown everything and perform a clean reboot.
+ *      This is not safe to call in interrupt context.
+ */
+void kernel_restart(char *cmd)
+{
+        kernel_restart_prepare(cmd);
+        migrate_to_reboot_cpu();
+        syscore_shutdown();
+        if (!cmd)
+                pr_emerg("Restarting system\n");
+        else
+                pr_emerg("Restarting system with command '%s'\n", cmd);
+        kmsg_dump(KMSG_DUMP_RESTART);
+        machine_restart(cmd);
+}
+EXPORT_SYMBOL_GPL(kernel_restart);
+static void kernel_shutdown_prepare(enum system_states state)
+{
+        blocking_notifier_call_chain(&reboot_notifier_list,
+                (state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL);
+        system_state = state;
+        usermodehelper_disable();
+        device_shutdown();
+}
+/**
+ *      kernel_halt - halt the system
+ *
+ *      Shutdown everything and perform a clean system halt.
+ */
+void kernel_halt(void)
+{
+        kernel_shutdown_prepare(SYSTEM_HALT);
+        migrate_to_reboot_cpu();
+        syscore_shutdown();
+        pr_emerg("System halted\n");
+        kmsg_dump(KMSG_DUMP_HALT);
+        machine_halt();
+}
+EXPORT_SYMBOL_GPL(kernel_halt);
+/**
+ *      kernel_power_off - power_off the system
+ *
+ *      Shutdown everything and perform a clean system power_off.
+ */
+void kernel_power_off(void)
+{
+        kernel_shutdown_prepare(SYSTEM_POWER_OFF);
+        if (pm_power_off_prepare)
+                pm_power_off_prepare();
+        migrate_to_reboot_cpu();
+        syscore_shutdown();
+        pr_emerg("Power down\n");
+        kmsg_dump(KMSG_DUMP_POWEROFF);
+        machine_power_off();
+}
+EXPORT_SYMBOL_GPL(kernel_power_off);
+static DEFINE_MUTEX(reboot_mutex);
+/*
+ * Reboot system call: for obvious reasons only root may call it,
+ * and even root needs to set up some magic numbers in the registers
+ * so that some mistake won't make this reboot the whole machine.
+ * You can also set the meaning of the ctrl-alt-del-key here.
+ *
+ * reboot doesn't sync: do that yourself before calling this.
+ */
+SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
+                void __user *, arg)
+{
+        struct pid_namespace *pid_ns = task_active_pid_ns(current);
+        char buffer[256];
+        int ret = 0;
+        /* We only trust the superuser with rebooting the system. */
+        if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT))
+                return -EPERM;
+        /* For safety, we require "magic" arguments. */
+        if (magic1 != LINUX_REBOOT_MAGIC1 ||
+                        (magic2 != LINUX_REBOOT_MAGIC2 &&
+                        magic2 != LINUX_REBOOT_MAGIC2A &&
+                        magic2 != LINUX_REBOOT_MAGIC2B &&
+                        magic2 != LINUX_REBOOT_MAGIC2C))
+                return -EINVAL;
+        /*
+         * If pid namespaces are enabled and the current task is in a child
+         * pid_namespace, the command is handled by reboot_pid_ns() which will
+         * call do_exit().
+         */
+        ret = reboot_pid_ns(pid_ns, cmd);
+        if (ret)
+                return ret;
+        /* Instead of trying to make the power_off code look like
+         * halt when pm_power_off is not set do it the easy way.
+         */
+        if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
+                cmd = LINUX_REBOOT_CMD_HALT;
+        mutex_lock(&reboot_mutex);
+        switch (cmd) {
+        case LINUX_REBOOT_CMD_RESTART:
+                kernel_restart(NULL);
+                break;
+        case LINUX_REBOOT_CMD_CAD_ON:
+                C_A_D = 1;
+                break;
+        case LINUX_REBOOT_CMD_CAD_OFF:
+                C_A_D = 0;
+                break;
+        case LINUX_REBOOT_CMD_HALT:
+                kernel_halt();
+                do_exit(0);
+                panic("cannot halt");
+        case LINUX_REBOOT_CMD_POWER_OFF:
+                kernel_power_off();
+                do_exit(0);
+                break;
+        case LINUX_REBOOT_CMD_RESTART2:
+                ret = strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1);
+                if (ret < 0) {
+                        ret = -EFAULT;
+                        break;
+                }
+                buffer[sizeof(buffer) - 1] = '\0';
+                kernel_restart(buffer);
+                break;
+#ifdef CONFIG_KEXEC
+        case LINUX_REBOOT_CMD_KEXEC:
+                ret = kernel_kexec();
+                break;
+#endif
+#ifdef CONFIG_HIBERNATION
+        case LINUX_REBOOT_CMD_SW_SUSPEND:
+                ret = hibernate();
+                break;
+#endif
+        default:
+                ret = -EINVAL;
+                break;
+        }
+        mutex_unlock(&reboot_mutex);
+        return ret;
+}
+static void deferred_cad(struct work_struct *dummy)
+{
+        kernel_restart(NULL);
+}
+/*
+ * This function gets called by ctrl-alt-del - ie the keyboard interrupt.
+ * As it's called within an interrupt, it may NOT sync: the only choice
+ * is whether to reboot at once, or just ignore the ctrl-alt-del.
+ */
+void ctrl_alt_del(void)
+{
+        static DECLARE_WORK(cad_work, deferred_cad);
+        if (C_A_D)
+                schedule_work(&cad_work);
+        else
+                kill_cad_pid(SIGINT, 1);
+}
+char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
+static int __orderly_poweroff(bool force)
+{
+        char **argv;
+        static char *envp[] = {
+                "HOME=/",
+                "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+                NULL
+        };
+        int ret;
+        argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL);
+        if (argv) {
+                ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+                argv_free(argv);
+        } else {
+                ret = -ENOMEM;
+        }
+        if (ret && force) {
+                pr_warn("Failed to start orderly shutdown: forcing the issue\n");
+                /*
+                 * I guess this should try to kick off some daemon to sync and
+                 * poweroff asap.  Or not even bother syncing if we're doing an
+                 * emergency shutdown?
+                 */
+                emergency_sync();
+                kernel_power_off();
+        }
+        return ret;
+}
+static bool poweroff_force;
+static void poweroff_work_func(struct work_struct *work)
+{
+        __orderly_poweroff(poweroff_force);
+}
+static DECLARE_WORK(poweroff_work, poweroff_work_func);
+/**
+ * orderly_poweroff - Trigger an orderly system poweroff
+ * @force: force poweroff if command execution fails
+ *
+ * This may be called from any context to trigger a system shutdown.
+ * If the orderly shutdown fails, it will force an immediate shutdown.
+ */
+int orderly_poweroff(bool force)
+{
+        if (force) /* do not override the pending "true" */
+                poweroff_force = true;
+        schedule_work(&poweroff_work);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(orderly_poweroff);
+static int __init reboot_setup(char *str)
+{
+        for (;;) {
+                /*
+                 * Having anything passed on the command line via
+                 * reboot= will cause us to disable DMI checking
+                 * below.
+                 */
+                reboot_default = 0;
+                switch (*str) {
+                case 'w':
+                        reboot_mode = REBOOT_WARM;
+                        break;
+                case 'c':
+                        reboot_mode = REBOOT_COLD;
+                        break;
+                case 'h':
+                        reboot_mode = REBOOT_HARD;
+                        break;
+                case 's':
+                        if (isdigit(*(str+1)))
+                                reboot_cpu = simple_strtoul(str+1, NULL, 0);
+                        else if (str[1] == 'm' && str[2] == 'p' &&
+                                                        isdigit(*(str+3)))
+                                reboot_cpu = simple_strtoul(str+3, NULL, 0);
+                        else
+                                reboot_mode = REBOOT_SOFT;
+                        break;
+                case 'g':
+                        reboot_mode = REBOOT_GPIO;
+                        break;
+                case 'b':
+                case 'a':
+                case 'k':
+                case 't':
+                case 'e':
+                case 'p':
+                        reboot_type = *str;
+                        break;
+                case 'f':
+                        reboot_force = 1;
+                        break;
+                }
+                str = strchr(str, ',');
+                if (str)
+                        str++;
+                else
+                        break;
+        }
+        return 1;
+}
+__setup("reboot=", reboot_setup);
diff --git a/kernel/relay.c b/kernel/relay.c
index b91488ba2e5a..5001c9887db1 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -516,7 +516,7 @@ static void setup_callbacks(struct rchan *chan,
 *
 *      Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD)
 */
-static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
+static int relay_hotcpu_callback(struct notifier_block *nb,
                                unsigned long action,
                                void *hcpu)
 {
diff --git a/kernel/resource.c b/kernel/resource.c
index d7386986e10e..3f285dce9347 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -409,6 +409,7 @@ int __weak page_is_ram(unsigned long pfn)
 {
        return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
 }
+EXPORT_SYMBOL_GPL(page_is_ram);
 void __weak arch_remove_reservations(struct resource *avail)
 {
@@ -448,7 +449,6 @@ static int __find_resource(struct resource *root, struct resource *old,
        struct resource *this = root->child;
        struct resource tmp = *new, avail, alloc;
-        tmp.flags = new->flags;
        tmp.start = root->start;
        /*
         * Skip past an allocated resource that starts at 0, since the assignment
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 1e09308bf2a1..0dd6aec1cb6a 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -145,6 +145,19 @@ int max_lock_depth = 1024;
 /*
 * Adjust the priority chain. Also used for deadlock detection.
 * Decreases task's usage by one - may thus free the task.
+ *
+ * @task: the task owning the mutex (owner) for which a chain walk is probably
+ *        needed
+ * @deadlock_detect: do we have to carry out deadlock detection?
+ * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck
+ *             things for a task that has just got its priority adjusted, and
+ *             is waiting on a mutex)
+ * @orig_waiter: rt_mutex_waiter struct for the task that has just donated
+ *               its priority to the mutex owner (can be NULL in the case
+ *               depicted above or if the top waiter is gone away and we are
+ *               actually deboosting the owner)
+ * @top_task: the current top waiter
+ *
 * Returns 0 or -EDEADLK.
 */
 static int rt_mutex_adjust_prio_chain(struct task_struct *task,
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index deaf90e4a1de..54adcf35f495 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
 endif
-obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
+obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
 obj-$(CONFIG_SMP) += cpupri.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 64de5f8b0c9e..4a073539c58e 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -77,8 +77,6 @@ static inline struct autogroup *autogroup_create(void)
        if (IS_ERR(tg))
                goto out_free;
-        sched_online_group(tg, &root_task_group);
        kref_init(&ag->kref);
        init_rwsem(&ag->lock);
        ag->id = atomic_inc_return(&autogroup_seq_nr);
@@ -98,6 +96,7 @@ static inline struct autogroup *autogroup_create(void)
 #endif
        tg->autogroup = ag;
+        sched_online_group(tg, &root_task_group);
        return ag;
 out_free:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e8b335016c52..05c39f030314 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -370,13 +370,6 @@ static struct rq *this_rq_lock(void)
 #ifdef CONFIG_SCHED_HRTICK
 /*
 * Use HR-timers to deliver accurate preemption points.
- *
- * Its all a bit involved since we cannot program an hrt while holding the
- * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
- * reschedule event.
- *
- * When we get rescheduled we reprogram the hrtick_timer outside of the
- * rq->lock.
 */
 static void hrtick_clear(struct rq *rq)
@@ -404,6 +397,15 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
 }
 #ifdef CONFIG_SMP
+static int __hrtick_restart(struct rq *rq)
+{
+        struct hrtimer *timer = &rq->hrtick_timer;
+        ktime_t time = hrtimer_get_softexpires(timer);
+        return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
+}
 /*
 * called from hardirq (IPI) context
 */
@@ -412,7 +414,7 @@ static void __hrtick_start(void *arg)
        struct rq *rq = arg;
        raw_spin_lock(&rq->lock);
-        hrtimer_restart(&rq->hrtick_timer);
+        __hrtick_restart(rq);
        rq->hrtick_csd_pending = 0;
        raw_spin_unlock(&rq->lock);
 }
@@ -430,7 +432,7 @@ void hrtick_start(struct rq *rq, u64 delay)
        hrtimer_set_expires(timer, time);
        if (rq == this_rq()) {
-                hrtimer_restart(timer);
+                __hrtick_restart(rq);
        } else if (!rq->hrtick_csd_pending) {
                __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
                rq->hrtick_csd_pending = 1;
@@ -679,7 +681,7 @@ void sched_avg_update(struct rq *rq)
 {
        s64 period = sched_avg_period();
-        while ((s64)(rq->clock - rq->age_stamp) > period) {
+        while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
                /*
                 * Inline assembly required to prevent the compiler
                 * optimising this loop into a divmod call.
@@ -931,6 +933,8 @@ static int effective_prio(struct task_struct *p)
 /**
 * task_curr - is this task currently executing on a CPU?
 * @p: the task in question.
+ *
+ * Return: 1 if the task is currently executing. 0 otherwise.
 */
 inline int task_curr(const struct task_struct *p)
 {
@@ -1340,7 +1344,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
                p->sched_class->task_woken(rq, p);
        if (rq->idle_stamp) {
-                u64 delta = rq->clock - rq->idle_stamp;
+                u64 delta = rq_clock(rq) - rq->idle_stamp;
                u64 max = 2*sysctl_sched_migration_cost;
                if (delta > max)
@@ -1377,6 +1381,8 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
        rq = __task_rq_lock(p);
        if (p->on_rq) {
+                /* check_preempt_curr() may use rq clock */
+                update_rq_clock(rq);
                ttwu_do_wakeup(rq, p, wake_flags);
                ret = 1;
        }
@@ -1478,7 +1484,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
 * the simpler "current->state = TASK_RUNNING" to mark yourself
 * runnable without the overhead of this.
 *
- * Returns %true if @p was woken up, %false if it was already running
+ * Return: %true if @p was woken up, %false if it was already running.
 * or @state didn't match @p's state.
 */
 static int
@@ -1487,7 +1493,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
        unsigned long flags;
        int cpu, success = 0;
-        smp_wmb();
+        /*
+         * If we are going to wake up a thread waiting for CONDITION we
+         * need to ensure that CONDITION=1 done by the caller can not be
+         * reordered with p->state check below. This pairs with mb() in
+         * set_current_state() the waiting thread does.
+         */
+        smp_mb__before_spinlock();
        raw_spin_lock_irqsave(&p->pi_lock, flags);
        if (!(p->state & state))
                goto out;
@@ -1573,8 +1585,9 @@ out:
 * @p: The process to be woken up.
 *
 * Attempt to wake up the nominated process and move it to the set of runnable
- * processes.  Returns 1 if the process was woken up, 0 if it was already
+ * processes.
- * running.
+ *
+ * Return: 1 if the process was woken up, 0 if it was already running.
 *
 * It may be assumed that this function implies a write memory barrier before
 * changing the task state if and only if any tasks are woken up.
@@ -1609,15 +1622,6 @@ static void __sched_fork(struct task_struct *p)
        p->se.vruntime                  = 0;
        INIT_LIST_HEAD(&p->se.group_node);
-/*
- * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
- * removed when useful for applications beyond shares distribution (e.g.
- * load-balance).
- */
-#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
-        p->se.avg.runnable_avg_period = 0;
-        p->se.avg.runnable_avg_sum = 0;
-#endif
 #ifdef CONFIG_SCHEDSTATS
        memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
@@ -1761,6 +1765,8 @@ void wake_up_new_task(struct task_struct *p)
        set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
 #endif
+        /* Initialize new task's runnable average */
+        init_task_runnable_average(p);
        rq = __task_rq_lock(p);
        activate_task(rq, p, 0);
        p->on_rq = 1;
@@ -2069,575 +2075,6 @@ unsigned long nr_iowait_cpu(int cpu)
        return atomic_read(&this->nr_iowait);
 }
-unsigned long this_cpu_load(void)
-{
-        struct rq *this = this_rq();
-        return this->cpu_load[0];
-}
-/*
- * Global load-average calculations
- *
- * We take a distributed and async approach to calculating the global load-avg
- * in order to minimize overhead.
- *
- * The global load average is an exponentially decaying average of nr_running +
- * nr_uninterruptible.
- *
- * Once every LOAD_FREQ:
- *
- *   nr_active = 0;
- *   for_each_possible_cpu(cpu)
- *      nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
- *
- *   avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
- *
- * Due to a number of reasons the above turns in the mess below:
- *
- *  - for_each_possible_cpu() is prohibitively expensive on machines with
- *    serious number of cpus, therefore we need to take a distributed approach
- *    to calculating nr_active.
- *
- *        \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
- *                      = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
- *
- *    So assuming nr_active := 0 when we start out -- true per definition, we
- *    can simply take per-cpu deltas and fold those into a global accumulate
- *    to obtain the same result. See calc_load_fold_active().
- *
- *    Furthermore, in order to avoid synchronizing all per-cpu delta folding
- *    across the machine, we assume 10 ticks is sufficient time for every
- *    cpu to have completed this task.
- *
- *    This places an upper-bound on the IRQ-off latency of the machine. Then
- *    again, being late doesn't loose the delta, just wrecks the sample.
- *
- *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
- *    this would add another cross-cpu cacheline miss and atomic operation
- *    to the wakeup path. Instead we increment on whatever cpu the task ran
- *    when it went into uninterruptible state and decrement on whatever cpu
- *    did the wakeup. This means that only the sum of nr_uninterruptible over
- *    all cpus yields the correct result.
- *
- *  This covers the NO_HZ=n code, for extra head-aches, see the comment below.
- */
-/* Variables and functions for calc_load */
-static atomic_long_t calc_load_tasks;
-static unsigned long calc_load_update;
-unsigned long avenrun[3];
-EXPORT_SYMBOL(avenrun); /* should be removed */
-/**
- * get_avenrun - get the load average array
- * @loads:      pointer to dest load array
- * @offset:     offset to add
- * @shift:      shift count to shift the result left
- *
- * These values are estimates at best, so no need for locking.
- */
-void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
-{
-        loads[0] = (avenrun[0] + offset) << shift;
-        loads[1] = (avenrun[1] + offset) << shift;
-        loads[2] = (avenrun[2] + offset) << shift;
-}
-static long calc_load_fold_active(struct rq *this_rq)
-{
-        long nr_active, delta = 0;
-        nr_active = this_rq->nr_running;
-        nr_active += (long) this_rq->nr_uninterruptible;
-        if (nr_active != this_rq->calc_load_active) {
-                delta = nr_active - this_rq->calc_load_active;
-                this_rq->calc_load_active = nr_active;
-        }
-        return delta;
-}
-/*
- * a1 = a0 * e + a * (1 - e)
- */
-static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
-{
-        load *= exp;
-        load += active * (FIXED_1 - exp);
-        load += 1UL << (FSHIFT - 1);
-        return load >> FSHIFT;
-}
-#ifdef CONFIG_NO_HZ_COMMON
-/*
- * Handle NO_HZ for the global load-average.
- *
- * Since the above described distributed algorithm to compute the global
- * load-average relies on per-cpu sampling from the tick, it is affected by
- * NO_HZ.
- *
- * The basic idea is to fold the nr_active delta into a global idle-delta upon
- * entering NO_HZ state such that we can include this as an 'extra' cpu delta
- * when we read the global state.
- *
- * Obviously reality has to ruin such a delightfully simple scheme:
- *
- *  - When we go NO_HZ idle during the window, we can negate our sample
- *    contribution, causing under-accounting.
- *
- *    We avoid this by keeping two idle-delta counters and flipping them
- *    when the window starts, thus separating old and new NO_HZ load.
- *
- *    The only trick is the slight shift in index flip for read vs write.
- *
- *        0s            5s            10s           15s
- *          +10           +10           +10           +10
- *        |-|-----------|-|-----------|-|-----------|-|
- *    r:0 0 1           1 0           0 1           1 0
- *    w:0 1 1           0 0           1 1           0 0
- *
- *    This ensures we'll fold the old idle contribution in this window while
- *    accumlating the new one.
- *
- *  - When we wake up from NO_HZ idle during the window, we push up our
- *    contribution, since we effectively move our sample point to a known
- *    busy state.
- *
- *    This is solved by pushing the window forward, and thus skipping the
- *    sample, for this cpu (effectively using the idle-delta for this cpu which
- *    was in effect at the time the window opened). This also solves the issue
- *    of having to deal with a cpu having been in NOHZ idle for multiple
- *    LOAD_FREQ intervals.
- *
- * When making the ILB scale, we should try to pull this in as well.
- */
-static atomic_long_t calc_load_idle[2];
-static int calc_load_idx;
-static inline int calc_load_write_idx(void)
-{
-        int idx = calc_load_idx;
-        /*
-         * See calc_global_nohz(), if we observe the new index, we also
-         * need to observe the new update time.
-         */
-        smp_rmb();
-        /*
-         * If the folding window started, make sure we start writing in the
-         * next idle-delta.
-         */
-        if (!time_before(jiffies, calc_load_update))
-                idx++;
-        return idx & 1;
-}
-static inline int calc_load_read_idx(void)
-{
-        return calc_load_idx & 1;
-}
-void calc_load_enter_idle(void)
-{
-        struct rq *this_rq = this_rq();
-        long delta;
-        /*
-         * We're going into NOHZ mode, if there's any pending delta, fold it
-         * into the pending idle delta.
-         */
-        delta = calc_load_fold_active(this_rq);
-        if (delta) {
-                int idx = calc_load_write_idx();
-                atomic_long_add(delta, &calc_load_idle[idx]);
-        }
-}
-void calc_load_exit_idle(void)
-{
-        struct rq *this_rq = this_rq();
-        /*
-         * If we're still before the sample window, we're done.
-         */
-        if (time_before(jiffies, this_rq->calc_load_update))
-                return;
-        /*
-         * We woke inside or after the sample window, this means we're already
-         * accounted through the nohz accounting, so skip the entire deal and
-         * sync up for the next window.
-         */
-        this_rq->calc_load_update = calc_load_update;
-        if (time_before(jiffies, this_rq->calc_load_update + 10))
-                this_rq->calc_load_update += LOAD_FREQ;
-}
-static long calc_load_fold_idle(void)
-{
-        int idx = calc_load_read_idx();
-        long delta = 0;
-        if (atomic_long_read(&calc_load_idle[idx]))
-                delta = atomic_long_xchg(&calc_load_idle[idx], 0);
-        return delta;
-}
-/**
- * fixed_power_int - compute: x^n, in O(log n) time
- *
- * @x:         base of the power
- * @frac_bits: fractional bits of @x
- * @n:         power to raise @x to.
- *
- * By exploiting the relation between the definition of the natural power
- * function: x^n := x*x*...*x (x multiplied by itself for n times), and
- * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
- * (where: n_i \elem {0, 1}, the binary vector representing n),
- * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
- * of course trivially computable in O(log_2 n), the length of our binary
- * vector.
- */
-static unsigned long
-fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
-{
-        unsigned long result = 1UL << frac_bits;
-        if (n) for (;;) {
-                if (n & 1) {
-                        result *= x;
-                        result += 1UL << (frac_bits - 1);
-                        result >>= frac_bits;
-                }
-                n >>= 1;
-                if (!n)
-                        break;
-                x *= x;
-                x += 1UL << (frac_bits - 1);
-                x >>= frac_bits;
-        }
-        return result;
-}
-/*
- * a1 = a0 * e + a * (1 - e)
- *
- * a2 = a1 * e + a * (1 - e)
- *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
- *    = a0 * e^2 + a * (1 - e) * (1 + e)
- *
- * a3 = a2 * e + a * (1 - e)
- *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
- *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
- *
- *  ...
- *
- * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
- *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
- *    = a0 * e^n + a * (1 - e^n)
- *
- * [1] application of the geometric series:
- *
- *              n         1 - x^(n+1)
- *     S_n := \Sum x^i = -------------
- *             i=0          1 - x
- */
-static unsigned long
-calc_load_n(unsigned long load, unsigned long exp,
-            unsigned long active, unsigned int n)
-{
-        return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
-}
-/*
- * NO_HZ can leave us missing all per-cpu ticks calling
- * calc_load_account_active(), but since an idle CPU folds its delta into
- * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
- * in the pending idle delta if our idle period crossed a load cycle boundary.
- *
- * Once we've updated the global active value, we need to apply the exponential
- * weights adjusted to the number of cycles missed.
- */
-static void calc_global_nohz(void)
-{
-        long delta, active, n;
-        if (!time_before(jiffies, calc_load_update + 10)) {
-                /*
-                 * Catch-up, fold however many we are behind still
-                 */
-                delta = jiffies - calc_load_update - 10;
-                n = 1 + (delta / LOAD_FREQ);
-                active = atomic_long_read(&calc_load_tasks);
-                active = active > 0 ? active * FIXED_1 : 0;
-                avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
-                avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
-                avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
-                calc_load_update += n * LOAD_FREQ;
-        }
-        /*
-         * Flip the idle index...
-         *
-         * Make sure we first write the new time then flip the index, so that
-         * calc_load_write_idx() will see the new time when it reads the new
-         * index, this avoids a double flip messing things up.
-         */
-        smp_wmb();
-        calc_load_idx++;
-}
-#else /* !CONFIG_NO_HZ_COMMON */
-static inline long calc_load_fold_idle(void) { return 0; }
-static inline void calc_global_nohz(void) { }
-#endif /* CONFIG_NO_HZ_COMMON */
-/*
- * calc_load - update the avenrun load estimates 10 ticks after the
- * CPUs have updated calc_load_tasks.
- */
-void calc_global_load(unsigned long ticks)
-{
-        long active, delta;
-        if (time_before(jiffies, calc_load_update + 10))
-                return;
-        /*
-         * Fold the 'old' idle-delta to include all NO_HZ cpus.
-         */
-        delta = calc_load_fold_idle();
-        if (delta)
-                atomic_long_add(delta, &calc_load_tasks);
-        active = atomic_long_read(&calc_load_tasks);
-        active = active > 0 ? active * FIXED_1 : 0;
-        avenrun[0] = calc_load(avenrun[0], EXP_1, active);
-        avenrun[1] = calc_load(avenrun[1], EXP_5, active);
-        avenrun[2] = calc_load(avenrun[2], EXP_15, active);
-        calc_load_update += LOAD_FREQ;
-        /*
-         * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
-         */
-        calc_global_nohz();
-}
-/*
- * Called from update_cpu_load() to periodically update this CPU's
- * active count.
- */
-static void calc_load_account_active(struct rq *this_rq)
-{
-        long delta;
-        if (time_before(jiffies, this_rq->calc_load_update))
-                return;
-        delta  = calc_load_fold_active(this_rq);
-        if (delta)
-                atomic_long_add(delta, &calc_load_tasks);
-        this_rq->calc_load_update += LOAD_FREQ;
-}
-/*
- * End of global load-average stuff
- */
-/*
- * The exact cpuload at various idx values, calculated at every tick would be
- * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
- *
- * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
- * on nth tick when cpu may be busy, then we have:
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
- *
- * decay_load_missed() below does efficient calculation of
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
- *
- * The calculation is approximated on a 128 point scale.
- * degrade_zero_ticks is the number of ticks after which load at any
- * particular idx is approximated to be zero.
- * degrade_factor is a precomputed table, a row for each load idx.
- * Each column corresponds to degradation factor for a power of two ticks,
- * based on 128 point scale.
- * Example:
- * row 2, col 3 (=12) says that the degradation at load idx 2 after
- * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
- *
- * With this power of 2 load factors, we can degrade the load n times
- * by looking at 1 bits in n and doing as many mult/shift instead of
- * n mult/shifts needed by the exact degradation.
- */
-#define DEGRADE_SHIFT           7
-static const unsigned char
-                degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
-static const unsigned char
-                degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
-                                        {0, 0, 0, 0, 0, 0, 0, 0},
-                                        {64, 32, 8, 0, 0, 0, 0, 0},
-                                        {96, 72, 40, 12, 1, 0, 0},
-                                        {112, 98, 75, 43, 15, 1, 0},
-                                        {120, 112, 98, 76, 45, 16, 2} };
-/*
- * Update cpu_load for any missed ticks, due to tickless idle. The backlog
- * would be when CPU is idle and so we just decay the old load without
- * adding any new load.
- */
-static unsigned long
-decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
-{
-        int j = 0;
-        if (!missed_updates)
-                return load;
-        if (missed_updates >= degrade_zero_ticks[idx])
-                return 0;
-        if (idx == 1)
-                return load >> missed_updates;
-        while (missed_updates) {
-                if (missed_updates % 2)
-                        load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
-                missed_updates >>= 1;
-                j++;
-        }
-        return load;
-}
-/*
- * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC). With tickless idle this will not be called
- * every tick. We fix it up based on jiffies.
- */
-static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
-                              unsigned long pending_updates)
-{
-        int i, scale;
-        this_rq->nr_load_updates++;
-        /* Update our load: */
-        this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
-        for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
-                unsigned long old_load, new_load;
-                /* scale is effectively 1 << i now, and >> i divides by scale */
-                old_load = this_rq->cpu_load[i];
-                old_load = decay_load_missed(old_load, pending_updates - 1, i);
-                new_load = this_load;
-                /*
-                 * Round up the averaging division if load is increasing. This
-                 * prevents us from getting stuck on 9 if the load is 10, for
-                 * example.
-                 */
-                if (new_load > old_load)
-                        new_load += scale - 1;
-                this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
-        }
-        sched_avg_update(this_rq);
-}
-#ifdef CONFIG_NO_HZ_COMMON
-/*
- * There is no sane way to deal with nohz on smp when using jiffies because the
- * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
- *
- * Therefore we cannot use the delta approach from the regular tick since that
- * would seriously skew the load calculation. However we'll make do for those
- * updates happening while idle (nohz_idle_balance) or coming out of idle
- * (tick_nohz_idle_exit).
- *
- * This means we might still be one tick off for nohz periods.
- */
-/*
- * Called from nohz_idle_balance() to update the load ratings before doing the
- * idle balance.
- */
-void update_idle_cpu_load(struct rq *this_rq)
-{
-        unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
-        unsigned long load = this_rq->load.weight;
-        unsigned long pending_updates;
-        /*
-         * bail if there's load or we're actually up-to-date.
-         */
-        if (load || curr_jiffies == this_rq->last_load_update_tick)
-                return;
-        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-        this_rq->last_load_update_tick = curr_jiffies;
-        __update_cpu_load(this_rq, load, pending_updates);
-}
-/*
- * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
- */
-void update_cpu_load_nohz(void)
-{
-        struct rq *this_rq = this_rq();
-        unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
-        unsigned long pending_updates;
-        if (curr_jiffies == this_rq->last_load_update_tick)
-                return;
-        raw_spin_lock(&this_rq->lock);
-        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-        if (pending_updates) {
-                this_rq->last_load_update_tick = curr_jiffies;
-                /*
-                 * We were idle, this means load 0, the current load might be
-                 * !0 due to remote wakeups and the sort.
-                 */
-                __update_cpu_load(this_rq, 0, pending_updates);
-        }
-        raw_spin_unlock(&this_rq->lock);
-}
-#endif /* CONFIG_NO_HZ_COMMON */
-/*
- * Called from scheduler_tick()
- */
-static void update_cpu_load_active(struct rq *this_rq)
-{
-        /*
-         * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
-         */
-        this_rq->last_load_update_tick = jiffies;
-        __update_cpu_load(this_rq, this_rq->load.weight, 1);
-        calc_load_account_active(this_rq);
-}
 #ifdef CONFIG_SMP
 /*
@@ -2686,7 +2123,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
        if (task_current(rq, p)) {
                update_rq_clock(rq);
-                ns = rq->clock_task - p->se.exec_start;
+                ns = rq_clock_task(rq) - p->se.exec_start;
                if ((s64)ns < 0)
                        ns = 0;
        }
@@ -2739,8 +2176,8 @@ void scheduler_tick(void)
        raw_spin_lock(&rq->lock);
        update_rq_clock(rq);
-        update_cpu_load_active(rq);
        curr->sched_class->task_tick(rq, curr, 0);
+        update_cpu_load_active(rq);
        raw_spin_unlock(&rq->lock);
        perf_event_task_tick();
@@ -2763,6 +2200,8 @@ void scheduler_tick(void)
 * This makes sure that uptime, CFS vruntime, load
 * balancing, etc... continue to move forward, even
 * with a very low granularity.
+ *
+ * Return: Maximum deferment in nanoseconds.
 */
 u64 scheduler_tick_max_deferment(void)
 {
@@ -2966,6 +2405,12 @@ need_resched:
        if (sched_feat(HRTICK))
                hrtick_clear(rq);
+        /*
+         * Make sure that signal_pending_state()->signal_pending() below
+         * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
+         * done by the caller to avoid the race with signal_wake_up().
+         */
+        smp_mb__before_spinlock();
        raw_spin_lock_irq(&rq->lock);
        switch_count = &prev->nivcsw;
@@ -3368,8 +2813,8 @@ EXPORT_SYMBOL(wait_for_completion);
 * specified timeout to expire. The timeout is in jiffies. It is not
 * interruptible.
 *
- * The return value is 0 if timed out, and positive (at least 1, or number of
+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
- * jiffies left till timeout) if completed.
+ * till timeout) if completed.
 */
 unsigned long __sched
 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
@@ -3401,8 +2846,8 @@ EXPORT_SYMBOL(wait_for_completion_io);
 * specified timeout to expire. The timeout is in jiffies. It is not
 * interruptible. The caller is accounted as waiting for IO.
 *
- * The return value is 0 if timed out, and positive (at least 1, or number of
+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
- * jiffies left till timeout) if completed.
+ * till timeout) if completed.
 */
 unsigned long __sched
 wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
@@ -3418,7 +2863,7 @@ EXPORT_SYMBOL(wait_for_completion_io_timeout);
 * This waits for completion of a specific task to be signaled. It is
 * interruptible.
 *
- * The return value is -ERESTARTSYS if interrupted, 0 if completed.
+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
 */
 int __sched wait_for_completion_interruptible(struct completion *x)
 {
@@ -3437,8 +2882,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
 * This waits for either a completion of a specific task to be signaled or for a
 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
 *
- * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
- * positive (at least 1, or number of jiffies left till timeout) if completed.
+ * or number of jiffies left till timeout) if completed.
 */
 long __sched
 wait_for_completion_interruptible_timeout(struct completion *x,
@@ -3455,7 +2900,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
 * This waits to be signaled for completion of a specific task. It can be
 * interrupted by a kill signal.
 *
- * The return value is -ERESTARTSYS if interrupted, 0 if completed.
+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
 */
 int __sched wait_for_completion_killable(struct completion *x)
 {
@@ -3475,8 +2920,8 @@ EXPORT_SYMBOL(wait_for_completion_killable);
 * signaled or for a specified timeout to expire. It can be
 * interrupted by a kill signal. The timeout is in jiffies.
 *
- * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
- * positive (at least 1, or number of jiffies left till timeout) if completed.
+ * or number of jiffies left till timeout) if completed.
 */
 long __sched
 wait_for_completion_killable_timeout(struct completion *x,
@@ -3490,7 +2935,7 @@ EXPORT_SYMBOL(wait_for_completion_killable_timeout);
 *      try_wait_for_completion - try to decrement a completion without blocking
 *      @x:     completion structure
 *
- *      Returns: 0 if a decrement cannot be done without blocking
+ *      Return: 0 if a decrement cannot be done without blocking
 *               1 if a decrement succeeded.
 *
 *      If a completion is being used as a counting completion,
@@ -3517,7 +2962,7 @@ EXPORT_SYMBOL(try_wait_for_completion);
 *      completion_done - Test to see if a completion has any waiters
 *      @x:     completion structure
 *
- *      Returns: 0 if there are waiters (wait_for_completion() in progress)
+ *      Return: 0 if there are waiters (wait_for_completion() in progress)
 *               1 if there are no waiters.
 *
 */
@@ -3754,7 +3199,7 @@ SYSCALL_DEFINE1(nice, int, increment)
 * task_prio - return the priority value of a given task.
 * @p: the task in question.
 *
- * This is the priority value as seen by users in /proc.
+ * Return: The priority value as seen by users in /proc.
 * RT tasks are offset by -200. Normal tasks are centered
 * around 0, value goes from -16 to +15.
 */
@@ -3766,6 +3211,8 @@ int task_prio(const struct task_struct *p)
 /**
 * task_nice - return the nice value of a given task.
 * @p: the task in question.
+ *
+ * Return: The nice value [ -20 ... 0 ... 19 ].
 */
 int task_nice(const struct task_struct *p)
 {
@@ -3776,6 +3223,8 @@ EXPORT_SYMBOL(task_nice);
 /**
 * idle_cpu - is a given cpu idle currently?
 * @cpu: the processor in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
 */
 int idle_cpu(int cpu)
 {
@@ -3798,6 +3247,8 @@ int idle_cpu(int cpu)
 /**
 * idle_task - return the idle task for a given cpu.
 * @cpu: the processor in question.
+ *
+ * Return: The idle task for the cpu @cpu.
 */
 struct task_struct *idle_task(int cpu)
 {
@@ -3807,6 +3258,8 @@ struct task_struct *idle_task(int cpu)
 /**
 * find_process_by_pid - find a process with a matching PID value.
 * @pid: the pid in question.
+ *
+ * The task of @pid, if found. %NULL otherwise.
 */
 static struct task_struct *find_process_by_pid(pid_t pid)
 {
@@ -4004,6 +3457,8 @@ recheck:
 * @policy: new policy.
 * @param: structure containing the new RT priority.
 *
+ * Return: 0 on success. An error code otherwise.
+ *
 * NOTE that the task may be already dead.
 */
 int sched_setscheduler(struct task_struct *p, int policy,
@@ -4023,6 +3478,8 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
 * current context has permission.  For example, this is needed in
 * stop_machine(): we create temporary high priority worker threads,
 * but our caller might not have that capability.
+ *
+ * Return: 0 on success. An error code otherwise.
 */
 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
                               const struct sched_param *param)
@@ -4057,6 +3514,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 * @pid: the pid in question.
 * @policy: new policy.
 * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
 */
 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
                struct sched_param __user *, param)
@@ -4072,6 +3531,8 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
 * sys_sched_setparam - set/change the RT priority of a thread
 * @pid: the pid in question.
 * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
 */
 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
 {
@@ -4081,6 +3542,9 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
 /**
 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
 * @pid: the pid in question.
+ *
+ * Return: On success, the policy of the thread. Otherwise, a negative error
+ * code.
 */
 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
 {
@@ -4107,6 +3571,9 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
 * sys_sched_getparam - get the RT priority of a thread
 * @pid: the pid in question.
 * @param: structure containing the RT priority.
+ *
+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
+ * code.
 */
 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
 {
@@ -4231,6 +3698,8 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
 * @pid: pid of the process
 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
 * @user_mask_ptr: user-space pointer to the new cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
 */
 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
                unsigned long __user *, user_mask_ptr)
@@ -4282,6 +3751,8 @@ out_unlock:
 * @pid: pid of the process
 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
 * @user_mask_ptr: user-space pointer to hold the current cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
 */
 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
                unsigned long __user *, user_mask_ptr)
@@ -4316,6 +3787,8 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
 *
 * This function yields the current CPU to other tasks. If there are no
 * other threads running on this CPU then this function will return.
+ *
+ * Return: 0.
 */
 SYSCALL_DEFINE0(sched_yield)
 {
@@ -4441,7 +3914,7 @@ EXPORT_SYMBOL(yield);
 * It's the caller's job to ensure that the target task struct
 * can't go away on us before we can do any checks.
 *
- * Returns:
+ * Return:
 *      true (>0) if we indeed boosted the target task.
 *      false (0) if we failed to boost the target.
 *      -ESRCH if there's no task to yield to.
@@ -4544,8 +4017,9 @@ long __sched io_schedule_timeout(long timeout)
 * sys_sched_get_priority_max - return maximum RT priority.
 * @policy: scheduling class.
 *
- * this syscall returns the maximum rt_priority that can be used
+ * Return: On success, this syscall returns the maximum
- * by a given scheduling class.
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
 */
 SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
 {
@@ -4569,8 +4043,9 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
 * sys_sched_get_priority_min - return minimum RT priority.
 * @policy: scheduling class.
 *
- * this syscall returns the minimum rt_priority that can be used
+ * Return: On success, this syscall returns the minimum
- * by a given scheduling class.
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
 */
 SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
 {
@@ -4596,6 +4071,9 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
 *
 * this syscall writes the default timeslice value of a given process
 * into the user-space timespec buffer. A value of '0' means infinity.
+ *
+ * Return: On success, 0 and the timeslice is in @interval. Otherwise,
+ * an error code.
 */
 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
                struct timespec __user *, interval)
@@ -4705,7 +4183,7 @@ void show_state_filter(unsigned long state_filter)
                debug_show_all_locks();
 }
-void __cpuinit init_idle_bootup_task(struct task_struct *idle)
+void init_idle_bootup_task(struct task_struct *idle)
 {
        idle->sched_class = &idle_sched_class;
 }
@@ -4718,7 +4196,7 @@ void __cpuinit init_idle_bootup_task(struct task_struct *idle)
 * NOTE: this function does not set the idle thread's NEED_RESCHED
 * flag, to make booting more robust.
 */
-void __cpuinit init_idle(struct task_struct *idle, int cpu)
+void init_idle(struct task_struct *idle, int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
        unsigned long flags;
@@ -4960,6 +4438,13 @@ static void migrate_tasks(unsigned int dead_cpu)
         */
        rq->stop = NULL;
+        /*
+         * put_prev_task() and pick_next_task() sched
+         * class method both need to have an up-to-date
+         * value of rq->clock[_task]
+         */
+        update_rq_clock(rq);
        for ( ; ; ) {
                /*
                 * There's this thread running, bail when that's the only
@@ -5093,7 +4578,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
        return table;
 }
-static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
 {
        struct ctl_table *entry, *table;
        struct sched_domain *sd;
@@ -5195,7 +4680,7 @@ static void set_rq_offline(struct rq *rq)
 * migration_call - callback that gets triggered when a CPU is added.
 * Here we can start up the necessary migration thread for the new CPU.
 */
-static int __cpuinit
+static int
 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
        int cpu = (long)hcpu;
@@ -5249,12 +4734,12 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 * happens before everything else.  This has to be lower priority than
 * the notifier in the perf_event subsystem, though.
 */
-static struct notifier_block __cpuinitdata migration_notifier = {
+static struct notifier_block migration_notifier = {
        .notifier_call = migration_call,
        .priority = CPU_PRI_MIGRATION,
 };
-static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
+static int sched_cpu_active(struct notifier_block *nfb,
                                      unsigned long action, void *hcpu)
 {
        switch (action & ~CPU_TASKS_FROZEN) {
@@ -5267,7 +4752,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
        }
 }
-static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
+static int sched_cpu_inactive(struct notifier_block *nfb,
                                        unsigned long action, void *hcpu)
 {
        switch (action & ~CPU_TASKS_FROZEN) {
@@ -5907,7 +5392,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
        get_group(cpu, sdd, &sd->groups);
        atomic_inc(&sd->groups->ref);
-        if (cpu != cpumask_first(sched_domain_span(sd)))
+        if (cpu != cpumask_first(span))
                return 0;
        lockdep_assert_held(&sched_domains_mutex);
@@ -5917,12 +5402,12 @@ build_sched_groups(struct sched_domain *sd, int cpu)
        for_each_cpu(i, span) {
                struct sched_group *sg;
-                int group = get_group(i, sdd, &sg);
+                int group, j;
-                int j;
                if (cpumask_test_cpu(i, covered))
                        continue;
+                group = get_group(i, sdd, &sg);
                cpumask_clear(sched_group_cpus(sg));
                sg->sgp->power = 0;
                cpumask_setall(sched_group_mask(sg));
@@ -5960,7 +5445,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 {
        struct sched_group *sg = sd->groups;
-        WARN_ON(!sd || !sg);
+        WARN_ON(!sg);
        do {
                sg->group_weight = cpumask_weight(sched_group_cpus(sg));
@@ -6125,6 +5610,9 @@ static struct sched_domain_topology_level default_topology[] = {
 static struct sched_domain_topology_level *sched_domain_topology = default_topology;
+#define for_each_sd_topology(tl)                        \
+        for (tl = sched_domain_topology; tl->init; tl++)
 #ifdef CONFIG_NUMA
 static int sched_domains_numa_levels;
@@ -6422,7 +5910,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
        struct sched_domain_topology_level *tl;
        int j;
-        for (tl = sched_domain_topology; tl->init; tl++) {
+        for_each_sd_topology(tl) {
                struct sd_data *sdd = &tl->data;
                sdd->sd = alloc_percpu(struct sched_domain *);
@@ -6475,7 +5963,7 @@ static void __sdt_free(const struct cpumask *cpu_map)
        struct sched_domain_topology_level *tl;
        int j;
-        for (tl = sched_domain_topology; tl->init; tl++) {
+        for_each_sd_topology(tl) {
                struct sd_data *sdd = &tl->data;
                for_each_cpu(j, cpu_map) {
@@ -6503,9 +5991,8 @@ static void __sdt_free(const struct cpumask *cpu_map)
 }
 struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
-                struct s_data *d, const struct cpumask *cpu_map,
+                const struct cpumask *cpu_map, struct sched_domain_attr *attr,
-                struct sched_domain_attr *attr, struct sched_domain *child,
+                struct sched_domain *child, int cpu)
-                int cpu)
 {
        struct sched_domain *sd = tl->init(tl, cpu);
        if (!sd)
@@ -6516,8 +6003,8 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
                sd->level = child->level + 1;
                sched_domain_level_max = max(sched_domain_level_max, sd->level);
                child->parent = sd;
+                sd->child = child;
        }
-        sd->child = child;
        set_domain_attribute(sd, attr);
        return sd;
@@ -6530,7 +6017,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
 static int build_sched_domains(const struct cpumask *cpu_map,
                               struct sched_domain_attr *attr)
 {
-        enum s_alloc alloc_state = sa_none;
+        enum s_alloc alloc_state;
        struct sched_domain *sd;
        struct s_data d;
        int i, ret = -ENOMEM;
@@ -6544,18 +6031,15 @@ static int build_sched_domains(const struct cpumask *cpu_map,
                struct sched_domain_topology_level *tl;
                sd = NULL;
-                for (tl = sched_domain_topology; tl->init; tl++) {
+                for_each_sd_topology(tl) {
-                        sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
+                        sd = build_sched_domain(tl, cpu_map, attr, sd, i);
+                        if (tl == sched_domain_topology)
+                                *per_cpu_ptr(d.sd, i) = sd;
                        if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
                                sd->flags |= SD_OVERLAP;
                        if (cpumask_equal(cpu_map, sched_domain_span(sd)))
                                break;
                }
-                while (sd->child)
-                        sd = sd->child;
-                *per_cpu_ptr(d.sd, i) = sd;
        }
        /* Build the groups for the domains */
@@ -6867,9 +6351,6 @@ void __init sched_init_smp(void)
        hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
        hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
-        /* RT runtime code needs to handle some hotplug events */
-        hotcpu_notifier(update_runtime, 0);
        init_hrtick();
        /* Move init over to a non-isolated CPU */
@@ -7201,6 +6682,8 @@ void normalize_rt_tasks(void)
 * @cpu: the processor in question.
 *
 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ *
+ * Return: The current task for @cpu.
 */
 struct task_struct *curr_task(int cpu)
 {
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 1095e878a46f..8b836b376d91 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -62,7 +62,7 @@ static int convert_prio(int prio)
 * any discrepancies created by racing against the uncertainty of the current
 * priority configuration.
 *
- * Returns: (int)bool - CPUs were found
+ * Return: (int)bool - CPUs were found
 */
 int cpupri_find(struct cpupri *cp, struct task_struct *p,
                struct cpumask *lowest_mask)
@@ -203,7 +203,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 * cpupri_init - initialize the cpupri structure
 * @cp: The cpupri context
 *
- * Returns: -ENOMEM if memory fails.
+ * Return: -ENOMEM on memory allocation failure.
 */
 int cpupri_init(struct cpupri *cp)
 {
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index b5ccba22603b..a7959e05a9d5 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -515,9 +515,8 @@ static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
        for (;;) {
                /* Make sure "rtime" is the bigger of stime/rtime */
-                if (stime > rtime) {
+                if (stime > rtime)
-                        u64 tmp = rtime; rtime = stime; stime = tmp;
+                        swap(rtime, stime);
-                }
                /* Make sure 'total' fits in 32 bits */
                if (total >> 32)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 75024a673520..e076bddd4c66 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -209,22 +209,24 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
                        cfs_rq->nr_spread_over);
        SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
        SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
-#ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_SMP
-        SEQ_printf(m, "  .%-30s: %lld\n", "runnable_load_avg",
+        SEQ_printf(m, "  .%-30s: %ld\n", "runnable_load_avg",
                        cfs_rq->runnable_load_avg);
-        SEQ_printf(m, "  .%-30s: %lld\n", "blocked_load_avg",
+        SEQ_printf(m, "  .%-30s: %ld\n", "blocked_load_avg",
                        cfs_rq->blocked_load_avg);
-        SEQ_printf(m, "  .%-30s: %lld\n", "tg_load_avg",
+#ifdef CONFIG_FAIR_GROUP_SCHED
-                        (unsigned long long)atomic64_read(&cfs_rq->tg->load_avg));
+        SEQ_printf(m, "  .%-30s: %ld\n", "tg_load_contrib",
-        SEQ_printf(m, "  .%-30s: %lld\n", "tg_load_contrib",
                        cfs_rq->tg_load_contrib);
        SEQ_printf(m, "  .%-30s: %d\n", "tg_runnable_contrib",
                        cfs_rq->tg_runnable_contrib);
+        SEQ_printf(m, "  .%-30s: %ld\n", "tg_load_avg",
+                        atomic_long_read(&cfs_rq->tg->load_avg));
        SEQ_printf(m, "  .%-30s: %d\n", "tg->runnable_avg",
                        atomic_read(&cfs_rq->tg->runnable_avg));
 #endif
+#endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
        print_cfs_group_stats(m, cpu, cfs_rq->tg);
 #endif
 }
@@ -493,15 +495,16 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
        SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid,
                                                get_nr_threads(p));
        SEQ_printf(m,
-                "---------------------------------------------------------\n");
+                "---------------------------------------------------------"
+                "----------\n");
 #define __P(F) \
-        SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F)
+        SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
 #define P(F) \
-        SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F)
+        SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
 #define __PN(F) \
-        SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
+        SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
 #define PN(F) \
-        SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+        SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
        PN(se.exec_start);
        PN(se.vruntime);
@@ -560,12 +563,18 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
        }
 #endif
        __P(nr_switches);
-        SEQ_printf(m, "%-35s:%21Ld\n",
+        SEQ_printf(m, "%-45s:%21Ld\n",
                   "nr_voluntary_switches", (long long)p->nvcsw);
-        SEQ_printf(m, "%-35s:%21Ld\n",
+        SEQ_printf(m, "%-45s:%21Ld\n",
                   "nr_involuntary_switches", (long long)p->nivcsw);
        P(se.load.weight);
+#ifdef CONFIG_SMP
+        P(se.avg.runnable_avg_sum);
+        P(se.avg.runnable_avg_period);
+        P(se.avg.load_avg_contrib);
+        P(se.avg.decay_count);
+#endif
        P(policy);
        P(prio);
 #undef PN
@@ -579,7 +588,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
                t0 = cpu_clock(this_cpu);
                t1 = cpu_clock(this_cpu);
-                SEQ_printf(m, "%-35s:%21Ld\n",
+                SEQ_printf(m, "%-45s:%21Ld\n",
                           "clock-delta", (long long)(t1-t0));
        }
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c61a614465c8..68f1609ca149 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -113,6 +113,24 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
 #endif
+static inline void update_load_add(struct load_weight *lw, unsigned long inc)
+{
+        lw->weight += inc;
+        lw->inv_weight = 0;
+}
+static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
+{
+        lw->weight -= dec;
+        lw->inv_weight = 0;
+}
+static inline void update_load_set(struct load_weight *lw, unsigned long w)
+{
+        lw->weight = w;
+        lw->inv_weight = 0;
+}
 /*
 * Increase the granularity value when there are more CPUs,
 * because with more CPUs the 'effective latency' as visible
@@ -662,6 +680,26 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
        return calc_delta_fair(sched_slice(cfs_rq, se), se);
 }
+#ifdef CONFIG_SMP
+static inline void __update_task_entity_contrib(struct sched_entity *se);
+/* Give new task start runnable values to heavy its load in infant time */
+void init_task_runnable_average(struct task_struct *p)
+{
+        u32 slice;
+        p->se.avg.decay_count = 0;
+        slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
+        p->se.avg.runnable_avg_sum = slice;
+        p->se.avg.runnable_avg_period = slice;
+        __update_task_entity_contrib(&p->se);
+}
+#else
+void init_task_runnable_average(struct task_struct *p)
+{
+}
+#endif
 /*
 * Update the current task's runtime statistics. Skip current tasks that
 * are not in our scheduling class.
@@ -686,7 +724,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 static void update_curr(struct cfs_rq *cfs_rq)
 {
        struct sched_entity *curr = cfs_rq->curr;
-        u64 now = rq_of(cfs_rq)->clock_task;
+        u64 now = rq_clock_task(rq_of(cfs_rq));
        unsigned long delta_exec;
        if (unlikely(!curr))
@@ -718,7 +756,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-        schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
+        schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
 }
 /*
@@ -738,14 +776,14 @@ static void
 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
-                        rq_of(cfs_rq)->clock - se->statistics.wait_start));
+                        rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
        schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
        schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
-                        rq_of(cfs_rq)->clock - se->statistics.wait_start);
+                        rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
 #ifdef CONFIG_SCHEDSTATS
        if (entity_is_task(se)) {
                trace_sched_stat_wait(task_of(se),
-                        rq_of(cfs_rq)->clock - se->statistics.wait_start);
+                        rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
        }
 #endif
        schedstat_set(se->statistics.wait_start, 0);
@@ -771,7 +809,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
        /*
         * We are starting a new run period:
         */
-        se->exec_start = rq_of(cfs_rq)->clock_task;
+        se->exec_start = rq_clock_task(rq_of(cfs_rq));
 }
 /**************************************************
@@ -813,7 +851,7 @@ void task_numa_fault(int node, int pages, bool migrated)
 {
        struct task_struct *p = current;
-        if (!sched_feat_numa(NUMA))
+        if (!numabalancing_enabled)
                return;
        /* FIXME: Allocate task-specific structure for placement policy here */
@@ -1037,7 +1075,7 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
         * to gain a more accurate current total weight. See
         * update_cfs_rq_load_contribution().
         */
-        tg_weight = atomic64_read(&tg->load_avg);
+        tg_weight = atomic_long_read(&tg->load_avg);
        tg_weight -= cfs_rq->tg_load_contrib;
        tg_weight += cfs_rq->load.weight;
@@ -1110,8 +1148,7 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
-/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
+#ifdef CONFIG_SMP
-#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
 /*
 * We choose a half-life close to 1 scheduling period.
 * Note: The tables below are dependent on this value.
@@ -1319,13 +1356,13 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
                                                 int force_update)
 {
        struct task_group *tg = cfs_rq->tg;
-        s64 tg_contrib;
+        long tg_contrib;
        tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
        tg_contrib -= cfs_rq->tg_load_contrib;
-        if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
+        if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
-                atomic64_add(tg_contrib, &tg->load_avg);
+                atomic_long_add(tg_contrib, &tg->load_avg);
                cfs_rq->tg_load_contrib += tg_contrib;
        }
 }
@@ -1360,8 +1397,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)
        u64 contrib;
        contrib = cfs_rq->tg_load_contrib * tg->shares;
-        se->avg.load_avg_contrib = div64_u64(contrib,
+        se->avg.load_avg_contrib = div_u64(contrib,
-                                             atomic64_read(&tg->load_avg) + 1);
+                                     atomic_long_read(&tg->load_avg) + 1);
        /*
         * For group entities we need to compute a correction term in the case
@@ -1480,8 +1517,9 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
        if (!decays && !force_update)
                return;
-        if (atomic64_read(&cfs_rq->removed_load)) {
+        if (atomic_long_read(&cfs_rq->removed_load)) {
-                u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
+                unsigned long removed_load;
+                removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
                subtract_blocked_load_contrib(cfs_rq, removed_load);
        }
@@ -1497,7 +1535,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
 {
-        __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
+        __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
        __update_tg_runnable_avg(&rq->avg, &rq->cfs);
 }
@@ -1510,9 +1548,13 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
         * We track migrations using entity decay_count <= 0, on a wake-up
         * migration we use a negative decay count to track the remote decays
         * accumulated while sleeping.
+         *
+         * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
+         * are seen by enqueue_entity_load_avg() as a migration with an already
+         * constructed load_avg_contrib.
         */
        if (unlikely(se->avg.decay_count <= 0)) {
-                se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
+                se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
                if (se->avg.decay_count) {
                        /*
                         * In a wake-up migration we have to approximate the
@@ -1530,7 +1572,13 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
                }
                wakeup = 0;
        } else {
-                __synchronize_entity_decay(se);
+                /*
+                 * Task re-woke on same cpu (or else migrate_task_rq_fair()
+                 * would have made count negative); we must be careful to avoid
+                 * double-accounting blocked time after synchronizing decays.
+                 */
+                se->avg.last_runnable_update += __synchronize_entity_decay(se)
+                                                        << 20;
        }
        /* migrated tasks did not contribute to our blocked load */
@@ -1607,7 +1655,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
                tsk = task_of(se);
        if (se->statistics.sleep_start) {
-                u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
+                u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
                if ((s64)delta < 0)
                        delta = 0;
@@ -1624,7 +1672,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
                }
        }
        if (se->statistics.block_start) {
-                u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
+                u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
                if ((s64)delta < 0)
                        delta = 0;
@@ -1712,7 +1760,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
        /*
         * Update the normalized vruntime before updating min_vruntime
-         * through callig update_curr().
+         * through calling update_curr().
         */
        if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
                se->vruntime += cfs_rq->min_vruntime;
@@ -1805,9 +1853,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
                        struct task_struct *tsk = task_of(se);
                        if (tsk->state & TASK_INTERRUPTIBLE)
-                                se->statistics.sleep_start = rq_of(cfs_rq)->clock;
+                                se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
                        if (tsk->state & TASK_UNINTERRUPTIBLE)
-                                se->statistics.block_start = rq_of(cfs_rq)->clock;
+                                se->statistics.block_start = rq_clock(rq_of(cfs_rq));
                }
 #endif
        }
@@ -1984,6 +2032,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
         */
        update_entity_load_avg(curr, 1);
        update_cfs_rq_blocked_load(cfs_rq, 1);
+        update_cfs_shares(cfs_rq);
 #ifdef CONFIG_SCHED_HRTICK
        /*
@@ -2082,7 +2131,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
        if (unlikely(cfs_rq->throttle_count))
                return cfs_rq->throttled_clock_task;
-        return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
+        return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
 }
 /* returns 0 on failure to allocate runtime */
@@ -2138,10 +2187,9 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-        struct rq *rq = rq_of(cfs_rq);
        /* if the deadline is ahead of our clock, nothing to do */
-        if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
+        if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
                return;
        if (cfs_rq->runtime_remaining < 0)
@@ -2230,7 +2278,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
 #ifdef CONFIG_SMP
        if (!cfs_rq->throttle_count) {
                /* adjust cfs_rq_clock_task() */
-                cfs_rq->throttled_clock_task_time += rq->clock_task -
+                cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
                                             cfs_rq->throttled_clock_task;
        }
 #endif
@@ -2245,7 +2293,7 @@ static int tg_throttle_down(struct task_group *tg, void *data)
        /* group is entering throttled state, stop time */
        if (!cfs_rq->throttle_count)
-                cfs_rq->throttled_clock_task = rq->clock_task;
+                cfs_rq->throttled_clock_task = rq_clock_task(rq);
        cfs_rq->throttle_count++;
        return 0;
@@ -2284,7 +2332,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
                rq->nr_running -= task_delta;
        cfs_rq->throttled = 1;
-        cfs_rq->throttled_clock = rq->clock;
+        cfs_rq->throttled_clock = rq_clock(rq);
        raw_spin_lock(&cfs_b->lock);
        list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
        raw_spin_unlock(&cfs_b->lock);
@@ -2298,15 +2346,17 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
        int enqueue = 1;
        long task_delta;
-        se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+        se = cfs_rq->tg->se[cpu_of(rq)];
        cfs_rq->throttled = 0;
+        update_rq_clock(rq);
        raw_spin_lock(&cfs_b->lock);
-        cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
+        cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
        list_del_rcu(&cfs_rq->throttled_list);
        raw_spin_unlock(&cfs_b->lock);
-        update_rq_clock(rq);
        /* update hierarchical throttle state */
        walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
@@ -2599,10 +2649,6 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
        throttle_cfs_rq(cfs_rq);
 }
-static inline u64 default_cfs_period(void);
-static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
-static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
 {
        struct cfs_bandwidth *cfs_b =
@@ -2706,7 +2752,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 #else /* CONFIG_CFS_BANDWIDTH */
 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
 {
-        return rq_of(cfs_rq)->clock_task;
+        return rq_clock_task(rq_of(cfs_rq));
 }
 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
@@ -2919,7 +2965,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 /* Used instead of source_load when we know the type == 0 */
 static unsigned long weighted_cpuload(const int cpu)
 {
-        return cpu_rq(cpu)->load.weight;
+        return cpu_rq(cpu)->cfs.runnable_load_avg;
 }
 /*
@@ -2964,9 +3010,10 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
        unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
+        unsigned long load_avg = rq->cfs.runnable_load_avg;
        if (nr_running)
-                return rq->load.weight / nr_running;
+                return load_avg / nr_running;
        return 0;
 }
@@ -3416,12 +3463,6 @@ unlock:
 }
 /*
- * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
- * removed when useful for applications beyond shares distribution (e.g.
- * load-balance).
- */
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/*
 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
 * cfs_rq_of(p) references at time of call are still valid and identify the
 * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
@@ -3441,10 +3482,10 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
         */
        if (se->avg.decay_count) {
                se->avg.decay_count = -__synchronize_entity_decay(se);
-                atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
+                atomic_long_add(se->avg.load_avg_contrib,
+                                                &cfs_rq->removed_load);
        }
 }
-#endif
 #endif /* CONFIG_SMP */
 static unsigned long
@@ -3946,7 +3987,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
         * 2) too many balance attempts have failed.
         */
-        tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
+        tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
        if (!tsk_cache_hot ||
                env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
@@ -4141,11 +4182,11 @@ static int tg_load_down(struct task_group *tg, void *data)
        long cpu = (long)data;
        if (!tg->parent) {
-                load = cpu_rq(cpu)->load.weight;
+                load = cpu_rq(cpu)->avg.load_avg_contrib;
        } else {
                load = tg->parent->cfs_rq[cpu]->h_load;
-                load *= tg->se[cpu]->load.weight;
+                load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib,
-                load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
+                                tg->parent->cfs_rq[cpu]->runnable_load_avg + 1);
        }
        tg->cfs_rq[cpu]->h_load = load;
@@ -4171,12 +4212,9 @@ static void update_h_load(long cpu)
 static unsigned long task_h_load(struct task_struct *p)
 {
        struct cfs_rq *cfs_rq = task_cfs_rq(p);
-        unsigned long load;
-        load = p->se.load.weight;
+        return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
-        load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
+                        cfs_rq->runnable_load_avg + 1);
-        return load;
 }
 #else
 static inline void update_blocked_averages(int cpu)
@@ -4189,7 +4227,7 @@ static inline void update_h_load(long cpu)
 static unsigned long task_h_load(struct task_struct *p)
 {
-        return p->se.load.weight;
+        return p->se.avg.load_avg_contrib;
 }
 #endif
@@ -4243,6 +4281,8 @@ struct sg_lb_stats {
 * get_sd_load_idx - Obtain the load index for a given sched domain.
 * @sd: The sched_domain whose load_idx is to be obtained.
 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+ *
+ * Return: The load index.
 */
 static inline int get_sd_load_idx(struct sched_domain *sd,
                                        enum cpu_idle_type idle)
@@ -4302,7 +4342,7 @@ static unsigned long scale_rt_power(int cpu)
        age_stamp = ACCESS_ONCE(rq->age_stamp);
        avg = ACCESS_ONCE(rq->rt_avg);
-        total = sched_avg_period() + (rq->clock - age_stamp);
+        total = sched_avg_period() + (rq_clock(rq) - age_stamp);
        if (unlikely(total < avg)) {
                /* Ensures that power won't end up being negative */
@@ -4537,6 +4577,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 *
 * Determine if @sg is a busier group than the previously selected
 * busiest group.
+ *
+ * Return: %true if @sg is a busier group than the previously selected
+ * busiest group. %false otherwise.
 */
 static bool update_sd_pick_busiest(struct lb_env *env,
                                   struct sd_lb_stats *sds,
@@ -4654,7 +4697,7 @@ static inline void update_sd_lb_stats(struct lb_env *env,
 * assuming lower CPU number will be equivalent to lower a SMT thread
 * number.
 *
- * Returns 1 when packing is required and a task should be moved to
+ * Return: 1 when packing is required and a task should be moved to
 * this CPU.  The amount of the imbalance is returned in *imbalance.
 *
 * @env: The load balancing environment.
@@ -4832,7 +4875,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 * @balance: Pointer to a variable indicating if this_cpu
 *      is the appropriate cpu to perform load balancing at this_level.
 *
- * Returns:     - the busiest group if imbalance exists.
+ * Return:      - The busiest group if imbalance exists.
 *              - If no imbalance and user has opted for power-savings balance,
 *                 return the least loaded group whose CPUs can be
 *                 put to idle by rebalancing its tasks onto our group.
@@ -5241,7 +5284,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
        int pulled_task = 0;
        unsigned long next_balance = jiffies + HZ;
-        this_rq->idle_stamp = this_rq->clock;
+        this_rq->idle_stamp = rq_clock(this_rq);
        if (this_rq->avg_idle < sysctl_sched_migration_cost)
                return;
@@ -5418,10 +5461,9 @@ static inline void nohz_balance_exit_idle(int cpu)
 static inline void set_cpu_sd_state_busy(void)
 {
        struct sched_domain *sd;
-        int cpu = smp_processor_id();
        rcu_read_lock();
-        sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
+        sd = rcu_dereference_check_sched_domain(this_rq()->sd);
        if (!sd || !sd->nohz_idle)
                goto unlock;
@@ -5436,10 +5478,9 @@ unlock:
 void set_cpu_sd_state_idle(void)
 {
        struct sched_domain *sd;
-        int cpu = smp_processor_id();
        rcu_read_lock();
-        sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
+        sd = rcu_dereference_check_sched_domain(this_rq()->sd);
        if (!sd || sd->nohz_idle)
                goto unlock;
@@ -5471,7 +5512,7 @@ void nohz_balance_enter_idle(int cpu)
        set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
 }
-static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
+static int sched_ilb_notifier(struct notifier_block *nfb,
                                        unsigned long action, void *hcpu)
 {
        switch (action & ~CPU_TASKS_FROZEN) {
@@ -5751,7 +5792,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
                entity_tick(cfs_rq, se, queued);
        }
-        if (sched_feat_numa(NUMA))
+        if (numabalancing_enabled)
                task_tick_numa(rq, curr);
        update_rq_runnable_avg(rq, 1);
@@ -5848,7 +5889,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
                se->vruntime -= cfs_rq->min_vruntime;
        }
-#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
        /*
        * Remove our load from contribution when we leave sched_fair
        * and ensure we don't carry in an old decay_count if we
@@ -5907,9 +5948,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 #ifndef CONFIG_64BIT
        cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 #endif
-#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
        atomic64_set(&cfs_rq->decay_counter, 1);
-        atomic64_set(&cfs_rq->removed_load, 0);
+        atomic_long_set(&cfs_rq->removed_load, 0);
 #endif
 }
@@ -6091,6 +6132,9 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
                se = tg->se[i];
                /* Propagate contribution to hierarchy */
                raw_spin_lock_irqsave(&rq->lock, flags);
+                /* Possible calls to update_curr() need rq clock */
+                update_rq_clock(rq);
                for_each_sched_entity(se)
                        update_cfs_shares(group_cfs_rq(se));
                raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -6146,9 +6190,8 @@ const struct sched_class fair_sched_class = {
 #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_fair,
-#ifdef CONFIG_FAIR_GROUP_SCHED
        .migrate_task_rq        = migrate_task_rq_fair,
-#endif
        .rq_online              = rq_online_fair,
        .rq_offline             = rq_offline_fair,
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c
new file mode 100644
index 000000000000..16f5a30f9c88
--- /dev/null
+++ b/kernel/sched/proc.c
@@ -0,0 +1,591 @@
+/*
+ *  kernel/sched/proc.c
+ *
+ *  Kernel load calculations, forked from sched/core.c
+ */
+#include <linux/export.h>
+#include "sched.h"
+unsigned long this_cpu_load(void)
+{
+        struct rq *this = this_rq();
+        return this->cpu_load[0];
+}
+/*
+ * Global load-average calculations
+ *
+ * We take a distributed and async approach to calculating the global load-avg
+ * in order to minimize overhead.
+ *
+ * The global load average is an exponentially decaying average of nr_running +
+ * nr_uninterruptible.
+ *
+ * Once every LOAD_FREQ:
+ *
+ *   nr_active = 0;
+ *   for_each_possible_cpu(cpu)
+ *      nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
+ *
+ *   avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
+ *
+ * Due to a number of reasons the above turns in the mess below:
+ *
+ *  - for_each_possible_cpu() is prohibitively expensive on machines with
+ *    serious number of cpus, therefore we need to take a distributed approach
+ *    to calculating nr_active.
+ *
+ *        \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
+ *                      = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
+ *
+ *    So assuming nr_active := 0 when we start out -- true per definition, we
+ *    can simply take per-cpu deltas and fold those into a global accumulate
+ *    to obtain the same result. See calc_load_fold_active().
+ *
+ *    Furthermore, in order to avoid synchronizing all per-cpu delta folding
+ *    across the machine, we assume 10 ticks is sufficient time for every
+ *    cpu to have completed this task.
+ *
+ *    This places an upper-bound on the IRQ-off latency of the machine. Then
+ *    again, being late doesn't loose the delta, just wrecks the sample.
+ *
+ *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
+ *    this would add another cross-cpu cacheline miss and atomic operation
+ *    to the wakeup path. Instead we increment on whatever cpu the task ran
+ *    when it went into uninterruptible state and decrement on whatever cpu
+ *    did the wakeup. This means that only the sum of nr_uninterruptible over
+ *    all cpus yields the correct result.
+ *
+ *  This covers the NO_HZ=n code, for extra head-aches, see the comment below.
+ */
+/* Variables and functions for calc_load */
+atomic_long_t calc_load_tasks;
+unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun); /* should be removed */
+/**
+ * get_avenrun - get the load average array
+ * @loads:      pointer to dest load array
+ * @offset:     offset to add
+ * @shift:      shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+        loads[0] = (avenrun[0] + offset) << shift;
+        loads[1] = (avenrun[1] + offset) << shift;
+        loads[2] = (avenrun[2] + offset) << shift;
+}
+long calc_load_fold_active(struct rq *this_rq)
+{
+        long nr_active, delta = 0;
+        nr_active = this_rq->nr_running;
+        nr_active += (long) this_rq->nr_uninterruptible;
+        if (nr_active != this_rq->calc_load_active) {
+                delta = nr_active - this_rq->calc_load_active;
+                this_rq->calc_load_active = nr_active;
+        }
+        return delta;
+}
+/*
+ * a1 = a0 * e + a * (1 - e)
+ */
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+        load *= exp;
+        load += active * (FIXED_1 - exp);
+        load += 1UL << (FSHIFT - 1);
+        return load >> FSHIFT;
+}
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * Handle NO_HZ for the global load-average.
+ *
+ * Since the above described distributed algorithm to compute the global
+ * load-average relies on per-cpu sampling from the tick, it is affected by
+ * NO_HZ.
+ *
+ * The basic idea is to fold the nr_active delta into a global idle-delta upon
+ * entering NO_HZ state such that we can include this as an 'extra' cpu delta
+ * when we read the global state.
+ *
+ * Obviously reality has to ruin such a delightfully simple scheme:
+ *
+ *  - When we go NO_HZ idle during the window, we can negate our sample
+ *    contribution, causing under-accounting.
+ *
+ *    We avoid this by keeping two idle-delta counters and flipping them
+ *    when the window starts, thus separating old and new NO_HZ load.
+ *
+ *    The only trick is the slight shift in index flip for read vs write.
+ *
+ *        0s            5s            10s           15s
+ *          +10           +10           +10           +10
+ *        |-|-----------|-|-----------|-|-----------|-|
+ *    r:0 0 1           1 0           0 1           1 0
+ *    w:0 1 1           0 0           1 1           0 0
+ *
+ *    This ensures we'll fold the old idle contribution in this window while
+ *    accumlating the new one.
+ *
+ *  - When we wake up from NO_HZ idle during the window, we push up our
+ *    contribution, since we effectively move our sample point to a known
+ *    busy state.
+ *
+ *    This is solved by pushing the window forward, and thus skipping the
+ *    sample, for this cpu (effectively using the idle-delta for this cpu which
+ *    was in effect at the time the window opened). This also solves the issue
+ *    of having to deal with a cpu having been in NOHZ idle for multiple
+ *    LOAD_FREQ intervals.
+ *
+ * When making the ILB scale, we should try to pull this in as well.
+ */
+static atomic_long_t calc_load_idle[2];
+static int calc_load_idx;
+static inline int calc_load_write_idx(void)
+{
+        int idx = calc_load_idx;
+        /*
+         * See calc_global_nohz(), if we observe the new index, we also
+         * need to observe the new update time.
+         */
+        smp_rmb();
+        /*
+         * If the folding window started, make sure we start writing in the
+         * next idle-delta.
+         */
+        if (!time_before(jiffies, calc_load_update))
+                idx++;
+        return idx & 1;
+}
+static inline int calc_load_read_idx(void)
+{
+        return calc_load_idx & 1;
+}
+void calc_load_enter_idle(void)
+{
+        struct rq *this_rq = this_rq();
+        long delta;
+        /*
+         * We're going into NOHZ mode, if there's any pending delta, fold it
+         * into the pending idle delta.
+         */
+        delta = calc_load_fold_active(this_rq);
+        if (delta) {
+                int idx = calc_load_write_idx();
+                atomic_long_add(delta, &calc_load_idle[idx]);
+        }
+}
+void calc_load_exit_idle(void)
+{
+        struct rq *this_rq = this_rq();
+        /*
+         * If we're still before the sample window, we're done.
+         */
+        if (time_before(jiffies, this_rq->calc_load_update))
+                return;
+        /*
+         * We woke inside or after the sample window, this means we're already
+         * accounted through the nohz accounting, so skip the entire deal and
+         * sync up for the next window.
+         */
+        this_rq->calc_load_update = calc_load_update;
+        if (time_before(jiffies, this_rq->calc_load_update + 10))
+                this_rq->calc_load_update += LOAD_FREQ;
+}
+static long calc_load_fold_idle(void)
+{
+        int idx = calc_load_read_idx();
+        long delta = 0;
+        if (atomic_long_read(&calc_load_idle[idx]))
+                delta = atomic_long_xchg(&calc_load_idle[idx], 0);
+        return delta;
+}
+/**
+ * fixed_power_int - compute: x^n, in O(log n) time
+ *
+ * @x:         base of the power
+ * @frac_bits: fractional bits of @x
+ * @n:         power to raise @x to.
+ *
+ * By exploiting the relation between the definition of the natural power
+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ * (where: n_i \elem {0, 1}, the binary vector representing n),
+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ * of course trivially computable in O(log_2 n), the length of our binary
+ * vector.
+ */
+static unsigned long
+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
+{
+        unsigned long result = 1UL << frac_bits;
+        if (n) for (;;) {
+                if (n & 1) {
+                        result *= x;
+                        result += 1UL << (frac_bits - 1);
+                        result >>= frac_bits;
+                }
+                n >>= 1;
+                if (!n)
+                        break;
+                x *= x;
+                x += 1UL << (frac_bits - 1);
+                x >>= frac_bits;
+        }
+        return result;
+}
+/*
+ * a1 = a0 * e + a * (1 - e)
+ *
+ * a2 = a1 * e + a * (1 - e)
+ *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+ *    = a0 * e^2 + a * (1 - e) * (1 + e)
+ *
+ * a3 = a2 * e + a * (1 - e)
+ *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+ *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+ *
+ *  ...
+ *
+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+ *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+ *    = a0 * e^n + a * (1 - e^n)
+ *
+ * [1] application of the geometric series:
+ *
+ *              n         1 - x^(n+1)
+ *     S_n := \Sum x^i = -------------
+ *             i=0          1 - x
+ */
+static unsigned long
+calc_load_n(unsigned long load, unsigned long exp,
+            unsigned long active, unsigned int n)
+{
+        return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
+}
+/*
+ * NO_HZ can leave us missing all per-cpu ticks calling
+ * calc_load_account_active(), but since an idle CPU folds its delta into
+ * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
+ * in the pending idle delta if our idle period crossed a load cycle boundary.
+ *
+ * Once we've updated the global active value, we need to apply the exponential
+ * weights adjusted to the number of cycles missed.
+ */
+static void calc_global_nohz(void)
+{
+        long delta, active, n;
+        if (!time_before(jiffies, calc_load_update + 10)) {
+                /*
+                 * Catch-up, fold however many we are behind still
+                 */
+                delta = jiffies - calc_load_update - 10;
+                n = 1 + (delta / LOAD_FREQ);
+                active = atomic_long_read(&calc_load_tasks);
+                active = active > 0 ? active * FIXED_1 : 0;
+                avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+                avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+                avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+                calc_load_update += n * LOAD_FREQ;
+        }
+        /*
+         * Flip the idle index...
+         *
+         * Make sure we first write the new time then flip the index, so that
+         * calc_load_write_idx() will see the new time when it reads the new
+         * index, this avoids a double flip messing things up.
+         */
+        smp_wmb();
+        calc_load_idx++;
+}
+#else /* !CONFIG_NO_HZ_COMMON */
+static inline long calc_load_fold_idle(void) { return 0; }
+static inline void calc_global_nohz(void) { }
+#endif /* CONFIG_NO_HZ_COMMON */
+/*
+ * calc_load - update the avenrun load estimates 10 ticks after the
+ * CPUs have updated calc_load_tasks.
+ */
+void calc_global_load(unsigned long ticks)
+{
+        long active, delta;
+        if (time_before(jiffies, calc_load_update + 10))
+                return;
+        /*
+         * Fold the 'old' idle-delta to include all NO_HZ cpus.
+         */
+        delta = calc_load_fold_idle();
+        if (delta)
+                atomic_long_add(delta, &calc_load_tasks);
+        active = atomic_long_read(&calc_load_tasks);
+        active = active > 0 ? active * FIXED_1 : 0;
+        avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+        avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+        avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+        calc_load_update += LOAD_FREQ;
+        /*
+         * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
+         */
+        calc_global_nohz();
+}
+/*
+ * Called from update_cpu_load() to periodically update this CPU's
+ * active count.
+ */
+static void calc_load_account_active(struct rq *this_rq)
+{
+        long delta;
+        if (time_before(jiffies, this_rq->calc_load_update))
+                return;
+        delta  = calc_load_fold_active(this_rq);
+        if (delta)
+                atomic_long_add(delta, &calc_load_tasks);
+        this_rq->calc_load_update += LOAD_FREQ;
+}
+/*
+ * End of global load-average stuff
+ */
+/*
+ * The exact cpuload at various idx values, calculated at every tick would be
+ * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ *
+ * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
+ * on nth tick when cpu may be busy, then we have:
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ *
+ * decay_load_missed() below does efficient calculation of
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ *
+ * The calculation is approximated on a 128 point scale.
+ * degrade_zero_ticks is the number of ticks after which load at any
+ * particular idx is approximated to be zero.
+ * degrade_factor is a precomputed table, a row for each load idx.
+ * Each column corresponds to degradation factor for a power of two ticks,
+ * based on 128 point scale.
+ * Example:
+ * row 2, col 3 (=12) says that the degradation at load idx 2 after
+ * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
+ *
+ * With this power of 2 load factors, we can degrade the load n times
+ * by looking at 1 bits in n and doing as many mult/shift instead of
+ * n mult/shifts needed by the exact degradation.
+ */
+#define DEGRADE_SHIFT           7
+static const unsigned char
+                degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const unsigned char
+                degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+                                        {0, 0, 0, 0, 0, 0, 0, 0},
+                                        {64, 32, 8, 0, 0, 0, 0, 0},
+                                        {96, 72, 40, 12, 1, 0, 0},
+                                        {112, 98, 75, 43, 15, 1, 0},
+                                        {120, 112, 98, 76, 45, 16, 2} };
+/*
+ * Update cpu_load for any missed ticks, due to tickless idle. The backlog
+ * would be when CPU is idle and so we just decay the old load without
+ * adding any new load.
+ */
+static unsigned long
+decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
+{
+        int j = 0;
+        if (!missed_updates)
+                return load;
+        if (missed_updates >= degrade_zero_ticks[idx])
+                return 0;
+        if (idx == 1)
+                return load >> missed_updates;
+        while (missed_updates) {
+                if (missed_updates % 2)
+                        load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
+                missed_updates >>= 1;
+                j++;
+        }
+        return load;
+}
+/*
+ * Update rq->cpu_load[] statistics. This function is usually called every
+ * scheduler tick (TICK_NSEC). With tickless idle this will not be called
+ * every tick. We fix it up based on jiffies.
+ */
+static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+                              unsigned long pending_updates)
+{
+        int i, scale;
+        this_rq->nr_load_updates++;
+        /* Update our load: */
+        this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+        for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+                unsigned long old_load, new_load;
+                /* scale is effectively 1 << i now, and >> i divides by scale */
+                old_load = this_rq->cpu_load[i];
+                old_load = decay_load_missed(old_load, pending_updates - 1, i);
+                new_load = this_load;
+                /*
+                 * Round up the averaging division if load is increasing. This
+                 * prevents us from getting stuck on 9 if the load is 10, for
+                 * example.
+                 */
+                if (new_load > old_load)
+                        new_load += scale - 1;
+                this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
+        }
+        sched_avg_update(this_rq);
+}
+#ifdef CONFIG_SMP
+static inline unsigned long get_rq_runnable_load(struct rq *rq)
+{
+        return rq->cfs.runnable_load_avg;
+}
+#else
+static inline unsigned long get_rq_runnable_load(struct rq *rq)
+{
+        return rq->load.weight;
+}
+#endif
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we cannot use the delta approach from the regular tick since that
+ * would seriously skew the load calculation. However we'll make do for those
+ * updates happening while idle (nohz_idle_balance) or coming out of idle
+ * (tick_nohz_idle_exit).
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+/*
+ * Called from nohz_idle_balance() to update the load ratings before doing the
+ * idle balance.
+ */
+void update_idle_cpu_load(struct rq *this_rq)
+{
+        unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+        unsigned long load = get_rq_runnable_load(this_rq);
+        unsigned long pending_updates;
+        /*
+         * bail if there's load or we're actually up-to-date.
+         */
+        if (load || curr_jiffies == this_rq->last_load_update_tick)
+                return;
+        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+        this_rq->last_load_update_tick = curr_jiffies;
+        __update_cpu_load(this_rq, load, pending_updates);
+}
+/*
+ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ */
+void update_cpu_load_nohz(void)
+{
+        struct rq *this_rq = this_rq();
+        unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+        unsigned long pending_updates;
+        if (curr_jiffies == this_rq->last_load_update_tick)
+                return;
+        raw_spin_lock(&this_rq->lock);
+        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+        if (pending_updates) {
+                this_rq->last_load_update_tick = curr_jiffies;
+                /*
+                 * We were idle, this means load 0, the current load might be
+                 * !0 due to remote wakeups and the sort.
+                 */
+                __update_cpu_load(this_rq, 0, pending_updates);
+        }
+        raw_spin_unlock(&this_rq->lock);
+}
+#endif /* CONFIG_NO_HZ */
+/*
+ * Called from scheduler_tick()
+ */
+void update_cpu_load_active(struct rq *this_rq)
+{
+        unsigned long load = get_rq_runnable_load(this_rq);
+        /*
+         * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+         */
+        this_rq->last_load_update_tick = jiffies;
+        __update_cpu_load(this_rq, load, 1);
+        calc_load_account_active(this_rq);
+}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 127a2c4cf4ab..01970c8e64df 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -399,20 +399,6 @@ static inline struct task_group *next_task_group(struct task_group *tg)
                (iter = next_task_group(iter)) &&                       \
                (rt_rq = iter->rt_rq[cpu_of(rq)]);)
-static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
-{
-        list_add_rcu(&rt_rq->leaf_rt_rq_list,
-                        &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
-}
-static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
-{
-        list_del_rcu(&rt_rq->leaf_rt_rq_list);
-}
-#define for_each_leaf_rt_rq(rt_rq, rq) \
-        list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
 #define for_each_sched_rt_entity(rt_se) \
        for (; rt_se; rt_se = rt_se->parent)
@@ -472,7 +458,7 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)
 #ifdef CONFIG_SMP
 static inline const struct cpumask *sched_rt_period_mask(void)
 {
-        return cpu_rq(smp_processor_id())->rd->span;
+        return this_rq()->rd->span;
 }
 #else
 static inline const struct cpumask *sched_rt_period_mask(void)
@@ -509,17 +495,6 @@ typedef struct rt_rq *rt_rq_iter_t;
 #define for_each_rt_rq(rt_rq, iter, rq) \
        for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
-static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
-{
-}
-static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
-{
-}
-#define for_each_leaf_rt_rq(rt_rq, rq) \
-        for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 #define for_each_sched_rt_entity(rt_se) \
        for (; rt_se; rt_se = NULL)
@@ -699,15 +674,6 @@ balanced:
        }
 }
-static void disable_runtime(struct rq *rq)
-{
-        unsigned long flags;
-        raw_spin_lock_irqsave(&rq->lock, flags);
-        __disable_runtime(rq);
-        raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
 static void __enable_runtime(struct rq *rq)
 {
        rt_rq_iter_t iter;
@@ -732,37 +698,6 @@ static void __enable_runtime(struct rq *rq)
        }
 }
-static void enable_runtime(struct rq *rq)
-{
-        unsigned long flags;
-        raw_spin_lock_irqsave(&rq->lock, flags);
-        __enable_runtime(rq);
-        raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
-        int cpu = (int)(long)hcpu;
-        switch (action) {
-        case CPU_DOWN_PREPARE:
-        case CPU_DOWN_PREPARE_FROZEN:
-                disable_runtime(cpu_rq(cpu));
-                return NOTIFY_OK;
-        case CPU_DOWN_FAILED:
-        case CPU_DOWN_FAILED_FROZEN:
-        case CPU_ONLINE:
-        case CPU_ONLINE_FROZEN:
-                enable_runtime(cpu_rq(cpu));
-                return NOTIFY_OK;
-        default:
-                return NOTIFY_DONE;
-        }
-}
 static int balance_runtime(struct rt_rq *rt_rq)
 {
        int more = 0;
@@ -926,7 +861,7 @@ static void update_curr_rt(struct rq *rq)
        if (curr->sched_class != &rt_sched_class)
                return;
-        delta_exec = rq->clock_task - curr->se.exec_start;
+        delta_exec = rq_clock_task(rq) - curr->se.exec_start;
        if (unlikely((s64)delta_exec <= 0))
                return;
@@ -936,7 +871,7 @@ static void update_curr_rt(struct rq *rq)
        curr->se.sum_exec_runtime += delta_exec;
        account_group_exec_runtime(curr, delta_exec);
-        curr->se.exec_start = rq->clock_task;
+        curr->se.exec_start = rq_clock_task(rq);
        cpuacct_charge(curr, delta_exec);
        sched_rt_avg_update(rq, delta_exec);
@@ -1106,9 +1041,6 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
        if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
                return;
-        if (!rt_rq->rt_nr_running)
-                list_add_leaf_rt_rq(rt_rq);
        if (head)
                list_add(&rt_se->run_list, queue);
        else
@@ -1128,8 +1060,6 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
                __clear_bit(rt_se_prio(rt_se), array->bitmap);
        dec_rt_tasks(rt_se, rt_rq);
-        if (!rt_rq->rt_nr_running)
-                list_del_leaf_rt_rq(rt_rq);
 }
 /*
@@ -1385,7 +1315,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
        } while (rt_rq);
        p = rt_task_of(rt_se);
-        p->se.exec_start = rq->clock_task;
+        p->se.exec_start = rq_clock_task(rq);
        return p;
 }
@@ -1434,42 +1364,24 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
        return 0;
 }
-/* Return the second highest RT task, NULL otherwise */
+/*
-static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
+ * Return the highest pushable rq's task, which is suitable to be executed
+ * on the cpu, NULL otherwise
+ */
+static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
 {
-        struct task_struct *next = NULL;
+        struct plist_head *head = &rq->rt.pushable_tasks;
-        struct sched_rt_entity *rt_se;
+        struct task_struct *p;
-        struct rt_prio_array *array;
-        struct rt_rq *rt_rq;
-        int idx;
-        for_each_leaf_rt_rq(rt_rq, rq) {
-                array = &rt_rq->active;
-                idx = sched_find_first_bit(array->bitmap);
-next_idx:
-                if (idx >= MAX_RT_PRIO)
-                        continue;
-                if (next && next->prio <= idx)
-                        continue;
-                list_for_each_entry(rt_se, array->queue + idx, run_list) {
-                        struct task_struct *p;
-                        if (!rt_entity_is_task(rt_se))
+        if (!has_pushable_tasks(rq))
-                                continue;
+                return NULL;
-                        p = rt_task_of(rt_se);
+        plist_for_each_entry(p, head, pushable_tasks) {
-                        if (pick_rt_task(rq, p, cpu)) {
+                if (pick_rt_task(rq, p, cpu))
-                                next = p;
+                        return p;
-                                break;
-                        }
-                }
-                if (!next) {
-                        idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
-                        goto next_idx;
-                }
        }
-        return next;
+        return NULL;
 }
 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
@@ -1743,12 +1655,10 @@ static int pull_rt_task(struct rq *this_rq)
                double_lock_balance(this_rq, src_rq);
                /*
-                 * Are there still pullable RT tasks?
+                 * We can pull only a task, which is pushable
+                 * on its rq, and no others.
                 */
-                if (src_rq->rt.rt_nr_running <= 1)
+                p = pick_highest_pushable_task(src_rq, this_cpu);
-                        goto skip;
-                p = pick_next_highest_task_rt(src_rq, this_cpu);
                /*
                 * Do we have an RT task that preempts
@@ -2037,7 +1947,7 @@ static void set_curr_task_rt(struct rq *rq)
 {
        struct task_struct *p = rq->curr;
-        p->se.exec_start = rq->clock_task;
+        p->se.exec_start = rq_clock_task(rq);
        /* The running task is never eligible for pushing */
        dequeue_pushable_task(rq, p);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ce39224d6155..ef0a7b2439dd 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -10,8 +10,16 @@
 #include "cpupri.h"
 #include "cpuacct.h"
+struct rq;
 extern __read_mostly int scheduler_running;
+extern unsigned long calc_load_update;
+extern atomic_long_t calc_load_tasks;
+extern long calc_load_fold_active(struct rq *this_rq);
+extern void update_cpu_load_active(struct rq *this_rq);
 /*
 * Convert user-nice values [ -20 ... 0 ... 19 ]
 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -140,10 +148,11 @@ struct task_group {
        struct cfs_rq **cfs_rq;
        unsigned long shares;
-        atomic_t load_weight;
+#ifdef  CONFIG_SMP
-        atomic64_t load_avg;
+        atomic_long_t load_avg;
        atomic_t runnable_avg;
 #endif
+#endif
 #ifdef CONFIG_RT_GROUP_SCHED
        struct sched_rt_entity **rt_se;
@@ -261,26 +270,21 @@ struct cfs_rq {
 #endif
 #ifdef CONFIG_SMP
-/*
- * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
- * removed when useful for applications beyond shares distribution (e.g.
- * load-balance).
- */
-#ifdef CONFIG_FAIR_GROUP_SCHED
        /*
         * CFS Load tracking
         * Under CFS, load is tracked on a per-entity basis and aggregated up.
         * This allows for the description of both thread and group usage (in
         * the FAIR_GROUP_SCHED case).
         */
-        u64 runnable_load_avg, blocked_load_avg;
+        unsigned long runnable_load_avg, blocked_load_avg;
-        atomic64_t decay_counter, removed_load;
+        atomic64_t decay_counter;
        u64 last_decay;
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+        atomic_long_t removed_load;
-/* These always depend on CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_FAIR_GROUP_SCHED
+        /* Required to track per-cpu representation of a task_group */
        u32 tg_runnable_contrib;
-        u64 tg_load_contrib;
+        unsigned long tg_load_contrib;
 #endif /* CONFIG_FAIR_GROUP_SCHED */
        /*
@@ -353,7 +357,6 @@ struct rt_rq {
        unsigned long rt_nr_boosted;
        struct rq *rq;
-        struct list_head leaf_rt_rq_list;
        struct task_group *tg;
 #endif
 };
@@ -540,6 +543,16 @@ DECLARE_PER_CPU(struct rq, runqueues);
 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
 #define raw_rq()                (&__raw_get_cpu_var(runqueues))
+static inline u64 rq_clock(struct rq *rq)
+{
+        return rq->clock;
+}
+static inline u64 rq_clock_task(struct rq *rq)
+{
+        return rq->clock_task;
+}
 #ifdef CONFIG_SMP
 #define rcu_dereference_check_sched_domain(p) \
@@ -884,24 +897,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 #define WF_FORK         0x02            /* child wakeup after fork */
 #define WF_MIGRATED     0x4             /* internal use, task got migrated */
-static inline void update_load_add(struct load_weight *lw, unsigned long inc)
-{
-        lw->weight += inc;
-        lw->inv_weight = 0;
-}
-static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
-{
-        lw->weight -= dec;
-        lw->inv_weight = 0;
-}
-static inline void update_load_set(struct load_weight *lw, unsigned long w)
-{
-        lw->weight = w;
-        lw->inv_weight = 0;
-}
 /*
 * To aid in avoiding the subversion of "niceness" due to uneven distribution
 * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1028,17 +1023,8 @@ extern void update_group_power(struct sched_domain *sd, int cpu);
 extern void trigger_load_balance(struct rq *rq, int cpu);
 extern void idle_balance(int this_cpu, struct rq *this_rq);
-/*
- * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg
- * becomes useful in lb
- */
-#if defined(CONFIG_FAIR_GROUP_SCHED)
 extern void idle_enter_fair(struct rq *this_rq);
 extern void idle_exit_fair(struct rq *this_rq);
-#else
-static inline void idle_enter_fair(struct rq *this_rq) {}
-static inline void idle_exit_fair(struct rq *this_rq) {}
-#endif
 #else   /* CONFIG_SMP */
@@ -1051,7 +1037,6 @@ static inline void idle_balance(int cpu, struct rq *rq)
 extern void sysrq_sched_debug_show(void);
 extern void sched_init_granularity(void);
 extern void update_max_interval(void);
-extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
 extern void init_sched_rt_class(void);
 extern void init_sched_fair_class(void);
@@ -1063,6 +1048,8 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime
 extern void update_idle_cpu_load(struct rq *this_rq);
+extern void init_task_runnable_average(struct task_struct *p);
 #ifdef CONFIG_PARAVIRT
 static inline u64 steal_ticks(u64 steal)
 {
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 2ef90a51ec5e..5aef494fc8b4 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -61,7 +61,7 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
 */
 static inline void sched_info_dequeued(struct task_struct *t)
 {
-        unsigned long long now = task_rq(t)->clock, delta = 0;
+        unsigned long long now = rq_clock(task_rq(t)), delta = 0;
        if (unlikely(sched_info_on()))
                if (t->sched_info.last_queued)
@@ -79,7 +79,7 @@ static inline void sched_info_dequeued(struct task_struct *t)
 */
 static void sched_info_arrive(struct task_struct *t)
 {
-        unsigned long long now = task_rq(t)->clock, delta = 0;
+        unsigned long long now = rq_clock(task_rq(t)), delta = 0;
        if (t->sched_info.last_queued)
                delta = now - t->sched_info.last_queued;
@@ -100,7 +100,7 @@ static inline void sched_info_queued(struct task_struct *t)
 {
        if (unlikely(sched_info_on()))
                if (!t->sched_info.last_queued)
-                        t->sched_info.last_queued = task_rq(t)->clock;
+                        t->sched_info.last_queued = rq_clock(task_rq(t));
 }
 /*
@@ -112,7 +112,7 @@ static inline void sched_info_queued(struct task_struct *t)
 */
 static inline void sched_info_depart(struct task_struct *t)
 {
-        unsigned long long delta = task_rq(t)->clock -
+        unsigned long long delta = rq_clock(task_rq(t)) -
                                        t->sched_info.last_arrival;
        rq_sched_info_depart(task_rq(t), delta);
@@ -162,6 +162,39 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
 */
 /**
+ * cputimer_running - return true if cputimer is running
+ *
+ * @tsk:        Pointer to target task.
+ */
+static inline bool cputimer_running(struct task_struct *tsk)
+{
+        struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+        if (!cputimer->running)
+                return false;
+        /*
+         * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime
+         * in __exit_signal(), we won't account to the signal struct further
+         * cputime consumed by that task, even though the task can still be
+         * ticking after __exit_signal().
+         *
+         * In order to keep a consistent behaviour between thread group cputime
+         * and thread group cputimer accounting, lets also ignore the cputime
+         * elapsing after __exit_signal() in any thread group timer running.
+         *
+         * This makes sure that POSIX CPU clocks and timers are synchronized, so
+         * that a POSIX CPU timer won't expire while the corresponding POSIX CPU
+         * clock delta is behind the expiring timer value.
+         */
+        if (unlikely(!tsk->sighand))
+                return false;
+        return true;
+}
+/**
 * account_group_user_time - Maintain utime for a thread group.
 *
 * @tsk:        Pointer to task structure.
@@ -176,7 +209,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
 {
        struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-        if (!cputimer->running)
+        if (!cputimer_running(tsk))
                return;
        raw_spin_lock(&cputimer->lock);
@@ -199,7 +232,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
 {
        struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-        if (!cputimer->running)
+        if (!cputimer_running(tsk))
                return;
        raw_spin_lock(&cputimer->lock);
@@ -222,7 +255,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
 {
        struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-        if (!cputimer->running)
+        if (!cputimer_running(tsk))
                return;
        raw_spin_lock(&cputimer->lock);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index da5eb5bed84a..e08fbeeb54b9 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -28,7 +28,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
        struct task_struct *stop = rq->stop;
        if (stop && stop->on_rq) {
-                stop->se.exec_start = rq->clock_task;
+                stop->se.exec_start = rq_clock_task(rq);
                return stop;
        }
@@ -57,7 +57,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
        struct task_struct *curr = rq->curr;
        u64 delta_exec;
-        delta_exec = rq->clock_task - curr->se.exec_start;
+        delta_exec = rq_clock_task(rq) - curr->se.exec_start;
        if (unlikely((s64)delta_exec < 0))
                delta_exec = 0;
@@ -67,7 +67,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
        curr->se.sum_exec_runtime += delta_exec;
        account_group_exec_runtime(curr, delta_exec);
-        curr->se.exec_start = rq->clock_task;
+        curr->se.exec_start = rq_clock_task(rq);
        cpuacct_charge(curr, delta_exec);
 }
@@ -79,7 +79,7 @@ static void set_curr_task_stop(struct rq *rq)
 {
        struct task_struct *stop = rq->stop;
-        stop->se.exec_start = rq->clock_task;
+        stop->se.exec_start = rq_clock_task(rq);
 }
 static void switched_to_stop(struct rq *rq, struct task_struct *p)
diff --git a/kernel/signal.c b/kernel/signal.c
index 113411bfe8b1..50e41075ac77 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2848,7 +2848,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
                recalc_sigpending();
                spin_unlock_irq(&tsk->sighand->siglock);
-                timeout = schedule_timeout_interruptible(timeout);
+                timeout = freezable_schedule_timeout_interruptible(timeout);
                spin_lock_irq(&tsk->sighand->siglock);
                __set_task_blocked(tsk, &tsk->real_blocked);
diff --git a/kernel/smp.c b/kernel/smp.c
index 4dba0f7b72ad..fe9f773d7114 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -73,7 +73,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
        return NOTIFY_OK;
 }
-static struct notifier_block __cpuinitdata hotplug_cfd_notifier = {
+static struct notifier_block hotplug_cfd_notifier = {
        .notifier_call          = hotplug_cfd,
 };
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 02fc5c933673..eb89e1807408 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -24,7 +24,7 @@
 */
 static DEFINE_PER_CPU(struct task_struct *, idle_threads);
-struct task_struct * __cpuinit idle_thread_get(unsigned int cpu)
+struct task_struct *idle_thread_get(unsigned int cpu)
 {
        struct task_struct *tsk = per_cpu(idle_threads, cpu);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 3d6833f125d3..be3d3514c325 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -127,8 +127,7 @@ static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
 void local_bh_disable(void)
 {
-        __local_bh_disable((unsigned long)__builtin_return_address(0),
+        __local_bh_disable(_RET_IP_, SOFTIRQ_DISABLE_OFFSET);
-                                SOFTIRQ_DISABLE_OFFSET);
 }
 EXPORT_SYMBOL(local_bh_disable);
@@ -139,7 +138,7 @@ static void __local_bh_enable(unsigned int cnt)
        WARN_ON_ONCE(!irqs_disabled());
        if (softirq_count() == cnt)
-                trace_softirqs_on((unsigned long)__builtin_return_address(0));
+                trace_softirqs_on(_RET_IP_);
        sub_preempt_count(cnt);
 }
@@ -184,7 +183,7 @@ static inline void _local_bh_enable_ip(unsigned long ip)
 void local_bh_enable(void)
 {
-        _local_bh_enable_ip((unsigned long)__builtin_return_address(0));
+        _local_bh_enable_ip(_RET_IP_);
 }
 EXPORT_SYMBOL(local_bh_enable);
@@ -229,8 +228,7 @@ asmlinkage void __do_softirq(void)
        pending = local_softirq_pending();
        account_irq_enter_time(current);
-        __local_bh_disable((unsigned long)__builtin_return_address(0),
+        __local_bh_disable(_RET_IP_, SOFTIRQ_OFFSET);
-                                SOFTIRQ_OFFSET);
        lockdep_softirq_enter();
        cpu = smp_processor_id();
@@ -701,7 +699,7 @@ void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
 }
 EXPORT_SYMBOL(send_remote_softirq);
-static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self,
+static int remote_softirq_cpu_notify(struct notifier_block *self,
                                               unsigned long action, void *hcpu)
 {
        /*
@@ -730,7 +728,7 @@ static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self,
        return NOTIFY_OK;
 }
-static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = {
+static struct notifier_block remote_softirq_cpu_notifier = {
        .notifier_call  = remote_softirq_cpu_notify,
 };
@@ -832,7 +830,7 @@ static void takeover_tasklets(unsigned int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
-static int __cpuinit cpu_callback(struct notifier_block *nfb,
+static int cpu_callback(struct notifier_block *nfb,
                                  unsigned long action,
                                  void *hcpu)
 {
@@ -847,7 +845,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
        return NOTIFY_OK;
 }
-static struct notifier_block __cpuinitdata cpu_nfb = {
+static struct notifier_block cpu_nfb = {
        .notifier_call = cpu_callback
 };
diff --git a/kernel/sys.c b/kernel/sys.c
index 2bbd9a73b54c..771129b299f8 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -116,20 +116,6 @@ EXPORT_SYMBOL(fs_overflowuid);
 EXPORT_SYMBOL(fs_overflowgid);
 /*
- * this indicates whether you can reboot with ctrl-alt-del: the default is yes
- */
-int C_A_D = 1;
-struct pid *cad_pid;
-EXPORT_SYMBOL(cad_pid);
-/*
- * If set, this is used for preparing the system to power off.
- */
-void (*pm_power_off_prepare)(void);
-/*
 * Returns true if current's euid is same as p's uid or euid,
 * or has CAP_SYS_NICE to p's user_ns.
 *
@@ -308,266 +294,6 @@ out_unlock:
        return retval;
 }
-/**
- *      emergency_restart - reboot the system
- *
- *      Without shutting down any hardware or taking any locks
- *      reboot the system.  This is called when we know we are in
- *      trouble so this is our best effort to reboot.  This is
- *      safe to call in interrupt context.
- */
-void emergency_restart(void)
-{
-        kmsg_dump(KMSG_DUMP_EMERG);
-        machine_emergency_restart();
-}
-EXPORT_SYMBOL_GPL(emergency_restart);
-void kernel_restart_prepare(char *cmd)
-{
-        blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
-        system_state = SYSTEM_RESTART;
-        usermodehelper_disable();
-        device_shutdown();
-}
-/**
- *      register_reboot_notifier - Register function to be called at reboot time
- *      @nb: Info about notifier function to be called
- *
- *      Registers a function with the list of functions
- *      to be called at reboot time.
- *
- *      Currently always returns zero, as blocking_notifier_chain_register()
- *      always returns zero.
- */
-int register_reboot_notifier(struct notifier_block *nb)
-{
-        return blocking_notifier_chain_register(&reboot_notifier_list, nb);
-}
-EXPORT_SYMBOL(register_reboot_notifier);
-/**
- *      unregister_reboot_notifier - Unregister previously registered reboot notifier
- *      @nb: Hook to be unregistered
- *
- *      Unregisters a previously registered reboot
- *      notifier function.
- *
- *      Returns zero on success, or %-ENOENT on failure.
- */
-int unregister_reboot_notifier(struct notifier_block *nb)
-{
-        return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
-}
-EXPORT_SYMBOL(unregister_reboot_notifier);
-/* Add backwards compatibility for stable trees. */
-#ifndef PF_NO_SETAFFINITY
-#define PF_NO_SETAFFINITY               PF_THREAD_BOUND
-#endif
-static void migrate_to_reboot_cpu(void)
-{
-        /* The boot cpu is always logical cpu 0 */
-        int cpu = 0;
-        cpu_hotplug_disable();
-        /* Make certain the cpu I'm about to reboot on is online */
-        if (!cpu_online(cpu))
-                cpu = cpumask_first(cpu_online_mask);
-        /* Prevent races with other tasks migrating this task */
-        current->flags |= PF_NO_SETAFFINITY;
-        /* Make certain I only run on the appropriate processor */
-        set_cpus_allowed_ptr(current, cpumask_of(cpu));
-}
-/**
- *      kernel_restart - reboot the system
- *      @cmd: pointer to buffer containing command to execute for restart
- *              or %NULL
- *
- *      Shutdown everything and perform a clean reboot.
- *      This is not safe to call in interrupt context.
- */
-void kernel_restart(char *cmd)
-{
-        kernel_restart_prepare(cmd);
-        migrate_to_reboot_cpu();
-        syscore_shutdown();
-        if (!cmd)
-                printk(KERN_EMERG "Restarting system.\n");
-        else
-                printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
-        kmsg_dump(KMSG_DUMP_RESTART);
-        machine_restart(cmd);
-}
-EXPORT_SYMBOL_GPL(kernel_restart);
-static void kernel_shutdown_prepare(enum system_states state)
-{
-        blocking_notifier_call_chain(&reboot_notifier_list,
-                (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
-        system_state = state;
-        usermodehelper_disable();
-        device_shutdown();
-}
-/**
- *      kernel_halt - halt the system
- *
- *      Shutdown everything and perform a clean system halt.
- */
-void kernel_halt(void)
-{
-        kernel_shutdown_prepare(SYSTEM_HALT);
-        migrate_to_reboot_cpu();
-        syscore_shutdown();
-        printk(KERN_EMERG "System halted.\n");
-        kmsg_dump(KMSG_DUMP_HALT);
-        machine_halt();
-}
-EXPORT_SYMBOL_GPL(kernel_halt);
-/**
- *      kernel_power_off - power_off the system
- *
- *      Shutdown everything and perform a clean system power_off.
- */
-void kernel_power_off(void)
-{
-        kernel_shutdown_prepare(SYSTEM_POWER_OFF);
-        if (pm_power_off_prepare)
-                pm_power_off_prepare();
-        migrate_to_reboot_cpu();
-        syscore_shutdown();
-        printk(KERN_EMERG "Power down.\n");
-        kmsg_dump(KMSG_DUMP_POWEROFF);
-        machine_power_off();
-}
-EXPORT_SYMBOL_GPL(kernel_power_off);
-static DEFINE_MUTEX(reboot_mutex);
-/*
- * Reboot system call: for obvious reasons only root may call it,
- * and even root needs to set up some magic numbers in the registers
- * so that some mistake won't make this reboot the whole machine.
- * You can also set the meaning of the ctrl-alt-del-key here.
- *
- * reboot doesn't sync: do that yourself before calling this.
- */
-SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
-                void __user *, arg)
-{
-        struct pid_namespace *pid_ns = task_active_pid_ns(current);
-        char buffer[256];
-        int ret = 0;
-        /* We only trust the superuser with rebooting the system. */
-        if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT))
-                return -EPERM;
-        /* For safety, we require "magic" arguments. */
-        if (magic1 != LINUX_REBOOT_MAGIC1 ||
-            (magic2 != LINUX_REBOOT_MAGIC2 &&
-                        magic2 != LINUX_REBOOT_MAGIC2A &&
-                        magic2 != LINUX_REBOOT_MAGIC2B &&
-                        magic2 != LINUX_REBOOT_MAGIC2C))
-                return -EINVAL;
-        /*
-         * If pid namespaces are enabled and the current task is in a child
-         * pid_namespace, the command is handled by reboot_pid_ns() which will
-         * call do_exit().
-         */
-        ret = reboot_pid_ns(pid_ns, cmd);
-        if (ret)
-                return ret;
-        /* Instead of trying to make the power_off code look like
-         * halt when pm_power_off is not set do it the easy way.
-         */
-        if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
-                cmd = LINUX_REBOOT_CMD_HALT;
-        mutex_lock(&reboot_mutex);
-        switch (cmd) {
-        case LINUX_REBOOT_CMD_RESTART:
-                kernel_restart(NULL);
-                break;
-        case LINUX_REBOOT_CMD_CAD_ON:
-                C_A_D = 1;
-                break;
-        case LINUX_REBOOT_CMD_CAD_OFF:
-                C_A_D = 0;
-                break;
-        case LINUX_REBOOT_CMD_HALT:
-                kernel_halt();
-                do_exit(0);
-                panic("cannot halt");
-        case LINUX_REBOOT_CMD_POWER_OFF:
-                kernel_power_off();
-                do_exit(0);
-                break;
-        case LINUX_REBOOT_CMD_RESTART2:
-                if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {
-                        ret = -EFAULT;
-                        break;
-                }
-                buffer[sizeof(buffer) - 1] = '\0';
-                kernel_restart(buffer);
-                break;
-#ifdef CONFIG_KEXEC
-        case LINUX_REBOOT_CMD_KEXEC:
-                ret = kernel_kexec();
-                break;
-#endif
-#ifdef CONFIG_HIBERNATION
-        case LINUX_REBOOT_CMD_SW_SUSPEND:
-                ret = hibernate();
-                break;
-#endif
-        default:
-                ret = -EINVAL;
-                break;
-        }
-        mutex_unlock(&reboot_mutex);
-        return ret;
-}
-static void deferred_cad(struct work_struct *dummy)
-{
-        kernel_restart(NULL);
-}
-/*
- * This function gets called by ctrl-alt-del - ie the keyboard interrupt.
- * As it's called within an interrupt, it may NOT sync: the only choice
- * is whether to reboot at once, or just ignore the ctrl-alt-del.
- */
-void ctrl_alt_del(void)
-{
-        static DECLARE_WORK(cad_work, deferred_cad);
-        if (C_A_D)
-                schedule_work(&cad_work);
-        else
-                kill_cad_pid(SIGINT, 1);
-}
-        
 /*
 * Unprivileged users may change the real gid to the effective gid
 * or vice versa.  (BSD-style)
@@ -1309,6 +1035,17 @@ out:
        return retval;
 }
+static void set_special_pids(struct pid *pid)
+{
+        struct task_struct *curr = current->group_leader;
+        if (task_session(curr) != pid)
+                change_pid(curr, PIDTYPE_SID, pid);
+        if (task_pgrp(curr) != pid)
+                change_pid(curr, PIDTYPE_PGID, pid);
+}
 SYSCALL_DEFINE0(setsid)
 {
        struct task_struct *group_leader = current->group_leader;
@@ -1328,7 +1065,7 @@ SYSCALL_DEFINE0(setsid)
                goto out;
        group_leader->signal->leader = 1;
-        __set_special_pids(sid);
+        set_special_pids(sid);
        proc_clear_tty(group_leader);
@@ -2281,68 +2018,6 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
        return err ? -EFAULT : 0;
 }
-char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
-static int __orderly_poweroff(bool force)
-{
-        char **argv;
-        static char *envp[] = {
-                "HOME=/",
-                "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
-                NULL
-        };
-        int ret;
-        argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL);
-        if (argv) {
-                ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
-                argv_free(argv);
-        } else {
-                printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
-                                         __func__, poweroff_cmd);
-                ret = -ENOMEM;
-        }
-        if (ret && force) {
-                printk(KERN_WARNING "Failed to start orderly shutdown: "
-                                        "forcing the issue\n");
-                /*
-                 * I guess this should try to kick off some daemon to sync and
-                 * poweroff asap.  Or not even bother syncing if we're doing an
-                 * emergency shutdown?
-                 */
-                emergency_sync();
-                kernel_power_off();
-        }
-        return ret;
-}
-static bool poweroff_force;
-static void poweroff_work_func(struct work_struct *work)
-{
-        __orderly_poweroff(poweroff_force);
-}
-static DECLARE_WORK(poweroff_work, poweroff_work_func);
-/**
- * orderly_poweroff - Trigger an orderly system poweroff
- * @force: force poweroff if command execution fails
- *
- * This may be called from any context to trigger a system shutdown.
- * If the orderly shutdown fails, it will force an immediate shutdown.
- */
-int orderly_poweroff(bool force)
-{
-        if (force) /* do not override the pending "true" */
-                poweroff_force = true;
-        schedule_work(&poweroff_work);
-        return 0;
-}
-EXPORT_SYMBOL_GPL(orderly_poweroff);
 /**
 * do_sysinfo - fill in sysinfo struct
 * @info: pointer to buffer to fill
@@ -2355,8 +2030,7 @@ static int do_sysinfo(struct sysinfo *info)
        memset(info, 0, sizeof(struct sysinfo));
-        ktime_get_ts(&tp);
+        get_monotonic_boottime(&tp);
-        monotonic_to_bootbased(&tp);
        info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
        get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9edcf456e0fc..07f6fc468e17 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -120,7 +120,6 @@ extern int blk_iopoll_enabled;
 /* Constants used for minimum and  maximum */
 #ifdef CONFIG_LOCKUP_DETECTOR
 static int sixty = 60;
-static int neg_one = -1;
 #endif
 static int zero;
@@ -600,6 +599,13 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+        {
+                .procname       = "traceoff_on_warning",
+                .data           = &__disable_trace_on_warning,
+                .maxlen         = sizeof(__disable_trace_on_warning),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
 #endif
 #ifdef CONFIG_MODULES
        {
@@ -801,7 +807,7 @@ static struct ctl_table kern_table[] = {
 #if defined(CONFIG_LOCKUP_DETECTOR)
        {
                .procname       = "watchdog",
-                .data           = &watchdog_enabled,
+                .data           = &watchdog_user_enabled,
                .maxlen         = sizeof (int),
                .mode           = 0644,
                .proc_handler   = proc_dowatchdog,
@@ -814,7 +820,7 @@ static struct ctl_table kern_table[] = {
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = proc_dowatchdog,
-                .extra1         = &neg_one,
+                .extra1         = &zero,
                .extra2         = &sixty,
        },
        {
@@ -828,7 +834,7 @@ static struct ctl_table kern_table[] = {
        },
        {
                .procname       = "nmi_watchdog",
-                .data           = &watchdog_enabled,
+                .data           = &watchdog_user_enabled,
                .maxlen         = sizeof (int),
                .mode           = 0644,
                .proc_handler   = proc_dowatchdog,
@@ -1044,6 +1050,15 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = perf_proc_update_handler,
        },
+        {
+                .procname       = "perf_cpu_time_max_percent",
+                .data           = &sysctl_perf_cpu_time_max_percent,
+                .maxlen         = sizeof(sysctl_perf_cpu_time_max_percent),
+                .mode           = 0644,
+                .proc_handler   = perf_cpu_time_max_percent_handler,
+                .extra1         = &zero,
+                .extra2         = &one_hundred,
+        },
 #endif
 #ifdef CONFIG_KMEMCHECK
        {
@@ -2331,7 +2346,11 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
                                            int write, void *data)
 {
        if (write) {
-                *valp = msecs_to_jiffies(*negp ? -*lvalp : *lvalp);
+                unsigned long jif = msecs_to_jiffies(*negp ? -*lvalp : *lvalp);
+                if (jif > INT_MAX)
+                        return 1;
+                *valp = (int)jif;
        } else {
                int val = *valp;
                unsigned long lval;
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index aea4a9ea6fc8..b609213ca9a2 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -3,7 +3,6 @@
 #include "../fs/xfs/xfs_sysctl.h"
 #include <linux/sunrpc/debug.h>
 #include <linux/string.h>
-#include <net/ip_vs.h>
 #include <linux/syscalls.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
diff --git a/kernel/time.c b/kernel/time.c
index d3617dbd3dca..7c7964c33ae7 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -11,7 +11,7 @@
 * Modification history kernel/time.c
 *
 * 1993-09-02    Philip Gladstone
- *      Created file with time related functions from sched.c and adjtimex()
+ *      Created file with time related functions from sched/core.c and adjtimex()
 * 1993-10-08    Torsten Duwe
 *      adjtime interface update and CMOS clock write code
 * 1995-08-13    Torsten Duwe
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index ff7d9d2ab504..9250130646f5 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -4,6 +4,8 @@ obj-y += timeconv.o posix-clock.o alarmtimer.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD)         += clockevents.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS)               += tick-common.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST)     += tick-broadcast.o
+obj-$(CONFIG_GENERIC_SCHED_CLOCK)               += sched_clock.o
 obj-$(CONFIG_TICK_ONESHOT)                      += tick-oneshot.o
 obj-$(CONFIG_TICK_ONESHOT)                      += tick-sched.o
 obj-$(CONFIG_TIMER_STATS)                       += timer_stats.o
+obj-$(CONFIG_DEBUG_FS)                          += timekeeping_debug.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index f11d83b12949..eec50fcef9e4 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -199,6 +199,13 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
 }
+ktime_t alarm_expires_remaining(const struct alarm *alarm)
+{
+        struct alarm_base *base = &alarm_bases[alarm->type];
+        return ktime_sub(alarm->node.expires, base->gettime());
+}
+EXPORT_SYMBOL_GPL(alarm_expires_remaining);
 #ifdef CONFIG_RTC_CLASS
 /**
 * alarmtimer_suspend - Suspend time callback
@@ -303,9 +310,10 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
        alarm->type = type;
        alarm->state = ALARMTIMER_STATE_INACTIVE;
 }
+EXPORT_SYMBOL_GPL(alarm_init);
 /**
- * alarm_start - Sets an alarm to fire
+ * alarm_start - Sets an absolute alarm to fire
 * @alarm: ptr to alarm to set
 * @start: time to run the alarm
 */
@@ -323,6 +331,34 @@ int alarm_start(struct alarm *alarm, ktime_t start)
        spin_unlock_irqrestore(&base->lock, flags);
        return ret;
 }
+EXPORT_SYMBOL_GPL(alarm_start);
+/**
+ * alarm_start_relative - Sets a relative alarm to fire
+ * @alarm: ptr to alarm to set
+ * @start: time relative to now to run the alarm
+ */
+int alarm_start_relative(struct alarm *alarm, ktime_t start)
+{
+        struct alarm_base *base = &alarm_bases[alarm->type];
+        start = ktime_add(start, base->gettime());
+        return alarm_start(alarm, start);
+}
+EXPORT_SYMBOL_GPL(alarm_start_relative);
+void alarm_restart(struct alarm *alarm)
+{
+        struct alarm_base *base = &alarm_bases[alarm->type];
+        unsigned long flags;
+        spin_lock_irqsave(&base->lock, flags);
+        hrtimer_set_expires(&alarm->timer, alarm->node.expires);
+        hrtimer_restart(&alarm->timer);
+        alarmtimer_enqueue(base, alarm);
+        spin_unlock_irqrestore(&base->lock, flags);
+}
+EXPORT_SYMBOL_GPL(alarm_restart);
 /**
 * alarm_try_to_cancel - Tries to cancel an alarm timer
@@ -344,6 +380,7 @@ int alarm_try_to_cancel(struct alarm *alarm)
        spin_unlock_irqrestore(&base->lock, flags);
        return ret;
 }
+EXPORT_SYMBOL_GPL(alarm_try_to_cancel);
 /**
@@ -361,6 +398,7 @@ int alarm_cancel(struct alarm *alarm)
                cpu_relax();
        }
 }
+EXPORT_SYMBOL_GPL(alarm_cancel);
 u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
@@ -393,8 +431,15 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
        alarm->node.expires = ktime_add(alarm->node.expires, interval);
        return overrun;
 }
+EXPORT_SYMBOL_GPL(alarm_forward);
+u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
+{
+        struct alarm_base *base = &alarm_bases[alarm->type];
+        return alarm_forward(alarm, base->gettime(), interval);
+}
+EXPORT_SYMBOL_GPL(alarm_forward_now);
 /**
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index c6d6400ee137..38959c866789 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -15,20 +15,23 @@
 #include <linux/hrtimer.h>
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/notifier.h>
 #include <linux/smp.h>
+#include <linux/device.h>
 #include "tick-internal.h"
 /* The registered clock event devices */
 static LIST_HEAD(clockevent_devices);
 static LIST_HEAD(clockevents_released);
-/* Notification for clock events */
-static RAW_NOTIFIER_HEAD(clockevents_chain);
 /* Protection for the above */
 static DEFINE_RAW_SPINLOCK(clockevents_lock);
+/* Protection for unbind operations */
+static DEFINE_MUTEX(clockevents_mutex);
+struct ce_unbind {
+        struct clock_event_device *ce;
+        int res;
+};
 /**
 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
@@ -232,47 +235,107 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
        return (rc && force) ? clockevents_program_min_delta(dev) : rc;
 }
-/**
+/*
- * clockevents_register_notifier - register a clock events change listener
+ * Called after a notify add to make devices available which were
+ * released from the notifier call.
 */
-int clockevents_register_notifier(struct notifier_block *nb)
+static void clockevents_notify_released(void)
 {
-        unsigned long flags;
+        struct clock_event_device *dev;
-        int ret;
-        raw_spin_lock_irqsave(&clockevents_lock, flags);
+        while (!list_empty(&clockevents_released)) {
-        ret = raw_notifier_chain_register(&clockevents_chain, nb);
+                dev = list_entry(clockevents_released.next,
-        raw_spin_unlock_irqrestore(&clockevents_lock, flags);
+                                 struct clock_event_device, list);
+                list_del(&dev->list);
+                list_add(&dev->list, &clockevent_devices);
+                tick_check_new_device(dev);
+        }
+}
-        return ret;
+/*
+ * Try to install a replacement clock event device
+ */
+static int clockevents_replace(struct clock_event_device *ced)
+{
+        struct clock_event_device *dev, *newdev = NULL;
+        list_for_each_entry(dev, &clockevent_devices, list) {
+                if (dev == ced || dev->mode != CLOCK_EVT_MODE_UNUSED)
+                        continue;
+                if (!tick_check_replacement(newdev, dev))
+                        continue;
+                if (!try_module_get(dev->owner))
+                        continue;
+                if (newdev)
+                        module_put(newdev->owner);
+                newdev = dev;
+        }
+        if (newdev) {
+                tick_install_replacement(newdev);
+                list_del_init(&ced->list);
+        }
+        return newdev ? 0 : -EBUSY;
 }
 /*
- * Notify about a clock event change. Called with clockevents_lock
+ * Called with clockevents_mutex and clockevents_lock held
- * held.
 */
-static void clockevents_do_notify(unsigned long reason, void *dev)
+static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu)
 {
-        raw_notifier_call_chain(&clockevents_chain, reason, dev);
+        /* Fast track. Device is unused */
+        if (ced->mode == CLOCK_EVT_MODE_UNUSED) {
+                list_del_init(&ced->list);
+                return 0;
+        }
+        return ced == per_cpu(tick_cpu_device, cpu).evtdev ? -EAGAIN : -EBUSY;
 }
 /*
- * Called after a notify add to make devices available which were
+ * SMP function call to unbind a device
- * released from the notifier call.
 */
-static void clockevents_notify_released(void)
+static void __clockevents_unbind(void *arg)
 {
-        struct clock_event_device *dev;
+        struct ce_unbind *cu = arg;
+        int res;
+        raw_spin_lock(&clockevents_lock);
+        res = __clockevents_try_unbind(cu->ce, smp_processor_id());
+        if (res == -EAGAIN)
+                res = clockevents_replace(cu->ce);
+        cu->res = res;
+        raw_spin_unlock(&clockevents_lock);
+}
-        while (!list_empty(&clockevents_released)) {
+/*
-                dev = list_entry(clockevents_released.next,
+ * Issues smp function call to unbind a per cpu device. Called with
-                                 struct clock_event_device, list);
+ * clockevents_mutex held.
-                list_del(&dev->list);
+ */
-                list_add(&dev->list, &clockevent_devices);
+static int clockevents_unbind(struct clock_event_device *ced, int cpu)
-                clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
+{
-        }
+        struct ce_unbind cu = { .ce = ced, .res = -ENODEV };
+        smp_call_function_single(cpu, __clockevents_unbind, &cu, 1);
+        return cu.res;
 }
+/*
+ * Unbind a clockevents device.
+ */
+int clockevents_unbind_device(struct clock_event_device *ced, int cpu)
+{
+        int ret;
+        mutex_lock(&clockevents_mutex);
+        ret = clockevents_unbind(ced, cpu);
+        mutex_unlock(&clockevents_mutex);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(clockevents_unbind);
 /**
 * clockevents_register_device - register a clock event device
 * @dev:        device to register
@@ -290,7 +353,7 @@ void clockevents_register_device(struct clock_event_device *dev)
        raw_spin_lock_irqsave(&clockevents_lock, flags);
        list_add(&dev->list, &clockevent_devices);
-        clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
+        tick_check_new_device(dev);
        clockevents_notify_released();
        raw_spin_unlock_irqrestore(&clockevents_lock, flags);
@@ -386,6 +449,7 @@ void clockevents_exchange_device(struct clock_event_device *old,
         * released list and do a notify add later.
         */
        if (old) {
+                module_put(old->owner);
                clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
                list_del(&old->list);
                list_add(&old->list, &clockevents_released);
@@ -433,10 +497,36 @@ void clockevents_notify(unsigned long reason, void *arg)
        int cpu;
        raw_spin_lock_irqsave(&clockevents_lock, flags);
-        clockevents_do_notify(reason, arg);
        switch (reason) {
+        case CLOCK_EVT_NOTIFY_BROADCAST_ON:
+        case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
+        case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
+                tick_broadcast_on_off(reason, arg);
+                break;
+        case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
+        case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
+                tick_broadcast_oneshot_control(reason);
+                break;
+        case CLOCK_EVT_NOTIFY_CPU_DYING:
+                tick_handover_do_timer(arg);
+                break;
+        case CLOCK_EVT_NOTIFY_SUSPEND:
+                tick_suspend();
+                tick_suspend_broadcast();
+                break;
+        case CLOCK_EVT_NOTIFY_RESUME:
+                tick_resume();
+                break;
        case CLOCK_EVT_NOTIFY_CPU_DEAD:
+                tick_shutdown_broadcast_oneshot(arg);
+                tick_shutdown_broadcast(arg);
+                tick_shutdown(arg);
                /*
                 * Unregister the clock event devices which were
                 * released from the users in the notify chain.
@@ -462,4 +552,123 @@ void clockevents_notify(unsigned long reason, void *arg)
        raw_spin_unlock_irqrestore(&clockevents_lock, flags);
 }
 EXPORT_SYMBOL_GPL(clockevents_notify);
+#ifdef CONFIG_SYSFS
+struct bus_type clockevents_subsys = {
+        .name           = "clockevents",
+        .dev_name       = "clockevent",
+};
+static DEFINE_PER_CPU(struct device, tick_percpu_dev);
+static struct tick_device *tick_get_tick_dev(struct device *dev);
+static ssize_t sysfs_show_current_tick_dev(struct device *dev,
+                                           struct device_attribute *attr,
+                                           char *buf)
+{
+        struct tick_device *td;
+        ssize_t count = 0;
+        raw_spin_lock_irq(&clockevents_lock);
+        td = tick_get_tick_dev(dev);
+        if (td && td->evtdev)
+                count = snprintf(buf, PAGE_SIZE, "%s\n", td->evtdev->name);
+        raw_spin_unlock_irq(&clockevents_lock);
+        return count;
+}
+static DEVICE_ATTR(current_device, 0444, sysfs_show_current_tick_dev, NULL);
+/* We don't support the abomination of removable broadcast devices */
+static ssize_t sysfs_unbind_tick_dev(struct device *dev,
+                                     struct device_attribute *attr,
+                                     const char *buf, size_t count)
+{
+        char name[CS_NAME_LEN];
+        size_t ret = sysfs_get_uname(buf, name, count);
+        struct clock_event_device *ce;
+        if (ret < 0)
+                return ret;
+        ret = -ENODEV;
+        mutex_lock(&clockevents_mutex);
+        raw_spin_lock_irq(&clockevents_lock);
+        list_for_each_entry(ce, &clockevent_devices, list) {
+                if (!strcmp(ce->name, name)) {
+                        ret = __clockevents_try_unbind(ce, dev->id);
+                        break;
+                }
+        }
+        raw_spin_unlock_irq(&clockevents_lock);
+        /*
+         * We hold clockevents_mutex, so ce can't go away
+         */
+        if (ret == -EAGAIN)
+                ret = clockevents_unbind(ce, dev->id);
+        mutex_unlock(&clockevents_mutex);
+        return ret ? ret : count;
+}
+static DEVICE_ATTR(unbind_device, 0200, NULL, sysfs_unbind_tick_dev);
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+static struct device tick_bc_dev = {
+        .init_name      = "broadcast",
+        .id             = 0,
+        .bus            = &clockevents_subsys,
+};
+static struct tick_device *tick_get_tick_dev(struct device *dev)
+{
+        return dev == &tick_bc_dev ? tick_get_broadcast_device() :
+                &per_cpu(tick_cpu_device, dev->id);
+}
+static __init int tick_broadcast_init_sysfs(void)
+{
+        int err = device_register(&tick_bc_dev);
+        if (!err)
+                err = device_create_file(&tick_bc_dev, &dev_attr_current_device);
+        return err;
+}
+#else
+static struct tick_device *tick_get_tick_dev(struct device *dev)
+{
+        return &per_cpu(tick_cpu_device, dev->id);
+}
+static inline int tick_broadcast_init_sysfs(void) { return 0; }
 #endif
+static int __init tick_init_sysfs(void)
+{
+        int cpu;
+        for_each_possible_cpu(cpu) {
+                struct device *dev = &per_cpu(tick_percpu_dev, cpu);
+                int err;
+                dev->id = cpu;
+                dev->bus = &clockevents_subsys;
+                err = device_register(dev);
+                if (!err)
+                        err = device_create_file(dev, &dev_attr_current_device);
+                if (!err)
+                        err = device_create_file(dev, &dev_attr_unbind_device);
+                if (err)
+                        return err;
+        }
+        return tick_broadcast_init_sysfs();
+}
+static int __init clockevents_init_sysfs(void)
+{
+        int err = subsys_system_register(&clockevents_subsys, NULL);
+        if (!err)
+                err = tick_init_sysfs();
+        return err;
+}
+device_initcall(clockevents_init_sysfs);
+#endif /* SYSFS */
+#endif /* GENERIC_CLOCK_EVENTS */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c9583382141a..50a8736757f3 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -31,6 +31,8 @@
 #include <linux/tick.h>
 #include <linux/kthread.h>
+#include "tick-internal.h"
 void timecounter_init(struct timecounter *tc,
                      const struct cyclecounter *cc,
                      u64 start_tstamp)
@@ -174,11 +176,12 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
 static struct clocksource *curr_clocksource;
 static LIST_HEAD(clocksource_list);
 static DEFINE_MUTEX(clocksource_mutex);
-static char override_name[32];
+static char override_name[CS_NAME_LEN];
 static int finished_booting;
 #ifdef CONFIG_CLOCKSOURCE_WATCHDOG
 static void clocksource_watchdog_work(struct work_struct *work);
+static void clocksource_select(void);
 static LIST_HEAD(watchdog_list);
 static struct clocksource *watchdog;
@@ -299,13 +302,30 @@ static void clocksource_watchdog(unsigned long data)
                if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
                    (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
                    (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
+                        /* Mark it valid for high-res. */
                        cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
+                        /*
+                         * clocksource_done_booting() will sort it if
+                         * finished_booting is not set yet.
+                         */
+                        if (!finished_booting)
+                                continue;
                        /*
-                         * We just marked the clocksource as highres-capable,
+                         * If this is not the current clocksource let
-                         * notify the rest of the system as well so that we
+                         * the watchdog thread reselect it. Due to the
-                         * transition into high-res mode:
+                         * change to high res this clocksource might
+                         * be preferred now. If it is the current
+                         * clocksource let the tick code know about
+                         * that change.
                         */
-                        tick_clock_notify();
+                        if (cs != curr_clocksource) {
+                                cs->flags |= CLOCK_SOURCE_RESELECT;
+                                schedule_work(&watchdog_work);
+                        } else {
+                                tick_clock_notify();
+                        }
                }
        }
@@ -388,44 +408,39 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
 static void clocksource_dequeue_watchdog(struct clocksource *cs)
 {
-        struct clocksource *tmp;
        unsigned long flags;
        spin_lock_irqsave(&watchdog_lock, flags);
-        if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
+        if (cs != watchdog) {
-                /* cs is a watched clocksource. */
+                if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
-                list_del_init(&cs->wd_list);
+                        /* cs is a watched clocksource. */
-        } else if (cs == watchdog) {
+                        list_del_init(&cs->wd_list);
-                /* Reset watchdog cycles */
+                        /* Check if the watchdog timer needs to be stopped. */
-                clocksource_reset_watchdog();
+                        clocksource_stop_watchdog();
-                /* Current watchdog is removed. Find an alternative. */
-                watchdog = NULL;
-                list_for_each_entry(tmp, &clocksource_list, list) {
-                        if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY)
-                                continue;
-                        if (!watchdog || tmp->rating > watchdog->rating)
-                                watchdog = tmp;
                }
        }
-        cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
-        /* Check if the watchdog timer needs to be stopped. */
-        clocksource_stop_watchdog();
        spin_unlock_irqrestore(&watchdog_lock, flags);
 }
-static int clocksource_watchdog_kthread(void *data)
+static int __clocksource_watchdog_kthread(void)
 {
        struct clocksource *cs, *tmp;
        unsigned long flags;
        LIST_HEAD(unstable);
+        int select = 0;
-        mutex_lock(&clocksource_mutex);
        spin_lock_irqsave(&watchdog_lock, flags);
-        list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list)
+        list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
                if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
                        list_del_init(&cs->wd_list);
                        list_add(&cs->wd_list, &unstable);
+                        select = 1;
                }
+                if (cs->flags & CLOCK_SOURCE_RESELECT) {
+                        cs->flags &= ~CLOCK_SOURCE_RESELECT;
+                        select = 1;
+                }
+        }
        /* Check if the watchdog timer needs to be stopped. */
        clocksource_stop_watchdog();
        spin_unlock_irqrestore(&watchdog_lock, flags);
@@ -435,10 +450,23 @@ static int clocksource_watchdog_kthread(void *data)
                list_del_init(&cs->wd_list);
                __clocksource_change_rating(cs, 0);
        }
+        return select;
+}
+static int clocksource_watchdog_kthread(void *data)
+{
+        mutex_lock(&clocksource_mutex);
+        if (__clocksource_watchdog_kthread())
+                clocksource_select();
        mutex_unlock(&clocksource_mutex);
        return 0;
 }
+static bool clocksource_is_watchdog(struct clocksource *cs)
+{
+        return cs == watchdog;
+}
 #else /* CONFIG_CLOCKSOURCE_WATCHDOG */
 static void clocksource_enqueue_watchdog(struct clocksource *cs)
@@ -449,7 +477,8 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
 static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
 static inline void clocksource_resume_watchdog(void) { }
-static inline int clocksource_watchdog_kthread(void *data) { return 0; }
+static inline int __clocksource_watchdog_kthread(void) { return 0; }
+static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }
 #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
@@ -553,24 +582,42 @@ static u64 clocksource_max_deferment(struct clocksource *cs)
 #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
-/**
+static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
- * clocksource_select - Select the best clocksource available
- *
- * Private function. Must hold clocksource_mutex when called.
- *
- * Select the clocksource with the best rating, or the clocksource,
- * which is selected by userspace override.
- */
-static void clocksource_select(void)
 {
-        struct clocksource *best, *cs;
+        struct clocksource *cs;
        if (!finished_booting || list_empty(&clocksource_list))
+                return NULL;
+        /*
+         * We pick the clocksource with the highest rating. If oneshot
+         * mode is active, we pick the highres valid clocksource with
+         * the best rating.
+         */
+        list_for_each_entry(cs, &clocksource_list, list) {
+                if (skipcur && cs == curr_clocksource)
+                        continue;
+                if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES))
+                        continue;
+                return cs;
+        }
+        return NULL;
+}
+static void __clocksource_select(bool skipcur)
+{
+        bool oneshot = tick_oneshot_mode_active();
+        struct clocksource *best, *cs;
+        /* Find the best suitable clocksource */
+        best = clocksource_find_best(oneshot, skipcur);
+        if (!best)
                return;
-        /* First clocksource on the list has the best rating. */
-        best = list_first_entry(&clocksource_list, struct clocksource, list);
        /* Check for the override clocksource. */
        list_for_each_entry(cs, &clocksource_list, list) {
+                if (skipcur && cs == curr_clocksource)
+                        continue;
                if (strcmp(cs->name, override_name) != 0)
                        continue;
                /*
@@ -578,8 +625,7 @@ static void clocksource_select(void)
                 * capable clocksource if the tick code is in oneshot
                 * mode (highres or nohz)
                 */
-                if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
+                if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) {
-                    tick_oneshot_mode_active()) {
                        /* Override clocksource cannot be used. */
                        printk(KERN_WARNING "Override clocksource %s is not "
                               "HRT compatible. Cannot switch while in "
@@ -590,16 +636,35 @@ static void clocksource_select(void)
                        best = cs;
                break;
        }
-        if (curr_clocksource != best) {
-                printk(KERN_INFO "Switching to clocksource %s\n", best->name);
+        if (curr_clocksource != best && !timekeeping_notify(best)) {
+                pr_info("Switched to clocksource %s\n", best->name);
                curr_clocksource = best;
-                timekeeping_notify(curr_clocksource);
        }
 }
+/**
+ * clocksource_select - Select the best clocksource available
+ *
+ * Private function. Must hold clocksource_mutex when called.
+ *
+ * Select the clocksource with the best rating, or the clocksource,
+ * which is selected by userspace override.
+ */
+static void clocksource_select(void)
+{
+        return __clocksource_select(false);
+}
+static void clocksource_select_fallback(void)
+{
+        return __clocksource_select(true);
+}
 #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */
 static inline void clocksource_select(void) { }
+static inline void clocksource_select_fallback(void) { }
 #endif
@@ -614,16 +679,11 @@ static int __init clocksource_done_booting(void)
 {
        mutex_lock(&clocksource_mutex);
        curr_clocksource = clocksource_default_clock();
-        mutex_unlock(&clocksource_mutex);
        finished_booting = 1;
        /*
         * Run the watchdog first to eliminate unstable clock sources
         */
-        clocksource_watchdog_kthread(NULL);
+        __clocksource_watchdog_kthread();
-        mutex_lock(&clocksource_mutex);
        clocksource_select();
        mutex_unlock(&clocksource_mutex);
        return 0;
@@ -756,7 +816,6 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating)
        list_del(&cs->list);
        cs->rating = rating;
        clocksource_enqueue(cs);
-        clocksource_select();
 }
 /**
@@ -768,21 +827,47 @@ void clocksource_change_rating(struct clocksource *cs, int rating)
 {
        mutex_lock(&clocksource_mutex);
        __clocksource_change_rating(cs, rating);
+        clocksource_select();
        mutex_unlock(&clocksource_mutex);
 }
 EXPORT_SYMBOL(clocksource_change_rating);
+/*
+ * Unbind clocksource @cs. Called with clocksource_mutex held
+ */
+static int clocksource_unbind(struct clocksource *cs)
+{
+        /*
+         * I really can't convince myself to support this on hardware
+         * designed by lobotomized monkeys.
+         */
+        if (clocksource_is_watchdog(cs))
+                return -EBUSY;
+        if (cs == curr_clocksource) {
+                /* Select and try to install a replacement clock source */
+                clocksource_select_fallback();
+                if (curr_clocksource == cs)
+                        return -EBUSY;
+        }
+        clocksource_dequeue_watchdog(cs);
+        list_del_init(&cs->list);
+        return 0;
+}
 /**
 * clocksource_unregister - remove a registered clocksource
 * @cs: clocksource to be unregistered
 */
-void clocksource_unregister(struct clocksource *cs)
+int clocksource_unregister(struct clocksource *cs)
 {
+        int ret = 0;
        mutex_lock(&clocksource_mutex);
-        clocksource_dequeue_watchdog(cs);
+        if (!list_empty(&cs->list))
-        list_del(&cs->list);
+                ret = clocksource_unbind(cs);
-        clocksource_select();
        mutex_unlock(&clocksource_mutex);
+        return ret;
 }
 EXPORT_SYMBOL(clocksource_unregister);
@@ -808,6 +893,23 @@ sysfs_show_current_clocksources(struct device *dev,
        return count;
 }
+size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
+{
+        size_t ret = cnt;
+        /* strings from sysfs write are not 0 terminated! */
+        if (!cnt || cnt >= CS_NAME_LEN)
+                return -EINVAL;
+        /* strip of \n: */
+        if (buf[cnt-1] == '\n')
+                cnt--;
+        if (cnt > 0)
+                memcpy(dst, buf, cnt);
+        dst[cnt] = 0;
+        return ret;
+}
 /**
 * sysfs_override_clocksource - interface for manually overriding clocksource
 * @dev:        unused
@@ -822,22 +924,13 @@ static ssize_t sysfs_override_clocksource(struct device *dev,
                                          struct device_attribute *attr,
                                          const char *buf, size_t count)
 {
-        size_t ret = count;
+        size_t ret;
-        /* strings from sysfs write are not 0 terminated! */
-        if (count >= sizeof(override_name))
-                return -EINVAL;
-        /* strip of \n: */
-        if (buf[count-1] == '\n')
-                count--;
        mutex_lock(&clocksource_mutex);
-        if (count > 0)
+        ret = sysfs_get_uname(buf, override_name, count);
-                memcpy(override_name, buf, count);
+        if (ret >= 0)
-        override_name[count] = 0;
+                clocksource_select();
-        clocksource_select();
        mutex_unlock(&clocksource_mutex);
@@ -845,6 +938,40 @@ static ssize_t sysfs_override_clocksource(struct device *dev,
 }
 /**
+ * sysfs_unbind_current_clocksource - interface for manually unbinding clocksource
+ * @dev:        unused
+ * @attr:       unused
+ * @buf:        unused
+ * @count:      length of buffer
+ *
+ * Takes input from sysfs interface for manually unbinding a clocksource.
+ */
+static ssize_t sysfs_unbind_clocksource(struct device *dev,
+                                        struct device_attribute *attr,
+                                        const char *buf, size_t count)
+{
+        struct clocksource *cs;
+        char name[CS_NAME_LEN];
+        size_t ret;
+        ret = sysfs_get_uname(buf, name, count);
+        if (ret < 0)
+                return ret;
+        ret = -ENODEV;
+        mutex_lock(&clocksource_mutex);
+        list_for_each_entry(cs, &clocksource_list, list) {
+                if (strcmp(cs->name, name))
+                        continue;
+                ret = clocksource_unbind(cs);
+                break;
+        }
+        mutex_unlock(&clocksource_mutex);
+        return ret ? ret : count;
+}
+/**
 * sysfs_show_available_clocksources - sysfs interface for listing clocksource
 * @dev:        unused
 * @attr:       unused
@@ -886,6 +1013,8 @@ sysfs_show_available_clocksources(struct device *dev,
 static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
                   sysfs_override_clocksource);
+static DEVICE_ATTR(unbind_clocksource, 0200, NULL, sysfs_unbind_clocksource);
 static DEVICE_ATTR(available_clocksource, 0444,
                   sysfs_show_available_clocksources, NULL);
@@ -910,6 +1039,9 @@ static int __init init_clocksource_sysfs(void)
                                &device_clocksource,
                                &dev_attr_current_clocksource);
        if (!error)
+                error = device_create_file(&device_clocksource,
+                                           &dev_attr_unbind_clocksource);
+        if (!error)
                error = device_create_file(
                                &device_clocksource,
                                &dev_attr_available_clocksource);
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
new file mode 100644
index 000000000000..a326f27d7f09
--- /dev/null
+++ b/kernel/time/sched_clock.c
@@ -0,0 +1,212 @@
+/*
+ * sched_clock.c: support for extending counters to full 64-bit ns counter
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/clocksource.h>
+#include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/syscore_ops.h>
+#include <linux/timer.h>
+#include <linux/sched_clock.h>
+struct clock_data {
+        u64 epoch_ns;
+        u32 epoch_cyc;
+        u32 epoch_cyc_copy;
+        unsigned long rate;
+        u32 mult;
+        u32 shift;
+        bool suspended;
+};
+static void sched_clock_poll(unsigned long wrap_ticks);
+static DEFINE_TIMER(sched_clock_timer, sched_clock_poll, 0, 0);
+static int irqtime = -1;
+core_param(irqtime, irqtime, int, 0400);
+static struct clock_data cd = {
+        .mult   = NSEC_PER_SEC / HZ,
+};
+static u32 __read_mostly sched_clock_mask = 0xffffffff;
+static u32 notrace jiffy_sched_clock_read(void)
+{
+        return (u32)(jiffies - INITIAL_JIFFIES);
+}
+static u32 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
+static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
+{
+        return (cyc * mult) >> shift;
+}
+static unsigned long long notrace sched_clock_32(void)
+{
+        u64 epoch_ns;
+        u32 epoch_cyc;
+        u32 cyc;
+        if (cd.suspended)
+                return cd.epoch_ns;
+        /*
+         * Load the epoch_cyc and epoch_ns atomically.  We do this by
+         * ensuring that we always write epoch_cyc, epoch_ns and
+         * epoch_cyc_copy in strict order, and read them in strict order.
+         * If epoch_cyc and epoch_cyc_copy are not equal, then we're in
+         * the middle of an update, and we should repeat the load.
+         */
+        do {
+                epoch_cyc = cd.epoch_cyc;
+                smp_rmb();
+                epoch_ns = cd.epoch_ns;
+                smp_rmb();
+        } while (epoch_cyc != cd.epoch_cyc_copy);
+        cyc = read_sched_clock();
+        cyc = (cyc - epoch_cyc) & sched_clock_mask;
+        return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift);
+}
+/*
+ * Atomically update the sched_clock epoch.
+ */
+static void notrace update_sched_clock(void)
+{
+        unsigned long flags;
+        u32 cyc;
+        u64 ns;
+        cyc = read_sched_clock();
+        ns = cd.epoch_ns +
+                cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
+                          cd.mult, cd.shift);
+        /*
+         * Write epoch_cyc and epoch_ns in a way that the update is
+         * detectable in cyc_to_fixed_sched_clock().
+         */
+        raw_local_irq_save(flags);
+        cd.epoch_cyc_copy = cyc;
+        smp_wmb();
+        cd.epoch_ns = ns;
+        smp_wmb();
+        cd.epoch_cyc = cyc;
+        raw_local_irq_restore(flags);
+}
+static void sched_clock_poll(unsigned long wrap_ticks)
+{
+        mod_timer(&sched_clock_timer, round_jiffies(jiffies + wrap_ticks));
+        update_sched_clock();
+}
+void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
+{
+        unsigned long r, w;
+        u64 res, wrap;
+        char r_unit;
+        if (cd.rate > rate)
+                return;
+        BUG_ON(bits > 32);
+        WARN_ON(!irqs_disabled());
+        read_sched_clock = read;
+        sched_clock_mask = (1 << bits) - 1;
+        cd.rate = rate;
+        /* calculate the mult/shift to convert counter ticks to ns. */
+        clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 0);
+        r = rate;
+        if (r >= 4000000) {
+                r /= 1000000;
+                r_unit = 'M';
+        } else if (r >= 1000) {
+                r /= 1000;
+                r_unit = 'k';
+        } else
+                r_unit = ' ';
+        /* calculate how many ns until we wrap */
+        wrap = cyc_to_ns((1ULL << bits) - 1, cd.mult, cd.shift);
+        do_div(wrap, NSEC_PER_MSEC);
+        w = wrap;
+        /* calculate the ns resolution of this counter */
+        res = cyc_to_ns(1ULL, cd.mult, cd.shift);
+        pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lums\n",
+                bits, r, r_unit, res, w);
+        /*
+         * Start the timer to keep sched_clock() properly updated and
+         * sets the initial epoch.
+         */
+        sched_clock_timer.data = msecs_to_jiffies(w - (w / 10));
+        update_sched_clock();
+        /*
+         * Ensure that sched_clock() starts off at 0ns
+         */
+        cd.epoch_ns = 0;
+        /* Enable IRQ time accounting if we have a fast enough sched_clock */
+        if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
+                enable_sched_clock_irqtime();
+        pr_debug("Registered %pF as sched_clock source\n", read);
+}
+unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32;
+unsigned long long notrace sched_clock(void)
+{
+        return sched_clock_func();
+}
+void __init sched_clock_postinit(void)
+{
+        /*
+         * If no sched_clock function has been provided at that point,
+         * make it the final one one.
+         */
+        if (read_sched_clock == jiffy_sched_clock_read)
+                setup_sched_clock(jiffy_sched_clock_read, 32, HZ);
+        sched_clock_poll(sched_clock_timer.data);
+}
+static int sched_clock_suspend(void)
+{
+        sched_clock_poll(sched_clock_timer.data);
+        cd.suspended = true;
+        return 0;
+}
+static void sched_clock_resume(void)
+{
+        cd.epoch_cyc = read_sched_clock();
+        cd.epoch_cyc_copy = cd.epoch_cyc;
+        cd.suspended = false;
+}
+static struct syscore_ops sched_clock_ops = {
+        .suspend = sched_clock_suspend,
+        .resume = sched_clock_resume,
+};
+static int __init sched_clock_syscore_init(void)
+{
+        register_syscore_ops(&sched_clock_ops);
+        return 0;
+}
+device_initcall(sched_clock_syscore_init);
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 20d6fba70652..218bcb565fed 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -19,6 +19,7 @@
 #include <linux/profile.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
+#include <linux/module.h>
 #include "tick-internal.h"
@@ -29,6 +30,7 @@
 static struct tick_device tick_broadcast_device;
 static cpumask_var_t tick_broadcast_mask;
+static cpumask_var_t tick_broadcast_on;
 static cpumask_var_t tmpmask;
 static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
 static int tick_broadcast_force;
@@ -64,17 +66,34 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)
 /*
 * Check, if the device can be utilized as broadcast device:
 */
-int tick_check_broadcast_device(struct clock_event_device *dev)
+static bool tick_check_broadcast_device(struct clock_event_device *curdev,
+                                        struct clock_event_device *newdev)
+{
+        if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) ||
+            (newdev->features & CLOCK_EVT_FEAT_C3STOP))
+                return false;
+        if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT &&
+            !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
+                return false;
+        return !curdev || newdev->rating > curdev->rating;
+}
+/*
+ * Conditionally install/replace broadcast device
+ */
+void tick_install_broadcast_device(struct clock_event_device *dev)
 {
        struct clock_event_device *cur = tick_broadcast_device.evtdev;
-        if ((dev->features & CLOCK_EVT_FEAT_DUMMY) ||
+        if (!tick_check_broadcast_device(cur, dev))
-            (tick_broadcast_device.evtdev &&
+                return;
-             tick_broadcast_device.evtdev->rating >= dev->rating) ||
-             (dev->features & CLOCK_EVT_FEAT_C3STOP))
+        if (!try_module_get(dev->owner))
-                return 0;
+                return;
-        clockevents_exchange_device(tick_broadcast_device.evtdev, dev);
+        clockevents_exchange_device(cur, dev);
        if (cur)
                cur->event_handler = clockevents_handle_noop;
        tick_broadcast_device.evtdev = dev;
@@ -90,7 +109,6 @@ int tick_check_broadcast_device(struct clock_event_device *dev)
         */
        if (dev->features & CLOCK_EVT_FEAT_ONESHOT)
                tick_clock_notify();
-        return 1;
 }
 /*
@@ -123,8 +141,9 @@ static void tick_device_setup_broadcast_func(struct clock_event_device *dev)
 */
 int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
 {
+        struct clock_event_device *bc = tick_broadcast_device.evtdev;
        unsigned long flags;
-        int ret = 0;
+        int ret;
        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
@@ -138,20 +157,62 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
                dev->event_handler = tick_handle_periodic;
                tick_device_setup_broadcast_func(dev);
                cpumask_set_cpu(cpu, tick_broadcast_mask);
-                tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
+                if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
+                        tick_broadcast_start_periodic(bc);
+                else
+                        tick_broadcast_setup_oneshot(bc);
                ret = 1;
        } else {
                /*
-                 * When the new device is not affected by the stop
+                 * Clear the broadcast bit for this cpu if the
-                 * feature and the cpu is marked in the broadcast mask
+                 * device is not power state affected.
-                 * then clear the broadcast bit.
                 */
-                if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
+                if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
-                        int cpu = smp_processor_id();
                        cpumask_clear_cpu(cpu, tick_broadcast_mask);
-                        tick_broadcast_clear_oneshot(cpu);
+                else
-                } else {
                        tick_device_setup_broadcast_func(dev);
+                /*
+                 * Clear the broadcast bit if the CPU is not in
+                 * periodic broadcast on state.
+                 */
+                if (!cpumask_test_cpu(cpu, tick_broadcast_on))
+                        cpumask_clear_cpu(cpu, tick_broadcast_mask);
+                switch (tick_broadcast_device.mode) {
+                case TICKDEV_MODE_ONESHOT:
+                        /*
+                         * If the system is in oneshot mode we can
+                         * unconditionally clear the oneshot mask bit,
+                         * because the CPU is running and therefore
+                         * not in an idle state which causes the power
+                         * state affected device to stop. Let the
+                         * caller initialize the device.
+                         */
+                        tick_broadcast_clear_oneshot(cpu);
+                        ret = 0;
+                        break;
+                case TICKDEV_MODE_PERIODIC:
+                        /*
+                         * If the system is in periodic mode, check
+                         * whether the broadcast device can be
+                         * switched off now.
+                         */
+                        if (cpumask_empty(tick_broadcast_mask) && bc)
+                                clockevents_shutdown(bc);
+                        /*
+                         * If we kept the cpu in the broadcast mask,
+                         * tell the caller to leave the per cpu device
+                         * in shutdown state. The periodic interrupt
+                         * is delivered by the broadcast device.
+                         */
+                        ret = cpumask_test_cpu(cpu, tick_broadcast_mask);
+                        break;
+                default:
+                        /* Nothing to do */
+                        ret = 0;
+                        break;
                }
        }
        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
@@ -281,6 +342,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
        switch (*reason) {
        case CLOCK_EVT_NOTIFY_BROADCAST_ON:
        case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
+                cpumask_set_cpu(cpu, tick_broadcast_on);
                if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
                        if (tick_broadcast_device.mode ==
                            TICKDEV_MODE_PERIODIC)
@@ -290,8 +352,12 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
                        tick_broadcast_force = 1;
                break;
        case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
-                if (!tick_broadcast_force &&
+                if (tick_broadcast_force)
-                    cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) {
+                        break;
+                cpumask_clear_cpu(cpu, tick_broadcast_on);
+                if (!tick_device_is_functional(dev))
+                        break;
+                if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) {
                        if (tick_broadcast_device.mode ==
                            TICKDEV_MODE_PERIODIC)
                                tick_setup_periodic(dev, 0);
@@ -349,6 +415,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
        bc = tick_broadcast_device.evtdev;
        cpumask_clear_cpu(cpu, tick_broadcast_mask);
+        cpumask_clear_cpu(cpu, tick_broadcast_on);
        if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
                if (bc && cpumask_empty(tick_broadcast_mask))
@@ -475,7 +542,15 @@ void tick_check_oneshot_broadcast(int cpu)
        if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) {
                struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
-                clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
+                /*
+                 * We might be in the middle of switching over from
+                 * periodic to oneshot. If the CPU has not yet
+                 * switched over, leave the device alone.
+                 */
+                if (td->mode == TICKDEV_MODE_ONESHOT) {
+                        clockevents_set_mode(td->evtdev,
+                                             CLOCK_EVT_MODE_ONESHOT);
+                }
        }
 }
@@ -522,6 +597,13 @@ again:
        cpumask_clear(tick_broadcast_force_mask);
        /*
+         * Sanity check. Catch the case where we try to broadcast to
+         * offline cpus.
+         */
+        if (WARN_ON_ONCE(!cpumask_subset(tmpmask, cpu_online_mask)))
+                cpumask_and(tmpmask, tmpmask, cpu_online_mask);
+        /*
         * Wakeup the cpus which have an expired event.
         */
        tick_do_broadcast(tmpmask);
@@ -761,10 +843,12 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
        /*
-         * Clear the broadcast mask flag for the dead cpu, but do not
+         * Clear the broadcast masks for the dead cpu, but do not stop
-         * stop the broadcast device!
+         * the broadcast device!
         */
        cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
+        cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
+        cpumask_clear_cpu(cpu, tick_broadcast_force_mask);
        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
@@ -792,6 +876,7 @@ bool tick_broadcast_oneshot_available(void)
 void __init tick_broadcast_init(void)
 {
        zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT);
+        zalloc_cpumask_var(&tick_broadcast_on, GFP_NOWAIT);
        zalloc_cpumask_var(&tmpmask, GFP_NOWAIT);
 #ifdef CONFIG_TICK_ONESHOT
        zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 5d3fb100bc06..64522ecdfe0e 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -18,6 +18,7 @@
 #include <linux/percpu.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
+#include <linux/module.h>
 #include <asm/irq_regs.h>
@@ -33,7 +34,6 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
 ktime_t tick_next_period;
 ktime_t tick_period;
 int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
-static DEFINE_RAW_SPINLOCK(tick_device_lock);
 /*
 * Debugging: see timer_list.c
@@ -194,7 +194,8 @@ static void tick_setup_device(struct tick_device *td,
         * When global broadcasting is active, check if the current
         * device is registered as a placeholder for broadcast mode.
         * This allows us to handle this x86 misfeature in a generic
-         * way.
+         * way. This function also returns !=0 when we keep the
+         * current active broadcast state for this CPU.
         */
        if (tick_device_uses_broadcast(newdev, cpu))
                return;
@@ -205,17 +206,75 @@ static void tick_setup_device(struct tick_device *td,
                tick_setup_oneshot(newdev, handler, next_event);
 }
+void tick_install_replacement(struct clock_event_device *newdev)
+{
+        struct tick_device *td = &__get_cpu_var(tick_cpu_device);
+        int cpu = smp_processor_id();
+        clockevents_exchange_device(td->evtdev, newdev);
+        tick_setup_device(td, newdev, cpu, cpumask_of(cpu));
+        if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
+                tick_oneshot_notify();
+}
+static bool tick_check_percpu(struct clock_event_device *curdev,
+                              struct clock_event_device *newdev, int cpu)
+{
+        if (!cpumask_test_cpu(cpu, newdev->cpumask))
+                return false;
+        if (cpumask_equal(newdev->cpumask, cpumask_of(cpu)))
+                return true;
+        /* Check if irq affinity can be set */
+        if (newdev->irq >= 0 && !irq_can_set_affinity(newdev->irq))
+                return false;
+        /* Prefer an existing cpu local device */
+        if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu)))
+                return false;
+        return true;
+}
+static bool tick_check_preferred(struct clock_event_device *curdev,
+                                 struct clock_event_device *newdev)
+{
+        /* Prefer oneshot capable device */
+        if (!(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) {
+                if (curdev && (curdev->features & CLOCK_EVT_FEAT_ONESHOT))
+                        return false;
+                if (tick_oneshot_mode_active())
+                        return false;
+        }
+        /*
+         * Use the higher rated one, but prefer a CPU local device with a lower
+         * rating than a non-CPU local device
+         */
+        return !curdev ||
+                newdev->rating > curdev->rating ||
+               !cpumask_equal(curdev->cpumask, newdev->cpumask);
+}
+/*
+ * Check whether the new device is a better fit than curdev. curdev
+ * can be NULL !
+ */
+bool tick_check_replacement(struct clock_event_device *curdev,
+                            struct clock_event_device *newdev)
+{
+        if (tick_check_percpu(curdev, newdev, smp_processor_id()))
+                return false;
+        return tick_check_preferred(curdev, newdev);
+}
 /*
- * Check, if the new registered device should be used.
+ * Check, if the new registered device should be used. Called with
+ * clockevents_lock held and interrupts disabled.
 */
-static int tick_check_new_device(struct clock_event_device *newdev)
+void tick_check_new_device(struct clock_event_device *newdev)
 {
        struct clock_event_device *curdev;
        struct tick_device *td;
-        int cpu, ret = NOTIFY_OK;
+        int cpu;
-        unsigned long flags;
-        raw_spin_lock_irqsave(&tick_device_lock, flags);
        cpu = smp_processor_id();
        if (!cpumask_test_cpu(cpu, newdev->cpumask))
@@ -225,40 +284,15 @@ static int tick_check_new_device(struct clock_event_device *newdev)
        curdev = td->evtdev;
        /* cpu local device ? */
-        if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) {
+        if (!tick_check_percpu(curdev, newdev, cpu))
+                goto out_bc;
-                /*
-                 * If the cpu affinity of the device interrupt can not
-                 * be set, ignore it.
-                 */
-                if (!irq_can_set_affinity(newdev->irq))
-                        goto out_bc;
-                /*
+        /* Preference decision */
-                 * If we have a cpu local device already, do not replace it
+        if (!tick_check_preferred(curdev, newdev))
-                 * by a non cpu local device
+                goto out_bc;
-                 */
-                if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu)))
-                        goto out_bc;
-        }
-        /*
+        if (!try_module_get(newdev->owner))
-         * If we have an active device, then check the rating and the oneshot
+                return;
-         * feature.
-         */
-        if (curdev) {
-                /*
-                 * Prefer one shot capable devices !
-                 */
-                if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) &&
-                    !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
-                        goto out_bc;
-                /*
-                 * Check the rating
-                 */
-                if (curdev->rating >= newdev->rating)
-                        goto out_bc;
-        }
        /*
         * Replace the eventually existing device by the new
@@ -273,20 +307,13 @@ static int tick_check_new_device(struct clock_event_device *newdev)
        tick_setup_device(td, newdev, cpu, cpumask_of(cpu));
        if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
                tick_oneshot_notify();
+        return;
-        raw_spin_unlock_irqrestore(&tick_device_lock, flags);
-        return NOTIFY_STOP;
 out_bc:
        /*
         * Can the new device be used as a broadcast device ?
         */
-        if (tick_check_broadcast_device(newdev))
+        tick_install_broadcast_device(newdev);
-                ret = NOTIFY_STOP;
-        raw_spin_unlock_irqrestore(&tick_device_lock, flags);
-        return ret;
 }
 /*
@@ -294,7 +321,7 @@ out_bc:
 *
 * Called with interrupts disabled.
 */
-static void tick_handover_do_timer(int *cpup)
+void tick_handover_do_timer(int *cpup)
 {
        if (*cpup == tick_do_timer_cpu) {
                int cpu = cpumask_first(cpu_online_mask);
@@ -311,13 +338,11 @@ static void tick_handover_do_timer(int *cpup)
 * access the hardware device itself.
 * We just set the mode and remove it from the lists.
 */
-static void tick_shutdown(unsigned int *cpup)
+void tick_shutdown(unsigned int *cpup)
 {
        struct tick_device *td = &per_cpu(tick_cpu_device, *cpup);
        struct clock_event_device *dev = td->evtdev;
-        unsigned long flags;
-        raw_spin_lock_irqsave(&tick_device_lock, flags);
        td->mode = TICKDEV_MODE_PERIODIC;
        if (dev) {
                /*
@@ -329,26 +354,20 @@ static void tick_shutdown(unsigned int *cpup)
                dev->event_handler = clockevents_handle_noop;
                td->evtdev = NULL;
        }
-        raw_spin_unlock_irqrestore(&tick_device_lock, flags);
 }
-static void tick_suspend(void)
+void tick_suspend(void)
 {
        struct tick_device *td = &__get_cpu_var(tick_cpu_device);
-        unsigned long flags;
-        raw_spin_lock_irqsave(&tick_device_lock, flags);
        clockevents_shutdown(td->evtdev);
-        raw_spin_unlock_irqrestore(&tick_device_lock, flags);
 }
-static void tick_resume(void)
+void tick_resume(void)
 {
        struct tick_device *td = &__get_cpu_var(tick_cpu_device);
-        unsigned long flags;
        int broadcast = tick_resume_broadcast();
-        raw_spin_lock_irqsave(&tick_device_lock, flags);
        clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
        if (!broadcast) {
@@ -357,68 +376,12 @@ static void tick_resume(void)
                else
                        tick_resume_oneshot();
        }
-        raw_spin_unlock_irqrestore(&tick_device_lock, flags);
 }
-/*
- * Notification about clock event devices
- */
-static int tick_notify(struct notifier_block *nb, unsigned long reason,
-                               void *dev)
-{
-        switch (reason) {
-        case CLOCK_EVT_NOTIFY_ADD:
-                return tick_check_new_device(dev);
-        case CLOCK_EVT_NOTIFY_BROADCAST_ON:
-        case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
-        case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
-                tick_broadcast_on_off(reason, dev);
-                break;
-        case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
-        case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
-                tick_broadcast_oneshot_control(reason);
-                break;
-        case CLOCK_EVT_NOTIFY_CPU_DYING:
-                tick_handover_do_timer(dev);
-                break;
-        case CLOCK_EVT_NOTIFY_CPU_DEAD:
-                tick_shutdown_broadcast_oneshot(dev);
-                tick_shutdown_broadcast(dev);
-                tick_shutdown(dev);
-                break;
-        case CLOCK_EVT_NOTIFY_SUSPEND:
-                tick_suspend();
-                tick_suspend_broadcast();
-                break;
-        case CLOCK_EVT_NOTIFY_RESUME:
-                tick_resume();
-                break;
-        default:
-                break;
-        }
-        return NOTIFY_OK;
-}
-static struct notifier_block tick_notifier = {
-        .notifier_call = tick_notify,
-};
 /**
 * tick_init - initialize the tick control
- *
- * Register the notifier with the clockevents framework
 */
 void __init tick_init(void)
 {
-        clockevents_register_notifier(&tick_notifier);
        tick_broadcast_init();
 }
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index f0299eae4602..bc906cad709b 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -6,6 +6,8 @@
 extern seqlock_t jiffies_lock;
+#define CS_NAME_LEN     32
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
 #define TICK_DO_TIMER_NONE      -1
@@ -18,9 +20,19 @@ extern int tick_do_timer_cpu __read_mostly;
 extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
 extern void tick_handle_periodic(struct clock_event_device *dev);
+extern void tick_check_new_device(struct clock_event_device *dev);
+extern void tick_handover_do_timer(int *cpup);
+extern void tick_shutdown(unsigned int *cpup);
+extern void tick_suspend(void);
+extern void tick_resume(void);
+extern bool tick_check_replacement(struct clock_event_device *curdev,
+                                   struct clock_event_device *newdev);
+extern void tick_install_replacement(struct clock_event_device *dev);
 extern void clockevents_shutdown(struct clock_event_device *dev);
+extern size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
 /*
 * NO_HZ / high resolution timer shared code
 */
@@ -90,7 +102,7 @@ static inline bool tick_broadcast_oneshot_available(void) { return false; }
 */
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
 extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
-extern int tick_check_broadcast_device(struct clock_event_device *dev);
+extern void tick_install_broadcast_device(struct clock_event_device *dev);
 extern int tick_is_broadcast_device(struct clock_event_device *dev);
 extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
 extern void tick_shutdown_broadcast(unsigned int *cpup);
@@ -102,9 +114,8 @@ tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
 #else /* !BROADCAST */
-static inline int tick_check_broadcast_device(struct clock_event_device *dev)
+static inline void tick_install_broadcast_device(struct clock_event_device *dev)
 {
-        return 0;
 }
 static inline int tick_is_broadcast_device(struct clock_event_device *dev)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 0cf1c1453181..e77edc97e036 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -178,6 +178,11 @@ static bool can_stop_full_tick(void)
         */
        if (!sched_clock_stable) {
                trace_tick_stop(0, "unstable sched clock\n");
+                /*
+                 * Don't allow the user to think they can get
+                 * full NO_HZ with this machine.
+                 */
+                WARN_ONCE(1, "NO_HZ FULL will not work with unstable sched clock");
                return false;
        }
 #endif
@@ -293,7 +298,7 @@ static int __init tick_nohz_full_setup(char *str)
 }
 __setup("nohz_full=", tick_nohz_full_setup);
-static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb,
+static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
                                                 unsigned long action,
                                                 void *hcpu)
 {
@@ -346,16 +351,6 @@ void __init tick_nohz_init(void)
        }
        cpu_notifier(tick_nohz_cpu_down_callback, 0);
-        /* Make sure full dynticks CPU are also RCU nocbs */
-        for_each_cpu(cpu, nohz_full_mask) {
-                if (!rcu_is_nocb_cpu(cpu)) {
-                        pr_warning("NO_HZ: CPU %d is not RCU nocb: "
-                                   "cleared from nohz_full range", cpu);
-                        cpumask_clear_cpu(cpu, nohz_full_mask);
-                }
-        }
        cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask);
        pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf);
 }
@@ -832,13 +827,10 @@ void tick_nohz_irq_exit(void)
 {
        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
-        if (ts->inidle) {
+        if (ts->inidle)
-                /* Cancel the timer because CPU already waken up from the C-states*/
-                menu_hrtimer_cancel();
                __tick_nohz_idle_enter(ts);
-        } else {
+        else
                tick_nohz_full_stop_tick(ts);
-        }
 }
 /**
@@ -936,8 +928,6 @@ void tick_nohz_idle_exit(void)
        ts->inidle = 0;
-        /* Cancel the timer because CPU already waken up from the C-states*/
-        menu_hrtimer_cancel();
        if (ts->idle_active || ts->tick_stopped)
                now = ktime_get();
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index baeeb5c87cf1..48b9fffabdc2 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -25,6 +25,11 @@
 #include "tick-internal.h"
 #include "ntp_internal.h"
+#include "timekeeping_internal.h"
+#define TK_CLEAR_NTP            (1 << 0)
+#define TK_MIRROR               (1 << 1)
+#define TK_CLOCK_WAS_SET        (1 << 2)
 static struct timekeeper timekeeper;
 static DEFINE_RAW_SPINLOCK(timekeeper_lock);
@@ -200,9 +205,9 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
 static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
-static void update_pvclock_gtod(struct timekeeper *tk)
+static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
 {
-        raw_notifier_call_chain(&pvclock_gtod_chain, 0, tk);
+        raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk);
 }
 /**
@@ -216,7 +221,7 @@ int pvclock_gtod_register_notifier(struct notifier_block *nb)
        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
-        update_pvclock_gtod(tk);
+        update_pvclock_gtod(tk, true);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
        return ret;
@@ -241,16 +246,16 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
 EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
 /* must hold timekeeper_lock */
-static void timekeeping_update(struct timekeeper *tk, bool clearntp, bool mirror)
+static void timekeeping_update(struct timekeeper *tk, unsigned int action)
 {
-        if (clearntp) {
+        if (action & TK_CLEAR_NTP) {
                tk->ntp_error = 0;
                ntp_clear();
        }
        update_vsyscall(tk);
-        update_pvclock_gtod(tk);
+        update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
-        if (mirror)
+        if (action & TK_MIRROR)
                memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
 }
@@ -508,7 +513,7 @@ int do_settimeofday(const struct timespec *tv)
        tk_set_xtime(tk, tv);
-        timekeeping_update(tk, true, true);
+        timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
        write_seqcount_end(&timekeeper_seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -552,7 +557,7 @@ int timekeeping_inject_offset(struct timespec *ts)
        tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts));
 error: /* even if we error out, we forwarded the time, so call update */
-        timekeeping_update(tk, true, true);
+        timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
        write_seqcount_end(&timekeeper_seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -627,13 +632,22 @@ static int change_clocksource(void *data)
        write_seqcount_begin(&timekeeper_seq);
        timekeeping_forward_now(tk);
-        if (!new->enable || new->enable(new) == 0) {
+        /*
-                old = tk->clock;
+         * If the cs is in module, get a module reference. Succeeds
-                tk_setup_internals(tk, new);
+         * for built-in code (owner == NULL) as well.
-                if (old->disable)
+         */
-                        old->disable(old);
+        if (try_module_get(new->owner)) {
+                if (!new->enable || new->enable(new) == 0) {
+                        old = tk->clock;
+                        tk_setup_internals(tk, new);
+                        if (old->disable)
+                                old->disable(old);
+                        module_put(old->owner);
+                } else {
+                        module_put(new->owner);
+                }
        }
-        timekeeping_update(tk, true, true);
+        timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
        write_seqcount_end(&timekeeper_seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -648,14 +662,15 @@ static int change_clocksource(void *data)
 * This function is called from clocksource.c after a new, better clock
 * source has been registered. The caller holds the clocksource_mutex.
 */
-void timekeeping_notify(struct clocksource *clock)
+int timekeeping_notify(struct clocksource *clock)
 {
        struct timekeeper *tk = &timekeeper;
        if (tk->clock == clock)
-                return;
+                return 0;
        stop_machine(change_clocksource, clock, NULL);
        tick_clock_notify();
+        return tk->clock == clock ? 0 : -1;
 }
 /**
@@ -841,6 +856,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
        tk_xtime_add(tk, delta);
        tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta));
        tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta));
+        tk_debug_account_sleep_time(delta);
 }
 /**
@@ -872,7 +888,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
        __timekeeping_inject_sleeptime(tk, delta);
-        timekeeping_update(tk, true, true);
+        timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
        write_seqcount_end(&timekeeper_seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -954,7 +970,7 @@ static void timekeeping_resume(void)
        tk->cycle_last = clock->cycle_last = cycle_now;
        tk->ntp_error = 0;
        timekeeping_suspended = 0;
-        timekeeping_update(tk, false, true);
+        timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
        write_seqcount_end(&timekeeper_seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -1236,9 +1252,10 @@ out_adjust:
 * It also calls into the NTP code to handle leapsecond processing.
 *
 */
-static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
+static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
 {
        u64 nsecps = (u64)NSEC_PER_SEC << tk->shift;
+        unsigned int action = 0;
        while (tk->xtime_nsec >= nsecps) {
                int leap;
@@ -1261,8 +1278,10 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
                        __timekeeping_set_tai_offset(tk, tk->tai_offset - leap);
                        clock_was_set_delayed();
+                        action = TK_CLOCK_WAS_SET;
                }
        }
+        return action;
 }
 /**
@@ -1347,6 +1366,7 @@ static void update_wall_time(void)
        struct timekeeper *tk = &shadow_timekeeper;
        cycle_t offset;
        int shift = 0, maxshift;
+        unsigned int action;
        unsigned long flags;
        raw_spin_lock_irqsave(&timekeeper_lock, flags);
@@ -1399,7 +1419,7 @@ static void update_wall_time(void)
         * Finally, make sure that after the rounding
         * xtime_nsec isn't larger than NSEC_PER_SEC
         */
-        accumulate_nsecs_to_secs(tk);
+        action = accumulate_nsecs_to_secs(tk);
        write_seqcount_begin(&timekeeper_seq);
        /* Update clock->cycle_last with the new value */
@@ -1415,7 +1435,7 @@ static void update_wall_time(void)
         * updating.
         */
        memcpy(real_tk, tk, sizeof(*tk));
-        timekeeping_update(real_tk, false, false);
+        timekeeping_update(real_tk, action);
        write_seqcount_end(&timekeeper_seq);
 out:
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -1677,6 +1697,7 @@ int do_adjtimex(struct timex *txc)
        if (tai != orig_tai) {
                __timekeeping_set_tai_offset(tk, tai);
+                update_pvclock_gtod(tk, true);
                clock_was_set_delayed();
        }
        write_seqcount_end(&timekeeper_seq);
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
new file mode 100644
index 000000000000..802433a4f5eb
--- /dev/null
+++ b/kernel/time/timekeeping_debug.c
@@ -0,0 +1,72 @@
+/*
+ * debugfs file to track time spent in suspend
+ *
+ * Copyright (c) 2011, Google, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/debugfs.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+#include <linux/time.h>
+static unsigned int sleep_time_bin[32] = {0};
+static int tk_debug_show_sleep_time(struct seq_file *s, void *data)
+{
+        unsigned int bin;
+        seq_puts(s, "      time (secs)        count\n");
+        seq_puts(s, "------------------------------\n");
+        for (bin = 0; bin < 32; bin++) {
+                if (sleep_time_bin[bin] == 0)
+                        continue;
+                seq_printf(s, "%10u - %-10u %4u\n",
+                        bin ? 1 << (bin - 1) : 0, 1 << bin,
+                                sleep_time_bin[bin]);
+        }
+        return 0;
+}
+static int tk_debug_sleep_time_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, tk_debug_show_sleep_time, NULL);
+}
+static const struct file_operations tk_debug_sleep_time_fops = {
+        .open           = tk_debug_sleep_time_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+static int __init tk_debug_sleep_time_init(void)
+{
+        struct dentry *d;
+        d = debugfs_create_file("sleep_time", 0444, NULL, NULL,
+                &tk_debug_sleep_time_fops);
+        if (!d) {
+                pr_err("Failed to create sleep_time debug file\n");
+                return -ENOMEM;
+        }
+        return 0;
+}
+late_initcall(tk_debug_sleep_time_init);
+void tk_debug_account_sleep_time(struct timespec *t)
+{
+        sleep_time_bin[fls(t->tv_sec)]++;
+}
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
new file mode 100644
index 000000000000..13323ea08ffa
--- /dev/null
+++ b/kernel/time/timekeeping_internal.h
@@ -0,0 +1,14 @@
+#ifndef _TIMEKEEPING_INTERNAL_H
+#define _TIMEKEEPING_INTERNAL_H
+/*
+ * timekeeping debug functions
+ */
+#include <linux/time.h>
+#ifdef CONFIG_DEBUG_FS
+extern void tk_debug_account_sleep_time(struct timespec *t);
+#else
+#define tk_debug_account_sleep_time(x)
+#endif
+#endif /* _TIMEKEEPING_INTERNAL_H */
diff --git a/kernel/timer.c b/kernel/timer.c
index 15ffdb3f1948..4296d13db3d1 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -149,9 +149,11 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu,
        /* now that we have rounded, subtract the extra skew again */
        j -= cpu * 3;
-        if (j <= jiffies) /* rounding ate our timeout entirely; */
+        /*
-                return original;
+         * Make sure j is still in the future. Otherwise return the
-        return j;
+         * unmodified value.
+         */
+        return time_is_after_jiffies(j) ? j : original;
 }
 /**
@@ -1503,11 +1505,11 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
 }
 EXPORT_SYMBOL(schedule_timeout_uninterruptible);
-static int __cpuinit init_timers_cpu(int cpu)
+static int init_timers_cpu(int cpu)
 {
        int j;
        struct tvec_base *base;
-        static char __cpuinitdata tvec_base_done[NR_CPUS];
+        static char tvec_base_done[NR_CPUS];
        if (!tvec_base_done[cpu]) {
                static char boot_done;
@@ -1575,7 +1577,7 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea
        }
 }
-static void __cpuinit migrate_timers(int cpu)
+static void migrate_timers(int cpu)
 {
        struct tvec_base *old_base;
        struct tvec_base *new_base;
@@ -1608,7 +1610,7 @@ static void __cpuinit migrate_timers(int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
-static int __cpuinit timer_cpu_notify(struct notifier_block *self,
+static int timer_cpu_notify(struct notifier_block *self,
                                unsigned long action, void *hcpu)
 {
        long cpu = (long)hcpu;
@@ -1633,7 +1635,7 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self,
        return NOTIFY_OK;
 }
-static struct notifier_block __cpuinitdata timers_nb = {
+static struct notifier_block timers_nb = {
        .notifier_call  = timer_cpu_notify,
 };
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6c508ff33c62..a6d098c6df3f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -413,6 +413,17 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
        return 0;
 }
+static void ftrace_sync(struct work_struct *work)
+{
+        /*
+         * This function is just a stub to implement a hard force
+         * of synchronize_sched(). This requires synchronizing
+         * tasks even in userspace and idle.
+         *
+         * Yes, function tracing is rude.
+         */
+}
 static int __unregister_ftrace_function(struct ftrace_ops *ops)
 {
        int ret;
@@ -440,8 +451,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
                         * so there'll be no new users. We must ensure
                         * all current users are done before we free
                         * the control data.
+                         * Note synchronize_sched() is not enough, as we
+                         * use preempt_disable() to do RCU, but the function
+                         * tracer can be called where RCU is not active
+                         * (before user_exit()).
                         */
-                        synchronize_sched();
+                        schedule_on_each_cpu(ftrace_sync);
                        control_ops_free(ops);
                }
        } else
@@ -456,9 +471,13 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
        /*
         * Dynamic ops may be freed, we must make sure that all
         * callers are done before leaving this function.
+         *
+         * Again, normal synchronize_sched() is not good enough.
+         * We need to do a hard force of sched synchronization.
         */
        if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
-                synchronize_sched();
+                schedule_on_each_cpu(ftrace_sync);
        return 0;
 }
@@ -622,12 +641,18 @@ static int function_stat_show(struct seq_file *m, void *v)
        if (rec->counter <= 1)
                stddev = 0;
        else {
-                stddev = rec->time_squared - rec->counter * avg * avg;
+                /*
+                 * Apply Welford's method:
+                 * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2)
+                 */
+                stddev = rec->counter * rec->time_squared -
+                         rec->time * rec->time;
                /*
                 * Divide only 1000 for ns^2 -> us^2 conversion.
                 * trace_print_graph_duration will divide 1000 again.
                 */
-                do_div(stddev, (rec->counter - 1) * 1000);
+                do_div(stddev, rec->counter * (rec->counter - 1) * 1000);
        }
        trace_seq_init(&s);
@@ -1416,12 +1441,22 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
 * the hashes are freed with call_rcu_sched().
 */
 static int
-ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
+ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
 {
        struct ftrace_hash *filter_hash;
        struct ftrace_hash *notrace_hash;
        int ret;
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+        /*
+         * There's a small race when adding ops that the ftrace handler
+         * that wants regs, may be called without them. We can not
+         * allow that handler to be called if regs is NULL.
+         */
+        if (regs == NULL && (ops->flags & FTRACE_OPS_FL_SAVE_REGS))
+                return 0;
+#endif
        filter_hash = rcu_dereference_raw_notrace(ops->filter_hash);
        notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash);
@@ -2134,12 +2169,57 @@ static cycle_t		ftrace_update_time;
 static unsigned long    ftrace_update_cnt;
 unsigned long           ftrace_update_tot_cnt;
-static int ops_traces_mod(struct ftrace_ops *ops)
+static inline int ops_traces_mod(struct ftrace_ops *ops)
 {
-        struct ftrace_hash *hash;
+        /*
+         * Filter_hash being empty will default to trace module.
+         * But notrace hash requires a test of individual module functions.
+         */
+        return ftrace_hash_empty(ops->filter_hash) &&
+                ftrace_hash_empty(ops->notrace_hash);
+}
+/*
+ * Check if the current ops references the record.
+ *
+ * If the ops traces all functions, then it was already accounted for.
+ * If the ops does not trace the current record function, skip it.
+ * If the ops ignores the function via notrace filter, skip it.
+ */
+static inline bool
+ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
+{
+        /* If ops isn't enabled, ignore it */
+        if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+                return 0;
-        hash = ops->filter_hash;
+        /* If ops traces all mods, we already accounted for it */
-        return ftrace_hash_empty(hash);
+        if (ops_traces_mod(ops))
+                return 0;
+        /* The function must be in the filter */
+        if (!ftrace_hash_empty(ops->filter_hash) &&
+            !ftrace_lookup_ip(ops->filter_hash, rec->ip))
+                return 0;
+        /* If in notrace hash, we ignore it too */
+        if (ftrace_lookup_ip(ops->notrace_hash, rec->ip))
+                return 0;
+        return 1;
+}
+static int referenced_filters(struct dyn_ftrace *rec)
+{
+        struct ftrace_ops *ops;
+        int cnt = 0;
+        for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
+                if (ops_references_rec(ops, rec))
+                    cnt++;
+        }
+        return cnt;
 }
 static int ftrace_update_code(struct module *mod)
@@ -2148,6 +2228,7 @@ static int ftrace_update_code(struct module *mod)
        struct dyn_ftrace *p;
        cycle_t start, stop;
        unsigned long ref = 0;
+        bool test = false;
        int i;
        /*
@@ -2161,9 +2242,12 @@ static int ftrace_update_code(struct module *mod)
                for (ops = ftrace_ops_list;
                     ops != &ftrace_list_end; ops = ops->next) {
-                        if (ops->flags & FTRACE_OPS_FL_ENABLED &&
+                        if (ops->flags & FTRACE_OPS_FL_ENABLED) {
-                            ops_traces_mod(ops))
+                                if (ops_traces_mod(ops))
-                                ref++;
+                                        ref++;
+                                else
+                                        test = true;
+                        }
                }
        }
@@ -2173,12 +2257,16 @@ static int ftrace_update_code(struct module *mod)
        for (pg = ftrace_new_pgs; pg; pg = pg->next) {
                for (i = 0; i < pg->index; i++) {
+                        int cnt = ref;
                        /* If something went wrong, bail without enabling anything */
                        if (unlikely(ftrace_disabled))
                                return -1;
                        p = &pg->records[i];
-                        p->flags = ref;
+                        if (test)
+                                cnt += referenced_filters(p);
+                        p->flags = cnt;
                        /*
                         * Do the initial record conversion from mcount jump
@@ -2198,7 +2286,7 @@ static int ftrace_update_code(struct module *mod)
                         * conversion puts the module to the correct state, thus
                         * passing the ftrace_make_call check.
                         */
-                        if (ftrace_start_up && ref) {
+                        if (ftrace_start_up && cnt) {
                                int failed = __ftrace_replace_code(p, 1);
                                if (failed)
                                        ftrace_bug(failed, p->ip);
@@ -3349,6 +3437,12 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
        return add_hash_entry(hash, ip);
 }
+static void ftrace_ops_update_code(struct ftrace_ops *ops)
+{
+        if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled)
+                ftrace_run_update_code(FTRACE_UPDATE_CALLS);
+}
 static int
 ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
                unsigned long ip, int remove, int reset, int enable)
@@ -3391,9 +3485,8 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
        mutex_lock(&ftrace_lock);
        ret = ftrace_hash_move(ops, enable, orig_hash, hash);
-        if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED
+        if (!ret)
-            && ftrace_enabled)
+                ftrace_ops_update_code(ops);
-                ftrace_run_update_code(FTRACE_UPDATE_CALLS);
        mutex_unlock(&ftrace_lock);
@@ -3512,8 +3605,12 @@ EXPORT_SYMBOL_GPL(ftrace_set_global_notrace);
 static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
 static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
+/* Used by function selftest to not test if filter is set */
+bool ftrace_filter_param __initdata;
 static int __init set_ftrace_notrace(char *str)
 {
+        ftrace_filter_param = true;
        strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
        return 1;
 }
@@ -3521,6 +3618,7 @@ __setup("ftrace_notrace=", set_ftrace_notrace);
 static int __init set_ftrace_filter(char *str)
 {
+        ftrace_filter_param = true;
        strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
        return 1;
 }
@@ -3615,9 +3713,8 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
                mutex_lock(&ftrace_lock);
                ret = ftrace_hash_move(iter->ops, filter_hash,
                                       orig_hash, iter->hash);
-                if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED)
+                if (!ret)
-                    && ftrace_enabled)
+                        ftrace_ops_update_code(iter->ops);
-                        ftrace_run_update_code(FTRACE_UPDATE_CALLS);
                mutex_unlock(&ftrace_lock);
        }
@@ -4188,7 +4285,7 @@ static inline void ftrace_startup_enable(int command) { }
 # define ftrace_shutdown_sysctl()       do { } while (0)
 static inline int
-ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
+ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
 {
        return 1;
 }
@@ -4211,7 +4308,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
        do_for_each_ftrace_op(op, ftrace_control_list) {
                if (!(op->flags & FTRACE_OPS_FL_STUB) &&
                    !ftrace_function_local_disabled(op) &&
-                    ftrace_ops_test(op, ip))
+                    ftrace_ops_test(op, ip, regs))
                        op->func(ip, parent_ip, op, regs);
        } while_for_each_ftrace_op(op);
        trace_recursion_clear(TRACE_CONTROL_BIT);
@@ -4244,7 +4341,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
         */
        preempt_disable_notrace();
        do_for_each_ftrace_op(op, ftrace_ops_list) {
-                if (ftrace_ops_test(op, ip))
+                if (ftrace_ops_test(op, ip, regs))
                        op->func(ip, parent_ip, op, regs);
        } while_for_each_ftrace_op(op);
        preempt_enable_notrace();
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index e444ff88f0a4..cc2f66f68dc5 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -36,11 +36,11 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
 {
        int ret;
-        ret = trace_seq_printf(s, "# compressed entry header\n");
+        ret = trace_seq_puts(s, "# compressed entry header\n");
-        ret = trace_seq_printf(s, "\ttype_len    :    5 bits\n");
+        ret = trace_seq_puts(s, "\ttype_len    :    5 bits\n");
-        ret = trace_seq_printf(s, "\ttime_delta  :   27 bits\n");
+        ret = trace_seq_puts(s, "\ttime_delta  :   27 bits\n");
-        ret = trace_seq_printf(s, "\tarray       :   32 bits\n");
+        ret = trace_seq_puts(s, "\tarray       :   32 bits\n");
-        ret = trace_seq_printf(s, "\n");
+        ret = trace_seq_putc(s, '\n');
        ret = trace_seq_printf(s, "\tpadding     : type == %d\n",
                               RINGBUF_TYPE_PADDING);
        ret = trace_seq_printf(s, "\ttime_extend : type == %d\n",
@@ -1066,7 +1066,7 @@ static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
 }
 /**
- * check_pages - integrity check of buffer pages
+ * rb_check_pages - integrity check of buffer pages
 * @cpu_buffer: CPU buffer with pages to test
 *
 * As a safety measure we check to make sure the data pages have not
@@ -1258,7 +1258,7 @@ static int rb_cpu_notify(struct notifier_block *self,
 #endif
 /**
- * ring_buffer_alloc - allocate a new ring_buffer
+ * __ring_buffer_alloc - allocate a new ring_buffer
 * @size: the size in bytes per cpu that is needed.
 * @flags: attributes to set for the ring buffer.
 *
@@ -1607,6 +1607,7 @@ static void update_pages_handler(struct work_struct *work)
 * ring_buffer_resize - resize the ring buffer
 * @buffer: the buffer to resize.
 * @size: the new size.
+ * @cpu_id: the cpu buffer to resize
 *
 * Minimum size is 2 * BUF_PAGE_SIZE.
 *
@@ -3956,11 +3957,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume);
 * expected.
 *
 * After a sequence of ring_buffer_read_prepare calls, the user is
- * expected to make at least one call to ring_buffer_prepare_sync.
+ * expected to make at least one call to ring_buffer_read_prepare_sync.
 * Afterwards, ring_buffer_read_start is invoked to get things going
 * for real.
 *
- * This overall must be paired with ring_buffer_finish.
+ * This overall must be paired with ring_buffer_read_finish.
 */
 struct ring_buffer_iter *
 ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
@@ -4009,7 +4010,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
 * an intervening ring_buffer_read_prepare_sync must have been
 * performed.
 *
- * Must be paired with ring_buffer_finish.
+ * Must be paired with ring_buffer_read_finish.
 */
 void
 ring_buffer_read_start(struct ring_buffer_iter *iter)
@@ -4031,7 +4032,7 @@ ring_buffer_read_start(struct ring_buffer_iter *iter)
 EXPORT_SYMBOL_GPL(ring_buffer_read_start);
 /**
- * ring_buffer_finish - finish reading the iterator of the buffer
+ * ring_buffer_read_finish - finish reading the iterator of the buffer
 * @iter: The iterator retrieved by ring_buffer_start
 *
 * This re-enables the recording to the buffer, and frees the
@@ -4346,6 +4347,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
 /**
 * ring_buffer_alloc_read_page - allocate a page to read from buffer
 * @buffer: the buffer to allocate for.
+ * @cpu: the cpu buffer to allocate.
 *
 * This function is used in conjunction with ring_buffer_read_page.
 * When reading a full page from the ring buffer, these functions
@@ -4403,7 +4405,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
 * to swap with a page in the ring buffer.
 *
 * for example:
- *      rpage = ring_buffer_alloc_read_page(buffer);
+ *      rpage = ring_buffer_alloc_read_page(buffer, cpu);
 *      if (!rpage)
 *              return error;
 *      ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e71a8be4a6ee..496f94d57698 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -115,6 +115,9 @@ cpumask_var_t __read_mostly	tracing_buffer_mask;
 enum ftrace_dump_mode ftrace_dump_on_oops;
+/* When set, tracing will stop when a WARN*() is hit */
+int __disable_trace_on_warning;
 static int tracing_set_tracer(const char *buf);
 #define MAX_TRACER_SIZE         100
@@ -149,6 +152,13 @@ static int __init set_ftrace_dump_on_oops(char *str)
 }
 __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
+static int __init stop_trace_on_warning(char *str)
+{
+        __disable_trace_on_warning = 1;
+        return 1;
+}
+__setup("traceoff_on_warning=", stop_trace_on_warning);
 static int __init boot_alloc_snapshot(char *str)
 {
        allocate_snapshot = true;
@@ -170,6 +180,7 @@ static int __init set_trace_boot_options(char *str)
 }
 __setup("trace_options=", set_trace_boot_options);
 unsigned long long ns2usecs(cycle_t nsec)
 {
        nsec += 500;
@@ -193,6 +204,37 @@ static struct trace_array	global_trace;
 LIST_HEAD(ftrace_trace_arrays);
+int trace_array_get(struct trace_array *this_tr)
+{
+        struct trace_array *tr;
+        int ret = -ENODEV;
+        mutex_lock(&trace_types_lock);
+        list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+                if (tr == this_tr) {
+                        tr->ref++;
+                        ret = 0;
+                        break;
+                }
+        }
+        mutex_unlock(&trace_types_lock);
+        return ret;
+}
+static void __trace_array_put(struct trace_array *this_tr)
+{
+        WARN_ON(!this_tr->ref);
+        this_tr->ref--;
+}
+void trace_array_put(struct trace_array *this_tr)
+{
+        mutex_lock(&trace_types_lock);
+        __trace_array_put(this_tr);
+        mutex_unlock(&trace_types_lock);
+}
 int filter_current_check_discard(struct ring_buffer *buffer,
                                 struct ftrace_event_call *call, void *rec,
                                 struct ring_buffer_event *event)
@@ -201,23 +243,43 @@ int filter_current_check_discard(struct ring_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(filter_current_check_discard);
-cycle_t ftrace_now(int cpu)
+cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
 {
        u64 ts;
        /* Early boot up does not have a buffer yet */
-        if (!global_trace.trace_buffer.buffer)
+        if (!buf->buffer)
                return trace_clock_local();
-        ts = ring_buffer_time_stamp(global_trace.trace_buffer.buffer, cpu);
+        ts = ring_buffer_time_stamp(buf->buffer, cpu);
-        ring_buffer_normalize_time_stamp(global_trace.trace_buffer.buffer, cpu, &ts);
+        ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts);
        return ts;
 }
+cycle_t ftrace_now(int cpu)
+{
+        return buffer_ftrace_now(&global_trace.trace_buffer, cpu);
+}
+/**
+ * tracing_is_enabled - Show if global_trace has been disabled
+ *
+ * Shows if the global trace has been enabled or not. It uses the
+ * mirror flag "buffer_disabled" to be used in fast paths such as for
+ * the irqsoff tracer. But it may be inaccurate due to races. If you
+ * need to know the accurate state, use tracing_is_on() which is a little
+ * slower, but accurate.
+ */
 int tracing_is_enabled(void)
 {
-        return tracing_is_on();
+        /*
+         * For quick access (irqsoff uses this in fast path), just
+         * return the mirror variable of the state of the ring buffer.
+         * It's a little racy, but we don't really care.
+         */
+        smp_rmb();
+        return !global_trace.buffer_disabled;
 }
 /*
@@ -240,7 +302,7 @@ static struct tracer		*trace_types __read_mostly;
 /*
 * trace_types_lock is used to protect the trace_types list.
 */
-static DEFINE_MUTEX(trace_types_lock);
+DEFINE_MUTEX(trace_types_lock);
 /*
 * serialize the access of the ring buffer
@@ -330,6 +392,23 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
        TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
        TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION;
+static void tracer_tracing_on(struct trace_array *tr)
+{
+        if (tr->trace_buffer.buffer)
+                ring_buffer_record_on(tr->trace_buffer.buffer);
+        /*
+         * This flag is looked at when buffers haven't been allocated
+         * yet, or by some tracers (like irqsoff), that just want to
+         * know if the ring buffer has been disabled, but it can handle
+         * races of where it gets disabled but we still do a record.
+         * As the check is in the fast path of the tracers, it is more
+         * important to be fast than accurate.
+         */
+        tr->buffer_disabled = 0;
+        /* Make the flag seen by readers */
+        smp_wmb();
+}
 /**
 * tracing_on - enable tracing buffers
 *
@@ -338,15 +417,7 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
 */
 void tracing_on(void)
 {
-        if (global_trace.trace_buffer.buffer)
+        tracer_tracing_on(&global_trace);
-                ring_buffer_record_on(global_trace.trace_buffer.buffer);
-        /*
-         * This flag is only looked at when buffers haven't been
-         * allocated yet. We don't really care about the race
-         * between setting this flag and actually turning
-         * on the buffer.
-         */
-        global_trace.buffer_disabled = 0;
 }
 EXPORT_SYMBOL_GPL(tracing_on);
@@ -540,6 +611,23 @@ void tracing_snapshot_alloc(void)
 EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
 #endif /* CONFIG_TRACER_SNAPSHOT */
+static void tracer_tracing_off(struct trace_array *tr)
+{
+        if (tr->trace_buffer.buffer)
+                ring_buffer_record_off(tr->trace_buffer.buffer);
+        /*
+         * This flag is looked at when buffers haven't been allocated
+         * yet, or by some tracers (like irqsoff), that just want to
+         * know if the ring buffer has been disabled, but it can handle
+         * races of where it gets disabled but we still do a record.
+         * As the check is in the fast path of the tracers, it is more
+         * important to be fast than accurate.
+         */
+        tr->buffer_disabled = 1;
+        /* Make the flag seen by readers */
+        smp_wmb();
+}
 /**
 * tracing_off - turn off tracing buffers
 *
@@ -550,26 +638,35 @@ EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
 */
 void tracing_off(void)
 {
-        if (global_trace.trace_buffer.buffer)
+        tracer_tracing_off(&global_trace);
-                ring_buffer_record_off(global_trace.trace_buffer.buffer);
-        /*
-         * This flag is only looked at when buffers haven't been
-         * allocated yet. We don't really care about the race
-         * between setting this flag and actually turning
-         * on the buffer.
-         */
-        global_trace.buffer_disabled = 1;
 }
 EXPORT_SYMBOL_GPL(tracing_off);
+void disable_trace_on_warning(void)
+{
+        if (__disable_trace_on_warning)
+                tracing_off();
+}
+/**
+ * tracer_tracing_is_on - show real state of ring buffer enabled
+ * @tr : the trace array to know if ring buffer is enabled
+ *
+ * Shows real state of the ring buffer if it is enabled or not.
+ */
+static int tracer_tracing_is_on(struct trace_array *tr)
+{
+        if (tr->trace_buffer.buffer)
+                return ring_buffer_record_is_on(tr->trace_buffer.buffer);
+        return !tr->buffer_disabled;
+}
 /**
 * tracing_is_on - show state of ring buffers enabled
 */
 int tracing_is_on(void)
 {
-        if (global_trace.trace_buffer.buffer)
+        return tracer_tracing_is_on(&global_trace);
-                return ring_buffer_record_is_on(global_trace.trace_buffer.buffer);
-        return !global_trace.buffer_disabled;
 }
 EXPORT_SYMBOL_GPL(tracing_is_on);
@@ -1119,7 +1216,7 @@ void tracing_reset_online_cpus(struct trace_buffer *buf)
        /* Make sure all commits have finished */
        synchronize_sched();
-        buf->time_start = ftrace_now(buf->cpu);
+        buf->time_start = buffer_ftrace_now(buf, buf->cpu);
        for_each_online_cpu(cpu)
                ring_buffer_reset_cpu(buffer, cpu);
@@ -1127,23 +1224,17 @@ void tracing_reset_online_cpus(struct trace_buffer *buf)
        ring_buffer_record_enable(buffer);
 }
-void tracing_reset_current(int cpu)
+/* Must have trace_types_lock held */
-{
-        tracing_reset(&global_trace.trace_buffer, cpu);
-}
 void tracing_reset_all_online_cpus(void)
 {
        struct trace_array *tr;
-        mutex_lock(&trace_types_lock);
        list_for_each_entry(tr, &ftrace_trace_arrays, list) {
                tracing_reset_online_cpus(&tr->trace_buffer);
 #ifdef CONFIG_TRACER_MAX_TRACE
                tracing_reset_online_cpus(&tr->max_buffer);
 #endif
        }
-        mutex_unlock(&trace_types_lock);
 }
 #define SAVED_CMDLINES 128
@@ -1543,15 +1634,6 @@ trace_function(struct trace_array *tr,
                __buffer_unlock_commit(buffer, event);
 }
-void
-ftrace(struct trace_array *tr, struct trace_array_cpu *data,
-       unsigned long ip, unsigned long parent_ip, unsigned long flags,
-       int pc)
-{
-        if (likely(!atomic_read(&data->disabled)))
-                trace_function(tr, ip, parent_ip, flags, pc);
-}
 #ifdef CONFIG_STACKTRACE
 #define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long))
@@ -2760,6 +2842,17 @@ static int s_show(struct seq_file *m, void *v)
        return 0;
 }
+/*
+ * Should be used after trace_array_get(), trace_types_lock
+ * ensures that i_cdev was already initialized.
+ */
+static inline int tracing_get_cpu(struct inode *inode)
+{
+        if (inode->i_cdev) /* See trace_create_cpu_file() */
+                return (long)inode->i_cdev - 1;
+        return RING_BUFFER_ALL_CPUS;
+}
 static const struct seq_operations tracer_seq_ops = {
        .start          = s_start,
        .next           = s_next,
@@ -2770,8 +2863,7 @@ static const struct seq_operations tracer_seq_ops = {
 static struct trace_iterator *
 __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 {
-        struct trace_cpu *tc = inode->i_private;
+        struct trace_array *tr = inode->i_private;
-        struct trace_array *tr = tc->tr;
        struct trace_iterator *iter;
        int cpu;
@@ -2812,8 +2904,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
                iter->trace_buffer = &tr->trace_buffer;
        iter->snapshot = snapshot;
        iter->pos = -1;
+        iter->cpu_file = tracing_get_cpu(inode);
        mutex_init(&iter->mutex);
-        iter->cpu_file = tc->cpu;
        /* Notify the tracer early; before we stop tracing. */
        if (iter->trace && iter->trace->open)
@@ -2850,8 +2942,6 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
                tracing_iter_reset(iter, cpu);
        }
-        tr->ref++;
        mutex_unlock(&trace_types_lock);
        return iter;
@@ -2874,24 +2964,41 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
        return 0;
 }
+/*
+ * Open and update trace_array ref count.
+ * Must have the current trace_array passed to it.
+ */
+static int tracing_open_generic_tr(struct inode *inode, struct file *filp)
+{
+        struct trace_array *tr = inode->i_private;
+        if (tracing_disabled)
+                return -ENODEV;
+        if (trace_array_get(tr) < 0)
+                return -ENODEV;
+        filp->private_data = inode->i_private;
+        return 0;
+}
 static int tracing_release(struct inode *inode, struct file *file)
 {
+        struct trace_array *tr = inode->i_private;
        struct seq_file *m = file->private_data;
        struct trace_iterator *iter;
-        struct trace_array *tr;
        int cpu;
-        if (!(file->f_mode & FMODE_READ))
+        if (!(file->f_mode & FMODE_READ)) {
+                trace_array_put(tr);
                return 0;
+        }
+        /* Writes do not use seq_file */
        iter = m->private;
-        tr = iter->tr;
        mutex_lock(&trace_types_lock);
-        WARN_ON(!tr->ref);
-        tr->ref--;
        for_each_tracing_cpu(cpu) {
                if (iter->buffer_iter[cpu])
                        ring_buffer_read_finish(iter->buffer_iter[cpu]);
@@ -2903,6 +3010,9 @@ static int tracing_release(struct inode *inode, struct file *file)
        if (!iter->snapshot)
                /* reenable tracing if it was previously enabled */
                tracing_start_tr(tr);
+        __trace_array_put(tr);
        mutex_unlock(&trace_types_lock);
        mutex_destroy(&iter->mutex);
@@ -2910,24 +3020,44 @@ static int tracing_release(struct inode *inode, struct file *file)
        kfree(iter->trace);
        kfree(iter->buffer_iter);
        seq_release_private(inode, file);
+        return 0;
+}
+static int tracing_release_generic_tr(struct inode *inode, struct file *file)
+{
+        struct trace_array *tr = inode->i_private;
+        trace_array_put(tr);
        return 0;
 }
+static int tracing_single_release_tr(struct inode *inode, struct file *file)
+{
+        struct trace_array *tr = inode->i_private;
+        trace_array_put(tr);
+        return single_release(inode, file);
+}
 static int tracing_open(struct inode *inode, struct file *file)
 {
+        struct trace_array *tr = inode->i_private;
        struct trace_iterator *iter;
        int ret = 0;
+        if (trace_array_get(tr) < 0)
+                return -ENODEV;
        /* If this file was open for write, then erase contents */
-        if ((file->f_mode & FMODE_WRITE) &&
+        if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
-            (file->f_flags & O_TRUNC)) {
+                int cpu = tracing_get_cpu(inode);
-                struct trace_cpu *tc = inode->i_private;
-                struct trace_array *tr = tc->tr;
-                if (tc->cpu == RING_BUFFER_ALL_CPUS)
+                if (cpu == RING_BUFFER_ALL_CPUS)
                        tracing_reset_online_cpus(&tr->trace_buffer);
                else
-                        tracing_reset(&tr->trace_buffer, tc->cpu);
+                        tracing_reset(&tr->trace_buffer, cpu);
        }
        if (file->f_mode & FMODE_READ) {
@@ -2937,6 +3067,10 @@ static int tracing_open(struct inode *inode, struct file *file)
                else if (trace_flags & TRACE_ITER_LATENCY_FMT)
                        iter->iter_flags |= TRACE_FILE_LAT_FMT;
        }
+        if (ret < 0)
+                trace_array_put(tr);
        return ret;
 }
@@ -3293,17 +3427,27 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
 static int tracing_trace_options_open(struct inode *inode, struct file *file)
 {
+        struct trace_array *tr = inode->i_private;
+        int ret;
        if (tracing_disabled)
                return -ENODEV;
-        return single_open(file, tracing_trace_options_show, inode->i_private);
+        if (trace_array_get(tr) < 0)
+                return -ENODEV;
+        ret = single_open(file, tracing_trace_options_show, inode->i_private);
+        if (ret < 0)
+                trace_array_put(tr);
+        return ret;
 }
 static const struct file_operations tracing_iter_fops = {
        .open           = tracing_trace_options_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-        .release        = single_release,
+        .release        = tracing_single_release_tr,
        .write          = tracing_trace_options_write,
 };
@@ -3379,14 +3523,14 @@ static const char readme_msg[] =
        "\n  snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n"
        "\t\t\t  Read the contents for more information\n"
 #endif
-#ifdef CONFIG_STACKTRACE
+#ifdef CONFIG_STACK_TRACER
        "  stack_trace\t\t- Shows the max stack trace when active\n"
        "  stack_max_size\t- Shows current max stack size that was traced\n"
        "\t\t\t  Write into this file to reset the max size (trigger a new trace)\n"
 #ifdef CONFIG_DYNAMIC_FTRACE
        "  stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n"
 #endif
-#endif /* CONFIG_STACKTRACE */
+#endif /* CONFIG_STACK_TRACER */
 ;
 static ssize_t
@@ -3783,20 +3927,23 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
 static int tracing_open_pipe(struct inode *inode, struct file *filp)
 {
-        struct trace_cpu *tc = inode->i_private;
+        struct trace_array *tr = inode->i_private;
-        struct trace_array *tr = tc->tr;
        struct trace_iterator *iter;
        int ret = 0;
        if (tracing_disabled)
                return -ENODEV;
+        if (trace_array_get(tr) < 0)
+                return -ENODEV;
        mutex_lock(&trace_types_lock);
        /* create a buffer to store the information to pass to userspace */
        iter = kzalloc(sizeof(*iter), GFP_KERNEL);
        if (!iter) {
                ret = -ENOMEM;
+                __trace_array_put(tr);
                goto out;
        }
@@ -3826,9 +3973,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
        if (trace_clocks[tr->clock_id].in_ns)
                iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
-        iter->cpu_file = tc->cpu;
+        iter->tr = tr;
-        iter->tr = tc->tr;
+        iter->trace_buffer = &tr->trace_buffer;
-        iter->trace_buffer = &tc->tr->trace_buffer;
+        iter->cpu_file = tracing_get_cpu(inode);
        mutex_init(&iter->mutex);
        filp->private_data = iter;
@@ -3843,6 +3990,7 @@ out:
 fail:
        kfree(iter->trace);
        kfree(iter);
+        __trace_array_put(tr);
        mutex_unlock(&trace_types_lock);
        return ret;
 }
@@ -3850,6 +3998,7 @@ fail:
 static int tracing_release_pipe(struct inode *inode, struct file *file)
 {
        struct trace_iterator *iter = file->private_data;
+        struct trace_array *tr = inode->i_private;
        mutex_lock(&trace_types_lock);
@@ -3863,6 +4012,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
        kfree(iter->trace);
        kfree(iter);
+        trace_array_put(tr);
        return 0;
 }
@@ -3939,7 +4090,7 @@ static int tracing_wait_pipe(struct file *filp)
                 *
                 * iter->pos will be 0 if we haven't read anything.
                 */
-                if (!tracing_is_enabled() && iter->pos)
+                if (!tracing_is_on() && iter->pos)
                        break;
        }
@@ -4000,6 +4151,7 @@ waitagain:
        memset(&iter->seq, 0,
               sizeof(struct trace_iterator) -
               offsetof(struct trace_iterator, seq));
+        cpumask_clear(iter->started);
        iter->pos = -1;
        trace_event_read_lock();
@@ -4200,15 +4352,16 @@ static ssize_t
 tracing_entries_read(struct file *filp, char __user *ubuf,
                     size_t cnt, loff_t *ppos)
 {
-        struct trace_cpu *tc = filp->private_data;
+        struct inode *inode = file_inode(filp);
-        struct trace_array *tr = tc->tr;
+        struct trace_array *tr = inode->i_private;
+        int cpu = tracing_get_cpu(inode);
        char buf[64];
        int r = 0;
        ssize_t ret;
        mutex_lock(&trace_types_lock);
-        if (tc->cpu == RING_BUFFER_ALL_CPUS) {
+        if (cpu == RING_BUFFER_ALL_CPUS) {
                int cpu, buf_size_same;
                unsigned long size;
@@ -4235,7 +4388,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
                } else
                        r = sprintf(buf, "X\n");
        } else
-                r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, tc->cpu)->entries >> 10);
+                r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10);
        mutex_unlock(&trace_types_lock);
@@ -4247,7 +4400,8 @@ static ssize_t
 tracing_entries_write(struct file *filp, const char __user *ubuf,
                      size_t cnt, loff_t *ppos)
 {
-        struct trace_cpu *tc = filp->private_data;
+        struct inode *inode = file_inode(filp);
+        struct trace_array *tr = inode->i_private;
        unsigned long val;
        int ret;
@@ -4261,8 +4415,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
        /* value is in KB */
        val <<= 10;
+        ret = tracing_resize_ring_buffer(tr, val, tracing_get_cpu(inode));
-        ret = tracing_resize_ring_buffer(tc->tr, val, tc->cpu);
        if (ret < 0)
                return ret;
@@ -4316,10 +4469,12 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
        /* disable tracing ? */
        if (trace_flags & TRACE_ITER_STOP_ON_FREE)
-                tracing_off();
+                tracer_tracing_off(tr);
        /* resize the ring buffer to 0 */
        tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS);
+        trace_array_put(tr);
        return 0;
 }
@@ -4328,6 +4483,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
                                        size_t cnt, loff_t *fpos)
 {
        unsigned long addr = (unsigned long)ubuf;
+        struct trace_array *tr = filp->private_data;
        struct ring_buffer_event *event;
        struct ring_buffer *buffer;
        struct print_entry *entry;
@@ -4387,7 +4543,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
        local_save_flags(irq_flags);
        size = sizeof(*entry) + cnt + 2; /* possible \n added */
-        buffer = global_trace.trace_buffer.buffer;
+        buffer = tr->trace_buffer.buffer;
        event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
                                          irq_flags, preempt_count());
        if (!event) {
@@ -4478,12 +4634,12 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
         * New clock may not be consistent with the previous clock.
         * Reset the buffer so that it doesn't have incomparable timestamps.
         */
-        tracing_reset_online_cpus(&global_trace.trace_buffer);
+        tracing_reset_online_cpus(&tr->trace_buffer);
 #ifdef CONFIG_TRACER_MAX_TRACE
        if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer)
                ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func);
-        tracing_reset_online_cpus(&global_trace.max_buffer);
+        tracing_reset_online_cpus(&tr->max_buffer);
 #endif
        mutex_unlock(&trace_types_lock);
@@ -4495,10 +4651,20 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
 static int tracing_clock_open(struct inode *inode, struct file *file)
 {
+        struct trace_array *tr = inode->i_private;
+        int ret;
        if (tracing_disabled)
                return -ENODEV;
-        return single_open(file, tracing_clock_show, inode->i_private);
+        if (trace_array_get(tr))
+                return -ENODEV;
+        ret = single_open(file, tracing_clock_show, inode->i_private);
+        if (ret < 0)
+                trace_array_put(tr);
+        return ret;
 }
 struct ftrace_buffer_info {
@@ -4510,31 +4676,40 @@ struct ftrace_buffer_info {
 #ifdef CONFIG_TRACER_SNAPSHOT
 static int tracing_snapshot_open(struct inode *inode, struct file *file)
 {
-        struct trace_cpu *tc = inode->i_private;
+        struct trace_array *tr = inode->i_private;
        struct trace_iterator *iter;
        struct seq_file *m;
        int ret = 0;
+        if (trace_array_get(tr) < 0)
+                return -ENODEV;
        if (file->f_mode & FMODE_READ) {
                iter = __tracing_open(inode, file, true);
                if (IS_ERR(iter))
                        ret = PTR_ERR(iter);
        } else {
                /* Writes still need the seq_file to hold the private data */
+                ret = -ENOMEM;
                m = kzalloc(sizeof(*m), GFP_KERNEL);
                if (!m)
-                        return -ENOMEM;
+                        goto out;
                iter = kzalloc(sizeof(*iter), GFP_KERNEL);
                if (!iter) {
                        kfree(m);
-                        return -ENOMEM;
+                        goto out;
                }
-                iter->tr = tc->tr;
+                ret = 0;
-                iter->trace_buffer = &tc->tr->max_buffer;
-                iter->cpu_file = tc->cpu;
+                iter->tr = tr;
+                iter->trace_buffer = &tr->max_buffer;
+                iter->cpu_file = tracing_get_cpu(inode);
                m->private = iter;
                file->private_data = m;
        }
+out:
+        if (ret < 0)
+                trace_array_put(tr);
        return ret;
 }
@@ -4616,9 +4791,12 @@ out:
 static int tracing_snapshot_release(struct inode *inode, struct file *file)
 {
        struct seq_file *m = file->private_data;
+        int ret;
+        ret = tracing_release(inode, file);
        if (file->f_mode & FMODE_READ)
-                return tracing_release(inode, file);
+                return ret;
        /* If write only, the seq_file is just a stub */
        if (m)
@@ -4684,34 +4862,38 @@ static const struct file_operations tracing_pipe_fops = {
 };
 static const struct file_operations tracing_entries_fops = {
-        .open           = tracing_open_generic,
+        .open           = tracing_open_generic_tr,
        .read           = tracing_entries_read,
        .write          = tracing_entries_write,
        .llseek         = generic_file_llseek,
+        .release        = tracing_release_generic_tr,
 };
 static const struct file_operations tracing_total_entries_fops = {
-        .open           = tracing_open_generic,
+        .open           = tracing_open_generic_tr,
        .read           = tracing_total_entries_read,
        .llseek         = generic_file_llseek,
+        .release        = tracing_release_generic_tr,
 };
 static const struct file_operations tracing_free_buffer_fops = {
+        .open           = tracing_open_generic_tr,
        .write          = tracing_free_buffer_write,
        .release        = tracing_free_buffer_release,
 };
 static const struct file_operations tracing_mark_fops = {
-        .open           = tracing_open_generic,
+        .open           = tracing_open_generic_tr,
        .write          = tracing_mark_write,
        .llseek         = generic_file_llseek,
+        .release        = tracing_release_generic_tr,
 };
 static const struct file_operations trace_clock_fops = {
        .open           = tracing_clock_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-        .release        = single_release,
+        .release        = tracing_single_release_tr,
        .write          = tracing_clock_write,
 };
@@ -4736,23 +4918,26 @@ static const struct file_operations snapshot_raw_fops = {
 static int tracing_buffers_open(struct inode *inode, struct file *filp)
 {
-        struct trace_cpu *tc = inode->i_private;
+        struct trace_array *tr = inode->i_private;
-        struct trace_array *tr = tc->tr;
        struct ftrace_buffer_info *info;
+        int ret;
        if (tracing_disabled)
                return -ENODEV;
+        if (trace_array_get(tr) < 0)
+                return -ENODEV;
        info = kzalloc(sizeof(*info), GFP_KERNEL);
-        if (!info)
+        if (!info) {
+                trace_array_put(tr);
                return -ENOMEM;
+        }
        mutex_lock(&trace_types_lock);
-        tr->ref++;
        info->iter.tr           = tr;
-        info->iter.cpu_file     = tc->cpu;
+        info->iter.cpu_file     = tracing_get_cpu(inode);
        info->iter.trace        = tr->current_trace;
        info->iter.trace_buffer = &tr->trace_buffer;
        info->spare             = NULL;
@@ -4763,7 +4948,11 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
        mutex_unlock(&trace_types_lock);
-        return nonseekable_open(inode, filp);
+        ret = nonseekable_open(inode, filp);
+        if (ret < 0)
+                trace_array_put(tr);
+        return ret;
 }
 static unsigned int
@@ -4863,8 +5052,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
        mutex_lock(&trace_types_lock);
-        WARN_ON(!iter->tr->ref);
+        __trace_array_put(iter->tr);
-        iter->tr->ref--;
        if (info->spare)
                ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare);
@@ -5066,14 +5254,14 @@ static ssize_t
 tracing_stats_read(struct file *filp, char __user *ubuf,
                   size_t count, loff_t *ppos)
 {
-        struct trace_cpu *tc = filp->private_data;
+        struct inode *inode = file_inode(filp);
-        struct trace_array *tr = tc->tr;
+        struct trace_array *tr = inode->i_private;
        struct trace_buffer *trace_buf = &tr->trace_buffer;
+        int cpu = tracing_get_cpu(inode);
        struct trace_seq *s;
        unsigned long cnt;
        unsigned long long t;
        unsigned long usec_rem;
-        int cpu = tc->cpu;
        s = kmalloc(sizeof(*s), GFP_KERNEL);
        if (!s)
@@ -5126,9 +5314,10 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
 }
 static const struct file_operations tracing_stats_fops = {
-        .open           = tracing_open_generic,
+        .open           = tracing_open_generic_tr,
        .read           = tracing_stats_read,
        .llseek         = generic_file_llseek,
+        .release        = tracing_release_generic_tr,
 };
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -5317,10 +5506,20 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
        return tr->percpu_dir;
 }
+static struct dentry *
+trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent,
+                      void *data, long cpu, const struct file_operations *fops)
+{
+        struct dentry *ret = trace_create_file(name, mode, parent, data, fops);
+        if (ret) /* See tracing_get_cpu() */
+                ret->d_inode->i_cdev = (void *)(cpu + 1);
+        return ret;
+}
 static void
 tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
 {
-        struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
        struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);
        struct dentry *d_cpu;
        char cpu_dir[30]; /* 30 characters should be more than enough */
@@ -5336,28 +5535,28 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
        }
        /* per cpu trace_pipe */
-        trace_create_file("trace_pipe", 0444, d_cpu,
+        trace_create_cpu_file("trace_pipe", 0444, d_cpu,
-                        (void *)&data->trace_cpu, &tracing_pipe_fops);
+                                tr, cpu, &tracing_pipe_fops);
        /* per cpu trace */
-        trace_create_file("trace", 0644, d_cpu,
+        trace_create_cpu_file("trace", 0644, d_cpu,
-                        (void *)&data->trace_cpu, &tracing_fops);
+                                tr, cpu, &tracing_fops);
-        trace_create_file("trace_pipe_raw", 0444, d_cpu,
+        trace_create_cpu_file("trace_pipe_raw", 0444, d_cpu,
-                        (void *)&data->trace_cpu, &tracing_buffers_fops);
+                                tr, cpu, &tracing_buffers_fops);
-        trace_create_file("stats", 0444, d_cpu,
+        trace_create_cpu_file("stats", 0444, d_cpu,
-                        (void *)&data->trace_cpu, &tracing_stats_fops);
+                                tr, cpu, &tracing_stats_fops);
-        trace_create_file("buffer_size_kb", 0444, d_cpu,
+        trace_create_cpu_file("buffer_size_kb", 0444, d_cpu,
-                        (void *)&data->trace_cpu, &tracing_entries_fops);
+                                tr, cpu, &tracing_entries_fops);
 #ifdef CONFIG_TRACER_SNAPSHOT
-        trace_create_file("snapshot", 0644, d_cpu,
+        trace_create_cpu_file("snapshot", 0644, d_cpu,
-                          (void *)&data->trace_cpu, &snapshot_fops);
+                                tr, cpu, &snapshot_fops);
-        trace_create_file("snapshot_raw", 0444, d_cpu,
+        trace_create_cpu_file("snapshot_raw", 0444, d_cpu,
-                        (void *)&data->trace_cpu, &snapshot_raw_fops);
+                                tr, cpu, &snapshot_raw_fops);
 #endif
 }
@@ -5612,15 +5811,10 @@ rb_simple_read(struct file *filp, char __user *ubuf,
               size_t cnt, loff_t *ppos)
 {
        struct trace_array *tr = filp->private_data;
-        struct ring_buffer *buffer = tr->trace_buffer.buffer;
        char buf[64];
        int r;
-        if (buffer)
+        r = tracer_tracing_is_on(tr);
-                r = ring_buffer_record_is_on(buffer);
-        else
-                r = 0;
        r = sprintf(buf, "%d\n", r);
        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
@@ -5642,11 +5836,11 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
        if (buffer) {
                mutex_lock(&trace_types_lock);
                if (val) {
-                        ring_buffer_record_on(buffer);
+                        tracer_tracing_on(tr);
                        if (tr->current_trace->start)
                                tr->current_trace->start(tr);
                } else {
-                        ring_buffer_record_off(buffer);
+                        tracer_tracing_off(tr);
                        if (tr->current_trace->stop)
                                tr->current_trace->stop(tr);
                }
@@ -5659,9 +5853,10 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
 }
 static const struct file_operations rb_simple_fops = {
-        .open           = tracing_open_generic,
+        .open           = tracing_open_generic_tr,
        .read           = rb_simple_read,
        .write          = rb_simple_write,
+        .release        = tracing_release_generic_tr,
        .llseek         = default_llseek,
 };
@@ -5670,17 +5865,6 @@ struct dentry *trace_instance_dir;
 static void
 init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer);
-static void init_trace_buffers(struct trace_array *tr, struct trace_buffer *buf)
-{
-        int cpu;
-        for_each_tracing_cpu(cpu) {
-                memset(per_cpu_ptr(buf->data, cpu), 0, sizeof(struct trace_array_cpu));
-                per_cpu_ptr(buf->data, cpu)->trace_cpu.cpu = cpu;
-                per_cpu_ptr(buf->data, cpu)->trace_cpu.tr = tr;
-        }
-}
 static int
 allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size)
 {
@@ -5698,8 +5882,6 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size
                return -ENOMEM;
        }
-        init_trace_buffers(tr, buf);
        /* Allocate the first page for all buffers */
        set_buffer_entries(&tr->trace_buffer,
                           ring_buffer_size(tr->trace_buffer.buffer, 0));
@@ -5766,17 +5948,15 @@ static int new_instance_create(const char *name)
        if (allocate_trace_buffers(tr, trace_buf_size) < 0)
                goto out_free_tr;
-        /* Holder for file callbacks */
-        tr->trace_cpu.cpu = RING_BUFFER_ALL_CPUS;
-        tr->trace_cpu.tr = tr;
        tr->dir = debugfs_create_dir(name, trace_instance_dir);
        if (!tr->dir)
                goto out_free_tr;
        ret = event_trace_add_tracer(tr->dir, tr);
-        if (ret)
+        if (ret) {
+                debugfs_remove_recursive(tr->dir);
                goto out_free_tr;
+        }
        init_tracer_debugfs(tr, tr->dir);
@@ -5922,18 +6102,18 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
                          tr, &tracing_iter_fops);
        trace_create_file("trace", 0644, d_tracer,
-                        (void *)&tr->trace_cpu, &tracing_fops);
+                          tr, &tracing_fops);
        trace_create_file("trace_pipe", 0444, d_tracer,
-                        (void *)&tr->trace_cpu, &tracing_pipe_fops);
+                          tr, &tracing_pipe_fops);
        trace_create_file("buffer_size_kb", 0644, d_tracer,
-                        (void *)&tr->trace_cpu, &tracing_entries_fops);
+                          tr, &tracing_entries_fops);
        trace_create_file("buffer_total_size_kb", 0444, d_tracer,
                          tr, &tracing_total_entries_fops);
-        trace_create_file("free_buffer", 0644, d_tracer,
+        trace_create_file("free_buffer", 0200, d_tracer,
                          tr, &tracing_free_buffer_fops);
        trace_create_file("trace_marker", 0220, d_tracer,
@@ -5943,11 +6123,11 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
                          &trace_clock_fops);
        trace_create_file("tracing_on", 0644, d_tracer,
-                            tr, &rb_simple_fops);
+                          tr, &rb_simple_fops);
 #ifdef CONFIG_TRACER_SNAPSHOT
        trace_create_file("snapshot", 0644, d_tracer,
-                          (void *)&tr->trace_cpu, &snapshot_fops);
+                          tr, &snapshot_fops);
 #endif
        for_each_tracing_cpu(cpu)
@@ -6241,10 +6421,6 @@ __init static int tracer_alloc_buffers(void)
        global_trace.flags = TRACE_ARRAY_FL_GLOBAL;
-        /* Holder for file callbacks */
-        global_trace.trace_cpu.cpu = RING_BUFFER_ALL_CPUS;
-        global_trace.trace_cpu.tr = &global_trace;
        INIT_LIST_HEAD(&global_trace.systems);
        INIT_LIST_HEAD(&global_trace.events);
        list_add(&global_trace.list, &ftrace_trace_arrays);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 20572ed88c5c..afaae41b0a02 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -130,19 +130,12 @@ enum trace_flag_type {
 struct trace_array;
-struct trace_cpu {
-        struct trace_array      *tr;
-        struct dentry           *dir;
-        int                     cpu;
-};
 /*
 * The CPU trace array - it consists of thousands of trace entries
 * plus some other descriptor data: (for example which task started
 * the trace, etc.)
 */
 struct trace_array_cpu {
-        struct trace_cpu        trace_cpu;
        atomic_t                disabled;
        void                    *buffer_page;   /* ring buffer spare */
@@ -196,7 +189,6 @@ struct trace_array {
        bool                    allocated_snapshot;
 #endif
        int                     buffer_disabled;
-        struct trace_cpu        trace_cpu;      /* place holder */
 #ifdef CONFIG_FTRACE_SYSCALLS
        int                     sys_refcount_enter;
        int                     sys_refcount_exit;
@@ -214,7 +206,6 @@ struct trace_array {
        struct dentry           *event_dir;
        struct list_head        systems;
        struct list_head        events;
-        struct task_struct      *waiter;
        int                     ref;
 };
@@ -224,6 +215,11 @@ enum {
 extern struct list_head ftrace_trace_arrays;
+extern struct mutex trace_types_lock;
+extern int trace_array_get(struct trace_array *tr);
+extern void trace_array_put(struct trace_array *tr);
 /*
 * The global tracer (top) should be the first trace array added,
 * but we check the flag anyway.
@@ -554,11 +550,6 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu);
 void poll_wait_pipe(struct trace_iterator *iter);
-void ftrace(struct trace_array *tr,
-                            struct trace_array_cpu *data,
-                            unsigned long ip,
-                            unsigned long parent_ip,
-                            unsigned long flags, int pc);
 void tracing_sched_switch_trace(struct trace_array *tr,
                                struct task_struct *prev,
                                struct task_struct *next,
@@ -680,6 +671,15 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace,
                                               struct trace_array *tr);
 extern int trace_selftest_startup_branch(struct tracer *trace,
                                         struct trace_array *tr);
+/*
+ * Tracer data references selftest functions that only occur
+ * on boot up. These can be __init functions. Thus, when selftests
+ * are enabled, then the tracers need to reference __init functions.
+ */
+#define __tracer_data           __refdata
+#else
+/* Tracers are seldom changed. Optimize when selftests are disabled. */
+#define __tracer_data           __read_mostly
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 extern void *head_page(struct trace_array_cpu *data);
@@ -774,6 +774,7 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 extern struct list_head ftrace_pids;
 #ifdef CONFIG_FUNCTION_TRACER
+extern bool ftrace_filter_param __initdata;
 static inline int ftrace_trace_task(struct task_struct *task)
 {
        if (list_empty(&ftrace_pids))
@@ -899,12 +900,6 @@ static inline void trace_branch_disable(void)
 /* set ring buffers to default size if not already done so */
 int tracing_update_buffers(void);
-/* trace event type bit fields, not numeric */
-enum {
-        TRACE_EVENT_TYPE_PRINTF         = 1,
-        TRACE_EVENT_TYPE_RAW            = 2,
-};
 struct ftrace_event_field {
        struct list_head        link;
        const char              *name;
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 84b1e045faba..80c36bcf66e8 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -236,6 +236,10 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
        BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
+        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
+                        "perf buffer not large enough"))
+                return NULL;
        pc = preempt_count();
        *rctxp = perf_swevent_get_recursion_context();
@@ -266,6 +270,10 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
        struct pt_regs regs;
        int rctx;
+        head = this_cpu_ptr(event_function.perf_events);
+        if (hlist_empty(head))
+                return;
 #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
                    sizeof(u64)) - sizeof(u32))
@@ -279,8 +287,6 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
        entry->ip = ip;
        entry->parent_ip = parent_ip;
-        head = this_cpu_ptr(event_function.perf_events);
        perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
                              1, &regs, head, NULL);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 27963e2bf4bf..29a7ebcfb426 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -41,6 +41,23 @@ static LIST_HEAD(ftrace_common_fields);
 static struct kmem_cache *field_cachep;
 static struct kmem_cache *file_cachep;
+#define SYSTEM_FL_FREE_NAME             (1 << 31)
+static inline int system_refcount(struct event_subsystem *system)
+{
+        return system->ref_count & ~SYSTEM_FL_FREE_NAME;
+}
+static int system_refcount_inc(struct event_subsystem *system)
+{
+        return (system->ref_count++) & ~SYSTEM_FL_FREE_NAME;
+}
+static int system_refcount_dec(struct event_subsystem *system)
+{
+        return (--system->ref_count) & ~SYSTEM_FL_FREE_NAME;
+}
 /* Double loops, do not use break, only goto's work */
 #define do_for_each_event_file(tr, file)                        \
        list_for_each_entry(tr, &ftrace_trace_arrays, list) {   \
@@ -97,7 +114,7 @@ static int __trace_define_field(struct list_head *head, const char *type,
        field = kmem_cache_alloc(field_cachep, GFP_TRACE);
        if (!field)
-                goto err;
+                return -ENOMEM;
        field->name = name;
        field->type = type;
@@ -114,11 +131,6 @@ static int __trace_define_field(struct list_head *head, const char *type,
        list_add(&field->link, head);
        return 0;
-err:
-        kmem_cache_free(field_cachep, field);
-        return -ENOMEM;
 }
 int trace_define_field(struct ftrace_event_call *call, const char *type,
@@ -279,9 +291,11 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
                        }
                        call->class->reg(call, TRACE_REG_UNREGISTER, file);
                }
-                /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT */
+                /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */
                if (file->flags & FTRACE_EVENT_FL_SOFT_MODE)
                        set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+                else
+                        clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
                break;
        case 1:
                /*
@@ -349,8 +363,8 @@ static void __put_system(struct event_subsystem *system)
 {
        struct event_filter *filter = system->filter;
-        WARN_ON_ONCE(system->ref_count == 0);
+        WARN_ON_ONCE(system_refcount(system) == 0);
-        if (--system->ref_count)
+        if (system_refcount_dec(system))
                return;
        list_del(&system->list);
@@ -359,13 +373,15 @@ static void __put_system(struct event_subsystem *system)
                kfree(filter->filter_string);
                kfree(filter);
        }
+        if (system->ref_count & SYSTEM_FL_FREE_NAME)
+                kfree(system->name);
        kfree(system);
 }
 static void __get_system(struct event_subsystem *system)
 {
-        WARN_ON_ONCE(system->ref_count == 0);
+        WARN_ON_ONCE(system_refcount(system) == 0);
-        system->ref_count++;
+        system_refcount_inc(system);
 }
 static void __get_system_dir(struct ftrace_subsystem_dir *dir)
@@ -379,7 +395,7 @@ static void __put_system_dir(struct ftrace_subsystem_dir *dir)
 {
        WARN_ON_ONCE(dir->ref_count == 0);
        /* If the subsystem is about to be freed, the dir must be too */
-        WARN_ON_ONCE(dir->subsystem->ref_count == 1 && dir->ref_count != 1);
+        WARN_ON_ONCE(system_refcount(dir->subsystem) == 1 && dir->ref_count != 1);
        __put_system(dir->subsystem);
        if (!--dir->ref_count)
@@ -393,17 +409,55 @@ static void put_system(struct ftrace_subsystem_dir *dir)
        mutex_unlock(&event_mutex);
 }
+static void remove_subsystem(struct ftrace_subsystem_dir *dir)
+{
+        if (!dir)
+                return;
+        if (!--dir->nr_events) {
+                debugfs_remove_recursive(dir->entry);
+                list_del(&dir->list);
+                __put_system_dir(dir);
+        }
+}
+static void *event_file_data(struct file *filp)
+{
+        return ACCESS_ONCE(file_inode(filp)->i_private);
+}
+static void remove_event_file_dir(struct ftrace_event_file *file)
+{
+        struct dentry *dir = file->dir;
+        struct dentry *child;
+        if (dir) {
+                spin_lock(&dir->d_lock);        /* probably unneeded */
+                list_for_each_entry(child, &dir->d_subdirs, d_u.d_child) {
+                        if (child->d_inode)     /* probably unneeded */
+                                child->d_inode->i_private = NULL;
+                }
+                spin_unlock(&dir->d_lock);
+                debugfs_remove_recursive(dir);
+        }
+        list_del(&file->list);
+        remove_subsystem(file->system);
+        kmem_cache_free(file_cachep, file);
+}
 /*
 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
 */
-static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
+static int
-                                  const char *sub, const char *event, int set)
+__ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
+                              const char *sub, const char *event, int set)
 {
        struct ftrace_event_file *file;
        struct ftrace_event_call *call;
        int ret = -EINVAL;
-        mutex_lock(&event_mutex);
        list_for_each_entry(file, &tr->events, list) {
                call = file->event_call;
@@ -429,6 +483,17 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
                ret = 0;
        }
+        return ret;
+}
+static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
+                                  const char *sub, const char *event, int set)
+{
+        int ret;
+        mutex_lock(&event_mutex);
+        ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set);
        mutex_unlock(&event_mutex);
        return ret;
@@ -623,18 +688,28 @@ static ssize_t
 event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
                  loff_t *ppos)
 {
-        struct ftrace_event_file *file = filp->private_data;
+        struct ftrace_event_file *file;
-        char *buf;
+        unsigned long flags;
+        char buf[4] = "0";
-        if (file->flags & FTRACE_EVENT_FL_ENABLED) {
+        mutex_lock(&event_mutex);
-                if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)
+        file = event_file_data(filp);
-                        buf = "0*\n";
+        if (likely(file))
-                else if (file->flags & FTRACE_EVENT_FL_SOFT_MODE)
+                flags = file->flags;
-                        buf = "1*\n";
+        mutex_unlock(&event_mutex);
-                else
-                        buf = "1\n";
+        if (!file)
-        } else
+                return -ENODEV;
-                buf = "0\n";
+        if (flags & FTRACE_EVENT_FL_ENABLED &&
+            !(flags & FTRACE_EVENT_FL_SOFT_DISABLED))
+                strcpy(buf, "1");
+        if (flags & FTRACE_EVENT_FL_SOFT_DISABLED ||
+            flags & FTRACE_EVENT_FL_SOFT_MODE)
+                strcat(buf, "*");
+        strcat(buf, "\n");
        return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf));
 }
@@ -643,13 +718,10 @@ static ssize_t
 event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
                   loff_t *ppos)
 {
-        struct ftrace_event_file *file = filp->private_data;
+        struct ftrace_event_file *file;
        unsigned long val;
        int ret;
-        if (!file)
-                return -EINVAL;
        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
        if (ret)
                return ret;
@@ -661,8 +733,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
        switch (val) {
        case 0:
        case 1:
+                ret = -ENODEV;
                mutex_lock(&event_mutex);
-                ret = ftrace_event_enable_disable(file, val);
+                file = event_file_data(filp);
+                if (likely(file))
+                        ret = ftrace_event_enable_disable(file, val);
                mutex_unlock(&event_mutex);
                break;
@@ -769,65 +844,39 @@ enum {
 static void *f_next(struct seq_file *m, void *v, loff_t *pos)
 {
-        struct ftrace_event_call *call = m->private;
+        struct ftrace_event_call *call = event_file_data(m->private);
-        struct ftrace_event_field *field;
        struct list_head *common_head = &ftrace_common_fields;
        struct list_head *head = trace_get_fields(call);
+        struct list_head *node = v;
        (*pos)++;
        switch ((unsigned long)v) {
        case FORMAT_HEADER:
-                if (unlikely(list_empty(common_head)))
+                node = common_head;
-                        return NULL;
+                break;
-                field = list_entry(common_head->prev,
-                                   struct ftrace_event_field, link);
-                return field;
        case FORMAT_FIELD_SEPERATOR:
-                if (unlikely(list_empty(head)))
+                node = head;
-                        return NULL;
+                break;
-                field = list_entry(head->prev, struct ftrace_event_field, link);
-                return field;
        case FORMAT_PRINTFMT:
                /* all done */
                return NULL;
        }
-        field = v;
+        node = node->prev;
-        if (field->link.prev == common_head)
+        if (node == common_head)
                return (void *)FORMAT_FIELD_SEPERATOR;
-        else if (field->link.prev == head)
+        else if (node == head)
                return (void *)FORMAT_PRINTFMT;
+        else
-        field = list_entry(field->link.prev, struct ftrace_event_field, link);
+                return node;
-        return field;
-}
-static void *f_start(struct seq_file *m, loff_t *pos)
-{
-        loff_t l = 0;
-        void *p;
-        /* Start by showing the header */
-        if (!*pos)
-                return (void *)FORMAT_HEADER;
-        p = (void *)FORMAT_HEADER;
-        do {
-                p = f_next(m, p, &l);
-        } while (p && l < *pos);
-        return p;
 }
 static int f_show(struct seq_file *m, void *v)
 {
-        struct ftrace_event_call *call = m->private;
+        struct ftrace_event_call *call = event_file_data(m->private);
        struct ftrace_event_field *field;
        const char *array_descriptor;
@@ -848,8 +897,7 @@ static int f_show(struct seq_file *m, void *v)
                return 0;
        }
-        field = v;
+        field = list_entry(v, struct ftrace_event_field, link);
        /*
         * Smartly shows the array type(except dynamic array).
         * Normal:
@@ -876,8 +924,25 @@ static int f_show(struct seq_file *m, void *v)
        return 0;
 }
+static void *f_start(struct seq_file *m, loff_t *pos)
+{
+        void *p = (void *)FORMAT_HEADER;
+        loff_t l = 0;
+        /* ->stop() is called even if ->start() fails */
+        mutex_lock(&event_mutex);
+        if (!event_file_data(m->private))
+                return ERR_PTR(-ENODEV);
+        while (l < *pos && p)
+                p = f_next(m, p, &l);
+        return p;
+}
 static void f_stop(struct seq_file *m, void *p)
 {
+        mutex_unlock(&event_mutex);
 }
 static const struct seq_operations trace_format_seq_ops = {
@@ -889,7 +954,6 @@ static const struct seq_operations trace_format_seq_ops = {
 static int trace_format_open(struct inode *inode, struct file *file)
 {
-        struct ftrace_event_call *call = inode->i_private;
        struct seq_file *m;
        int ret;
@@ -898,7 +962,7 @@ static int trace_format_open(struct inode *inode, struct file *file)
                return ret;
        m = file->private_data;
-        m->private = call;
+        m->private = file;
        return 0;
 }
@@ -906,45 +970,47 @@ static int trace_format_open(struct inode *inode, struct file *file)
 static ssize_t
 event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
 {
-        struct ftrace_event_call *call = filp->private_data;
+        int id = (long)event_file_data(filp);
-        struct trace_seq *s;
+        char buf[32];
-        int r;
+        int len;
        if (*ppos)
                return 0;
-        s = kmalloc(sizeof(*s), GFP_KERNEL);
+        if (unlikely(!id))
-        if (!s)
+                return -ENODEV;
-                return -ENOMEM;
-        trace_seq_init(s);
+        len = sprintf(buf, "%d\n", id);
-        trace_seq_printf(s, "%d\n", call->event.type);
-        r = simple_read_from_buffer(ubuf, cnt, ppos,
+        return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
-                                    s->buffer, s->len);
-        kfree(s);
-        return r;
 }
 static ssize_t
 event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
                  loff_t *ppos)
 {
-        struct ftrace_event_call *call = filp->private_data;
+        struct ftrace_event_call *call;
        struct trace_seq *s;
-        int r;
+        int r = -ENODEV;
        if (*ppos)
                return 0;
        s = kmalloc(sizeof(*s), GFP_KERNEL);
        if (!s)
                return -ENOMEM;
        trace_seq_init(s);
-        print_event_filter(call, s);
+        mutex_lock(&event_mutex);
-        r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+        call = event_file_data(filp);
+        if (call)
+                print_event_filter(call, s);
+        mutex_unlock(&event_mutex);
+        if (call)
+                r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
        kfree(s);
@@ -955,9 +1021,9 @@ static ssize_t
 event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
                   loff_t *ppos)
 {
-        struct ftrace_event_call *call = filp->private_data;
+        struct ftrace_event_call *call;
        char *buf;
-        int err;
+        int err = -ENODEV;
        if (cnt >= PAGE_SIZE)
                return -EINVAL;
@@ -972,7 +1038,12 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
        }
        buf[cnt] = '\0';
-        err = apply_event_filter(call, buf);
+        mutex_lock(&event_mutex);
+        call = event_file_data(filp);
+        if (call)
+                err = apply_event_filter(call, buf);
+        mutex_unlock(&event_mutex);
        free_page((unsigned long) buf);
        if (err < 0)
                return err;
@@ -992,6 +1063,7 @@ static int subsystem_open(struct inode *inode, struct file *filp)
        int ret;
        /* Make sure the system still exists */
+        mutex_lock(&trace_types_lock);
        mutex_lock(&event_mutex);
        list_for_each_entry(tr, &ftrace_trace_arrays, list) {
                list_for_each_entry(dir, &tr->systems, list) {
@@ -1007,6 +1079,7 @@ static int subsystem_open(struct inode *inode, struct file *filp)
        }
 exit_loop:
        mutex_unlock(&event_mutex);
+        mutex_unlock(&trace_types_lock);
        if (!system)
                return -ENODEV;
@@ -1014,9 +1087,17 @@ static int subsystem_open(struct inode *inode, struct file *filp)
        /* Some versions of gcc think dir can be uninitialized here */
        WARN_ON(!dir);
+        /* Still need to increment the ref count of the system */
+        if (trace_array_get(tr) < 0) {
+                put_system(dir);
+                return -ENODEV;
+        }
        ret = tracing_open_generic(inode, filp);
-        if (ret < 0)
+        if (ret < 0) {
+                trace_array_put(tr);
                put_system(dir);
+        }
        return ret;
 }
@@ -1027,16 +1108,23 @@ static int system_tr_open(struct inode *inode, struct file *filp)
        struct trace_array *tr = inode->i_private;
        int ret;
+        if (trace_array_get(tr) < 0)
+                return -ENODEV;
        /* Make a temporary dir that has no system but points to tr */
        dir = kzalloc(sizeof(*dir), GFP_KERNEL);
-        if (!dir)
+        if (!dir) {
+                trace_array_put(tr);
                return -ENOMEM;
+        }
        dir->tr = tr;
        ret = tracing_open_generic(inode, filp);
-        if (ret < 0)
+        if (ret < 0) {
+                trace_array_put(tr);
                kfree(dir);
+        }
        filp->private_data = dir;
@@ -1047,6 +1135,8 @@ static int subsystem_release(struct inode *inode, struct file *file)
 {
        struct ftrace_subsystem_dir *dir = file->private_data;
+        trace_array_put(dir->tr);
        /*
         * If dir->subsystem is NULL, then this is a temporary
         * descriptor that was made for a trace_array to enable
@@ -1143,6 +1233,7 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
 static int ftrace_event_avail_open(struct inode *inode, struct file *file);
 static int ftrace_event_set_open(struct inode *inode, struct file *file);
+static int ftrace_event_release(struct inode *inode, struct file *file);
 static const struct seq_operations show_event_seq_ops = {
        .start = t_start,
@@ -1170,7 +1261,7 @@ static const struct file_operations ftrace_set_event_fops = {
        .read = seq_read,
        .write = ftrace_event_write,
        .llseek = seq_lseek,
-        .release = seq_release,
+        .release = ftrace_event_release,
 };
 static const struct file_operations ftrace_enable_fops = {
@@ -1188,7 +1279,6 @@ static const struct file_operations ftrace_event_format_fops = {
 };
 static const struct file_operations ftrace_event_id_fops = {
-        .open = tracing_open_generic,
        .read = event_id_read,
        .llseek = default_llseek,
 };
@@ -1247,6 +1337,15 @@ ftrace_event_open(struct inode *inode, struct file *file,
        return ret;
 }
+static int ftrace_event_release(struct inode *inode, struct file *file)
+{
+        struct trace_array *tr = inode->i_private;
+        trace_array_put(tr);
+        return seq_release(inode, file);
+}
 static int
 ftrace_event_avail_open(struct inode *inode, struct file *file)
 {
@@ -1260,12 +1359,19 @@ ftrace_event_set_open(struct inode *inode, struct file *file)
 {
        const struct seq_operations *seq_ops = &show_set_event_seq_ops;
        struct trace_array *tr = inode->i_private;
+        int ret;
+        if (trace_array_get(tr) < 0)
+                return -ENODEV;
        if ((file->f_mode & FMODE_WRITE) &&
            (file->f_flags & O_TRUNC))
                ftrace_clear_events(tr);
-        return ftrace_event_open(inode, file, seq_ops);
+        ret = ftrace_event_open(inode, file, seq_ops);
+        if (ret < 0)
+                trace_array_put(tr);
+        return ret;
 }
 static struct event_subsystem *
@@ -1279,7 +1385,15 @@ create_new_subsystem(const char *name)
                return NULL;
        system->ref_count = 1;
-        system->name = name;
+        /* Only allocate if dynamic (kprobes and modules) */
+        if (!core_kernel_data((unsigned long)name)) {
+                system->ref_count |= SYSTEM_FL_FREE_NAME;
+                system->name = kstrdup(name, GFP_KERNEL);
+                if (!system->name)
+                        goto out_free;
+        } else
+                system->name = name;
        system->filter = NULL;
@@ -1292,6 +1406,8 @@ create_new_subsystem(const char *name)
        return system;
 out_free:
+        if (system->ref_count & SYSTEM_FL_FREE_NAME)
+                kfree(system->name);
        kfree(system);
        return NULL;
 }
@@ -1410,8 +1526,8 @@ event_create_dir(struct dentry *parent,
 #ifdef CONFIG_PERF_EVENTS
        if (call->event.type && call->class->reg)
-                trace_create_file("id", 0444, file->dir, call,
+                trace_create_file("id", 0444, file->dir,
-                                  id);
+                                  (void *)(long)call->event.type, id);
 #endif
        /*
@@ -1436,33 +1552,16 @@ event_create_dir(struct dentry *parent,
        return 0;
 }
-static void remove_subsystem(struct ftrace_subsystem_dir *dir)
-{
-        if (!dir)
-                return;
-        if (!--dir->nr_events) {
-                debugfs_remove_recursive(dir->entry);
-                list_del(&dir->list);
-                __put_system_dir(dir);
-        }
-}
 static void remove_event_from_tracers(struct ftrace_event_call *call)
 {
        struct ftrace_event_file *file;
        struct trace_array *tr;
        do_for_each_event_file_safe(tr, file) {
                if (file->event_call != call)
                        continue;
-                list_del(&file->list);
+                remove_event_file_dir(file);
-                debugfs_remove_recursive(file->dir);
-                remove_subsystem(file->system);
-                kmem_cache_free(file_cachep, file);
                /*
                 * The do_for_each_event_file_safe() is
                 * a double loop. After finding the call for this
@@ -1591,6 +1690,7 @@ static void __add_event_to_tracers(struct ftrace_event_call *call,
 int trace_add_event_call(struct ftrace_event_call *call)
 {
        int ret;
+        mutex_lock(&trace_types_lock);
        mutex_lock(&event_mutex);
        ret = __register_event(call, NULL);
@@ -1598,11 +1698,13 @@ int trace_add_event_call(struct ftrace_event_call *call)
                __add_event_to_tracers(call, NULL);
        mutex_unlock(&event_mutex);
+        mutex_unlock(&trace_types_lock);
        return ret;
 }
 /*
- * Must be called under locking both of event_mutex and trace_event_sem.
+ * Must be called under locking of trace_types_lock, event_mutex and
+ * trace_event_sem.
 */
 static void __trace_remove_event_call(struct ftrace_event_call *call)
 {
@@ -1611,14 +1713,53 @@ static void __trace_remove_event_call(struct ftrace_event_call *call)
        destroy_preds(call);
 }
+static int probe_remove_event_call(struct ftrace_event_call *call)
+{
+        struct trace_array *tr;
+        struct ftrace_event_file *file;
+#ifdef CONFIG_PERF_EVENTS
+        if (call->perf_refcount)
+                return -EBUSY;
+#endif
+        do_for_each_event_file(tr, file) {
+                if (file->event_call != call)
+                        continue;
+                /*
+                 * We can't rely on ftrace_event_enable_disable(enable => 0)
+                 * we are going to do, FTRACE_EVENT_FL_SOFT_MODE can suppress
+                 * TRACE_REG_UNREGISTER.
+                 */
+                if (file->flags & FTRACE_EVENT_FL_ENABLED)
+                        return -EBUSY;
+                /*
+                 * The do_for_each_event_file_safe() is
+                 * a double loop. After finding the call for this
+                 * trace_array, we use break to jump to the next
+                 * trace_array.
+                 */
+                break;
+        } while_for_each_event_file();
+        __trace_remove_event_call(call);
+        return 0;
+}
 /* Remove an event_call */
-void trace_remove_event_call(struct ftrace_event_call *call)
+int trace_remove_event_call(struct ftrace_event_call *call)
 {
+        int ret;
+        mutex_lock(&trace_types_lock);
        mutex_lock(&event_mutex);
        down_write(&trace_event_sem);
-        __trace_remove_event_call(call);
+        ret = probe_remove_event_call(call);
        up_write(&trace_event_sem);
        mutex_unlock(&event_mutex);
+        mutex_unlock(&trace_types_lock);
+        return ret;
 }
 #define for_each_event(event, start, end)                       \
@@ -1762,6 +1903,7 @@ static int trace_module_notify(struct notifier_block *self,
 {
        struct module *mod = data;
+        mutex_lock(&trace_types_lock);
        mutex_lock(&event_mutex);
        switch (val) {
        case MODULE_STATE_COMING:
@@ -1772,6 +1914,7 @@ static int trace_module_notify(struct notifier_block *self,
                break;
        }
        mutex_unlock(&event_mutex);
+        mutex_unlock(&trace_types_lock);
        return 0;
 }
@@ -2011,10 +2154,7 @@ event_enable_func(struct ftrace_hash *hash,
        int ret;
        /* hash funcs only work with set_ftrace_filter */
-        if (!enabled)
+        if (!enabled || !param)
-                return -EINVAL;
-        if (!param)
                return -EINVAL;
        system = strsep(&param, ":");
@@ -2188,12 +2328,8 @@ __trace_remove_event_dirs(struct trace_array *tr)
 {
        struct ftrace_event_file *file, *next;
-        list_for_each_entry_safe(file, next, &tr->events, list) {
+        list_for_each_entry_safe(file, next, &tr->events, list)
-                list_del(&file->list);
+                remove_event_file_dir(file);
-                debugfs_remove_recursive(file->dir);
-                remove_subsystem(file->system);
-                kmem_cache_free(file_cachep, file);
-        }
 }
 static void
@@ -2329,11 +2465,11 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
 int event_trace_del_tracer(struct trace_array *tr)
 {
-        /* Disable any running events */
-        __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
        mutex_lock(&event_mutex);
+        /* Disable any running events */
+        __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0);
        down_write(&trace_event_sem);
        __trace_remove_event_dirs(tr);
        debugfs_remove_recursive(tr->event_dir);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e1b653f7e1ca..97daa8cf958d 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -44,6 +44,7 @@ enum filter_op_ids
        OP_LE,
        OP_GT,
        OP_GE,
+        OP_BAND,
        OP_NONE,
        OP_OPEN_PAREN,
 };
@@ -54,6 +55,7 @@ struct filter_op {
        int precedence;
 };
+/* Order must be the same as enum filter_op_ids above */
 static struct filter_op filter_ops[] = {
        { OP_OR,        "||",           1 },
        { OP_AND,       "&&",           2 },
@@ -64,6 +66,7 @@ static struct filter_op filter_ops[] = {
        { OP_LE,        "<=",           5 },
        { OP_GT,        ">",            5 },
        { OP_GE,        ">=",           5 },
+        { OP_BAND,      "&",            6 },
        { OP_NONE,      "OP_NONE",      0 },
        { OP_OPEN_PAREN, "(",           0 },
 };
@@ -156,6 +159,9 @@ static int filter_pred_##type(struct filter_pred *pred, void *event)	\
        case OP_GE:                                                     \
                match = (*addr >= val);                                 \
                break;                                                  \
+        case OP_BAND:                                                   \
+                match = (*addr & val);                                  \
+                break;                                                  \
        default:                                                        \
                break;                                                  \
        }                                                               \
@@ -631,17 +637,15 @@ static void append_filter_err(struct filter_parse_state *ps,
        free_page((unsigned long) buf);
 }
+/* caller must hold event_mutex */
 void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
 {
-        struct event_filter *filter;
+        struct event_filter *filter = call->filter;
-        mutex_lock(&event_mutex);
-        filter = call->filter;
        if (filter && filter->filter_string)
                trace_seq_printf(s, "%s\n", filter->filter_string);
        else
-                trace_seq_printf(s, "none\n");
+                trace_seq_puts(s, "none\n");
-        mutex_unlock(&event_mutex);
 }
 void print_subsystem_event_filter(struct event_subsystem *system,
@@ -654,7 +658,7 @@ void print_subsystem_event_filter(struct event_subsystem *system,
        if (filter && filter->filter_string)
                trace_seq_printf(s, "%s\n", filter->filter_string);
        else
-                trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n");
+                trace_seq_puts(s, DEFAULT_SYS_FILTER_MESSAGE "\n");
        mutex_unlock(&event_mutex);
 }
@@ -1835,23 +1839,22 @@ static int create_system_filter(struct event_subsystem *system,
        return err;
 }
+/* caller must hold event_mutex */
 int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 {
        struct event_filter *filter;
-        int err = 0;
+        int err;
-        mutex_lock(&event_mutex);
        if (!strcmp(strstrip(filter_string), "0")) {
                filter_disable(call);
                filter = call->filter;
                if (!filter)
-                        goto out_unlock;
+                        return 0;
                RCU_INIT_POINTER(call->filter, NULL);
                /* Make sure the filter is not being used */
                synchronize_sched();
                __free_filter(filter);
-                goto out_unlock;
+                return 0;
        }
        err = create_filter(call, filter_string, true, &filter);
@@ -1878,8 +1881,6 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
                        __free_filter(tmp);
                }
        }
-out_unlock:
-        mutex_unlock(&event_mutex);
        return err;
 }
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index c4d6d7191988..38fe1483c508 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -199,7 +199,7 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)
        return 0;
 }
-static struct tracer function_trace __read_mostly =
+static struct tracer function_trace __tracer_data =
 {
        .name           = "function",
        .init           = function_trace_init,
@@ -290,6 +290,21 @@ ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data)
                trace_dump_stack(STACK_SKIP);
 }
+static void
+ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, void **data)
+{
+        if (update_count(data))
+                ftrace_dump(DUMP_ALL);
+}
+/* Only dump the current CPU buffer. */
+static void
+ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, void **data)
+{
+        if (update_count(data))
+                ftrace_dump(DUMP_ORIG);
+}
 static int
 ftrace_probe_print(const char *name, struct seq_file *m,
                   unsigned long ip, void *data)
@@ -327,6 +342,20 @@ ftrace_stacktrace_print(struct seq_file *m, unsigned long ip,
        return ftrace_probe_print("stacktrace", m, ip, data);
 }
+static int
+ftrace_dump_print(struct seq_file *m, unsigned long ip,
+                        struct ftrace_probe_ops *ops, void *data)
+{
+        return ftrace_probe_print("dump", m, ip, data);
+}
+static int
+ftrace_cpudump_print(struct seq_file *m, unsigned long ip,
+                        struct ftrace_probe_ops *ops, void *data)
+{
+        return ftrace_probe_print("cpudump", m, ip, data);
+}
 static struct ftrace_probe_ops traceon_count_probe_ops = {
        .func                   = ftrace_traceon_count,
        .print                  = ftrace_traceon_print,
@@ -342,6 +371,16 @@ static struct ftrace_probe_ops stacktrace_count_probe_ops = {
        .print                  = ftrace_stacktrace_print,
 };
+static struct ftrace_probe_ops dump_probe_ops = {
+        .func                   = ftrace_dump_probe,
+        .print                  = ftrace_dump_print,
+};
+static struct ftrace_probe_ops cpudump_probe_ops = {
+        .func                   = ftrace_cpudump_probe,
+        .print                  = ftrace_cpudump_print,
+};
 static struct ftrace_probe_ops traceon_probe_ops = {
        .func                   = ftrace_traceon,
        .print                  = ftrace_traceon_print,
@@ -425,6 +464,32 @@ ftrace_stacktrace_callback(struct ftrace_hash *hash,
                                           param, enable);
 }
+static int
+ftrace_dump_callback(struct ftrace_hash *hash,
+                           char *glob, char *cmd, char *param, int enable)
+{
+        struct ftrace_probe_ops *ops;
+        ops = &dump_probe_ops;
+        /* Only dump once. */
+        return ftrace_trace_probe_callback(ops, hash, glob, cmd,
+                                           "1", enable);
+}
+static int
+ftrace_cpudump_callback(struct ftrace_hash *hash,
+                           char *glob, char *cmd, char *param, int enable)
+{
+        struct ftrace_probe_ops *ops;
+        ops = &cpudump_probe_ops;
+        /* Only dump once. */
+        return ftrace_trace_probe_callback(ops, hash, glob, cmd,
+                                           "1", enable);
+}
 static struct ftrace_func_command ftrace_traceon_cmd = {
        .name                   = "traceon",
        .func                   = ftrace_trace_onoff_callback,
@@ -440,6 +505,16 @@ static struct ftrace_func_command ftrace_stacktrace_cmd = {
        .func                   = ftrace_stacktrace_callback,
 };
+static struct ftrace_func_command ftrace_dump_cmd = {
+        .name                   = "dump",
+        .func                   = ftrace_dump_callback,
+};
+static struct ftrace_func_command ftrace_cpudump_cmd = {
+        .name                   = "cpudump",
+        .func                   = ftrace_cpudump_callback,
+};
 static int __init init_func_cmd_traceon(void)
 {
        int ret;
@@ -450,13 +525,31 @@ static int __init init_func_cmd_traceon(void)
        ret = register_ftrace_command(&ftrace_traceon_cmd);
        if (ret)
-                unregister_ftrace_command(&ftrace_traceoff_cmd);
+                goto out_free_traceoff;
        ret = register_ftrace_command(&ftrace_stacktrace_cmd);
-        if (ret) {
+        if (ret)
-                unregister_ftrace_command(&ftrace_traceoff_cmd);
+                goto out_free_traceon;
-                unregister_ftrace_command(&ftrace_traceon_cmd);
-        }
+        ret = register_ftrace_command(&ftrace_dump_cmd);
+        if (ret)
+                goto out_free_stacktrace;
+        ret = register_ftrace_command(&ftrace_cpudump_cmd);
+        if (ret)
+                goto out_free_dump;
+        return 0;
+ out_free_dump:
+        unregister_ftrace_command(&ftrace_dump_cmd);
+ out_free_stacktrace:
+        unregister_ftrace_command(&ftrace_stacktrace_cmd);
+ out_free_traceon:
+        unregister_ftrace_command(&ftrace_traceon_cmd);
+ out_free_traceoff:
+        unregister_ftrace_command(&ftrace_traceoff_cmd);
        return ret;
 }
 #else
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 8388bc99f2ee..b5c09242683d 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -446,7 +446,7 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
        /* First spaces to align center */
        for (i = 0; i < spaces / 2; i++) {
-                ret = trace_seq_printf(s, " ");
+                ret = trace_seq_putc(s, ' ');
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
        }
@@ -457,7 +457,7 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
        /* Last spaces to align center */
        for (i = 0; i < spaces - (spaces / 2); i++) {
-                ret = trace_seq_printf(s, " ");
+                ret = trace_seq_putc(s, ' ');
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
        }
@@ -503,7 +503,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
 ------------------------------------------
 */
-        ret = trace_seq_printf(s,
+        ret = trace_seq_puts(s,
                " ------------------------------------------\n");
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
@@ -516,7 +516,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
        if (ret == TRACE_TYPE_PARTIAL_LINE)
                return TRACE_TYPE_PARTIAL_LINE;
-        ret = trace_seq_printf(s, " => ");
+        ret = trace_seq_puts(s, " => ");
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
@@ -524,7 +524,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
        if (ret == TRACE_TYPE_PARTIAL_LINE)
                return TRACE_TYPE_PARTIAL_LINE;
-        ret = trace_seq_printf(s,
+        ret = trace_seq_puts(s,
                "\n ------------------------------------------\n\n");
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
@@ -645,7 +645,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
                        ret = print_graph_proc(s, pid);
                        if (ret == TRACE_TYPE_PARTIAL_LINE)
                                return TRACE_TYPE_PARTIAL_LINE;
-                        ret = trace_seq_printf(s, " | ");
+                        ret = trace_seq_puts(s, " | ");
                        if (!ret)
                                return TRACE_TYPE_PARTIAL_LINE;
                }
@@ -657,9 +657,9 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
                return ret;
        if (type == TRACE_GRAPH_ENT)
-                ret = trace_seq_printf(s, "==========>");
+                ret = trace_seq_puts(s, "==========>");
        else
-                ret = trace_seq_printf(s, "<==========");
+                ret = trace_seq_puts(s, "<==========");
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
@@ -668,7 +668,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
        if (ret != TRACE_TYPE_HANDLED)
                return ret;
-        ret = trace_seq_printf(s, "\n");
+        ret = trace_seq_putc(s, '\n');
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
@@ -705,13 +705,13 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
                len += strlen(nsecs_str);
        }
-        ret = trace_seq_printf(s, " us ");
+        ret = trace_seq_puts(s, " us ");
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
        /* Print remaining spaces to fit the row's width */
        for (i = len; i < 7; i++) {
-                ret = trace_seq_printf(s, " ");
+                ret = trace_seq_putc(s, ' ');
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
        }
@@ -731,13 +731,13 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s,
        /* No real adata, just filling the column with spaces */
        switch (duration) {
        case DURATION_FILL_FULL:
-                ret = trace_seq_printf(s, "              |  ");
+                ret = trace_seq_puts(s, "              |  ");
                return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
        case DURATION_FILL_START:
-                ret = trace_seq_printf(s, "  ");
+                ret = trace_seq_puts(s, "  ");
                return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
        case DURATION_FILL_END:
-                ret = trace_seq_printf(s, " |");
+                ret = trace_seq_puts(s, " |");
                return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
        }
@@ -745,10 +745,10 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s,
        if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
                /* Duration exceeded 100 msecs */
                if (duration > 100000ULL)
-                        ret = trace_seq_printf(s, "! ");
+                        ret = trace_seq_puts(s, "! ");
                /* Duration exceeded 10 msecs */
                else if (duration > 10000ULL)
-                        ret = trace_seq_printf(s, "+ ");
+                        ret = trace_seq_puts(s, "+ ");
        }
        /*
@@ -757,7 +757,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s,
         * to fill out the space.
         */
        if (ret == -1)
-                ret = trace_seq_printf(s, "  ");
+                ret = trace_seq_puts(s, "  ");
        /* Catching here any failure happenned above */
        if (!ret)
@@ -767,7 +767,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s,
        if (ret != TRACE_TYPE_HANDLED)
                return ret;
-        ret = trace_seq_printf(s, "|  ");
+        ret = trace_seq_puts(s, "|  ");
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
@@ -817,7 +817,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
        /* Function */
        for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
-                ret = trace_seq_printf(s, " ");
+                ret = trace_seq_putc(s, ' ');
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
        }
@@ -858,7 +858,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
        /* Function */
        for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
-                ret = trace_seq_printf(s, " ");
+                ret = trace_seq_putc(s, ' ');
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
        }
@@ -917,7 +917,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
                if (ret == TRACE_TYPE_PARTIAL_LINE)
                        return TRACE_TYPE_PARTIAL_LINE;
-                ret = trace_seq_printf(s, " | ");
+                ret = trace_seq_puts(s, " | ");
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
        }
@@ -1117,7 +1117,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
        /* Closing brace */
        for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
-                ret = trace_seq_printf(s, " ");
+                ret = trace_seq_putc(s, ' ');
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
        }
@@ -1129,7 +1129,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
         * belongs to, write out the function name.
         */
        if (func_match) {
-                ret = trace_seq_printf(s, "}\n");
+                ret = trace_seq_puts(s, "}\n");
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
        } else {
@@ -1179,13 +1179,13 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
        /* Indentation */
        if (depth > 0)
                for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) {
-                        ret = trace_seq_printf(s, " ");
+                        ret = trace_seq_putc(s, ' ');
                        if (!ret)
                                return TRACE_TYPE_PARTIAL_LINE;
                }
        /* The comment */
-        ret = trace_seq_printf(s, "/* ");
+        ret = trace_seq_puts(s, "/* ");
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
@@ -1216,7 +1216,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
                s->len--;
        }
-        ret = trace_seq_printf(s, " */\n");
+        ret = trace_seq_puts(s, " */\n");
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
@@ -1448,7 +1448,7 @@ static struct trace_event graph_trace_ret_event = {
        .funcs          = &graph_functions
 };
-static struct tracer graph_trace __read_mostly = {
+static struct tracer graph_trace __tracer_data = {
        .name           = "function_graph",
        .open           = graph_trace_open,
        .pipe_open      = graph_trace_open,
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index b19d065a28cb..2aefbee93a6d 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -373,7 +373,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
        struct trace_array_cpu *data;
        unsigned long flags;
-        if (likely(!tracer_enabled))
+        if (!tracer_enabled || !tracing_is_enabled())
                return;
        cpu = raw_smp_processor_id();
@@ -416,7 +416,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
        else
                return;
-        if (!tracer_enabled)
+        if (!tracer_enabled || !tracing_is_enabled())
                return;
        data = per_cpu_ptr(tr->trace_buffer.data, cpu);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 9f46e98ba8f2..243f6834d026 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -35,12 +35,17 @@ struct trace_probe {
        const char              *symbol;        /* symbol name */
        struct ftrace_event_class       class;
        struct ftrace_event_call        call;
-        struct ftrace_event_file * __rcu *files;
+        struct list_head        files;
        ssize_t                 size;           /* trace entry size */
        unsigned int            nr_args;
        struct probe_arg        args[];
 };
+struct event_file_link {
+        struct ftrace_event_file        *file;
+        struct list_head                list;
+};
 #define SIZEOF_TRACE_PROBE(n)                   \
        (offsetof(struct trace_probe, args) +   \
        (sizeof(struct probe_arg) * (n)))
@@ -90,7 +95,7 @@ static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp)
 }
 static int register_probe_event(struct trace_probe *tp);
-static void unregister_probe_event(struct trace_probe *tp);
+static int unregister_probe_event(struct trace_probe *tp);
 static DEFINE_MUTEX(probe_lock);
 static LIST_HEAD(probe_list);
@@ -150,6 +155,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
                goto error;
        INIT_LIST_HEAD(&tp->list);
+        INIT_LIST_HEAD(&tp->files);
        return tp;
 error:
        kfree(tp->call.name);
@@ -183,25 +189,6 @@ static struct trace_probe *find_trace_probe(const char *event,
        return NULL;
 }
-static int trace_probe_nr_files(struct trace_probe *tp)
-{
-        struct ftrace_event_file **file;
-        int ret = 0;
-        /*
-         * Since all tp->files updater is protected by probe_enable_lock,
-         * we don't need to lock an rcu_read_lock.
-         */
-        file = rcu_dereference_raw(tp->files);
-        if (file)
-                while (*(file++))
-                        ret++;
-        return ret;
-}
-static DEFINE_MUTEX(probe_enable_lock);
 /*
 * Enable trace_probe
 * if the file is NULL, enable "perf" handler, or enable "trace" handler.
@@ -211,67 +198,42 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
 {
        int ret = 0;
-        mutex_lock(&probe_enable_lock);
        if (file) {
-                struct ftrace_event_file **new, **old;
+                struct event_file_link *link;
-                int n = trace_probe_nr_files(tp);
+                link = kmalloc(sizeof(*link), GFP_KERNEL);
-                old = rcu_dereference_raw(tp->files);
+                if (!link) {
-                /* 1 is for new one and 1 is for stopper */
-                new = kzalloc((n + 2) * sizeof(struct ftrace_event_file *),
-                              GFP_KERNEL);
-                if (!new) {
                        ret = -ENOMEM;
-                        goto out_unlock;
+                        goto out;
                }
-                memcpy(new, old, n * sizeof(struct ftrace_event_file *));
-                new[n] = file;
-                /* The last one keeps a NULL */
-                rcu_assign_pointer(tp->files, new);
+                link->file = file;
-                tp->flags |= TP_FLAG_TRACE;
+                list_add_tail_rcu(&link->list, &tp->files);
-                if (old) {
+                tp->flags |= TP_FLAG_TRACE;
-                        /* Make sure the probe is done with old files */
-                        synchronize_sched();
-                        kfree(old);
-                }
        } else
                tp->flags |= TP_FLAG_PROFILE;
-        if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) &&
+        if (trace_probe_is_registered(tp) && !trace_probe_has_gone(tp)) {
-            !trace_probe_has_gone(tp)) {
                if (trace_probe_is_return(tp))
                        ret = enable_kretprobe(&tp->rp);
                else
                        ret = enable_kprobe(&tp->rp.kp);
        }
+ out:
- out_unlock:
-        mutex_unlock(&probe_enable_lock);
        return ret;
 }
-static int
+static struct event_file_link *
-trace_probe_file_index(struct trace_probe *tp, struct ftrace_event_file *file)
+find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
 {
-        struct ftrace_event_file **files;
+        struct event_file_link *link;
-        int i;
-        /*
+        list_for_each_entry(link, &tp->files, list)
-         * Since all tp->files updater is protected by probe_enable_lock,
+                if (link->file == file)
-         * we don't need to lock an rcu_read_lock.
+                        return link;
-         */
-        files = rcu_dereference_raw(tp->files);
-        if (files) {
-                for (i = 0; files[i]; i++)
-                        if (files[i] == file)
-                                return i;
-        }
-        return -1;
+        return NULL;
 }
 /*
@@ -281,43 +243,23 @@ trace_probe_file_index(struct trace_probe *tp, struct ftrace_event_file *file)
 static int
 disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
 {
+        struct event_file_link *link = NULL;
+        int wait = 0;
        int ret = 0;
-        mutex_lock(&probe_enable_lock);
        if (file) {
-                struct ftrace_event_file **new, **old;
+                link = find_event_file_link(tp, file);
-                int n = trace_probe_nr_files(tp);
+                if (!link) {
-                int i, j;
-                old = rcu_dereference_raw(tp->files);
-                if (n == 0 || trace_probe_file_index(tp, file) < 0) {
                        ret = -EINVAL;
-                        goto out_unlock;
+                        goto out;
                }
-                if (n == 1) {   /* Remove the last file */
+                list_del_rcu(&link->list);
-                        tp->flags &= ~TP_FLAG_TRACE;
+                wait = 1;
-                        new = NULL;
+                if (!list_empty(&tp->files))
-                } else {
+                        goto out;
-                        new = kzalloc(n * sizeof(struct ftrace_event_file *),
-                                      GFP_KERNEL);
-                        if (!new) {
-                                ret = -ENOMEM;
-                                goto out_unlock;
-                        }
-                        /* This copy & check loop copies the NULL stopper too */
-                        for (i = 0, j = 0; j < n && i < n + 1; i++)
-                                if (old[i] != file)
-                                        new[j++] = old[i];
-                }
-                rcu_assign_pointer(tp->files, new);
-                /* Make sure the probe is done with old files */
+                tp->flags &= ~TP_FLAG_TRACE;
-                synchronize_sched();
-                kfree(old);
        } else
                tp->flags &= ~TP_FLAG_PROFILE;
@@ -326,10 +268,21 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
                        disable_kretprobe(&tp->rp);
                else
                        disable_kprobe(&tp->rp.kp);
+                wait = 1;
+        }
+ out:
+        if (wait) {
+                /*
+                 * Synchronize with kprobe_trace_func/kretprobe_trace_func
+                 * to ensure disabled (all running handlers are finished).
+                 * This is not only for kfree(), but also the caller,
+                 * trace_remove_event_call() supposes it for releasing
+                 * event_call related objects, which will be accessed in
+                 * the kprobe_trace_func/kretprobe_trace_func.
+                 */
+                synchronize_sched();
+                kfree(link);    /* Ignored if link == NULL */
        }
- out_unlock:
-        mutex_unlock(&probe_enable_lock);
        return ret;
 }
@@ -398,9 +351,12 @@ static int unregister_trace_probe(struct trace_probe *tp)
        if (trace_probe_is_enabled(tp))
                return -EBUSY;
+        /* Will fail if probe is being used by ftrace or perf */
+        if (unregister_probe_event(tp))
+                return -EBUSY;
        __unregister_trace_probe(tp);
        list_del(&tp->list);
-        unregister_probe_event(tp);
        return 0;
 }
@@ -679,7 +635,9 @@ static int release_all_trace_probes(void)
        /* TODO: Use batch unregistration */
        while (!list_empty(&probe_list)) {
                tp = list_entry(probe_list.next, struct trace_probe, list);
-                unregister_trace_probe(tp);
+                ret = unregister_trace_probe(tp);
+                if (ret)
+                        goto end;
                free_trace_probe(tp);
        }
@@ -885,20 +843,10 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs,
 static __kprobes void
 kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs)
 {
-        /*
+        struct event_file_link *link;
-         * Note: preempt is already disabled around the kprobe handler.
-         * However, we still need an smp_read_barrier_depends() corresponding
-         * to smp_wmb() in rcu_assign_pointer() to access the pointer.
-         */
-        struct ftrace_event_file **file = rcu_dereference_raw(tp->files);
-        if (unlikely(!file))
-                return;
-        while (*file) {
+        list_for_each_entry_rcu(link, &tp->files, list)
-                __kprobe_trace_func(tp, regs, *file);
+                __kprobe_trace_func(tp, regs, link->file);
-                file++;
-        }
 }
 /* Kretprobe handler */
@@ -945,20 +893,10 @@ static __kprobes void
 kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
                     struct pt_regs *regs)
 {
-        /*
+        struct event_file_link *link;
-         * Note: preempt is already disabled around the kprobe handler.
-         * However, we still need an smp_read_barrier_depends() corresponding
-         * to smp_wmb() in rcu_assign_pointer() to access the pointer.
-         */
-        struct ftrace_event_file **file = rcu_dereference_raw(tp->files);
-        if (unlikely(!file))
-                return;
-        while (*file) {
+        list_for_each_entry_rcu(link, &tp->files, list)
-                __kretprobe_trace_func(tp, ri, regs, *file);
+                __kretprobe_trace_func(tp, ri, regs, link->file);
-                file++;
-        }
 }
 /* Event entry printers */
@@ -1157,13 +1095,14 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs)
        int size, __size, dsize;
        int rctx;
+        head = this_cpu_ptr(call->perf_events);
+        if (hlist_empty(head))
+                return;
        dsize = __get_data_size(tp, regs);
        __size = sizeof(*entry) + tp->size + dsize;
        size = ALIGN(__size + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);
-        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
-                     "profile buffer not large enough"))
-                return;
        entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
        if (!entry)
@@ -1172,10 +1111,7 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs)
        entry->ip = (unsigned long)tp->rp.kp.addr;
        memset(&entry[1], 0, dsize);
        store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
+        perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
-        head = this_cpu_ptr(call->perf_events);
-        perf_trace_buf_submit(entry, size, rctx,
-                                        entry->ip, 1, regs, head, NULL);
 }
 /* Kretprobe profile handler */
@@ -1189,13 +1125,14 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri,
        int size, __size, dsize;
        int rctx;
+        head = this_cpu_ptr(call->perf_events);
+        if (hlist_empty(head))
+                return;
        dsize = __get_data_size(tp, regs);
        __size = sizeof(*entry) + tp->size + dsize;
        size = ALIGN(__size + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);
-        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
-                     "profile buffer not large enough"))
-                return;
        entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
        if (!entry)
@@ -1204,13 +1141,16 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri,
        entry->func = (unsigned long)tp->rp.kp.addr;
        entry->ret_ip = (unsigned long)ri->ret_addr;
        store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
+        perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
-        head = this_cpu_ptr(call->perf_events);
-        perf_trace_buf_submit(entry, size, rctx,
-                                        entry->ret_ip, 1, regs, head, NULL);
 }
 #endif  /* CONFIG_PERF_EVENTS */
+/*
+ * called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex.
+ *
+ * kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe
+ * lockless, but we can't race with this __init function.
+ */
 static __kprobes
 int kprobe_register(struct ftrace_event_call *event,
                    enum trace_reg type, void *data)
@@ -1312,11 +1252,15 @@ static int register_probe_event(struct trace_probe *tp)
        return ret;
 }
-static void unregister_probe_event(struct trace_probe *tp)
+static int unregister_probe_event(struct trace_probe *tp)
 {
+        int ret;
        /* tp->event is unregistered in trace_remove_event_call() */
-        trace_remove_event_call(&tp->call);
+        ret = trace_remove_event_call(&tp->call);
-        kfree(tp->call.print_fmt);
+        if (!ret)
+                kfree(tp->call.print_fmt);
+        return ret;
 }
 /* Make a debugfs interface for controlling probe points */
@@ -1376,6 +1320,10 @@ find_trace_probe_file(struct trace_probe *tp, struct trace_array *tr)
        return NULL;
 }
+/*
+ * Nobody but us can call enable_trace_probe/disable_trace_probe at this
+ * stage, we can do this lockless.
+ */
 static __init int kprobe_trace_self_tests_init(void)
 {
        int ret, warn = 0;
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index a5e8f4878bfa..b3dcfb2f0fef 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -90,7 +90,7 @@ static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
        if (drv)
                ret += trace_seq_printf(s, " %s\n", drv->name);
        else
-                ret += trace_seq_printf(s, " \n");
+                ret += trace_seq_puts(s, " \n");
        return ret;
 }
@@ -107,7 +107,7 @@ static void mmio_pipe_open(struct trace_iterator *iter)
        struct header_iter *hiter;
        struct trace_seq *s = &iter->seq;
-        trace_seq_printf(s, "VERSION 20070824\n");
+        trace_seq_puts(s, "VERSION 20070824\n");
        hiter = kzalloc(sizeof(*hiter), GFP_KERNEL);
        if (!hiter)
@@ -209,7 +209,7 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
                        (rw->value >> 0) & 0xff, rw->pc, 0);
                break;
        default:
-                ret = trace_seq_printf(s, "rw what?\n");
+                ret = trace_seq_puts(s, "rw what?\n");
                break;
        }
        if (ret)
@@ -245,7 +245,7 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
                        secs, usec_rem, m->map_id, 0UL, 0);
                break;
        default:
-                ret = trace_seq_printf(s, "map what?\n");
+                ret = trace_seq_puts(s, "map what?\n");
                break;
        }
        if (ret)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index bb922d9ee51b..34e7cbac0c9c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -78,7 +78,7 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
        trace_assign_type(field, entry);
-        ret = trace_seq_printf(s, "%s", field->buf);
+        ret = trace_seq_puts(s, field->buf);
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
@@ -558,14 +558,14 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
                        if (ret)
                                ret = trace_seq_puts(s, "??");
                        if (ret)
-                                ret = trace_seq_puts(s, "\n");
+                                ret = trace_seq_putc(s, '\n');
                        continue;
                }
                if (!ret)
                        break;
                if (ret)
                        ret = seq_print_user_ip(s, mm, ip, sym_flags);
-                ret = trace_seq_puts(s, "\n");
+                ret = trace_seq_putc(s, '\n');
        }
        if (mm)
@@ -579,7 +579,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
        int ret;
        if (!ip)
-                return trace_seq_printf(s, "0");
+                return trace_seq_putc(s, '0');
        if (sym_flags & TRACE_ITER_SYM_OFFSET)
                ret = seq_print_sym_offset(s, "%s", ip);
@@ -964,14 +964,14 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
                goto partial;
        if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) {
-                if (!trace_seq_printf(s, " <-"))
+                if (!trace_seq_puts(s, " <-"))
                        goto partial;
                if (!seq_print_ip_sym(s,
                                      field->parent_ip,
                                      flags))
                        goto partial;
        }
-        if (!trace_seq_printf(s, "\n"))
+        if (!trace_seq_putc(s, '\n'))
                goto partial;
        return TRACE_TYPE_HANDLED;
@@ -1210,7 +1210,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
                if (!seq_print_ip_sym(s, *p, flags))
                        goto partial;
-                if (!trace_seq_puts(s, "\n"))
+                if (!trace_seq_putc(s, '\n'))
                        goto partial;
        }
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 2901e3b88590..a7329b7902f8 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -640,13 +640,20 @@ out:
 * Enable ftrace, sleep 1/10 second, and then read the trace
 * buffer to see if all is in order.
 */
-int
+__init int
 trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 {
        int save_ftrace_enabled = ftrace_enabled;
        unsigned long count;
        int ret;
+#ifdef CONFIG_DYNAMIC_FTRACE
+        if (ftrace_filter_param) {
+                printk(KERN_CONT " ... kernel command line filter set: force PASS ... ");
+                return 0;
+        }
+#endif
        /* make sure msleep has been recorded */
        msleep(1);
@@ -727,13 +734,20 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
 * Pretty much the same than for the function tracer from which the selftest
 * has been borrowed.
 */
-int
+__init int
 trace_selftest_startup_function_graph(struct tracer *trace,
                                        struct trace_array *tr)
 {
        int ret;
        unsigned long count;
+#ifdef CONFIG_DYNAMIC_FTRACE
+        if (ftrace_filter_param) {
+                printk(KERN_CONT " ... kernel command line filter set: force PASS ... ");
+                return 0;
+        }
+#endif
        /*
         * Simulate the init() callback but we attach a watchdog callback
         * to detect and recover from possible hangs
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 8f2ac73c7a5f..8fd03657bc7d 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -175,7 +175,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
        entry = syscall_nr_to_meta(syscall);
        if (!entry) {
-                trace_seq_printf(s, "\n");
+                trace_seq_putc(s, '\n');
                return TRACE_TYPE_HANDLED;
        }
@@ -306,6 +306,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
        struct syscall_metadata *sys_data;
        struct ring_buffer_event *event;
        struct ring_buffer *buffer;
+        unsigned long irq_flags;
+        int pc;
        int syscall_nr;
        int size;
@@ -321,9 +323,12 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
        size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
+        local_save_flags(irq_flags);
+        pc = preempt_count();
        buffer = tr->trace_buffer.buffer;
        event = trace_buffer_lock_reserve(buffer,
-                        sys_data->enter_event->event.type, size, 0, 0);
+                        sys_data->enter_event->event.type, size, irq_flags, pc);
        if (!event)
                return;
@@ -333,7 +338,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
        if (!filter_current_check_discard(buffer, sys_data->enter_event,
                                          entry, event))
-                trace_current_buffer_unlock_commit(buffer, event, 0, 0);
+                trace_current_buffer_unlock_commit(buffer, event,
+                                                   irq_flags, pc);
 }
 static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
@@ -343,6 +349,8 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
        struct syscall_metadata *sys_data;
        struct ring_buffer_event *event;
        struct ring_buffer *buffer;
+        unsigned long irq_flags;
+        int pc;
        int syscall_nr;
        syscall_nr = trace_get_syscall_nr(current, regs);
@@ -355,9 +363,13 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
        if (!sys_data)
                return;
+        local_save_flags(irq_flags);
+        pc = preempt_count();
        buffer = tr->trace_buffer.buffer;
        event = trace_buffer_lock_reserve(buffer,
-                        sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
+                        sys_data->exit_event->event.type, sizeof(*entry),
+                        irq_flags, pc);
        if (!event)
                return;
@@ -367,7 +379,8 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
        if (!filter_current_check_discard(buffer, sys_data->exit_event,
                                          entry, event))
-                trace_current_buffer_unlock_commit(buffer, event, 0, 0);
+                trace_current_buffer_unlock_commit(buffer, event,
+                                                   irq_flags, pc);
 }
 static int reg_event_syscall_enter(struct ftrace_event_file *file,
@@ -553,15 +566,15 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
        if (!sys_data)
                return;
+        head = this_cpu_ptr(sys_data->enter_event->perf_events);
+        if (hlist_empty(head))
+                return;
        /* get the size after alignment with the u32 buffer size field */
        size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
        size = ALIGN(size + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);
-        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
-                      "perf buffer not large enough"))
-                return;
        rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
                                sys_data->enter_event->event.type, regs, &rctx);
        if (!rec)
@@ -570,8 +583,6 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
        rec->nr = syscall_nr;
        syscall_get_arguments(current, regs, 0, sys_data->nb_args,
                               (unsigned long *)&rec->args);
-        head = this_cpu_ptr(sys_data->enter_event->perf_events);
        perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
 }
@@ -629,18 +640,14 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
        if (!sys_data)
                return;
+        head = this_cpu_ptr(sys_data->exit_event->perf_events);
+        if (hlist_empty(head))
+                return;
        /* We can probably do that at build time */
        size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);
-        /*
-         * Impossible, but be paranoid with the future
-         * How to put this check outside runtime?
-         */
-        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
-                "exit event has grown above perf buffer size"))
-                return;
        rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
                                sys_data->exit_event->event.type, regs, &rctx);
        if (!rec)
@@ -648,8 +655,6 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
        rec->nr = syscall_nr;
        rec->ret = syscall_get_return_value(current, regs);
-        head = this_cpu_ptr(sys_data->exit_event->perf_events);
        perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
 }
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 32494fb0ee64..272261b5f94f 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -70,7 +70,7 @@ struct trace_uprobe {
        (sizeof(struct probe_arg) * (n)))
 static int register_uprobe_event(struct trace_uprobe *tu);
-static void unregister_uprobe_event(struct trace_uprobe *tu);
+static int unregister_uprobe_event(struct trace_uprobe *tu);
 static DEFINE_MUTEX(uprobe_lock);
 static LIST_HEAD(uprobe_list);
@@ -164,11 +164,17 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou
 }
 /* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */
-static void unregister_trace_uprobe(struct trace_uprobe *tu)
+static int unregister_trace_uprobe(struct trace_uprobe *tu)
 {
+        int ret;
+        ret = unregister_uprobe_event(tu);
+        if (ret)
+                return ret;
        list_del(&tu->list);
-        unregister_uprobe_event(tu);
        free_trace_uprobe(tu);
+        return 0;
 }
 /* Register a trace_uprobe and probe_event */
@@ -181,9 +187,12 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
        /* register as an event */
        old_tp = find_probe_event(tu->call.name, tu->call.class->system);
-        if (old_tp)
+        if (old_tp) {
                /* delete old event */
-                unregister_trace_uprobe(old_tp);
+                ret = unregister_trace_uprobe(old_tp);
+                if (ret)
+                        goto end;
+        }
        ret = register_uprobe_event(tu);
        if (ret) {
@@ -256,6 +265,8 @@ static int create_trace_uprobe(int argc, char **argv)
                group = UPROBE_EVENT_SYSTEM;
        if (is_delete) {
+                int ret;
                if (!event) {
                        pr_info("Delete command needs an event name.\n");
                        return -EINVAL;
@@ -269,9 +280,9 @@ static int create_trace_uprobe(int argc, char **argv)
                        return -ENOENT;
                }
                /* delete an event */
-                unregister_trace_uprobe(tu);
+                ret = unregister_trace_uprobe(tu);
                mutex_unlock(&uprobe_lock);
-                return 0;
+                return ret;
        }
        if (argc < 2) {
@@ -283,8 +294,10 @@ static int create_trace_uprobe(int argc, char **argv)
                return -EINVAL;
        }
        arg = strchr(argv[1], ':');
-        if (!arg)
+        if (!arg) {
+                ret = -EINVAL;
                goto fail_address_parse;
+        }
        *arg++ = '\0';
        filename = argv[1];
@@ -406,16 +419,20 @@ fail_address_parse:
        return ret;
 }
-static void cleanup_all_probes(void)
+static int cleanup_all_probes(void)
 {
        struct trace_uprobe *tu;
+        int ret = 0;
        mutex_lock(&uprobe_lock);
        while (!list_empty(&uprobe_list)) {
                tu = list_entry(uprobe_list.next, struct trace_uprobe, list);
-                unregister_trace_uprobe(tu);
+                ret = unregister_trace_uprobe(tu);
+                if (ret)
+                        break;
        }
        mutex_unlock(&uprobe_lock);
+        return ret;
 }
 /* Probes listing interfaces */
@@ -460,8 +477,13 @@ static const struct seq_operations probes_seq_op = {
 static int probes_open(struct inode *inode, struct file *file)
 {
-        if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC))
+        int ret;
-                cleanup_all_probes();
+        if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
+                ret = cleanup_all_probes();
+                if (ret)
+                        return ret;
+        }
        return seq_open(file, &probes_seq_op);
 }
@@ -816,8 +838,6 @@ static void uprobe_perf_print(struct trace_uprobe *tu,
        size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
        size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32);
-        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
-                return;
        preempt_disable();
        head = this_cpu_ptr(call->perf_events);
@@ -968,12 +988,17 @@ static int register_uprobe_event(struct trace_uprobe *tu)
        return ret;
 }
-static void unregister_uprobe_event(struct trace_uprobe *tu)
+static int unregister_uprobe_event(struct trace_uprobe *tu)
 {
+        int ret;
        /* tu->event is unregistered in trace_remove_event_call() */
-        trace_remove_event_call(&tu->call);
+        ret = trace_remove_event_call(&tu->call);
+        if (ret)
+                return ret;
        kfree(tu->call.print_fmt);
        tu->call.print_fmt = NULL;
+        return 0;
 }
 /* Make a trace interface for controling probe points */
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index d8c30db06c5b..9064b919a406 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -62,6 +62,9 @@ int create_user_ns(struct cred *new)
        kgid_t group = new->egid;
        int ret;
+        if (parent_ns->level > 32)
+                return -EUSERS;
        /*
         * Verify that we can not violate the policy of which files
         * may be accessed that is specified by the root directory,
@@ -92,6 +95,7 @@ int create_user_ns(struct cred *new)
        atomic_set(&ns->count, 1);
        /* Leave the new->user_ns reference with the new user namespace. */
        ns->parent = parent_ns;
+        ns->level = parent_ns->level + 1;
        ns->owner = owner;
        ns->group = group;
@@ -105,16 +109,21 @@ int create_user_ns(struct cred *new)
 int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
 {
        struct cred *cred;
+        int err = -ENOMEM;
        if (!(unshare_flags & CLONE_NEWUSER))
                return 0;
        cred = prepare_creds();
-        if (!cred)
+        if (cred) {
-                return -ENOMEM;
+                err = create_user_ns(cred);
+                if (err)
+                        put_cred(cred);
+                else
+                        *new_cred = cred;
+        }
-        *new_cred = cred;
+        return err;
-        return create_user_ns(cred);
 }
 void free_user_ns(struct user_namespace *ns)
diff --git a/kernel/wait.c b/kernel/wait.c
index 6698e0c04ead..dec68bd4e9d8 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -287,3 +287,92 @@ wait_queue_head_t *bit_waitqueue(void *word, int bit)
        return &zone->wait_table[hash_long(val, zone->wait_table_bits)];
 }
 EXPORT_SYMBOL(bit_waitqueue);
+/*
+ * Manipulate the atomic_t address to produce a better bit waitqueue table hash
+ * index (we're keying off bit -1, but that would produce a horrible hash
+ * value).
+ */
+static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
+{
+        if (BITS_PER_LONG == 64) {
+                unsigned long q = (unsigned long)p;
+                return bit_waitqueue((void *)(q & ~1), q & 1);
+        }
+        return bit_waitqueue(p, 0);
+}
+static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync,
+                                  void *arg)
+{
+        struct wait_bit_key *key = arg;
+        struct wait_bit_queue *wait_bit
+                = container_of(wait, struct wait_bit_queue, wait);
+        atomic_t *val = key->flags;
+        if (wait_bit->key.flags != key->flags ||
+            wait_bit->key.bit_nr != key->bit_nr ||
+            atomic_read(val) != 0)
+                return 0;
+        return autoremove_wake_function(wait, mode, sync, key);
+}
+/*
+ * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting,
+ * the actions of __wait_on_atomic_t() are permitted return codes.  Nonzero
+ * return codes halt waiting and return.
+ */
+static __sched
+int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q,
+                       int (*action)(atomic_t *), unsigned mode)
+{
+        atomic_t *val;
+        int ret = 0;
+        do {
+                prepare_to_wait(wq, &q->wait, mode);
+                val = q->key.flags;
+                if (atomic_read(val) == 0)
+                        break;
+                ret = (*action)(val);
+        } while (!ret && atomic_read(val) != 0);
+        finish_wait(wq, &q->wait);
+        return ret;
+}
+#define DEFINE_WAIT_ATOMIC_T(name, p)                                   \
+        struct wait_bit_queue name = {                                  \
+                .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p),              \
+                .wait   = {                                             \
+                        .private        = current,                      \
+                        .func           = wake_atomic_t_function,       \
+                        .task_list      =                               \
+                                LIST_HEAD_INIT((name).wait.task_list),  \
+                },                                                      \
+        }
+__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
+                                         unsigned mode)
+{
+        wait_queue_head_t *wq = atomic_t_waitqueue(p);
+        DEFINE_WAIT_ATOMIC_T(wait, p);
+        return __wait_on_atomic_t(wq, &wait, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
+/**
+ * wake_up_atomic_t - Wake up a waiter on a atomic_t
+ * @word: The word being waited on, a kernel virtual address
+ * @bit: The bit of the word being waited on
+ *
+ * Wake up anyone waiting for the atomic_t to go to zero.
+ *
+ * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t
+ * check is done by the waiter's wake function, not the by the waker itself).
+ */
+void wake_up_atomic_t(atomic_t *p)
+{
+        __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
+}
+EXPORT_SYMBOL(wake_up_atomic_t);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 05039e348f07..1241d8c91d5e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -29,9 +29,9 @@
 #include <linux/kvm_para.h>
 #include <linux/perf_event.h>
-int watchdog_enabled = 1;
+int watchdog_user_enabled = 1;
 int __read_mostly watchdog_thresh = 10;
-static int __read_mostly watchdog_disabled;
+static int __read_mostly watchdog_running;
 static u64 __read_mostly sample_period;
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -63,7 +63,7 @@ static int __init hardlockup_panic_setup(char *str)
        else if (!strncmp(str, "nopanic", 7))
                hardlockup_panic = 0;
        else if (!strncmp(str, "0", 1))
-                watchdog_enabled = 0;
+                watchdog_user_enabled = 0;
        return 1;
 }
 __setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -82,7 +82,7 @@ __setup("softlockup_panic=", softlockup_panic_setup);
 static int __init nowatchdog_setup(char *str)
 {
-        watchdog_enabled = 0;
+        watchdog_user_enabled = 0;
        return 1;
 }
 __setup("nowatchdog", nowatchdog_setup);
@@ -90,7 +90,7 @@ __setup("nowatchdog", nowatchdog_setup);
 /* deprecated */
 static int __init nosoftlockup_setup(char *str)
 {
-        watchdog_enabled = 0;
+        watchdog_user_enabled = 0;
        return 1;
 }
 __setup("nosoftlockup", nosoftlockup_setup);
@@ -158,7 +158,7 @@ void touch_all_softlockup_watchdogs(void)
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 void touch_nmi_watchdog(void)
 {
-        if (watchdog_enabled) {
+        if (watchdog_user_enabled) {
                unsigned cpu;
                for_each_present_cpu(cpu) {
@@ -347,11 +347,6 @@ static void watchdog_enable(unsigned int cpu)
        hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        hrtimer->function = watchdog_timer_fn;
-        if (!watchdog_enabled) {
-                kthread_park(current);
-                return;
-        }
        /* Enable the perf event */
        watchdog_nmi_enable(cpu);
@@ -374,6 +369,11 @@ static void watchdog_disable(unsigned int cpu)
        watchdog_nmi_disable(cpu);
 }
+static void watchdog_cleanup(unsigned int cpu, bool online)
+{
+        watchdog_disable(cpu);
+}
 static int watchdog_should_run(unsigned int cpu)
 {
        return __this_cpu_read(hrtimer_interrupts) !=
@@ -475,28 +475,40 @@ static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
 static void watchdog_nmi_disable(unsigned int cpu) { return; }
 #endif /* CONFIG_HARDLOCKUP_DETECTOR */
-/* prepare/enable/disable routines */
+static struct smp_hotplug_thread watchdog_threads = {
-/* sysctl functions */
+        .store                  = &softlockup_watchdog,
-#ifdef CONFIG_SYSCTL
+        .thread_should_run      = watchdog_should_run,
-static void watchdog_enable_all_cpus(void)
+        .thread_fn              = watchdog,
+        .thread_comm            = "watchdog/%u",
+        .setup                  = watchdog_enable,
+        .cleanup                = watchdog_cleanup,
+        .park                   = watchdog_disable,
+        .unpark                 = watchdog_enable,
+};
+static int watchdog_enable_all_cpus(void)
 {
-        unsigned int cpu;
+        int err = 0;
-        if (watchdog_disabled) {
+        if (!watchdog_running) {
-                watchdog_disabled = 0;
+                err = smpboot_register_percpu_thread(&watchdog_threads);
-                for_each_online_cpu(cpu)
+                if (err)
-                        kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+                        pr_err("Failed to create watchdog threads, disabled\n");
+                else
+                        watchdog_running = 1;
        }
+        return err;
 }
+/* prepare/enable/disable routines */
+/* sysctl functions */
+#ifdef CONFIG_SYSCTL
 static void watchdog_disable_all_cpus(void)
 {
-        unsigned int cpu;
+        if (watchdog_running) {
+                watchdog_running = 0;
-        if (!watchdog_disabled) {
+                smpboot_unregister_percpu_thread(&watchdog_threads);
-                watchdog_disabled = 1;
-                for_each_online_cpu(cpu)
-                        kthread_park(per_cpu(softlockup_watchdog, cpu));
        }
 }
@@ -507,45 +519,48 @@ static void watchdog_disable_all_cpus(void)
 int proc_dowatchdog(struct ctl_table *table, int write,
                    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-        int ret;
+        int err, old_thresh, old_enabled;
-        if (watchdog_disabled < 0)
+        old_thresh = ACCESS_ONCE(watchdog_thresh);
-                return -ENODEV;
+        old_enabled = ACCESS_ONCE(watchdog_user_enabled);
-        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+        err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-        if (ret || !write)
+        if (err || !write)
-                return ret;
+                return err;
        set_sample_period();
        /*
         * Watchdog threads shouldn't be enabled if they are
-         * disabled. The 'watchdog_disabled' variable check in
+         * disabled. The 'watchdog_running' variable check in
         * watchdog_*_all_cpus() function takes care of this.
         */
-        if (watchdog_enabled && watchdog_thresh)
+        if (watchdog_user_enabled && watchdog_thresh)
-                watchdog_enable_all_cpus();
+                err = watchdog_enable_all_cpus();
        else
                watchdog_disable_all_cpus();
-        return ret;
+        /* Restore old values on failure */
+        if (err) {
+                watchdog_thresh = old_thresh;
+                watchdog_user_enabled = old_enabled;
+        }
+        return err;
 }
 #endif /* CONFIG_SYSCTL */
-static struct smp_hotplug_thread watchdog_threads = {
-        .store                  = &softlockup_watchdog,
-        .thread_should_run      = watchdog_should_run,
-        .thread_fn              = watchdog,
-        .thread_comm            = "watchdog/%u",
-        .setup                  = watchdog_enable,
-        .park                   = watchdog_disable,
-        .unpark                 = watchdog_enable,
-};
 void __init lockup_detector_init(void)
 {
        set_sample_period();
-        if (smpboot_register_percpu_thread(&watchdog_threads)) {
-                pr_err("Failed to create watchdog threads, disabled\n");
+#ifdef CONFIG_NO_HZ_FULL
-                watchdog_disabled = -ENODEV;
+        if (watchdog_user_enabled) {
+                watchdog_user_enabled = 0;
+                pr_warning("Disabled lockup detectors by default for full dynticks\n");
+                pr_warning("You can reactivate it with 'sysctl -w kernel.watchdog=1'\n");
        }
+#endif
+        if (watchdog_user_enabled)
+                watchdog_enable_all_cpus();
 }
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ee8e29a2320c..7f5d4be22034 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -272,6 +272,15 @@ static cpumask_var_t *wq_numa_possible_cpumask;
 static bool wq_disable_numa;
 module_param_named(disable_numa, wq_disable_numa, bool, 0444);
+/* see the comment above the definition of WQ_POWER_EFFICIENT */
+#ifdef CONFIG_WQ_POWER_EFFICIENT_DEFAULT
+static bool wq_power_efficient = true;
+#else
+static bool wq_power_efficient;
+#endif
+module_param_named(power_efficient, wq_power_efficient, bool, 0444);
 static bool wq_numa_enabled;            /* unbound NUMA affinity enabled */
 /* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
@@ -305,6 +314,10 @@ struct workqueue_struct *system_unbound_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_unbound_wq);
 struct workqueue_struct *system_freezable_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_freezable_wq);
+struct workqueue_struct *system_power_efficient_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_power_efficient_wq);
+struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
 static int worker_thread(void *__worker);
 static void copy_workqueue_attrs(struct workqueue_attrs *to,
@@ -2804,6 +2817,19 @@ already_gone:
        return false;
 }
+static bool __flush_work(struct work_struct *work)
+{
+        struct wq_barrier barr;
+        if (start_flush_work(work, &barr)) {
+                wait_for_completion(&barr.done);
+                destroy_work_on_stack(&barr.work);
+                return true;
+        } else {
+                return false;
+        }
+}
 /**
 * flush_work - wait for a work to finish executing the last queueing instance
 * @work: the work to flush
@@ -2817,18 +2843,10 @@ already_gone:
 */
 bool flush_work(struct work_struct *work)
 {
-        struct wq_barrier barr;
        lock_map_acquire(&work->lockdep_map);
        lock_map_release(&work->lockdep_map);
-        if (start_flush_work(work, &barr)) {
+        return __flush_work(work);
-                wait_for_completion(&barr.done);
-                destroy_work_on_stack(&barr.work);
-                return true;
-        } else {
-                return false;
-        }
 }
 EXPORT_SYMBOL_GPL(flush_work);
@@ -3398,6 +3416,12 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
 {
        to->nice = from->nice;
        cpumask_copy(to->cpumask, from->cpumask);
+        /*
+         * Unlike hash and equality test, this function doesn't ignore
+         * ->no_numa as it is used for both pool and wq attrs.  Instead,
+         * get_unbound_pool() explicitly clears ->no_numa after copying.
+         */
+        to->no_numa = from->no_numa;
 }
 /* hash value of the content of @attr */
@@ -3565,6 +3589,12 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
        lockdep_set_subclass(&pool->lock, 1);   /* see put_pwq() */
        copy_workqueue_attrs(pool->attrs, attrs);
+        /*
+         * no_numa isn't a worker_pool attribute, always clear it.  See
+         * 'struct workqueue_attrs' comments for detail.
+         */
+        pool->attrs->no_numa = false;
        /* if cpumask is contained inside a NUMA node, we belong to that node */
        if (wq_numa_enabled) {
                for_each_node(node) {
@@ -4086,6 +4116,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
        struct workqueue_struct *wq;
        struct pool_workqueue *pwq;
+        /* see the comment above the definition of WQ_POWER_EFFICIENT */
+        if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
+                flags |= WQ_UNBOUND;
        /* allocate wq and format name */
        if (flags & WQ_UNBOUND)
                tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]);
@@ -4627,7 +4661,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
 * Workqueues should be brought up before normal priority CPU notifiers.
 * This will be registered high priority CPU notifier.
 */
-static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
+static int workqueue_cpu_up_callback(struct notifier_block *nfb,
                                               unsigned long action,
                                               void *hcpu)
 {
@@ -4680,7 +4714,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
 * Workqueues should be brought down after normal priority CPU notifiers.
 * This will be registered as low priority CPU notifier.
 */
-static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
+static int workqueue_cpu_down_callback(struct notifier_block *nfb,
                                                 unsigned long action,
                                                 void *hcpu)
 {
@@ -4739,7 +4773,14 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
        INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
        schedule_work_on(cpu, &wfc.work);
-        flush_work(&wfc.work);
+        /*
+         * The work item is on-stack and can't lead to deadlock through
+         * flushing.  Use __flush_work() to avoid spurious lockdep warnings
+         * when work_on_cpu()s are nested.
+         */
+        __flush_work(&wfc.work);
        return wfc.ret;
 }
 EXPORT_SYMBOL_GPL(work_on_cpu);
@@ -4985,8 +5026,15 @@ static int __init init_workqueues(void)
                                            WQ_UNBOUND_MAX_ACTIVE);
        system_freezable_wq = alloc_workqueue("events_freezable",
                                              WQ_FREEZABLE, 0);
+        system_power_efficient_wq = alloc_workqueue("events_power_efficient",
+                                              WQ_POWER_EFFICIENT, 0);
+        system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",
+                                              WQ_FREEZABLE | WQ_POWER_EFFICIENT,
+                                              0);
        BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
-               !system_unbound_wq || !system_freezable_wq);
+               !system_unbound_wq || !system_freezable_wq ||
+               !system_power_efficient_wq ||
+               !system_freezable_power_efficient_wq);
        return 0;
 }
 early_initcall(init_workqueues);
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index ad83c96b2ece..7e2204db0b1a 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -64,7 +64,7 @@ static inline struct worker *current_wq_worker(void)
 /*
 * Scheduler hooks for concurrency managed workqueue.  Only to be used from
- * sched.c and workqueue.c.
+ * sched/core.c and workqueue.c.
 */
 void wq_worker_waking_up(struct task_struct *task, int cpu);
 struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu);
author	Sage Weil <sage@inktank.com>	2013-08-15 14:11:45 -0400
committer	Sage Weil <sage@inktank.com>	2013-08-15 14:11:45 -0400
commit	ee3e542fec6e69bc9fb668698889a37d93950ddf (patch)
tree	e74ee766a4764769ef1d3d45d266b4dea64101d3 /kernel
parent	fe2a801b50c0bb8039d627e5ae1fec249d10ff39 (diff)
parent	f1d6e17f540af37bb1891480143669ba7636c4cf (diff)