140 files changed, 11146 insertions, 5414 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 88c92fb44618..5068e2a4e75f 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -199,4 +199,4 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE
        def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
 config MUTEX_SPIN_ON_OWNER
-        def_bool SMP && !DEBUG_MUTEXES && !HAVE_DEFAULT_NO_SPIN_MUTEXES
+        def_bool SMP && !DEBUG_MUTEXES
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index bf987b95b356..24e7cb0ba26a 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -35,6 +35,7 @@ config PREEMPT_VOLUNTARY
 config PREEMPT
        bool "Preemptible Kernel (Low-Latency Desktop)"
+        select PREEMPT_COUNT
        help
          This option reduces the latency of the kernel by making
          all kernel code (that is not executing in a critical section)
@@ -52,3 +53,5 @@ config PREEMPT
 endchoice
+config PREEMPT_COUNT
+       bool
+\ No newline at end of file
diff --git a/kernel/Makefile b/kernel/Makefile
index 85cbfb31e73e..eca595e2fd52 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
            kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
            notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
-            async.o range.o jump_label.o
+            async.o range.o
 obj-y += groups.o
 ifdef CONFIG_FUNCTION_TRACER
@@ -21,7 +21,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
 CFLAGS_REMOVE_rtmutex-debug.o = -pg
 CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
-CFLAGS_REMOVE_perf_event.o = -pg
 CFLAGS_REMOVE_irq_work.o = -pg
 endif
@@ -62,7 +61,6 @@ obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CGROUPS) += cgroup.o
 obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
-obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
 obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_USER_NS) += user_namespace.o
 obj-$(CONFIG_PID_NS) += pid_namespace.o
@@ -103,11 +101,13 @@ obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_TRACEPOINTS) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_IRQ_WORK) += irq_work.o
-obj-$(CONFIG_PERF_EVENTS) += perf_event.o
-obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
+obj-$(CONFIG_PERF_EVENTS) += events/
 obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
 obj-$(CONFIG_PADATA) += padata.o
 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
+obj-$(CONFIG_JUMP_LABEL) += jump_label.o
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
@@ -126,11 +126,10 @@ targets += config_data.gz
 $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
        $(call if_changed,gzip)
-quiet_cmd_ikconfiggz = IKCFG   $@
+      filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;")
-      cmd_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@
 targets += config_data.h
 $(obj)/config_data.h: $(obj)/config_data.gz FORCE
-        $(call if_changed,ikconfiggz)
+        $(call filechk,ikconfiggz)
 $(obj)/time.o: $(obj)/timeconst.h
diff --git a/kernel/async.c b/kernel/async.c
index cd9dbb913c77..d5fe7af0de2e 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -49,12 +49,13 @@ asynchronous and synchronous parts of the kernel.
 */
 #include <linux/async.h>
+#include <linux/atomic.h>
+#include <linux/ktime.h>
 #include <linux/module.h>
 #include <linux/wait.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>
-#include <asm/atomic.h>
 static async_cookie_t next_cookie = 1;
@@ -128,7 +129,8 @@ static void async_run_entry_fn(struct work_struct *work)
        /* 2) run (and print duration) */
        if (initcall_debug && system_state == SYSTEM_BOOTING) {
-                printk("calling  %lli_%pF @ %i\n", (long long)entry->cookie,
+                printk(KERN_DEBUG "calling  %lli_%pF @ %i\n",
+                        (long long)entry->cookie,
                        entry->func, task_pid_nr(current));
                calltime = ktime_get();
        }
@@ -136,7 +138,7 @@ static void async_run_entry_fn(struct work_struct *work)
        if (initcall_debug && system_state == SYSTEM_BOOTING) {
                rettime = ktime_get();
                delta = ktime_sub(rettime, calltime);
-                printk("initcall %lli_%pF returned 0 after %lld usecs\n",
+                printk(KERN_DEBUG "initcall %lli_%pF returned 0 after %lld usecs\n",
                        (long long)entry->cookie,
                        entry->func,
                        (long long)ktime_to_ns(delta) >> 10);
@@ -270,7 +272,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie,
        ktime_t starttime, delta, endtime;
        if (initcall_debug && system_state == SYSTEM_BOOTING) {
-                printk("async_waiting @ %i\n", task_pid_nr(current));
+                printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
                starttime = ktime_get();
        }
@@ -280,7 +282,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie,
                endtime = ktime_get();
                delta = ktime_sub(endtime, starttime);
-                printk("async_continuing @ %i after %lli usec\n",
+                printk(KERN_DEBUG "async_continuing @ %i after %lli usec\n",
                        task_pid_nr(current),
                        (long long)ktime_to_ns(delta) >> 10);
        }
diff --git a/kernel/audit.c b/kernel/audit.c
index 939500317066..0a1355ca3d79 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -43,7 +43,7 @@
 #include <linux/init.h>
 #include <asm/types.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -55,6 +55,9 @@
 #include <net/sock.h>
 #include <net/netlink.h>
 #include <linux/skbuff.h>
+#ifdef CONFIG_SECURITY
+#include <linux/security.h>
+#endif
 #include <linux/netlink.h>
 #include <linux/freezer.h>
 #include <linux/tty.h>
@@ -1502,6 +1505,32 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
        }
 }
+#ifdef CONFIG_SECURITY
+/**
+ * audit_log_secctx - Converts and logs SELinux context
+ * @ab: audit_buffer
+ * @secid: security number
+ *
+ * This is a helper function that calls security_secid_to_secctx to convert
+ * secid to secctx and then adds the (converted) SELinux context to the audit
+ * log by calling audit_log_format, thus also preventing leak of internal secid
+ * to userspace. If secid cannot be converted audit_panic is called.
+ */
+void audit_log_secctx(struct audit_buffer *ab, u32 secid)
+{
+        u32 len;
+        char *secctx;
+        if (security_secid_to_secctx(secid, &secctx, &len)) {
+                audit_panic("Cannot convert secid to context");
+        } else {
+                audit_log_format(ab, " obj=%s", secctx);
+                security_release_secctx(secctx, len);
+        }
+}
+EXPORT_SYMBOL(audit_log_secctx);
+#endif
 EXPORT_SYMBOL(audit_log_start);
 EXPORT_SYMBOL(audit_log_end);
 EXPORT_SYMBOL(audit_log_format);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index e99dda04b126..5bf0790497e7 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -93,16 +93,10 @@ static inline void get_tree(struct audit_tree *tree)
        atomic_inc(&tree->count);
 }
-static void __put_tree(struct rcu_head *rcu)
-{
-        struct audit_tree *tree = container_of(rcu, struct audit_tree, head);
-        kfree(tree);
-}
 static inline void put_tree(struct audit_tree *tree)
 {
        if (atomic_dec_and_test(&tree->count))
-                call_rcu(&tree->head, __put_tree);
+                kfree_rcu(tree, head);
 }
 /* to avoid bringing the entire thing in audit.h */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b33513a08beb..ce4b054acee5 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -44,7 +44,7 @@
 #include <linux/init.h>
 #include <asm/types.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <linux/fs.h>
 #include <linux/namei.h>
 #include <linux/mm.h>
@@ -443,17 +443,25 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
 /* Determine if any context name data matches a rule's watch data */
 /* Compare a task_struct with an audit_rule.  Return 1 on match, 0
- * otherwise. */
+ * otherwise.
+ *
+ * If task_creation is true, this is an explicit indication that we are
+ * filtering a task rule at task creation time.  This and tsk == current are
+ * the only situations where tsk->cred may be accessed without an rcu read lock.
+ */
 static int audit_filter_rules(struct task_struct *tsk,
                              struct audit_krule *rule,
                              struct audit_context *ctx,
                              struct audit_names *name,
-                              enum audit_state *state)
+                              enum audit_state *state,
+                              bool task_creation)
 {
-        const struct cred *cred = get_task_cred(tsk);
+        const struct cred *cred;
        int i, j, need_sid = 1;
        u32 sid;
+        cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
        for (i = 0; i < rule->field_count; i++) {
                struct audit_field *f = &rule->fields[i];
                int result = 0;
@@ -637,10 +645,8 @@ static int audit_filter_rules(struct task_struct *tsk,
                        break;
                }
-                if (!result) {
+                if (!result)
-                        put_cred(cred);
                        return 0;
-                }
        }
        if (ctx) {
@@ -656,7 +662,6 @@ static int audit_filter_rules(struct task_struct *tsk,
        case AUDIT_NEVER:    *state = AUDIT_DISABLED;       break;
        case AUDIT_ALWAYS:   *state = AUDIT_RECORD_CONTEXT; break;
        }
-        put_cred(cred);
        return 1;
 }
@@ -671,7 +676,8 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
        rcu_read_lock();
        list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
-                if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) {
+                if (audit_filter_rules(tsk, &e->rule, NULL, NULL,
+                                       &state, true)) {
                        if (state == AUDIT_RECORD_CONTEXT)
                                *key = kstrdup(e->rule.filterkey, GFP_ATOMIC);
                        rcu_read_unlock();
@@ -705,7 +711,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
                list_for_each_entry_rcu(e, list, list) {
                        if ((e->rule.mask[word] & bit) == bit &&
                            audit_filter_rules(tsk, &e->rule, ctx, NULL,
-                                               &state)) {
+                                               &state, false)) {
                                rcu_read_unlock();
                                ctx->current_state = state;
                                return state;
@@ -743,7 +749,8 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
                list_for_each_entry_rcu(e, list, list) {
                        if ((e->rule.mask[word] & bit) == bit &&
-                            audit_filter_rules(tsk, &e->rule, ctx, n, &state)) {
+                            audit_filter_rules(tsk, &e->rule, ctx, n,
+                                               &state, false)) {
                                rcu_read_unlock();
                                ctx->current_state = state;
                                return;
diff --git a/kernel/capability.c b/kernel/capability.c
index bf0c734d0c12..283c529f8b1c 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -22,12 +22,8 @@
 */
 const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
-const kernel_cap_t __cap_full_set = CAP_FULL_SET;
-const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET;
 EXPORT_SYMBOL(__cap_empty_set);
-EXPORT_SYMBOL(__cap_full_set);
-EXPORT_SYMBOL(__cap_init_eff_set);
 int file_caps_enabled = 1;
@@ -399,3 +395,15 @@ bool task_ns_capable(struct task_struct *t, int cap)
        return ns_capable(task_cred_xxx(t, user)->user_ns, cap);
 }
 EXPORT_SYMBOL(task_ns_capable);
+/**
+ * nsown_capable - Check superior capability to one's own user_ns
+ * @cap: The capability in question
+ *
+ * Return true if the current task has the given superior capability
+ * targeted at its own user namespace.
+ */
+bool nsown_capable(int cap)
+{
+        return ns_capable(current_user_ns(), cap);
+}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 25c7eb52de1a..1d2b6ceea95d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -27,9 +27,11 @@
 */
 #include <linux/cgroup.h>
+#include <linux/cred.h>
 #include <linux/ctype.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
+#include <linux/init_task.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/mm.h>
@@ -57,8 +59,9 @@
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
 #include <linux/eventfd.h>
 #include <linux/poll.h>
+#include <linux/flex_array.h> /* used in cgroup_attach_proc */
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 static DEFINE_MUTEX(cgroup_mutex);
@@ -326,12 +329,6 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
        return &css_set_table[index];
 }
-static void free_css_set_rcu(struct rcu_head *obj)
-{
-        struct css_set *cg = container_of(obj, struct css_set, rcu_head);
-        kfree(cg);
-}
 /* We don't maintain the lists running through each css_set to its
 * task until after the first call to cgroup_iter_start(). This
 * reduces the fork()/exit() overhead for people who have cgroups
@@ -375,7 +372,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
        }
        write_unlock(&css_set_lock);
-        call_rcu(&cg->rcu_head, free_css_set_rcu);
+        kfree_rcu(cg, rcu_head);
 }
 /*
@@ -812,13 +809,6 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
        return ret;
 }
-static void free_cgroup_rcu(struct rcu_head *obj)
-{
-        struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
-        kfree(cgrp);
-}
 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 {
        /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -856,7 +846,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
                 */
                BUG_ON(!list_empty(&cgrp->pidlists));
-                call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
+                kfree_rcu(cgrp, rcu_head);
        }
        iput(inode);
 }
@@ -1526,6 +1516,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                struct cgroup *root_cgrp = &root->top_cgroup;
                struct inode *inode;
                struct cgroupfs_root *existing_root;
+                const struct cred *cred;
                int i;
                BUG_ON(sb->s_root != NULL);
@@ -1605,7 +1596,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                BUG_ON(!list_empty(&root_cgrp->children));
                BUG_ON(root->number_of_cgroups != 1);
+                cred = override_creds(&init_cred);
                cgroup_populate_dir(root_cgrp);
+                revert_creds(cred);
                mutex_unlock(&cgroup_mutex);
                mutex_unlock(&inode->i_mutex);
        } else {
@@ -1709,7 +1702,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 {
        char *start;
        struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
-                                                      rcu_read_lock_held() ||
                                                      cgroup_lock_is_held());
        if (!dentry || cgrp == dummytop) {
@@ -1735,7 +1727,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
                        break;
                dentry = rcu_dereference_check(cgrp->dentry,
-                                               rcu_read_lock_held() ||
                                               cgroup_lock_is_held());
                if (!cgrp->parent)
                        continue;
@@ -1748,6 +1739,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 }
 EXPORT_SYMBOL_GPL(cgroup_path);
+/*
+ * cgroup_task_migrate - move a task from one cgroup to another.
+ *
+ * 'guarantee' is set if the caller promises that a new css_set for the task
+ * will already exist. If not set, this function might sleep, and can fail with
+ * -ENOMEM. Otherwise, it can only fail with -ESRCH.
+ */
+static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
+                               struct task_struct *tsk, bool guarantee)
+{
+        struct css_set *oldcg;
+        struct css_set *newcg;
+        /*
+         * get old css_set. we need to take task_lock and refcount it, because
+         * an exiting task can change its css_set to init_css_set and drop its
+         * old one without taking cgroup_mutex.
+         */
+        task_lock(tsk);
+        oldcg = tsk->cgroups;
+        get_css_set(oldcg);
+        task_unlock(tsk);
+        /* locate or allocate a new css_set for this task. */
+        if (guarantee) {
+                /* we know the css_set we want already exists. */
+                struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+                read_lock(&css_set_lock);
+                newcg = find_existing_css_set(oldcg, cgrp, template);
+                BUG_ON(!newcg);
+                get_css_set(newcg);
+                read_unlock(&css_set_lock);
+        } else {
+                might_sleep();
+                /* find_css_set will give us newcg already referenced. */
+                newcg = find_css_set(oldcg, cgrp);
+                if (!newcg) {
+                        put_css_set(oldcg);
+                        return -ENOMEM;
+                }
+        }
+        put_css_set(oldcg);
+        /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
+        task_lock(tsk);
+        if (tsk->flags & PF_EXITING) {
+                task_unlock(tsk);
+                put_css_set(newcg);
+                return -ESRCH;
+        }
+        rcu_assign_pointer(tsk->cgroups, newcg);
+        task_unlock(tsk);
+        /* Update the css_set linked lists if we're using them */
+        write_lock(&css_set_lock);
+        if (!list_empty(&tsk->cg_list))
+                list_move(&tsk->cg_list, &newcg->tasks);
+        write_unlock(&css_set_lock);
+        /*
+         * We just gained a reference on oldcg by taking it from the task. As
+         * trading it for newcg is protected by cgroup_mutex, we're safe to drop
+         * it here; it will be freed under RCU.
+         */
+        put_css_set(oldcg);
+        set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+        return 0;
+}
 /**
 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
 * @cgrp: the cgroup the task is attaching to
@@ -1758,11 +1819,9 @@ EXPORT_SYMBOL_GPL(cgroup_path);
 */
 int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
-        int retval = 0;
+        int retval;
        struct cgroup_subsys *ss, *failed_ss = NULL;
        struct cgroup *oldcgrp;
-        struct css_set *cg;
-        struct css_set *newcg;
        struct cgroupfs_root *root = cgrp->root;
        /* Nothing to do if the task is already in that cgroup */
@@ -1772,7 +1831,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
        for_each_subsys(root, ss) {
                if (ss->can_attach) {
-                        retval = ss->can_attach(ss, cgrp, tsk, false);
+                        retval = ss->can_attach(ss, cgrp, tsk);
                        if (retval) {
                                /*
                                 * Remember on which subsystem the can_attach()
@@ -1784,46 +1843,29 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
                                goto out;
                        }
                }
+                if (ss->can_attach_task) {
+                        retval = ss->can_attach_task(cgrp, tsk);
+                        if (retval) {
+                                failed_ss = ss;
+                                goto out;
+                        }
+                }
        }
-        task_lock(tsk);
+        retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
-        cg = tsk->cgroups;
+        if (retval)
-        get_css_set(cg);
-        task_unlock(tsk);
-        /*
-         * Locate or allocate a new css_set for this task,
-         * based on its final set of cgroups
-         */
-        newcg = find_css_set(cg, cgrp);
-        put_css_set(cg);
-        if (!newcg) {
-                retval = -ENOMEM;
                goto out;
-        }
-        task_lock(tsk);
-        if (tsk->flags & PF_EXITING) {
-                task_unlock(tsk);
-                put_css_set(newcg);
-                retval = -ESRCH;
-                goto out;
-        }
-        rcu_assign_pointer(tsk->cgroups, newcg);
-        task_unlock(tsk);
-        /* Update the css_set linked lists if we're using them */
-        write_lock(&css_set_lock);
-        if (!list_empty(&tsk->cg_list))
-                list_move(&tsk->cg_list, &newcg->tasks);
-        write_unlock(&css_set_lock);
        for_each_subsys(root, ss) {
+                if (ss->pre_attach)
+                        ss->pre_attach(cgrp);
+                if (ss->attach_task)
+                        ss->attach_task(cgrp, tsk);
                if (ss->attach)
-                        ss->attach(ss, cgrp, oldcgrp, tsk, false);
+                        ss->attach(ss, cgrp, oldcgrp, tsk);
        }
-        set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
        synchronize_rcu();
-        put_css_set(cg);
        /*
         * wake up rmdir() waiter. the rmdir should fail since the cgroup
@@ -1842,7 +1884,7 @@ out:
                                 */
                                break;
                        if (ss->cancel_attach)
-                                ss->cancel_attach(ss, cgrp, tsk, false);
+                                ss->cancel_attach(ss, cgrp, tsk);
                }
        }
        return retval;
@@ -1873,49 +1915,370 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
 /*
- * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
+ * cgroup_attach_proc works in two stages, the first of which prefetches all
- * held. May take task_lock of task
+ * new css_sets needed (to make sure we have enough memory before committing
+ * to the move) and stores them in a list of entries of the following type.
+ * TODO: possible optimization: use css_set->rcu_head for chaining instead
+ */
+struct cg_list_entry {
+        struct css_set *cg;
+        struct list_head links;
+};
+static bool css_set_check_fetched(struct cgroup *cgrp,
+                                  struct task_struct *tsk, struct css_set *cg,
+                                  struct list_head *newcg_list)
+{
+        struct css_set *newcg;
+        struct cg_list_entry *cg_entry;
+        struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+        read_lock(&css_set_lock);
+        newcg = find_existing_css_set(cg, cgrp, template);
+        if (newcg)
+                get_css_set(newcg);
+        read_unlock(&css_set_lock);
+        /* doesn't exist at all? */
+        if (!newcg)
+                return false;
+        /* see if it's already in the list */
+        list_for_each_entry(cg_entry, newcg_list, links) {
+                if (cg_entry->cg == newcg) {
+                        put_css_set(newcg);
+                        return true;
+                }
+        }
+        /* not found */
+        put_css_set(newcg);
+        return false;
+}
+/*
+ * Find the new css_set and store it in the list in preparation for moving the
+ * given task to the given cgroup. Returns 0 or -ENOMEM.
+ */
+static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
+                            struct list_head *newcg_list)
+{
+        struct css_set *newcg;
+        struct cg_list_entry *cg_entry;
+        /* ensure a new css_set will exist for this thread */
+        newcg = find_css_set(cg, cgrp);
+        if (!newcg)
+                return -ENOMEM;
+        /* add it to the list */
+        cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
+        if (!cg_entry) {
+                put_css_set(newcg);
+                return -ENOMEM;
+        }
+        cg_entry->cg = newcg;
+        list_add(&cg_entry->links, newcg_list);
+        return 0;
+}
+/**
+ * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
+ * @cgrp: the cgroup to attach to
+ * @leader: the threadgroup leader task_struct of the group to be attached
+ *
+ * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
+ * take task_lock of each thread in leader's threadgroup individually in turn.
+ */
+int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+{
+        int retval, i, group_size;
+        struct cgroup_subsys *ss, *failed_ss = NULL;
+        bool cancel_failed_ss = false;
+        /* guaranteed to be initialized later, but the compiler needs this */
+        struct cgroup *oldcgrp = NULL;
+        struct css_set *oldcg;
+        struct cgroupfs_root *root = cgrp->root;
+        /* threadgroup list cursor and array */
+        struct task_struct *tsk;
+        struct flex_array *group;
+        /*
+         * we need to make sure we have css_sets for all the tasks we're
+         * going to move -before- we actually start moving them, so that in
+         * case we get an ENOMEM we can bail out before making any changes.
+         */
+        struct list_head newcg_list;
+        struct cg_list_entry *cg_entry, *temp_nobe;
+        /*
+         * step 0: in order to do expensive, possibly blocking operations for
+         * every thread, we cannot iterate the thread group list, since it needs
+         * rcu or tasklist locked. instead, build an array of all threads in the
+         * group - threadgroup_fork_lock prevents new threads from appearing,
+         * and if threads exit, this will just be an over-estimate.
+         */
+        group_size = get_nr_threads(leader);
+        /* flex_array supports very large thread-groups better than kmalloc. */
+        group = flex_array_alloc(sizeof(struct task_struct *), group_size,
+                                 GFP_KERNEL);
+        if (!group)
+                return -ENOMEM;
+        /* pre-allocate to guarantee space while iterating in rcu read-side. */
+        retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
+        if (retval)
+                goto out_free_group_list;
+        /* prevent changes to the threadgroup list while we take a snapshot. */
+        rcu_read_lock();
+        if (!thread_group_leader(leader)) {
+                /*
+                 * a race with de_thread from another thread's exec() may strip
+                 * us of our leadership, making while_each_thread unsafe to use
+                 * on this task. if this happens, there is no choice but to
+                 * throw this task away and try again (from cgroup_procs_write);
+                 * this is "double-double-toil-and-trouble-check locking".
+                 */
+                rcu_read_unlock();
+                retval = -EAGAIN;
+                goto out_free_group_list;
+        }
+        /* take a reference on each task in the group to go in the array. */
+        tsk = leader;
+        i = 0;
+        do {
+                /* as per above, nr_threads may decrease, but not increase. */
+                BUG_ON(i >= group_size);
+                get_task_struct(tsk);
+                /*
+                 * saying GFP_ATOMIC has no effect here because we did prealloc
+                 * earlier, but it's good form to communicate our expectations.
+                 */
+                retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
+                BUG_ON(retval != 0);
+                i++;
+        } while_each_thread(leader, tsk);
+        /* remember the number of threads in the array for later. */
+        group_size = i;
+        rcu_read_unlock();
+        /*
+         * step 1: check that we can legitimately attach to the cgroup.
+         */
+        for_each_subsys(root, ss) {
+                if (ss->can_attach) {
+                        retval = ss->can_attach(ss, cgrp, leader);
+                        if (retval) {
+                                failed_ss = ss;
+                                goto out_cancel_attach;
+                        }
+                }
+                /* a callback to be run on every thread in the threadgroup. */
+                if (ss->can_attach_task) {
+                        /* run on each task in the threadgroup. */
+                        for (i = 0; i < group_size; i++) {
+                                tsk = flex_array_get_ptr(group, i);
+                                retval = ss->can_attach_task(cgrp, tsk);
+                                if (retval) {
+                                        failed_ss = ss;
+                                        cancel_failed_ss = true;
+                                        goto out_cancel_attach;
+                                }
+                        }
+                }
+        }
+        /*
+         * step 2: make sure css_sets exist for all threads to be migrated.
+         * we use find_css_set, which allocates a new one if necessary.
+         */
+        INIT_LIST_HEAD(&newcg_list);
+        for (i = 0; i < group_size; i++) {
+                tsk = flex_array_get_ptr(group, i);
+                /* nothing to do if this task is already in the cgroup */
+                oldcgrp = task_cgroup_from_root(tsk, root);
+                if (cgrp == oldcgrp)
+                        continue;
+                /* get old css_set pointer */
+                task_lock(tsk);
+                if (tsk->flags & PF_EXITING) {
+                        /* ignore this task if it's going away */
+                        task_unlock(tsk);
+                        continue;
+                }
+                oldcg = tsk->cgroups;
+                get_css_set(oldcg);
+                task_unlock(tsk);
+                /* see if the new one for us is already in the list? */
+                if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
+                        /* was already there, nothing to do. */
+                        put_css_set(oldcg);
+                } else {
+                        /* we don't already have it. get new one. */
+                        retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
+                        put_css_set(oldcg);
+                        if (retval)
+                                goto out_list_teardown;
+                }
+        }
+        /*
+         * step 3: now that we're guaranteed success wrt the css_sets, proceed
+         * to move all tasks to the new cgroup, calling ss->attach_task for each
+         * one along the way. there are no failure cases after here, so this is
+         * the commit point.
+         */
+        for_each_subsys(root, ss) {
+                if (ss->pre_attach)
+                        ss->pre_attach(cgrp);
+        }
+        for (i = 0; i < group_size; i++) {
+                tsk = flex_array_get_ptr(group, i);
+                /* leave current thread as it is if it's already there */
+                oldcgrp = task_cgroup_from_root(tsk, root);
+                if (cgrp == oldcgrp)
+                        continue;
+                /* attach each task to each subsystem */
+                for_each_subsys(root, ss) {
+                        if (ss->attach_task)
+                                ss->attach_task(cgrp, tsk);
+                }
+                /* if the thread is PF_EXITING, it can just get skipped. */
+                retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
+                BUG_ON(retval != 0 && retval != -ESRCH);
+        }
+        /* nothing is sensitive to fork() after this point. */
+        /*
+         * step 4: do expensive, non-thread-specific subsystem callbacks.
+         * TODO: if ever a subsystem needs to know the oldcgrp for each task
+         * being moved, this call will need to be reworked to communicate that.
+         */
+        for_each_subsys(root, ss) {
+                if (ss->attach)
+                        ss->attach(ss, cgrp, oldcgrp, leader);
+        }
+        /*
+         * step 5: success! and cleanup
+         */
+        synchronize_rcu();
+        cgroup_wakeup_rmdir_waiter(cgrp);
+        retval = 0;
+out_list_teardown:
+        /* clean up the list of prefetched css_sets. */
+        list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
+                list_del(&cg_entry->links);
+                put_css_set(cg_entry->cg);
+                kfree(cg_entry);
+        }
+out_cancel_attach:
+        /* same deal as in cgroup_attach_task */
+        if (retval) {
+                for_each_subsys(root, ss) {
+                        if (ss == failed_ss) {
+                                if (cancel_failed_ss && ss->cancel_attach)
+                                        ss->cancel_attach(ss, cgrp, leader);
+                                break;
+                        }
+                        if (ss->cancel_attach)
+                                ss->cancel_attach(ss, cgrp, leader);
+                }
+        }
+        /* clean up the array of referenced threads in the group. */
+        for (i = 0; i < group_size; i++) {
+                tsk = flex_array_get_ptr(group, i);
+                put_task_struct(tsk);
+        }
+out_free_group_list:
+        flex_array_free(group);
+        return retval;
+}
+/*
+ * Find the task_struct of the task to attach by vpid and pass it along to the
+ * function to attach either it or all tasks in its threadgroup. Will take
+ * cgroup_mutex; may take task_lock of task.
 */
-static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
+static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
 {
        struct task_struct *tsk;
        const struct cred *cred = current_cred(), *tcred;
        int ret;
+        if (!cgroup_lock_live_group(cgrp))
+                return -ENODEV;
        if (pid) {
                rcu_read_lock();
                tsk = find_task_by_vpid(pid);
-                if (!tsk || tsk->flags & PF_EXITING) {
+                if (!tsk) {
                        rcu_read_unlock();
+                        cgroup_unlock();
+                        return -ESRCH;
+                }
+                if (threadgroup) {
+                        /*
+                         * RCU protects this access, since tsk was found in the
+                         * tid map. a race with de_thread may cause group_leader
+                         * to stop being the leader, but cgroup_attach_proc will
+                         * detect it later.
+                         */
+                        tsk = tsk->group_leader;
+                } else if (tsk->flags & PF_EXITING) {
+                        /* optimization for the single-task-only case */
+                        rcu_read_unlock();
+                        cgroup_unlock();
                        return -ESRCH;
                }
+                /*
+                 * even if we're attaching all tasks in the thread group, we
+                 * only need to check permissions on one of them.
+                 */
                tcred = __task_cred(tsk);
                if (cred->euid &&
                    cred->euid != tcred->uid &&
                    cred->euid != tcred->suid) {
                        rcu_read_unlock();
+                        cgroup_unlock();
                        return -EACCES;
                }
                get_task_struct(tsk);
                rcu_read_unlock();
        } else {
-                tsk = current;
+                if (threadgroup)
+                        tsk = current->group_leader;
+                else
+                        tsk = current;
                get_task_struct(tsk);
        }
-        ret = cgroup_attach_task(cgrp, tsk);
+        if (threadgroup) {
+                threadgroup_fork_write_lock(tsk);
+                ret = cgroup_attach_proc(cgrp, tsk);
+                threadgroup_fork_write_unlock(tsk);
+        } else {
+                ret = cgroup_attach_task(cgrp, tsk);
+        }
        put_task_struct(tsk);
+        cgroup_unlock();
        return ret;
 }
 static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
 {
+        return attach_task_by_pid(cgrp, pid, false);
+}
+static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
+{
        int ret;
-        if (!cgroup_lock_live_group(cgrp))
+        do {
-                return -ENODEV;
+                /*
-        ret = attach_task_by_pid(cgrp, pid);
+                 * attach_proc fails with -EAGAIN if threadgroup leadership
-        cgroup_unlock();
+                 * changes in the middle of the operation, in which case we need
+                 * to find the task_struct for the new leader and start over.
+                 */
+                ret = attach_task_by_pid(cgrp, tgid, true);
+        } while (ret == -EAGAIN);
        return ret;
 }
@@ -3182,7 +3545,8 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
        }
        /* the process need read permission on control file */
-        ret = file_permission(cfile, MAY_READ);
+        /* AV: shouldn't we check that it's been opened for read instead? */
+        ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ);
        if (ret < 0)
                goto fail;
@@ -3272,9 +3636,9 @@ static struct cftype files[] = {
        {
                .name = CGROUP_FILE_GENERIC_PREFIX "procs",
                .open = cgroup_procs_open,
-                /* .write_u64 = cgroup_procs_write, TODO */
+                .write_u64 = cgroup_procs_write,
                .release = cgroup_pidlist_release,
-                .mode = S_IRUGO,
+                .mode = S_IRUGO | S_IWUSR,
        },
        {
                .name = "notify_on_release",
@@ -4270,122 +4634,6 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 }
 /**
- * cgroup_clone - clone the cgroup the given subsystem is attached to
- * @tsk: the task to be moved
- * @subsys: the given subsystem
- * @nodename: the name for the new cgroup
- *
- * Duplicate the current cgroup in the hierarchy that the given
- * subsystem is attached to, and move this task into the new
- * child.
- */
-int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
-                                                        char *nodename)
-{
-        struct dentry *dentry;
-        int ret = 0;
-        struct cgroup *parent, *child;
-        struct inode *inode;
-        struct css_set *cg;
-        struct cgroupfs_root *root;
-        struct cgroup_subsys *ss;
-        /* We shouldn't be called by an unregistered subsystem */
-        BUG_ON(!subsys->active);
-        /* First figure out what hierarchy and cgroup we're dealing
-         * with, and pin them so we can drop cgroup_mutex */
-        mutex_lock(&cgroup_mutex);
- again:
-        root = subsys->root;
-        if (root == &rootnode) {
-                mutex_unlock(&cgroup_mutex);
-                return 0;
-        }
-        /* Pin the hierarchy */
-        if (!atomic_inc_not_zero(&root->sb->s_active)) {
-                /* We race with the final deactivate_super() */
-                mutex_unlock(&cgroup_mutex);
-                return 0;
-        }
-        /* Keep the cgroup alive */
-        task_lock(tsk);
-        parent = task_cgroup(tsk, subsys->subsys_id);
-        cg = tsk->cgroups;
-        get_css_set(cg);
-        task_unlock(tsk);
-        mutex_unlock(&cgroup_mutex);
-        /* Now do the VFS work to create a cgroup */
-        inode = parent->dentry->d_inode;
-        /* Hold the parent directory mutex across this operation to
-         * stop anyone else deleting the new cgroup */
-        mutex_lock(&inode->i_mutex);
-        dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
-        if (IS_ERR(dentry)) {
-                printk(KERN_INFO
-                       "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename,
-                       PTR_ERR(dentry));
-                ret = PTR_ERR(dentry);
-                goto out_release;
-        }
-        /* Create the cgroup directory, which also creates the cgroup */
-        ret = vfs_mkdir(inode, dentry, 0755);
-        child = __d_cgrp(dentry);
-        dput(dentry);
-        if (ret) {
-                printk(KERN_INFO
-                       "Failed to create cgroup %s: %d\n", nodename,
-                       ret);
-                goto out_release;
-        }
-        /* The cgroup now exists. Retake cgroup_mutex and check
-         * that we're still in the same state that we thought we
-         * were. */
-        mutex_lock(&cgroup_mutex);
-        if ((root != subsys->root) ||
-            (parent != task_cgroup(tsk, subsys->subsys_id))) {
-                /* Aargh, we raced ... */
-                mutex_unlock(&inode->i_mutex);
-                put_css_set(cg);
-                deactivate_super(root->sb);
-                /* The cgroup is still accessible in the VFS, but
-                 * we're not going to try to rmdir() it at this
-                 * point. */
-                printk(KERN_INFO
-                       "Race in cgroup_clone() - leaking cgroup %s\n",
-                       nodename);
-                goto again;
-        }
-        /* do any required auto-setup */
-        for_each_subsys(root, ss) {
-                if (ss->post_clone)
-                        ss->post_clone(ss, child);
-        }
-        /* All seems fine. Finish by moving the task into the new cgroup */
-        ret = cgroup_attach_task(child, tsk);
-        mutex_unlock(&cgroup_mutex);
- out_release:
-        mutex_unlock(&inode->i_mutex);
-        mutex_lock(&cgroup_mutex);
-        put_css_set(cg);
-        mutex_unlock(&cgroup_mutex);
-        deactivate_super(root->sb);
-        return ret;
-}
-/**
 * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
 * @cgrp: the cgroup in question
 * @task: the task in question
@@ -4569,8 +4817,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
         * on this or this is under rcu_read_lock(). Once css->id is allocated,
         * it's unchanged until freed.
         */
-        cssid = rcu_dereference_check(css->id,
+        cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
-                        rcu_read_lock_held() || atomic_read(&css->refcnt));
        if (cssid)
                return cssid->id;
@@ -4582,8 +4829,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
 {
        struct css_id *cssid;
-        cssid = rcu_dereference_check(css->id,
+        cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
-                        rcu_read_lock_held() || atomic_read(&css->refcnt));
        if (cssid)
                return cssid->depth;
@@ -4623,14 +4869,6 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,
        return ret;
 }
-static void __free_css_id_cb(struct rcu_head *head)
-{
-        struct css_id *id;
-        id = container_of(head, struct css_id, rcu_head);
-        kfree(id);
-}
 void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
 {
        struct css_id *id = css->id;
@@ -4645,7 +4883,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
        spin_lock(&ss->id_lock);
        idr_remove(&ss->idr, id->id);
        spin_unlock(&ss->id_lock);
-        call_rcu(&id->rcu_head, __free_css_id_cb);
+        kfree_rcu(id, rcu_head);
 }
 EXPORT_SYMBOL_GPL(free_css_id);
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e7bebb7c6c38..e691818d7e45 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -160,7 +160,7 @@ static void freezer_destroy(struct cgroup_subsys *ss,
 */
 static int freezer_can_attach(struct cgroup_subsys *ss,
                              struct cgroup *new_cgroup,
-                              struct task_struct *task, bool threadgroup)
+                              struct task_struct *task)
 {
        struct freezer *freezer;
@@ -172,26 +172,17 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
        if (freezer->state != CGROUP_THAWED)
                return -EBUSY;
+        return 0;
+}
+static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+{
        rcu_read_lock();
-        if (__cgroup_freezing_or_frozen(task)) {
+        if (__cgroup_freezing_or_frozen(tsk)) {
                rcu_read_unlock();
                return -EBUSY;
        }
        rcu_read_unlock();
-        if (threadgroup) {
-                struct task_struct *c;
-                rcu_read_lock();
-                list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
-                        if (__cgroup_freezing_or_frozen(c)) {
-                                rcu_read_unlock();
-                                return -EBUSY;
-                        }
-                }
-                rcu_read_unlock();
-        }
        return 0;
 }
@@ -390,6 +381,9 @@ struct cgroup_subsys freezer_subsys = {
        .populate       = freezer_populate,
        .subsys_id      = freezer_subsys_id,
        .can_attach     = freezer_can_attach,
+        .can_attach_task = freezer_can_attach_task,
+        .pre_attach     = NULL,
+        .attach_task    = NULL,
        .attach         = NULL,
        .fork           = freezer_fork,
        .exit           = NULL,
diff --git a/kernel/compat.c b/kernel/compat.c
index 38b1d2c1cbe8..e2435ee9993a 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -158,6 +158,7 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user
                        __put_user(ts->tv_sec, &cts->tv_sec) ||
                        __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
 }
+EXPORT_SYMBOL_GPL(put_compat_timespec);
 static long compat_nanosleep_restart(struct restart_block *restart)
 {
@@ -293,6 +294,8 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
        return compat_jiffies_to_clock_t(jiffies);
 }
+#ifdef __ARCH_WANT_SYS_SIGPENDING
 /*
 * Assumption: old_sigset_t and compat_old_sigset_t are both
 * types that can be passed to put_user()/get_user().
@@ -312,6 +315,10 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
        return ret;
 }
+#endif
+#ifdef __ARCH_WANT_SYS_SIGPROCMASK
 asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
                compat_old_sigset_t __user *oset)
 {
@@ -333,6 +340,8 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
        return ret;
 }
+#endif
 asmlinkage long compat_sys_setrlimit(unsigned int resource,
                struct compat_rlimit __user *rlim)
 {
@@ -882,6 +891,7 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
        case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
        }
 }
+EXPORT_SYMBOL_GPL(sigset_from_compat);
 asmlinkage long
 compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
@@ -890,10 +900,9 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
 {
        compat_sigset_t s32;
        sigset_t s;
-        int sig;
        struct timespec t;
        siginfo_t info;
-        long ret, timeout = 0;
+        long ret;
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;
@@ -901,51 +910,19 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
        if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
                return -EFAULT;
        sigset_from_compat(&s, &s32);
-        sigdelsetmask(&s,sigmask(SIGKILL)|sigmask(SIGSTOP));
-        signotset(&s);
        if (uts) {
-                if (get_compat_timespec (&t, uts))
+                if (get_compat_timespec(&t, uts))
                        return -EFAULT;
-                if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0
-                                || t.tv_sec < 0)
-                        return -EINVAL;
        }
-        spin_lock_irq(&current->sighand->siglock);
+        ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);
-        sig = dequeue_signal(current, &s, &info);
-        if (!sig) {
-                timeout = MAX_SCHEDULE_TIMEOUT;
-                if (uts)
-                        timeout = timespec_to_jiffies(&t)
-                                +(t.tv_sec || t.tv_nsec);
-                if (timeout) {
-                        current->real_blocked = current->blocked;
-                        sigandsets(&current->blocked, &current->blocked, &s);
-                        recalc_sigpending();
-                        spin_unlock_irq(&current->sighand->siglock);
-                        timeout = schedule_timeout_interruptible(timeout);
-                        spin_lock_irq(&current->sighand->siglock);
-                        sig = dequeue_signal(current, &s, &info);
-                        current->blocked = current->real_blocked;
-                        siginitset(&current->real_blocked, 0);
-                        recalc_sigpending();
-                }
-        }
-        spin_unlock_irq(&current->sighand->siglock);
-        if (sig) {
+        if (ret > 0 && uinfo) {
-                ret = sig;
+                if (copy_siginfo_to_user32(uinfo, &info))
-                if (uinfo) {
+                        ret = -EFAULT;
-                        if (copy_siginfo_to_user32(uinfo, &info))
-                                ret = -EFAULT;
-                }
-        }else {
-                ret = timeout?-EINTR:-EAGAIN;
        }
        return ret;
 }
@@ -1016,11 +993,8 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
        sigset_from_compat(&newset, &newset32);
        sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
-        spin_lock_irq(&current->sighand->siglock);
        current->saved_sigmask = current->blocked;
-        current->blocked = newset;
+        set_current_blocked(&newset);
-        recalc_sigpending();
-        spin_unlock_irq(&current->sighand->siglock);
        current->state = TASK_INTERRUPTIBLE;
        schedule();
diff --git a/kernel/configs.c b/kernel/configs.c
index b4066b44a99d..42e8fa075eed 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -92,8 +92,8 @@ static void __exit ikconfig_cleanup(void)
 module_init(ikconfig_init);
 module_exit(ikconfig_cleanup);
+#endif /* CONFIG_IKCONFIG_PROC */
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Randy Dunlap");
 MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel");
-#endif /* CONFIG_IKCONFIG_PROC */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 33eee16addb8..10131fdaff70 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -55,7 +55,7 @@
 #include <linux/sort.h>
 #include <asm/uaccess.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <linux/mutex.h>
 #include <linux/workqueue.h>
 #include <linux/cgroup.h>
@@ -1159,7 +1159,7 @@ int current_cpuset_is_being_rebound(void)
 static int update_relax_domain_level(struct cpuset *cs, s64 val)
 {
 #ifdef CONFIG_SMP
-        if (val < -1 || val >= SD_LV_MAX)
+        if (val < -1 || val >= sched_domain_level_max)
                return -EINVAL;
 #endif
@@ -1367,14 +1367,10 @@ static int fmeter_getrate(struct fmeter *fmp)
        return val;
 }
-/* Protected by cgroup_lock */
-static cpumask_var_t cpus_attach;
 /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
 static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
-                             struct task_struct *tsk, bool threadgroup)
+                             struct task_struct *tsk)
 {
-        int ret;
        struct cpuset *cs = cgroup_cs(cont);
        if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
@@ -1391,29 +1387,42 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
        if (tsk->flags & PF_THREAD_BOUND)
                return -EINVAL;
-        ret = security_task_setscheduler(tsk);
-        if (ret)
-                return ret;
-        if (threadgroup) {
-                struct task_struct *c;
-                rcu_read_lock();
-                list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-                        ret = security_task_setscheduler(c);
-                        if (ret) {
-                                rcu_read_unlock();
-                                return ret;
-                        }
-                }
-                rcu_read_unlock();
-        }
        return 0;
 }
-static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
+static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
-                               struct cpuset *cs)
+{
+        return security_task_setscheduler(task);
+}
+/*
+ * Protected by cgroup_lock. The nodemasks must be stored globally because
+ * dynamically allocating them is not allowed in pre_attach, and they must
+ * persist among pre_attach, attach_task, and attach.
+ */
+static cpumask_var_t cpus_attach;
+static nodemask_t cpuset_attach_nodemask_from;
+static nodemask_t cpuset_attach_nodemask_to;
+/* Set-up work for before attaching each task. */
+static void cpuset_pre_attach(struct cgroup *cont)
+{
+        struct cpuset *cs = cgroup_cs(cont);
+        if (cs == &top_cpuset)
+                cpumask_copy(cpus_attach, cpu_possible_mask);
+        else
+                guarantee_online_cpus(cs, cpus_attach);
+        guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+}
+/* Per-thread attachment work. */
+static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)
 {
        int err;
+        struct cpuset *cs = cgroup_cs(cont);
        /*
         * can_attach beforehand should guarantee that this doesn't fail.
         * TODO: have a better way to handle failure here
@@ -1421,45 +1430,29 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
        err = set_cpus_allowed_ptr(tsk, cpus_attach);
        WARN_ON_ONCE(err);
-        cpuset_change_task_nodemask(tsk, to);
+        cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to);
        cpuset_update_task_spread_flag(cs, tsk);
 }
 static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
-                          struct cgroup *oldcont, struct task_struct *tsk,
+                          struct cgroup *oldcont, struct task_struct *tsk)
-                          bool threadgroup)
 {
        struct mm_struct *mm;
        struct cpuset *cs = cgroup_cs(cont);
        struct cpuset *oldcs = cgroup_cs(oldcont);
-        static nodemask_t to;           /* protected by cgroup_mutex */
-        if (cs == &top_cpuset) {
-                cpumask_copy(cpus_attach, cpu_possible_mask);
-        } else {
-                guarantee_online_cpus(cs, cpus_attach);
-        }
-        guarantee_online_mems(cs, &to);
-        /* do per-task migration stuff possibly for each in the threadgroup */
-        cpuset_attach_task(tsk, &to, cs);
-        if (threadgroup) {
-                struct task_struct *c;
-                rcu_read_lock();
-                list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-                        cpuset_attach_task(c, &to, cs);
-                }
-                rcu_read_unlock();
-        }
-        /* change mm; only needs to be done once even if threadgroup */
+        /*
-        to = cs->mems_allowed;
+         * Change mm, possibly for multiple threads in a threadgroup. This is
+         * expensive and may sleep.
+         */
+        cpuset_attach_nodemask_from = oldcs->mems_allowed;
+        cpuset_attach_nodemask_to = cs->mems_allowed;
        mm = get_task_mm(tsk);
        if (mm) {
-                mpol_rebind_mm(mm, &to);
+                mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
                if (is_memory_migrate(cs))
-                        cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to);
+                        cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from,
+                                          &cpuset_attach_nodemask_to);
                mmput(mm);
        }
 }
@@ -1809,10 +1802,9 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
 }
 /*
- * post_clone() is called at the end of cgroup_clone().
+ * post_clone() is called during cgroup_create() when the
- * 'cgroup' was just created automatically as a result of
+ * clone_children mount argument was specified.  The cgroup
- * a cgroup_clone(), and the current task is about to
+ * can not yet have any tasks.
- * be moved into 'cgroup'.
 *
 * Currently we refuse to set up the cgroup - thereby
 * refusing the task to be entered, and as a result refusing
@@ -1911,6 +1903,9 @@ struct cgroup_subsys cpuset_subsys = {
        .create = cpuset_create,
        .destroy = cpuset_destroy,
        .can_attach = cpuset_can_attach,
+        .can_attach_task = cpuset_can_attach_task,
+        .pre_attach = cpuset_pre_attach,
+        .attach_task = cpuset_attach_task,
        .attach = cpuset_attach,
        .populate = cpuset_populate,
        .post_clone = cpuset_post_clone,
@@ -2195,7 +2190,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
        rcu_read_lock();
        cs = task_cs(tsk);
        if (cs)
-                cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
+                do_set_cpus_allowed(tsk, cs->cpus_allowed);
        rcu_read_unlock();
        /*
@@ -2222,7 +2217,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
                 * Like above we can temporary set any mask and rely on
                 * set_cpus_allowed_ptr() as synchronization point.
                 */
-                cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
+                do_set_cpus_allowed(tsk, cpu_possible_mask);
                cpu = cpumask_any(cpu_active_mask);
        }
@@ -2465,11 +2460,19 @@ static int cpuset_spread_node(int *rotor)
 int cpuset_mem_spread_node(void)
 {
+        if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
+                current->cpuset_mem_spread_rotor =
+                        node_random(&current->mems_allowed);
        return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
 }
 int cpuset_slab_spread_node(void)
 {
+        if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
+                current->cpuset_slab_spread_rotor =
+                        node_random(&current->mems_allowed);
        return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
 }
diff --git a/kernel/cred.c b/kernel/cred.c
index 5557b55048df..8ef31f53c44c 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -1,4 +1,4 @@
-/* Task credentials management - see Documentation/credentials.txt
+/* Task credentials management - see Documentation/security/credentials.txt
 *
 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
@@ -49,11 +49,12 @@ struct cred init_cred = {
        .magic                  = CRED_MAGIC,
 #endif
        .securebits             = SECUREBITS_DEFAULT,
-        .cap_inheritable        = CAP_INIT_INH_SET,
+        .cap_inheritable        = CAP_EMPTY_SET,
        .cap_permitted          = CAP_FULL_SET,
-        .cap_effective          = CAP_INIT_EFF_SET,
+        .cap_effective          = CAP_FULL_SET,
-        .cap_bset               = CAP_INIT_BSET,
+        .cap_bset               = CAP_FULL_SET,
        .user                   = INIT_USER,
+        .user_ns                = &init_user_ns,
        .group_info             = &init_groups,
 #ifdef CONFIG_KEYS
        .tgcred                 = &init_tgcred,
@@ -410,6 +411,11 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
                        goto error_put;
        }
+        /* cache user_ns in cred.  Doesn't need a refcount because it will
+         * stay pinned by cred->user
+         */
+        new->user_ns = new->user->user_ns;
 #ifdef CONFIG_KEYS
        /* new threads get their own thread keyrings if their parent already
         * had one */
@@ -502,10 +508,8 @@ int commit_creds(struct cred *new)
                key_fsgid_changed(task);
        /* do it
-         * - What if a process setreuid()'s and this brings the
+         * RLIMIT_NPROC limits on user->processes have already been checked
-         *   new uid over his NPROC rlimit?  We can check this now
+         * in set_user().
-         *   cheaply with the new uid cache, so if it matters
-         *   we should be checking for it.  -DaveM
         */
        alter_cred_subscribers(new, 2);
        if (new->user != old->user)
@@ -741,12 +745,6 @@ int set_create_files_as(struct cred *new, struct inode *inode)
 }
 EXPORT_SYMBOL(set_create_files_as);
-struct user_namespace *current_user_ns(void)
-{
-        return _current_user_ns();
-}
-EXPORT_SYMBOL(current_user_ns);
 #ifdef CONFIG_DEBUG_CREDENTIALS
 bool creds_are_invalid(const struct cred *cred)
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index bad6786dee88..0d7c08784efb 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -51,7 +51,7 @@
 #include <asm/cacheflush.h>
 #include <asm/byteorder.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <asm/system.h>
 #include "debug_core.h"
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index a11db956dd62..34872482315e 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -42,6 +42,8 @@
 /* Our I/O buffers. */
 static char                     remcom_in_buffer[BUFMAX];
 static char                     remcom_out_buffer[BUFMAX];
+static int                      gdbstub_use_prev_in_buf;
+static int                      gdbstub_prev_in_buf_pos;
 /* Storage for the registers, in GDB format. */
 static unsigned long            gdb_regs[(NUMREGBYTES +
@@ -58,6 +60,13 @@ static int gdbstub_read_wait(void)
        int ret = -1;
        int i;
+        if (unlikely(gdbstub_use_prev_in_buf)) {
+                if (gdbstub_prev_in_buf_pos < gdbstub_use_prev_in_buf)
+                        return remcom_in_buffer[gdbstub_prev_in_buf_pos++];
+                else
+                        gdbstub_use_prev_in_buf = 0;
+        }
        /* poll any additional I/O interfaces that are defined */
        while (ret < 0)
                for (i = 0; kdb_poll_funcs[i] != NULL; i++) {
@@ -109,7 +118,6 @@ static void get_packet(char *buffer)
                        buffer[count] = ch;
                        count = count + 1;
                }
-                buffer[count] = 0;
                if (ch == '#') {
                        xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4;
@@ -124,6 +132,7 @@ static void get_packet(char *buffer)
                        if (dbg_io_ops->flush)
                                dbg_io_ops->flush();
                }
+                buffer[count] = 0;
        } while (checksum != xmitcsum);
 }
@@ -1082,12 +1091,11 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd)
        case 'c':
                strcpy(remcom_in_buffer, cmd);
                return 0;
-        case '?':
+        case '$':
-                gdb_cmd_status(ks);
+                strcpy(remcom_in_buffer, cmd);
-                break;
+                gdbstub_use_prev_in_buf = strlen(remcom_in_buffer);
-        case '\0':
+                gdbstub_prev_in_buf_pos = 0;
-                strcpy(remcom_out_buffer, "");
+                return 0;
-                break;
        }
        dbg_io_ops->write_char('+');
        put_packet(remcom_out_buffer);
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 2f62fe85f16a..7179eac7b41c 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -112,9 +112,8 @@ kdb_bt(int argc, const char **argv)
        unsigned long addr;
        long offset;
-        kdbgetintenv("BTARGS", &argcount);      /* Arguments to print */
+        /* Prompt after each proc in bta */
-        kdbgetintenv("BTAPROMPT", &btaprompt);  /* Prompt after each
+        kdbgetintenv("BTAPROMPT", &btaprompt);
-                                                 * proc in bta */
        if (strcmp(argv[0], "bta") == 0) {
                struct task_struct *g, *p;
diff --git a/kernel/debug/kdb/kdb_cmds b/kernel/debug/kdb/kdb_cmds
index 56c88e4db309..9834ad303ab6 100644
--- a/kernel/debug/kdb/kdb_cmds
+++ b/kernel/debug/kdb/kdb_cmds
@@ -18,16 +18,12 @@ defcmd dumpcommon "" "Common kdb debugging"
 endefcmd
 defcmd dumpall "" "First line debugging"
-  set BTSYMARG 1
-  set BTARGS 9
  pid R
  -dumpcommon
  -bta
 endefcmd
 defcmd dumpcpu "" "Same as dumpall but only tasks on cpus"
-  set BTSYMARG 1
-  set BTARGS 9
  pid R
  -dumpcommon
  -btc
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index dd0b1b7dd02c..d9ca9aa481ec 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -30,6 +30,8 @@ EXPORT_SYMBOL_GPL(kdb_poll_funcs);
 int kdb_poll_idx = 1;
 EXPORT_SYMBOL_GPL(kdb_poll_idx);
+static struct kgdb_state *kdb_ks;
 int kdb_stub(struct kgdb_state *ks)
 {
        int error = 0;
@@ -39,6 +41,7 @@ int kdb_stub(struct kgdb_state *ks)
        kdb_dbtrap_t db_result = KDB_DB_NOBPT;
        int i;
+        kdb_ks = ks;
        if (KDB_STATE(REENTRY)) {
                reason = KDB_REASON_SWITCH;
                KDB_STATE_CLEAR(REENTRY);
@@ -123,20 +126,8 @@ int kdb_stub(struct kgdb_state *ks)
        KDB_STATE_CLEAR(PAGER);
        kdbnearsym_cleanup();
        if (error == KDB_CMD_KGDB) {
-                if (KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)) {
+                if (KDB_STATE(DOING_KGDB))
-        /*
-         * This inteface glue which allows kdb to transition in into
-         * the gdb stub.  In order to do this the '?' or '' gdb serial
-         * packet response is processed here.  And then control is
-         * passed to the gdbstub.
-         */
-                        if (KDB_STATE(DOING_KGDB))
-                                gdbstub_state(ks, "?");
-                        else
-                                gdbstub_state(ks, "");
                        KDB_STATE_CLEAR(DOING_KGDB);
-                        KDB_STATE_CLEAR(DOING_KGDB2);
-                }
                return DBG_PASS_EVENT;
        }
        kdb_bp_install(ks->linux_regs);
@@ -166,3 +157,7 @@ int kdb_stub(struct kgdb_state *ks)
        return kgdb_info[ks->cpu].ret_state;
 }
+void kdb_gdb_state_pass(char *buf)
+{
+        gdbstub_state(kdb_ks, buf);
+}
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 96fdaac46a80..4802eb5840e1 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -31,15 +31,21 @@ char kdb_prompt_str[CMD_BUFLEN];
 int kdb_trap_printk;
-static void kgdb_transition_check(char *buffer)
+static int kgdb_transition_check(char *buffer)
 {
-        int slen = strlen(buffer);
+        if (buffer[0] != '+' && buffer[0] != '$') {
-        if (strncmp(buffer, "$?#3f", slen) != 0 &&
-            strncmp(buffer, "$qSupported#37", slen) != 0 &&
-            strncmp(buffer, "+$qSupported#37", slen) != 0) {
                KDB_STATE_SET(KGDB_TRANS);
                kdb_printf("%s", buffer);
+        } else {
+                int slen = strlen(buffer);
+                if (slen > 3 && buffer[slen - 3] == '#') {
+                        kdb_gdb_state_pass(buffer);
+                        strcpy(buffer, "kgdb");
+                        KDB_STATE_SET(DOING_KGDB);
+                        return 1;
+                }
        }
+        return 0;
 }
 static int kdb_read_get_key(char *buffer, size_t bufsize)
@@ -251,6 +257,10 @@ poll_again:
        case 13: /* enter */
                *lastchar++ = '\n';
                *lastchar++ = '\0';
+                if (!KDB_STATE(KGDB_TRANS)) {
+                        KDB_STATE_SET(KGDB_TRANS);
+                        kdb_printf("%s", buffer);
+                }
                kdb_printf("\n");
                return buffer;
        case 4: /* Del */
@@ -382,22 +392,26 @@ poll_again:
                                 * printed characters if we think that
                                 * kgdb is connecting, until the check
                                 * fails */
-                                if (!KDB_STATE(KGDB_TRANS))
+                                if (!KDB_STATE(KGDB_TRANS)) {
-                                        kgdb_transition_check(buffer);
+                                        if (kgdb_transition_check(buffer))
-                                else
+                                                return buffer;
+                                } else {
                                        kdb_printf("%c", key);
+                                }
                        }
                        /* Special escape to kgdb */
                        if (lastchar - buffer >= 5 &&
                            strcmp(lastchar - 5, "$?#3f") == 0) {
+                                kdb_gdb_state_pass(lastchar - 5);
                                strcpy(buffer, "kgdb");
                                KDB_STATE_SET(DOING_KGDB);
                                return buffer;
                        }
-                        if (lastchar - buffer >= 14 &&
+                        if (lastchar - buffer >= 11 &&
-                            strcmp(lastchar - 14, "$qSupported#37") == 0) {
+                            strcmp(lastchar - 11, "$qSupported") == 0) {
+                                kdb_gdb_state_pass(lastchar - 11);
                                strcpy(buffer, "kgdb");
-                                KDB_STATE_SET(DOING_KGDB2);
+                                KDB_STATE_SET(DOING_KGDB);
                                return buffer;
                        }
                }
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index be14779bcef6..63786e71a3cd 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -145,7 +145,6 @@ static char *__env[] = {
 #endif
 "RADIX=16",
 "MDCOUNT=8",                   /* lines of md output */
- "BTARGS=9",                    /* 9 possible args in bt */
 KDB_PLATFORM_ENV,
 "DTABCOUNT=30",
 "NOSECT=1",
@@ -172,6 +171,7 @@ static char *__env[] = {
 (char *)0,
 (char *)0,
 (char *)0,
+ (char *)0,
 };
 static const int __nenv = (sizeof(__env) / sizeof(char *));
@@ -1386,7 +1386,7 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
                }
                if (result == KDB_CMD_KGDB) {
-                        if (!(KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)))
+                        if (!KDB_STATE(DOING_KGDB))
                                kdb_printf("Entering please attach debugger "
                                           "or use $D#44+ or $3#33\n");
                        break;
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 35d69ed1dfb5..e381d105b40b 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -21,7 +21,6 @@
 #define KDB_CMD_SS      (-1003)
 #define KDB_CMD_SSB     (-1004)
 #define KDB_CMD_KGDB (-1005)
-#define KDB_CMD_KGDB2 (-1006)
 /* Internal debug flags */
 #define KDB_DEBUG_FLAG_BP       0x0002  /* Breakpoint subsystem debug */
@@ -146,7 +145,6 @@ extern int kdb_state;
                                                 * keyboard on this cpu */
 #define KDB_STATE_KEXEC         0x00040000      /* kexec issued */
 #define KDB_STATE_DOING_KGDB    0x00080000      /* kgdb enter now issued */
-#define KDB_STATE_DOING_KGDB2   0x00100000      /* kgdb enter now issued */
 #define KDB_STATE_KGDB_TRANS    0x00200000      /* Transition to kgdb */
 #define KDB_STATE_ARCH          0xff000000      /* Reserved for arch
                                                 * specific use */
@@ -218,6 +216,7 @@ extern void kdb_print_nameval(const char *name, unsigned long val);
 extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
 extern void kdb_meminfo_proc_show(void);
 extern char *kdb_getstr(char *, size_t, char *);
+extern void kdb_gdb_state_pass(char *buf);
 /* Defines for kdb_symbol_print */
 #define KDB_SP_SPACEB   0x0001          /* Space before string */
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index ead9b610aa71..418b3f7053aa 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -19,8 +19,10 @@
 #include <linux/time.h>
 #include <linux/sysctl.h>
 #include <linux/delayacct.h>
+#include <linux/module.h>
 int delayacct_on __read_mostly = 1;     /* Delay accounting turned on/off */
+EXPORT_SYMBOL_GPL(delayacct_on);
 struct kmem_cache *delayacct_cache;
 static int __init delayacct_setup_disable(char *str)
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
new file mode 100644
index 000000000000..89e5e8aa4c36
--- /dev/null
+++ b/kernel/events/Makefile
@@ -0,0 +1,6 @@
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_core.o = -pg
+endif
+obj-y := core.o ring_buffer.o
+obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
diff --git a/kernel/perf_event.c b/kernel/events/core.c
index 8e81a9860a0d..b8785e26ee1c 100644
--- a/kernel/perf_event.c
+++ b/kernel/events/core.c
@@ -2,8 +2,8 @@
 * Performance events core code:
 *
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
- *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
- *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
 *  Copyright  �  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
 *
 * For licensing details see kernel-base/COPYING
@@ -36,13 +36,15 @@
 #include <linux/ftrace_event.h>
 #include <linux/hw_breakpoint.h>
+#include "internal.h"
 #include <asm/irq_regs.h>
 struct remote_function_call {
-        struct task_struct *p;
+        struct task_struct      *p;
-        int (*func)(void *info);
+        int                     (*func)(void *info);
-        void *info;
+        void                    *info;
-        int ret;
+        int                     ret;
 };
 static void remote_function(void *data)
@@ -76,10 +78,10 @@ static int
 task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
 {
        struct remote_function_call data = {
-                .p = p,
+                .p      = p,
-                .func = func,
+                .func   = func,
-                .info = info,
+                .info   = info,
-                .ret = -ESRCH, /* No such (running) process */
+                .ret    = -ESRCH, /* No such (running) process */
        };
        if (task_curr(p))
@@ -100,10 +102,10 @@ task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
 static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
 {
        struct remote_function_call data = {
-                .p = NULL,
+                .p      = NULL,
-                .func = func,
+                .func   = func,
-                .info = info,
+                .info   = info,
-                .ret = -ENXIO, /* No such CPU */
+                .ret    = -ENXIO, /* No such CPU */
        };
        smp_call_function_single(cpu, remote_function, &data, 1);
@@ -125,7 +127,7 @@ enum event_type_t {
 * perf_sched_events : >0 events exist
 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
 */
-atomic_t perf_sched_events __read_mostly;
+struct jump_label_key perf_sched_events __read_mostly;
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 static atomic_t nr_mmap_events __read_mostly;
@@ -200,6 +202,22 @@ __get_cpu_context(struct perf_event_context *ctx)
        return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
 }
+static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
+                          struct perf_event_context *ctx)
+{
+        raw_spin_lock(&cpuctx->ctx.lock);
+        if (ctx)
+                raw_spin_lock(&ctx->lock);
+}
+static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
+                            struct perf_event_context *ctx)
+{
+        if (ctx)
+                raw_spin_unlock(&ctx->lock);
+        raw_spin_unlock(&cpuctx->ctx.lock);
+}
 #ifdef CONFIG_CGROUP_PERF
 /*
@@ -340,11 +358,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
        rcu_read_lock();
        list_for_each_entry_rcu(pmu, &pmus, entry) {
                cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-                perf_pmu_disable(cpuctx->ctx.pmu);
                /*
                 * perf_cgroup_events says at least one
                 * context on this CPU has cgroup events.
@@ -353,6 +368,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
                 * events for a context.
                 */
                if (cpuctx->ctx.nr_cgroups > 0) {
+                        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+                        perf_pmu_disable(cpuctx->ctx.pmu);
                        if (mode & PERF_CGROUP_SWOUT) {
                                cpu_ctx_sched_out(cpuctx, EVENT_ALL);
@@ -372,9 +389,9 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
                                cpuctx->cgrp = perf_cgroup_from_task(task);
                                cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
                        }
+                        perf_pmu_enable(cpuctx->ctx.pmu);
+                        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
                }
-                perf_pmu_enable(cpuctx->ctx.pmu);
        }
        rcu_read_unlock();
@@ -586,14 +603,6 @@ static void get_ctx(struct perf_event_context *ctx)
        WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
 }
-static void free_ctx(struct rcu_head *head)
-{
-        struct perf_event_context *ctx;
-        ctx = container_of(head, struct perf_event_context, rcu_head);
-        kfree(ctx);
-}
 static void put_ctx(struct perf_event_context *ctx)
 {
        if (atomic_dec_and_test(&ctx->refcount)) {
@@ -601,7 +610,7 @@ static void put_ctx(struct perf_event_context *ctx)
                        put_ctx(ctx->parent_ctx);
                if (ctx->task)
                        put_task_struct(ctx->task);
-                call_rcu(&ctx->rcu_head, free_ctx);
+                kfree_rcu(ctx, rcu_head);
        }
 }
@@ -739,6 +748,7 @@ static u64 perf_event_time(struct perf_event *event)
 /*
 * Update the total_time_enabled and total_time_running fields for a event.
+ * The caller of this function needs to hold the ctx->lock.
 */
 static void update_event_times(struct perf_event *event)
 {
@@ -1113,6 +1123,10 @@ static int __perf_remove_from_context(void *info)
        raw_spin_lock(&ctx->lock);
        event_sched_out(event, cpuctx, ctx);
        list_del_event(event, ctx);
+        if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
+                ctx->is_active = 0;
+                cpuctx->task_ctx = NULL;
+        }
        raw_spin_unlock(&ctx->lock);
        return 0;
@@ -1462,8 +1476,24 @@ static void add_event_to_ctx(struct perf_event *event,
        event->tstamp_stopped = tstamp;
 }
-static void perf_event_context_sched_in(struct perf_event_context *ctx,
+static void task_ctx_sched_out(struct perf_event_context *ctx);
-                                        struct task_struct *tsk);
+static void
+ctx_sched_in(struct perf_event_context *ctx,
+             struct perf_cpu_context *cpuctx,
+             enum event_type_t event_type,
+             struct task_struct *task);
+static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
+                                struct perf_event_context *ctx,
+                                struct task_struct *task)
+{
+        cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
+        if (ctx)
+                ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
+        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
+        if (ctx)
+                ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
+}
 /*
 * Cross CPU call to install and enable a performance event
@@ -1474,20 +1504,37 @@ static int  __perf_install_in_context(void *info)
 {
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
-        struct perf_event *leader = event->group_leader;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
-        int err;
+        struct perf_event_context *task_ctx = cpuctx->task_ctx;
+        struct task_struct *task = current;
+        perf_ctx_lock(cpuctx, task_ctx);
+        perf_pmu_disable(cpuctx->ctx.pmu);
        /*
-         * In case we're installing a new context to an already running task,
+         * If there was an active task_ctx schedule it out.
-         * could also happen before perf_event_task_sched_in() on architectures
-         * which do context switches with IRQs enabled.
         */
-        if (ctx->task && !cpuctx->task_ctx)
+        if (task_ctx)
-                perf_event_context_sched_in(ctx, ctx->task);
+                task_ctx_sched_out(task_ctx);
+        /*
+         * If the context we're installing events in is not the
+         * active task_ctx, flip them.
+         */
+        if (ctx->task && task_ctx != ctx) {
+                if (task_ctx)
+                        raw_spin_unlock(&task_ctx->lock);
+                raw_spin_lock(&ctx->lock);
+                task_ctx = ctx;
+        }
+        if (task_ctx) {
+                cpuctx->task_ctx = task_ctx;
+                task = task_ctx->task;
+        }
+        cpu_ctx_sched_out(cpuctx, EVENT_ALL);
-        raw_spin_lock(&ctx->lock);
-        ctx->is_active = 1;
        update_context_time(ctx);
        /*
         * update cgrp time only if current cgrp
@@ -1498,43 +1545,13 @@ static int  __perf_install_in_context(void *info)
        add_event_to_ctx(event, ctx);
-        if (!event_filter_match(event))
-                goto unlock;
        /*
-         * Don't put the event on if it is disabled or if
+         * Schedule everything back in
-         * it is in a group and the group isn't on.
         */
-        if (event->state != PERF_EVENT_STATE_INACTIVE ||
+        perf_event_sched_in(cpuctx, task_ctx, task);
-            (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
-                goto unlock;
-        /*
-         * An exclusive event can't go on if there are already active
-         * hardware events, and no hardware event can go on if there
-         * is already an exclusive event on.
-         */
-        if (!group_can_go_on(event, cpuctx, 1))
-                err = -EEXIST;
-        else
-                err = event_sched_in(event, cpuctx, ctx);
-        if (err) {
-                /*
-                 * This event couldn't go on.  If it is in a group
-                 * then we have to pull the whole group off.
-                 * If the event group is pinned then put it in error state.
-                 */
-                if (leader != event)
-                        group_sched_out(leader, cpuctx, ctx);
-                if (leader->attr.pinned) {
-                        update_group_times(leader);
-                        leader->state = PERF_EVENT_STATE_ERROR;
-                }
-        }
-unlock:
+        perf_pmu_enable(cpuctx->ctx.pmu);
-        raw_spin_unlock(&ctx->lock);
+        perf_ctx_unlock(cpuctx, task_ctx);
        return 0;
 }
@@ -1747,7 +1764,7 @@ out:
        raw_spin_unlock_irq(&ctx->lock);
 }
-static int perf_event_refresh(struct perf_event *event, int refresh)
+int perf_event_refresh(struct perf_event *event, int refresh)
 {
        /*
         * not supported on inherited events
@@ -1760,36 +1777,35 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
        return 0;
 }
+EXPORT_SYMBOL_GPL(perf_event_refresh);
 static void ctx_sched_out(struct perf_event_context *ctx,
                          struct perf_cpu_context *cpuctx,
                          enum event_type_t event_type)
 {
        struct perf_event *event;
+        int is_active = ctx->is_active;
-        raw_spin_lock(&ctx->lock);
+        ctx->is_active &= ~event_type;
-        perf_pmu_disable(ctx->pmu);
-        ctx->is_active = 0;
        if (likely(!ctx->nr_events))
-                goto out;
+                return;
        update_context_time(ctx);
        update_cgrp_time_from_cpuctx(cpuctx);
        if (!ctx->nr_active)
-                goto out;
+                return;
-        if (event_type & EVENT_PINNED) {
+        perf_pmu_disable(ctx->pmu);
+        if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
                list_for_each_entry(event, &ctx->pinned_groups, group_entry)
                        group_sched_out(event, cpuctx, ctx);
        }
-        if (event_type & EVENT_FLEXIBLE) {
+        if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
                list_for_each_entry(event, &ctx->flexible_groups, group_entry)
                        group_sched_out(event, cpuctx, ctx);
        }
-out:
        perf_pmu_enable(ctx->pmu);
-        raw_spin_unlock(&ctx->lock);
 }
 /*
@@ -1937,8 +1953,10 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
        rcu_read_unlock();
        if (do_switch) {
+                raw_spin_lock(&ctx->lock);
                ctx_sched_out(ctx, cpuctx, EVENT_ALL);
                cpuctx->task_ctx = NULL;
+                raw_spin_unlock(&ctx->lock);
        }
 }
@@ -1973,8 +1991,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
                perf_cgroup_sched_out(task);
 }
-static void task_ctx_sched_out(struct perf_event_context *ctx,
+static void task_ctx_sched_out(struct perf_event_context *ctx)
-                               enum event_type_t event_type)
 {
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
@@ -1984,7 +2001,7 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
        if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
                return;
-        ctx_sched_out(ctx, cpuctx, event_type);
+        ctx_sched_out(ctx, cpuctx, EVENT_ALL);
        cpuctx->task_ctx = NULL;
 }
@@ -2063,11 +2080,11 @@ ctx_sched_in(struct perf_event_context *ctx,
             struct task_struct *task)
 {
        u64 now;
+        int is_active = ctx->is_active;
-        raw_spin_lock(&ctx->lock);
+        ctx->is_active |= event_type;
-        ctx->is_active = 1;
        if (likely(!ctx->nr_events))
-                goto out;
+                return;
        now = perf_clock();
        ctx->timestamp = now;
@@ -2076,15 +2093,12 @@ ctx_sched_in(struct perf_event_context *ctx,
         * First go through the list and put on any pinned groups
         * in order to give them the best chance of going on.
         */
-        if (event_type & EVENT_PINNED)
+        if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
                ctx_pinned_sched_in(ctx, cpuctx);
        /* Then walk through the lower prio flexible groups */
-        if (event_type & EVENT_FLEXIBLE)
+        if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
                ctx_flexible_sched_in(ctx, cpuctx);
-out:
-        raw_spin_unlock(&ctx->lock);
 }
 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
@@ -2096,19 +2110,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
        ctx_sched_in(ctx, cpuctx, event_type, task);
 }
-static void task_ctx_sched_in(struct perf_event_context *ctx,
-                              enum event_type_t event_type)
-{
-        struct perf_cpu_context *cpuctx;
-        cpuctx = __get_cpu_context(ctx);
-        if (cpuctx->task_ctx == ctx)
-                return;
-        ctx_sched_in(ctx, cpuctx, event_type, NULL);
-        cpuctx->task_ctx = ctx;
-}
 static void perf_event_context_sched_in(struct perf_event_context *ctx,
                                        struct task_struct *task)
 {
@@ -2118,6 +2119,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
        if (cpuctx->task_ctx == ctx)
                return;
+        perf_ctx_lock(cpuctx, ctx);
        perf_pmu_disable(ctx->pmu);
        /*
         * We want to keep the following priority order:
@@ -2126,18 +2128,18 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
         */
        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
-        ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
+        perf_event_sched_in(cpuctx, ctx, task);
-        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
-        ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
        cpuctx->task_ctx = ctx;
+        perf_pmu_enable(ctx->pmu);
+        perf_ctx_unlock(cpuctx, ctx);
        /*
         * Since these rotations are per-cpu, we need to ensure the
         * cpu-context we got scheduled on is actually rotating.
         */
        perf_pmu_rotate_start(ctx->pmu);
-        perf_pmu_enable(ctx->pmu);
 }
 /*
@@ -2277,7 +2279,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
        u64 interrupts, now;
        s64 delta;
-        raw_spin_lock(&ctx->lock);
        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                if (event->state != PERF_EVENT_STATE_ACTIVE)
                        continue;
@@ -2309,7 +2310,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
                if (delta > 0)
                        perf_adjust_period(event, period, delta);
        }
-        raw_spin_unlock(&ctx->lock);
 }
 /*
@@ -2317,16 +2317,12 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
 */
 static void rotate_ctx(struct perf_event_context *ctx)
 {
-        raw_spin_lock(&ctx->lock);
        /*
         * Rotate the first entry last of non-pinned groups. Rotation might be
         * disabled by the inheritance code.
         */
        if (!ctx->rotate_disable)
                list_rotate_left(&ctx->flexible_groups);
-        raw_spin_unlock(&ctx->lock);
 }
 /*
@@ -2353,6 +2349,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
                        rotate = 1;
        }
+        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
        perf_pmu_disable(cpuctx->ctx.pmu);
        perf_ctx_adjust_freq(&cpuctx->ctx, interval);
        if (ctx)
@@ -2363,21 +2360,20 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
        if (ctx)
-                task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
+                ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
        rotate_ctx(&cpuctx->ctx);
        if (ctx)
                rotate_ctx(ctx);
-        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current);
+        perf_event_sched_in(cpuctx, ctx, current);
-        if (ctx)
-                task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
 done:
        if (remove)
                list_del_init(&cpuctx->rotation_list);
        perf_pmu_enable(cpuctx->ctx.pmu);
+        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 }
 void perf_event_task_tick(void)
@@ -2432,9 +2428,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
         * in.
         */
        perf_cgroup_sched_out(current);
-        task_ctx_sched_out(ctx, EVENT_ALL);
        raw_spin_lock(&ctx->lock);
+        task_ctx_sched_out(ctx);
        list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
                ret = event_enable_on_exec(event, ctx);
@@ -2843,16 +2839,12 @@ retry:
                unclone_ctx(ctx);
                ++ctx->pin_count;
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
-        }
+        } else {
-        if (!ctx) {
                ctx = alloc_perf_context(pmu, task);
                err = -ENOMEM;
                if (!ctx)
                        goto errout;
-                get_ctx(ctx);
                err = 0;
                mutex_lock(&task->perf_event_mutex);
                /*
@@ -2864,14 +2856,14 @@ retry:
                else if (task->perf_event_ctxp[ctxn])
                        err = -EAGAIN;
                else {
+                        get_ctx(ctx);
                        ++ctx->pin_count;
                        rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
                }
                mutex_unlock(&task->perf_event_mutex);
                if (unlikely(err)) {
-                        put_task_struct(task);
+                        put_ctx(ctx);
-                        kfree(ctx);
                        if (err == -EAGAIN)
                                goto retry;
@@ -2898,7 +2890,7 @@ static void free_event_rcu(struct rcu_head *head)
        kfree(event);
 }
-static void perf_buffer_put(struct perf_buffer *buffer);
+static void ring_buffer_put(struct ring_buffer *rb);
 static void free_event(struct perf_event *event)
 {
@@ -2921,9 +2913,9 @@ static void free_event(struct perf_event *event)
                }
        }
-        if (event->buffer) {
+        if (event->rb) {
-                perf_buffer_put(event->buffer);
+                ring_buffer_put(event->rb);
-                event->buffer = NULL;
+                event->rb = NULL;
        }
        if (is_cgroup_event(event))
@@ -2942,12 +2934,6 @@ int perf_event_release_kernel(struct perf_event *event)
 {
        struct perf_event_context *ctx = event->ctx;
-        /*
-         * Remove from the PMU, can't get re-enabled since we got
-         * here because the last ref went.
-         */
-        perf_event_disable(event);
        WARN_ON_ONCE(ctx->parent_ctx);
        /*
         * There are two ways this annotation is useful:
@@ -2964,8 +2950,8 @@ int perf_event_release_kernel(struct perf_event *event)
        mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
        raw_spin_lock_irq(&ctx->lock);
        perf_group_detach(event);
-        list_del_event(event, ctx);
        raw_spin_unlock_irq(&ctx->lock);
+        perf_remove_from_context(event);
        mutex_unlock(&ctx->mutex);
        free_event(event);
@@ -3157,13 +3143,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 static unsigned int perf_poll(struct file *file, poll_table *wait)
 {
        struct perf_event *event = file->private_data;
-        struct perf_buffer *buffer;
+        struct ring_buffer *rb;
        unsigned int events = POLL_HUP;
        rcu_read_lock();
-        buffer = rcu_dereference(event->buffer);
+        rb = rcu_dereference(event->rb);
-        if (buffer)
+        if (rb)
-                events = atomic_xchg(&buffer->poll, 0);
+                events = atomic_xchg(&rb->poll, 0);
        rcu_read_unlock();
        poll_wait(file, &event->waitq, wait);
@@ -3366,6 +3352,18 @@ static int perf_event_index(struct perf_event *event)
        return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
 }
+static void calc_timer_values(struct perf_event *event,
+                                u64 *running,
+                                u64 *enabled)
+{
+        u64 now, ctx_time;
+        now = perf_clock();
+        ctx_time = event->shadow_ctx_time + now;
+        *enabled = ctx_time - event->tstamp_enabled;
+        *running = ctx_time - event->tstamp_running;
+}
 /*
 * Callers need to ensure there can be no nesting of this function, otherwise
 * the seqlock logic goes bad. We can not serialize this because the arch
@@ -3374,14 +3372,25 @@ static int perf_event_index(struct perf_event *event)
 void perf_event_update_userpage(struct perf_event *event)
 {
        struct perf_event_mmap_page *userpg;
-        struct perf_buffer *buffer;
+        struct ring_buffer *rb;
+        u64 enabled, running;
        rcu_read_lock();
-        buffer = rcu_dereference(event->buffer);
+        /*
-        if (!buffer)
+         * compute total_time_enabled, total_time_running
+         * based on snapshot values taken when the event
+         * was last scheduled in.
+         *
+         * we cannot simply called update_context_time()
+         * because of locking issue as we can be called in
+         * NMI context
+         */
+        calc_timer_values(event, &enabled, &running);
+        rb = rcu_dereference(event->rb);
+        if (!rb)
                goto unlock;
-        userpg = buffer->user_page;
+        userpg = rb->user_page;
        /*
         * Disable preemption so as to not let the corresponding user-space
@@ -3395,10 +3404,10 @@ void perf_event_update_userpage(struct perf_event *event)
        if (event->state == PERF_EVENT_STATE_ACTIVE)
                userpg->offset -= local64_read(&event->hw.prev_count);
-        userpg->time_enabled = event->total_time_enabled +
+        userpg->time_enabled = enabled +
                        atomic64_read(&event->child_total_time_enabled);
-        userpg->time_running = event->total_time_running +
+        userpg->time_running = running +
                        atomic64_read(&event->child_total_time_running);
        barrier();
@@ -3408,220 +3417,10 @@ unlock:
        rcu_read_unlock();
 }
-static unsigned long perf_data_size(struct perf_buffer *buffer);
-static void
-perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags)
-{
-        long max_size = perf_data_size(buffer);
-        if (watermark)
-                buffer->watermark = min(max_size, watermark);
-        if (!buffer->watermark)
-                buffer->watermark = max_size / 2;
-        if (flags & PERF_BUFFER_WRITABLE)
-                buffer->writable = 1;
-        atomic_set(&buffer->refcount, 1);
-}
-#ifndef CONFIG_PERF_USE_VMALLOC
-/*
- * Back perf_mmap() with regular GFP_KERNEL-0 pages.
- */
-static struct page *
-perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
-{
-        if (pgoff > buffer->nr_pages)
-                return NULL;
-        if (pgoff == 0)
-                return virt_to_page(buffer->user_page);
-        return virt_to_page(buffer->data_pages[pgoff - 1]);
-}
-static void *perf_mmap_alloc_page(int cpu)
-{
-        struct page *page;
-        int node;
-        node = (cpu == -1) ? cpu : cpu_to_node(cpu);
-        page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
-        if (!page)
-                return NULL;
-        return page_address(page);
-}
-static struct perf_buffer *
-perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
-{
-        struct perf_buffer *buffer;
-        unsigned long size;
-        int i;
-        size = sizeof(struct perf_buffer);
-        size += nr_pages * sizeof(void *);
-        buffer = kzalloc(size, GFP_KERNEL);
-        if (!buffer)
-                goto fail;
-        buffer->user_page = perf_mmap_alloc_page(cpu);
-        if (!buffer->user_page)
-                goto fail_user_page;
-        for (i = 0; i < nr_pages; i++) {
-                buffer->data_pages[i] = perf_mmap_alloc_page(cpu);
-                if (!buffer->data_pages[i])
-                        goto fail_data_pages;
-        }
-        buffer->nr_pages = nr_pages;
-        perf_buffer_init(buffer, watermark, flags);
-        return buffer;
-fail_data_pages:
-        for (i--; i >= 0; i--)
-                free_page((unsigned long)buffer->data_pages[i]);
-        free_page((unsigned long)buffer->user_page);
-fail_user_page:
-        kfree(buffer);
-fail:
-        return NULL;
-}
-static void perf_mmap_free_page(unsigned long addr)
-{
-        struct page *page = virt_to_page((void *)addr);
-        page->mapping = NULL;
-        __free_page(page);
-}
-static void perf_buffer_free(struct perf_buffer *buffer)
-{
-        int i;
-        perf_mmap_free_page((unsigned long)buffer->user_page);
-        for (i = 0; i < buffer->nr_pages; i++)
-                perf_mmap_free_page((unsigned long)buffer->data_pages[i]);
-        kfree(buffer);
-}
-static inline int page_order(struct perf_buffer *buffer)
-{
-        return 0;
-}
-#else
-/*
- * Back perf_mmap() with vmalloc memory.
- *
- * Required for architectures that have d-cache aliasing issues.
- */
-static inline int page_order(struct perf_buffer *buffer)
-{
-        return buffer->page_order;
-}
-static struct page *
-perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
-{
-        if (pgoff > (1UL << page_order(buffer)))
-                return NULL;
-        return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE);
-}
-static void perf_mmap_unmark_page(void *addr)
-{
-        struct page *page = vmalloc_to_page(addr);
-        page->mapping = NULL;
-}
-static void perf_buffer_free_work(struct work_struct *work)
-{
-        struct perf_buffer *buffer;
-        void *base;
-        int i, nr;
-        buffer = container_of(work, struct perf_buffer, work);
-        nr = 1 << page_order(buffer);
-        base = buffer->user_page;
-        for (i = 0; i < nr + 1; i++)
-                perf_mmap_unmark_page(base + (i * PAGE_SIZE));
-        vfree(base);
-        kfree(buffer);
-}
-static void perf_buffer_free(struct perf_buffer *buffer)
-{
-        schedule_work(&buffer->work);
-}
-static struct perf_buffer *
-perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
-{
-        struct perf_buffer *buffer;
-        unsigned long size;
-        void *all_buf;
-        size = sizeof(struct perf_buffer);
-        size += sizeof(void *);
-        buffer = kzalloc(size, GFP_KERNEL);
-        if (!buffer)
-                goto fail;
-        INIT_WORK(&buffer->work, perf_buffer_free_work);
-        all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
-        if (!all_buf)
-                goto fail_all_buf;
-        buffer->user_page = all_buf;
-        buffer->data_pages[0] = all_buf + PAGE_SIZE;
-        buffer->page_order = ilog2(nr_pages);
-        buffer->nr_pages = 1;
-        perf_buffer_init(buffer, watermark, flags);
-        return buffer;
-fail_all_buf:
-        kfree(buffer);
-fail:
-        return NULL;
-}
-#endif
-static unsigned long perf_data_size(struct perf_buffer *buffer)
-{
-        return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer));
-}
 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct perf_event *event = vma->vm_file->private_data;
-        struct perf_buffer *buffer;
+        struct ring_buffer *rb;
        int ret = VM_FAULT_SIGBUS;
        if (vmf->flags & FAULT_FLAG_MKWRITE) {
@@ -3631,14 +3430,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        }
        rcu_read_lock();
-        buffer = rcu_dereference(event->buffer);
+        rb = rcu_dereference(event->rb);
-        if (!buffer)
+        if (!rb)
                goto unlock;
        if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
                goto unlock;
-        vmf->page = perf_mmap_to_page(buffer, vmf->pgoff);
+        vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
        if (!vmf->page)
                goto unlock;
@@ -3653,35 +3452,35 @@ unlock:
        return ret;
 }
-static void perf_buffer_free_rcu(struct rcu_head *rcu_head)
+static void rb_free_rcu(struct rcu_head *rcu_head)
 {
-        struct perf_buffer *buffer;
+        struct ring_buffer *rb;
-        buffer = container_of(rcu_head, struct perf_buffer, rcu_head);
+        rb = container_of(rcu_head, struct ring_buffer, rcu_head);
-        perf_buffer_free(buffer);
+        rb_free(rb);
 }
-static struct perf_buffer *perf_buffer_get(struct perf_event *event)
+static struct ring_buffer *ring_buffer_get(struct perf_event *event)
 {
-        struct perf_buffer *buffer;
+        struct ring_buffer *rb;
        rcu_read_lock();
-        buffer = rcu_dereference(event->buffer);
+        rb = rcu_dereference(event->rb);
-        if (buffer) {
+        if (rb) {
-                if (!atomic_inc_not_zero(&buffer->refcount))
+                if (!atomic_inc_not_zero(&rb->refcount))
-                        buffer = NULL;
+                        rb = NULL;
        }
        rcu_read_unlock();
-        return buffer;
+        return rb;
 }
-static void perf_buffer_put(struct perf_buffer *buffer)
+static void ring_buffer_put(struct ring_buffer *rb)
 {
-        if (!atomic_dec_and_test(&buffer->refcount))
+        if (!atomic_dec_and_test(&rb->refcount))
                return;
-        call_rcu(&buffer->rcu_head, perf_buffer_free_rcu);
+        call_rcu(&rb->rcu_head, rb_free_rcu);
 }
 static void perf_mmap_open(struct vm_area_struct *vma)
@@ -3696,16 +3495,16 @@ static void perf_mmap_close(struct vm_area_struct *vma)
        struct perf_event *event = vma->vm_file->private_data;
        if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
-                unsigned long size = perf_data_size(event->buffer);
+                unsigned long size = perf_data_size(event->rb);
                struct user_struct *user = event->mmap_user;
-                struct perf_buffer *buffer = event->buffer;
+                struct ring_buffer *rb = event->rb;
                atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
                vma->vm_mm->locked_vm -= event->mmap_locked;
-                rcu_assign_pointer(event->buffer, NULL);
+                rcu_assign_pointer(event->rb, NULL);
                mutex_unlock(&event->mmap_mutex);
-                perf_buffer_put(buffer);
+                ring_buffer_put(rb);
                free_uid(user);
        }
 }
@@ -3723,7 +3522,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        unsigned long user_locked, user_lock_limit;
        struct user_struct *user = current_user();
        unsigned long locked, lock_limit;
-        struct perf_buffer *buffer;
+        struct ring_buffer *rb;
        unsigned long vma_size;
        unsigned long nr_pages;
        long user_extra, extra;
@@ -3732,7 +3531,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        /*
         * Don't allow mmap() of inherited per-task counters. This would
         * create a performance issue due to all children writing to the
-         * same buffer.
+         * same rb.
         */
        if (event->cpu == -1 && event->attr.inherit)
                return -EINVAL;
@@ -3744,7 +3543,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        nr_pages = (vma_size / PAGE_SIZE) - 1;
        /*
-         * If we have buffer pages ensure they're a power-of-two number, so we
+         * If we have rb pages ensure they're a power-of-two number, so we
         * can do bitmasks instead of modulo.
         */
        if (nr_pages != 0 && !is_power_of_2(nr_pages))
@@ -3758,9 +3557,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        WARN_ON_ONCE(event->ctx->parent_ctx);
        mutex_lock(&event->mmap_mutex);
-        if (event->buffer) {
+        if (event->rb) {
-                if (event->buffer->nr_pages == nr_pages)
+                if (event->rb->nr_pages == nr_pages)
-                        atomic_inc(&event->buffer->refcount);
+                        atomic_inc(&event->rb->refcount);
                else
                        ret = -EINVAL;
                goto unlock;
@@ -3790,18 +3589,20 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
                goto unlock;
        }
-        WARN_ON(event->buffer);
+        WARN_ON(event->rb);
        if (vma->vm_flags & VM_WRITE)
-                flags |= PERF_BUFFER_WRITABLE;
+                flags |= RING_BUFFER_WRITABLE;
+        rb = rb_alloc(nr_pages, 
+                event->attr.watermark ? event->attr.wakeup_watermark : 0,
+                event->cpu, flags);
-        buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark,
+        if (!rb) {
-                                   event->cpu, flags);
-        if (!buffer) {
                ret = -ENOMEM;
                goto unlock;
        }
-        rcu_assign_pointer(event->buffer, buffer);
+        rcu_assign_pointer(event->rb, rb);
        atomic_long_add(user_extra, &user->locked_vm);
        event->mmap_locked = extra;
@@ -3900,117 +3701,6 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
 }
 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
-/*
- * Output
- */
-static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail,
-                              unsigned long offset, unsigned long head)
-{
-        unsigned long mask;
-        if (!buffer->writable)
-                return true;
-        mask = perf_data_size(buffer) - 1;
-        offset = (offset - tail) & mask;
-        head   = (head   - tail) & mask;
-        if ((int)(head - offset) < 0)
-                return false;
-        return true;
-}
-static void perf_output_wakeup(struct perf_output_handle *handle)
-{
-        atomic_set(&handle->buffer->poll, POLL_IN);
-        if (handle->nmi) {
-                handle->event->pending_wakeup = 1;
-                irq_work_queue(&handle->event->pending);
-        } else
-                perf_event_wakeup(handle->event);
-}
-/*
- * We need to ensure a later event_id doesn't publish a head when a former
- * event isn't done writing. However since we need to deal with NMIs we
- * cannot fully serialize things.
- *
- * We only publish the head (and generate a wakeup) when the outer-most
- * event completes.
- */
-static void perf_output_get_handle(struct perf_output_handle *handle)
-{
-        struct perf_buffer *buffer = handle->buffer;
-        preempt_disable();
-        local_inc(&buffer->nest);
-        handle->wakeup = local_read(&buffer->wakeup);
-}
-static void perf_output_put_handle(struct perf_output_handle *handle)
-{
-        struct perf_buffer *buffer = handle->buffer;
-        unsigned long head;
-again:
-        head = local_read(&buffer->head);
-        /*
-         * IRQ/NMI can happen here, which means we can miss a head update.
-         */
-        if (!local_dec_and_test(&buffer->nest))
-                goto out;
-        /*
-         * Publish the known good head. Rely on the full barrier implied
-         * by atomic_dec_and_test() order the buffer->head read and this
-         * write.
-         */
-        buffer->user_page->data_head = head;
-        /*
-         * Now check if we missed an update, rely on the (compiler)
-         * barrier in atomic_dec_and_test() to re-read buffer->head.
-         */
-        if (unlikely(head != local_read(&buffer->head))) {
-                local_inc(&buffer->nest);
-                goto again;
-        }
-        if (handle->wakeup != local_read(&buffer->wakeup))
-                perf_output_wakeup(handle);
-out:
-        preempt_enable();
-}
-__always_inline void perf_output_copy(struct perf_output_handle *handle,
-                      const void *buf, unsigned int len)
-{
-        do {
-                unsigned long size = min_t(unsigned long, handle->size, len);
-                memcpy(handle->addr, buf, size);
-                len -= size;
-                handle->addr += size;
-                buf += size;
-                handle->size -= size;
-                if (!handle->size) {
-                        struct perf_buffer *buffer = handle->buffer;
-                        handle->page++;
-                        handle->page &= buffer->nr_pages - 1;
-                        handle->addr = buffer->data_pages[handle->page];
-                        handle->size = PAGE_SIZE << page_order(buffer);
-                }
-        } while (len);
-}
 static void __perf_event_header__init_id(struct perf_event_header *header,
                                         struct perf_sample_data *data,
                                         struct perf_event *event)
@@ -4041,9 +3731,9 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
        }
 }
-static void perf_event_header__init_id(struct perf_event_header *header,
+void perf_event_header__init_id(struct perf_event_header *header,
-                                       struct perf_sample_data *data,
+                                struct perf_sample_data *data,
-                                       struct perf_event *event)
+                                struct perf_event *event)
 {
        if (event->attr.sample_id_all)
                __perf_event_header__init_id(header, data, event);
@@ -4070,121 +3760,14 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle,
                perf_output_put(handle, data->cpu_entry);
 }
-static void perf_event__output_id_sample(struct perf_event *event,
+void perf_event__output_id_sample(struct perf_event *event,
-                                         struct perf_output_handle *handle,
+                                  struct perf_output_handle *handle,
-                                         struct perf_sample_data *sample)
+                                  struct perf_sample_data *sample)
 {
        if (event->attr.sample_id_all)
                __perf_event__output_id_sample(handle, sample);
 }
-int perf_output_begin(struct perf_output_handle *handle,
-                      struct perf_event *event, unsigned int size,
-                      int nmi, int sample)
-{
-        struct perf_buffer *buffer;
-        unsigned long tail, offset, head;
-        int have_lost;
-        struct perf_sample_data sample_data;
-        struct {
-                struct perf_event_header header;
-                u64                      id;
-                u64                      lost;
-        } lost_event;
-        rcu_read_lock();
-        /*
-         * For inherited events we send all the output towards the parent.
-         */
-        if (event->parent)
-                event = event->parent;
-        buffer = rcu_dereference(event->buffer);
-        if (!buffer)
-                goto out;
-        handle->buffer  = buffer;
-        handle->event   = event;
-        handle->nmi     = nmi;
-        handle->sample  = sample;
-        if (!buffer->nr_pages)
-                goto out;
-        have_lost = local_read(&buffer->lost);
-        if (have_lost) {
-                lost_event.header.size = sizeof(lost_event);
-                perf_event_header__init_id(&lost_event.header, &sample_data,
-                                           event);
-                size += lost_event.header.size;
-        }
-        perf_output_get_handle(handle);
-        do {
-                /*
-                 * Userspace could choose to issue a mb() before updating the
-                 * tail pointer. So that all reads will be completed before the
-                 * write is issued.
-                 */
-                tail = ACCESS_ONCE(buffer->user_page->data_tail);
-                smp_rmb();
-                offset = head = local_read(&buffer->head);
-                head += size;
-                if (unlikely(!perf_output_space(buffer, tail, offset, head)))
-                        goto fail;
-        } while (local_cmpxchg(&buffer->head, offset, head) != offset);
-        if (head - local_read(&buffer->wakeup) > buffer->watermark)
-                local_add(buffer->watermark, &buffer->wakeup);
-        handle->page = offset >> (PAGE_SHIFT + page_order(buffer));
-        handle->page &= buffer->nr_pages - 1;
-        handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1);
-        handle->addr = buffer->data_pages[handle->page];
-        handle->addr += handle->size;
-        handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size;
-        if (have_lost) {
-                lost_event.header.type = PERF_RECORD_LOST;
-                lost_event.header.misc = 0;
-                lost_event.id          = event->id;
-                lost_event.lost        = local_xchg(&buffer->lost, 0);
-                perf_output_put(handle, lost_event);
-                perf_event__output_id_sample(event, handle, &sample_data);
-        }
-        return 0;
-fail:
-        local_inc(&buffer->lost);
-        perf_output_put_handle(handle);
-out:
-        rcu_read_unlock();
-        return -ENOSPC;
-}
-void perf_output_end(struct perf_output_handle *handle)
-{
-        struct perf_event *event = handle->event;
-        struct perf_buffer *buffer = handle->buffer;
-        int wakeup_events = event->attr.wakeup_events;
-        if (handle->sample && wakeup_events) {
-                int events = local_inc_return(&buffer->events);
-                if (events >= wakeup_events) {
-                        local_sub(wakeup_events, &buffer->events);
-                        local_inc(&buffer->wakeup);
-                }
-        }
-        perf_output_put_handle(handle);
-        rcu_read_unlock();
-}
 static void perf_output_read_one(struct perf_output_handle *handle,
                                 struct perf_event *event,
                                 u64 enabled, u64 running)
@@ -4205,7 +3788,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(event);
-        perf_output_copy(handle, values, n * sizeof(u64));
+        __output_copy(handle, values, n * sizeof(u64));
 }
 /*
@@ -4235,7 +3818,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(leader);
-        perf_output_copy(handle, values, n * sizeof(u64));
+        __output_copy(handle, values, n * sizeof(u64));
        list_for_each_entry(sub, &leader->sibling_list, group_entry) {
                n = 0;
@@ -4247,7 +3830,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
                if (read_format & PERF_FORMAT_ID)
                        values[n++] = primary_event_id(sub);
-                perf_output_copy(handle, values, n * sizeof(u64));
+                __output_copy(handle, values, n * sizeof(u64));
        }
 }
@@ -4257,7 +3840,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 static void perf_output_read(struct perf_output_handle *handle,
                             struct perf_event *event)
 {
-        u64 enabled = 0, running = 0, now, ctx_time;
+        u64 enabled = 0, running = 0;
        u64 read_format = event->attr.read_format;
        /*
@@ -4269,12 +3852,8 @@ static void perf_output_read(struct perf_output_handle *handle,
         * because of locking issue as we are called in
         * NMI context
         */
-        if (read_format & PERF_FORMAT_TOTAL_TIMES) {
+        if (read_format & PERF_FORMAT_TOTAL_TIMES)
-                now = perf_clock();
+                calc_timer_values(event, &enabled, &running);
-                ctx_time = event->shadow_ctx_time + now;
-                enabled = ctx_time - event->tstamp_enabled;
-                running = ctx_time - event->tstamp_running;
-        }
        if (event->attr.read_format & PERF_FORMAT_GROUP)
                perf_output_read_group(handle, event, enabled, running);
@@ -4327,7 +3906,7 @@ void perf_output_sample(struct perf_output_handle *handle,
                        size *= sizeof(u64);
-                        perf_output_copy(handle, data->callchain, size);
+                        __output_copy(handle, data->callchain, size);
                } else {
                        u64 nr = 0;
                        perf_output_put(handle, nr);
@@ -4337,8 +3916,8 @@ void perf_output_sample(struct perf_output_handle *handle,
        if (sample_type & PERF_SAMPLE_RAW) {
                if (data->raw) {
                        perf_output_put(handle, data->raw->size);
-                        perf_output_copy(handle, data->raw->data,
+                        __output_copy(handle, data->raw->data,
-                                         data->raw->size);
+                                           data->raw->size);
                } else {
                        struct {
                                u32     size;
@@ -4350,6 +3929,20 @@ void perf_output_sample(struct perf_output_handle *handle,
                        perf_output_put(handle, raw);
                }
        }
+        if (!event->attr.watermark) {
+                int wakeup_events = event->attr.wakeup_events;
+                if (wakeup_events) {
+                        struct ring_buffer *rb = handle->rb;
+                        int events = local_inc_return(&rb->events);
+                        if (events >= wakeup_events) {
+                                local_sub(wakeup_events, &rb->events);
+                                local_inc(&rb->wakeup);
+                        }
+                }
+        }
 }
 void perf_prepare_sample(struct perf_event_header *header,
@@ -4394,7 +3987,7 @@ void perf_prepare_sample(struct perf_event_header *header,
        }
 }
-static void perf_event_output(struct perf_event *event, int nmi,
+static void perf_event_output(struct perf_event *event,
                                struct perf_sample_data *data,
                                struct pt_regs *regs)
 {
@@ -4406,7 +3999,7 @@ static void perf_event_output(struct perf_event *event, int nmi,
        perf_prepare_sample(&header, data, event, regs);
-        if (perf_output_begin(&handle, event, header.size, nmi, 1))
+        if (perf_output_begin(&handle, event, header.size))
                goto exit;
        perf_output_sample(&handle, &header, data, event);
@@ -4446,7 +4039,7 @@ perf_event_read_event(struct perf_event *event,
        int ret;
        perf_event_header__init_id(&read_event.header, &sample, event);
-        ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
+        ret = perf_output_begin(&handle, event, read_event.header.size);
        if (ret)
                return;
@@ -4489,7 +4082,7 @@ static void perf_event_task_output(struct perf_event *event,
        perf_event_header__init_id(&task_event->event_id.header, &sample, event);
        ret = perf_output_begin(&handle, event,
-                                task_event->event_id.header.size, 0, 0);
+                                task_event->event_id.header.size);
        if (ret)
                goto out;
@@ -4626,7 +4219,7 @@ static void perf_event_comm_output(struct perf_event *event,
        perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
        ret = perf_output_begin(&handle, event,
-                                comm_event->event_id.header.size, 0, 0);
+                                comm_event->event_id.header.size);
        if (ret)
                goto out;
@@ -4635,7 +4228,7 @@ static void perf_event_comm_output(struct perf_event *event,
        comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
        perf_output_put(&handle, comm_event->event_id);
-        perf_output_copy(&handle, comm_event->comm,
+        __output_copy(&handle, comm_event->comm,
                                   comm_event->comm_size);
        perf_event__output_id_sample(event, &handle, &sample);
@@ -4773,7 +4366,7 @@ static void perf_event_mmap_output(struct perf_event *event,
        perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
        ret = perf_output_begin(&handle, event,
-                                mmap_event->event_id.header.size, 0, 0);
+                                mmap_event->event_id.header.size);
        if (ret)
                goto out;
@@ -4781,7 +4374,7 @@ static void perf_event_mmap_output(struct perf_event *event,
        mmap_event->event_id.tid = perf_event_tid(event, current);
        perf_output_put(&handle, mmap_event->event_id);
-        perf_output_copy(&handle, mmap_event->file_name,
+        __output_copy(&handle, mmap_event->file_name,
                                   mmap_event->file_size);
        perf_event__output_id_sample(event, &handle, &sample);
@@ -4837,7 +4430,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
        if (file) {
                /*
-                 * d_path works from the end of the buffer backwards, so we
+                 * d_path works from the end of the rb backwards, so we
                 * need to add enough zero bytes after the string to handle
                 * the 64bit alignment we do later.
                 */
@@ -4968,7 +4561,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
        perf_event_header__init_id(&throttle_event.header, &sample, event);
        ret = perf_output_begin(&handle, event,
-                                throttle_event.header.size, 1, 0);
+                                throttle_event.header.size);
        if (ret)
                return;
@@ -4981,7 +4574,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
 * Generic event overflow handling, sampling.
 */
-static int __perf_event_overflow(struct perf_event *event, int nmi,
+static int __perf_event_overflow(struct perf_event *event,
                                   int throttle, struct perf_sample_data *data,
                                   struct pt_regs *regs)
 {
@@ -5024,26 +4617,28 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
        if (events && atomic_dec_and_test(&event->event_limit)) {
                ret = 1;
                event->pending_kill = POLL_HUP;
-                if (nmi) {
+                event->pending_disable = 1;
-                        event->pending_disable = 1;
+                irq_work_queue(&event->pending);
-                        irq_work_queue(&event->pending);
-                } else
-                        perf_event_disable(event);
        }
        if (event->overflow_handler)
-                event->overflow_handler(event, nmi, data, regs);
+                event->overflow_handler(event, data, regs);
        else
-                perf_event_output(event, nmi, data, regs);
+                perf_event_output(event, data, regs);
+        if (event->fasync && event->pending_kill) {
+                event->pending_wakeup = 1;
+                irq_work_queue(&event->pending);
+        }
        return ret;
 }
-int perf_event_overflow(struct perf_event *event, int nmi,
+int perf_event_overflow(struct perf_event *event,
                          struct perf_sample_data *data,
                          struct pt_regs *regs)
 {
-        return __perf_event_overflow(event, nmi, 1, data, regs);
+        return __perf_event_overflow(event, 1, data, regs);
 }
 /*
@@ -5092,7 +4687,7 @@ again:
 }
 static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
-                                    int nmi, struct perf_sample_data *data,
+                                    struct perf_sample_data *data,
                                    struct pt_regs *regs)
 {
        struct hw_perf_event *hwc = &event->hw;
@@ -5106,7 +4701,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
                return;
        for (; overflow; overflow--) {
-                if (__perf_event_overflow(event, nmi, throttle,
+                if (__perf_event_overflow(event, throttle,
                                            data, regs)) {
                        /*
                         * We inhibit the overflow from happening when
@@ -5119,7 +4714,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
 }
 static void perf_swevent_event(struct perf_event *event, u64 nr,
-                               int nmi, struct perf_sample_data *data,
+                               struct perf_sample_data *data,
                               struct pt_regs *regs)
 {
        struct hw_perf_event *hwc = &event->hw;
@@ -5133,12 +4728,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
                return;
        if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
-                return perf_swevent_overflow(event, 1, nmi, data, regs);
+                return perf_swevent_overflow(event, 1, data, regs);
        if (local64_add_negative(nr, &hwc->period_left))
                return;
-        perf_swevent_overflow(event, 0, nmi, data, regs);
+        perf_swevent_overflow(event, 0, data, regs);
 }
 static int perf_exclude_event(struct perf_event *event,
@@ -5226,7 +4821,7 @@ find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
 }
 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
-                                    u64 nr, int nmi,
+                                    u64 nr,
                                    struct perf_sample_data *data,
                                    struct pt_regs *regs)
 {
@@ -5242,7 +4837,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
        hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
                if (perf_swevent_match(event, type, event_id, data, regs))
-                        perf_swevent_event(event, nr, nmi, data, regs);
+                        perf_swevent_event(event, nr, data, regs);
        }
 end:
        rcu_read_unlock();
@@ -5263,8 +4858,7 @@ inline void perf_swevent_put_recursion_context(int rctx)
        put_recursion_context(swhash->recursion, rctx);
 }
-void __perf_sw_event(u32 event_id, u64 nr, int nmi,
+void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
-                            struct pt_regs *regs, u64 addr)
 {
        struct perf_sample_data data;
        int rctx;
@@ -5276,7 +4870,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
        perf_sample_data_init(&data, addr);
-        do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
+        do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
        perf_swevent_put_recursion_context(rctx);
        preempt_enable_notrace();
@@ -5331,14 +4925,6 @@ swevent_hlist_deref(struct swevent_htable *swhash)
                                         lockdep_is_held(&swhash->hlist_mutex));
 }
-static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
-{
-        struct swevent_hlist *hlist;
-        hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
-        kfree(hlist);
-}
 static void swevent_hlist_release(struct swevent_htable *swhash)
 {
        struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
@@ -5347,7 +4933,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash)
                return;
        rcu_assign_pointer(swhash->swevent_hlist, NULL);
-        call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
+        kfree_rcu(hlist, rcu_head);
 }
 static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
@@ -5429,7 +5015,7 @@ fail:
        return err;
 }
-atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
+struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
 static void sw_perf_event_destroy(struct perf_event *event)
 {
@@ -5532,7 +5118,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
        hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
                if (perf_tp_event_match(event, &data, regs))
-                        perf_swevent_event(event, count, 1, &data, regs);
+                        perf_swevent_event(event, count, &data, regs);
        }
        perf_swevent_put_recursion_context(rctx);
@@ -5625,7 +5211,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
        perf_sample_data_init(&sample, bp->attr.bp_addr);
        if (!bp->hw.state && !perf_exclude_event(bp, regs))
-                perf_swevent_event(bp, 1, 1, &sample, regs);
+                perf_swevent_event(bp, 1, &sample, regs);
 }
 #endif
@@ -5654,7 +5240,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
        if (regs && !perf_exclude_event(event, regs)) {
                if (!(event->attr.exclude_idle && current->pid == 0))
-                        if (perf_event_overflow(event, 0, &data, regs))
+                        if (perf_event_overflow(event, &data, regs))
                                ret = HRTIMER_NORESTART;
        }
@@ -5994,6 +5580,7 @@ free_dev:
 }
 static struct lock_class_key cpuctx_mutex;
+static struct lock_class_key cpuctx_lock;
 int perf_pmu_register(struct pmu *pmu, char *name, int type)
 {
@@ -6044,6 +5631,7 @@ skip_type:
                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
                __perf_event_init_context(&cpuctx->ctx);
                lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
+                lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
                cpuctx->ctx.type = cpu_context;
                cpuctx->ctx.pmu = pmu;
                cpuctx->jiffies_interval = 1;
@@ -6158,7 +5746,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
                 struct task_struct *task,
                 struct perf_event *group_leader,
                 struct perf_event *parent_event,
-                 perf_overflow_handler_t overflow_handler)
+                 perf_overflow_handler_t overflow_handler,
+                 void *context)
 {
        struct pmu *pmu;
        struct perf_event *event;
@@ -6216,10 +5805,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 #endif
        }
-        if (!overflow_handler && parent_event)
+        if (!overflow_handler && parent_event) {
                overflow_handler = parent_event->overflow_handler;
+                context = parent_event->overflow_handler_context;
+        }
        event->overflow_handler = overflow_handler;
+        event->overflow_handler_context = context;
        if (attr->disabled)
                event->state = PERF_EVENT_STATE_OFF;
@@ -6334,13 +5926,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
        if (ret)
                return -EFAULT;
-        /*
-         * If the type exists, the corresponding creation will verify
-         * the attr->config.
-         */
-        if (attr->type >= PERF_TYPE_MAX)
-                return -EINVAL;
        if (attr->__reserved_1)
                return -EINVAL;
@@ -6362,7 +5947,7 @@ err_size:
 static int
 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 {
-        struct perf_buffer *buffer = NULL, *old_buffer = NULL;
+        struct ring_buffer *rb = NULL, *old_rb = NULL;
        int ret = -EINVAL;
        if (!output_event)
@@ -6379,7 +5964,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
                goto out;
        /*
-         * If its not a per-cpu buffer, it must be the same task.
+         * If its not a per-cpu rb, it must be the same task.
         */
        if (output_event->cpu == -1 && output_event->ctx != event->ctx)
                goto out;
@@ -6391,20 +5976,20 @@ set:
                goto unlock;
        if (output_event) {
-                /* get the buffer we want to redirect to */
+                /* get the rb we want to redirect to */
-                buffer = perf_buffer_get(output_event);
+                rb = ring_buffer_get(output_event);
-                if (!buffer)
+                if (!rb)
                        goto unlock;
        }
-        old_buffer = event->buffer;
+        old_rb = event->rb;
-        rcu_assign_pointer(event->buffer, buffer);
+        rcu_assign_pointer(event->rb, rb);
        ret = 0;
 unlock:
        mutex_unlock(&event->mmap_mutex);
-        if (old_buffer)
+        if (old_rb)
-                perf_buffer_put(old_buffer);
+                ring_buffer_put(old_rb);
 out:
        return ret;
 }
@@ -6486,7 +6071,8 @@ SYSCALL_DEFINE5(perf_event_open,
                }
        }
-        event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL);
+        event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
+                                 NULL, NULL);
        if (IS_ERR(event)) {
                err = PTR_ERR(event);
                goto err_task;
@@ -6671,7 +6257,8 @@ err_fd:
 struct perf_event *
 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
                                 struct task_struct *task,
-                                 perf_overflow_handler_t overflow_handler)
+                                 perf_overflow_handler_t overflow_handler,
+                                 void *context)
 {
        struct perf_event_context *ctx;
        struct perf_event *event;
@@ -6681,7 +6268,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
         * Get the target context (task or percpu):
         */
-        event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler);
+        event = perf_event_alloc(attr, cpu, task, NULL, NULL,
+                                 overflow_handler, context);
        if (IS_ERR(event)) {
                err = PTR_ERR(event);
                goto err;
@@ -6788,7 +6376,6 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
         * our context.
         */
        child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
-        task_ctx_sched_out(child_ctx, EVENT_ALL);
        /*
         * Take the context lock here so that if find_get_context is
@@ -6796,6 +6383,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
         * incremented the context's refcount before we do put_ctx below.
         */
        raw_spin_lock(&child_ctx->lock);
+        task_ctx_sched_out(child_ctx);
        child->perf_event_ctxp[ctxn] = NULL;
        /*
         * If this context is a clone; unclone it so it can't get
@@ -6965,7 +6553,7 @@ inherit_event(struct perf_event *parent_event,
                                           parent_event->cpu,
                                           child,
                                           group_leader, parent_event,
-                                           NULL);
+                                           NULL, NULL);
        if (IS_ERR(child_event))
                return child_event;
        get_ctx(child_ctx);
@@ -6992,6 +6580,8 @@ inherit_event(struct perf_event *parent_event,
        child_event->ctx = child_ctx;
        child_event->overflow_handler = parent_event->overflow_handler;
+        child_event->overflow_handler_context
+                = parent_event->overflow_handler_context;
        /*
         * Precalculate sample_data sizes
@@ -7410,26 +7000,12 @@ static int __perf_cgroup_move(void *info)
        return 0;
 }
-static void perf_cgroup_move(struct task_struct *task)
+static void
+perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task)
 {
        task_function_call(task, __perf_cgroup_move, task);
 }
-static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-                struct cgroup *old_cgrp, struct task_struct *task,
-                bool threadgroup)
-{
-        perf_cgroup_move(task);
-        if (threadgroup) {
-                struct task_struct *c;
-                rcu_read_lock();
-                list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
-                        perf_cgroup_move(c);
-                }
-                rcu_read_unlock();
-        }
-}
 static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
                struct cgroup *old_cgrp, struct task_struct *task)
 {
@@ -7441,15 +7017,15 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
        if (!(task->flags & PF_EXITING))
                return;
-        perf_cgroup_move(task);
+        perf_cgroup_attach_task(cgrp, task);
 }
 struct cgroup_subsys perf_subsys = {
-        .name = "perf_event",
+        .name           = "perf_event",
-        .subsys_id = perf_subsys_id,
+        .subsys_id      = perf_subsys_id,
-        .create = perf_cgroup_create,
+        .create         = perf_cgroup_create,
-        .destroy = perf_cgroup_destroy,
+        .destroy        = perf_cgroup_destroy,
-        .exit = perf_cgroup_exit,
+        .exit           = perf_cgroup_exit,
-        .attach = perf_cgroup_attach,
+        .attach_task    = perf_cgroup_attach_task,
 };
 #endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 086adf25a55e..b7971d6f38bf 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -431,9 +431,11 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
 struct perf_event *
 register_user_hw_breakpoint(struct perf_event_attr *attr,
                            perf_overflow_handler_t triggered,
+                            void *context,
                            struct task_struct *tsk)
 {
-        return perf_event_create_kernel_counter(attr, -1, tsk, triggered);
+        return perf_event_create_kernel_counter(attr, -1, tsk, triggered,
+                                                context);
 }
 EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
@@ -502,7 +504,8 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
 */
 struct perf_event * __percpu *
 register_wide_hw_breakpoint(struct perf_event_attr *attr,
-                            perf_overflow_handler_t triggered)
+                            perf_overflow_handler_t triggered,
+                            void *context)
 {
        struct perf_event * __percpu *cpu_events, **pevent, *bp;
        long err;
@@ -515,7 +518,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
        get_online_cpus();
        for_each_online_cpu(cpu) {
                pevent = per_cpu_ptr(cpu_events, cpu);
-                bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered);
+                bp = perf_event_create_kernel_counter(attr, cpu, NULL,
+                                                      triggered, context);
                *pevent = bp;
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
new file mode 100644
index 000000000000..09097dd8116c
--- /dev/null
+++ b/kernel/events/internal.h
@@ -0,0 +1,96 @@
+#ifndef _KERNEL_EVENTS_INTERNAL_H
+#define _KERNEL_EVENTS_INTERNAL_H
+#define RING_BUFFER_WRITABLE            0x01
+struct ring_buffer {
+        atomic_t                        refcount;
+        struct rcu_head                 rcu_head;
+#ifdef CONFIG_PERF_USE_VMALLOC
+        struct work_struct              work;
+        int                             page_order;     /* allocation order  */
+#endif
+        int                             nr_pages;       /* nr of data pages  */
+        int                             writable;       /* are we writable   */
+        atomic_t                        poll;           /* POLL_ for wakeups */
+        local_t                         head;           /* write position    */
+        local_t                         nest;           /* nested writers    */
+        local_t                         events;         /* event limit       */
+        local_t                         wakeup;         /* wakeup stamp      */
+        local_t                         lost;           /* nr records lost   */
+        long                            watermark;      /* wakeup watermark  */
+        struct perf_event_mmap_page     *user_page;
+        void                            *data_pages[0];
+};
+extern void rb_free(struct ring_buffer *rb);
+extern struct ring_buffer *
+rb_alloc(int nr_pages, long watermark, int cpu, int flags);
+extern void perf_event_wakeup(struct perf_event *event);
+extern void
+perf_event_header__init_id(struct perf_event_header *header,
+                           struct perf_sample_data *data,
+                           struct perf_event *event);
+extern void
+perf_event__output_id_sample(struct perf_event *event,
+                             struct perf_output_handle *handle,
+                             struct perf_sample_data *sample);
+extern struct page *
+perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff);
+#ifdef CONFIG_PERF_USE_VMALLOC
+/*
+ * Back perf_mmap() with vmalloc memory.
+ *
+ * Required for architectures that have d-cache aliasing issues.
+ */
+static inline int page_order(struct ring_buffer *rb)
+{
+        return rb->page_order;
+}
+#else
+static inline int page_order(struct ring_buffer *rb)
+{
+        return 0;
+}
+#endif
+static unsigned long perf_data_size(struct ring_buffer *rb)
+{
+        return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
+}
+static inline void
+__output_copy(struct perf_output_handle *handle,
+                   const void *buf, unsigned int len)
+{
+        do {
+                unsigned long size = min_t(unsigned long, handle->size, len);
+                memcpy(handle->addr, buf, size);
+                len -= size;
+                handle->addr += size;
+                buf += size;
+                handle->size -= size;
+                if (!handle->size) {
+                        struct ring_buffer *rb = handle->rb;
+                        handle->page++;
+                        handle->page &= rb->nr_pages - 1;
+                        handle->addr = rb->data_pages[handle->page];
+                        handle->size = PAGE_SIZE << page_order(rb);
+                }
+        } while (len);
+}
+#endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
new file mode 100644
index 000000000000..a2a29205cc0f
--- /dev/null
+++ b/kernel/events/ring_buffer.c
@@ -0,0 +1,380 @@
+/*
+ * Performance events ring-buffer code:
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *  Copyright  �  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * For licensing details see kernel-base/COPYING
+ */
+#include <linux/perf_event.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include "internal.h"
+static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
+                              unsigned long offset, unsigned long head)
+{
+        unsigned long mask;
+        if (!rb->writable)
+                return true;
+        mask = perf_data_size(rb) - 1;
+        offset = (offset - tail) & mask;
+        head   = (head   - tail) & mask;
+        if ((int)(head - offset) < 0)
+                return false;
+        return true;
+}
+static void perf_output_wakeup(struct perf_output_handle *handle)
+{
+        atomic_set(&handle->rb->poll, POLL_IN);
+        handle->event->pending_wakeup = 1;
+        irq_work_queue(&handle->event->pending);
+}
+/*
+ * We need to ensure a later event_id doesn't publish a head when a former
+ * event isn't done writing. However since we need to deal with NMIs we
+ * cannot fully serialize things.
+ *
+ * We only publish the head (and generate a wakeup) when the outer-most
+ * event completes.
+ */
+static void perf_output_get_handle(struct perf_output_handle *handle)
+{
+        struct ring_buffer *rb = handle->rb;
+        preempt_disable();
+        local_inc(&rb->nest);
+        handle->wakeup = local_read(&rb->wakeup);
+}
+static void perf_output_put_handle(struct perf_output_handle *handle)
+{
+        struct ring_buffer *rb = handle->rb;
+        unsigned long head;
+again:
+        head = local_read(&rb->head);
+        /*
+         * IRQ/NMI can happen here, which means we can miss a head update.
+         */
+        if (!local_dec_and_test(&rb->nest))
+                goto out;
+        /*
+         * Publish the known good head. Rely on the full barrier implied
+         * by atomic_dec_and_test() order the rb->head read and this
+         * write.
+         */
+        rb->user_page->data_head = head;
+        /*
+         * Now check if we missed an update, rely on the (compiler)
+         * barrier in atomic_dec_and_test() to re-read rb->head.
+         */
+        if (unlikely(head != local_read(&rb->head))) {
+                local_inc(&rb->nest);
+                goto again;
+        }
+        if (handle->wakeup != local_read(&rb->wakeup))
+                perf_output_wakeup(handle);
+out:
+        preempt_enable();
+}
+int perf_output_begin(struct perf_output_handle *handle,
+                      struct perf_event *event, unsigned int size)
+{
+        struct ring_buffer *rb;
+        unsigned long tail, offset, head;
+        int have_lost;
+        struct perf_sample_data sample_data;
+        struct {
+                struct perf_event_header header;
+                u64                      id;
+                u64                      lost;
+        } lost_event;
+        rcu_read_lock();
+        /*
+         * For inherited events we send all the output towards the parent.
+         */
+        if (event->parent)
+                event = event->parent;
+        rb = rcu_dereference(event->rb);
+        if (!rb)
+                goto out;
+        handle->rb      = rb;
+        handle->event   = event;
+        if (!rb->nr_pages)
+                goto out;
+        have_lost = local_read(&rb->lost);
+        if (have_lost) {
+                lost_event.header.size = sizeof(lost_event);
+                perf_event_header__init_id(&lost_event.header, &sample_data,
+                                           event);
+                size += lost_event.header.size;
+        }
+        perf_output_get_handle(handle);
+        do {
+                /*
+                 * Userspace could choose to issue a mb() before updating the
+                 * tail pointer. So that all reads will be completed before the
+                 * write is issued.
+                 */
+                tail = ACCESS_ONCE(rb->user_page->data_tail);
+                smp_rmb();
+                offset = head = local_read(&rb->head);
+                head += size;
+                if (unlikely(!perf_output_space(rb, tail, offset, head)))
+                        goto fail;
+        } while (local_cmpxchg(&rb->head, offset, head) != offset);
+        if (head - local_read(&rb->wakeup) > rb->watermark)
+                local_add(rb->watermark, &rb->wakeup);
+        handle->page = offset >> (PAGE_SHIFT + page_order(rb));
+        handle->page &= rb->nr_pages - 1;
+        handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1);
+        handle->addr = rb->data_pages[handle->page];
+        handle->addr += handle->size;
+        handle->size = (PAGE_SIZE << page_order(rb)) - handle->size;
+        if (have_lost) {
+                lost_event.header.type = PERF_RECORD_LOST;
+                lost_event.header.misc = 0;
+                lost_event.id          = event->id;
+                lost_event.lost        = local_xchg(&rb->lost, 0);
+                perf_output_put(handle, lost_event);
+                perf_event__output_id_sample(event, handle, &sample_data);
+        }
+        return 0;
+fail:
+        local_inc(&rb->lost);
+        perf_output_put_handle(handle);
+out:
+        rcu_read_unlock();
+        return -ENOSPC;
+}
+void perf_output_copy(struct perf_output_handle *handle,
+                      const void *buf, unsigned int len)
+{
+        __output_copy(handle, buf, len);
+}
+void perf_output_end(struct perf_output_handle *handle)
+{
+        perf_output_put_handle(handle);
+        rcu_read_unlock();
+}
+static void
+ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
+{
+        long max_size = perf_data_size(rb);
+        if (watermark)
+                rb->watermark = min(max_size, watermark);
+        if (!rb->watermark)
+                rb->watermark = max_size / 2;
+        if (flags & RING_BUFFER_WRITABLE)
+                rb->writable = 1;
+        atomic_set(&rb->refcount, 1);
+}
+#ifndef CONFIG_PERF_USE_VMALLOC
+/*
+ * Back perf_mmap() with regular GFP_KERNEL-0 pages.
+ */
+struct page *
+perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+{
+        if (pgoff > rb->nr_pages)
+                return NULL;
+        if (pgoff == 0)
+                return virt_to_page(rb->user_page);
+        return virt_to_page(rb->data_pages[pgoff - 1]);
+}
+static void *perf_mmap_alloc_page(int cpu)
+{
+        struct page *page;
+        int node;
+        node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+        page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+        if (!page)
+                return NULL;
+        return page_address(page);
+}
+struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
+{
+        struct ring_buffer *rb;
+        unsigned long size;
+        int i;
+        size = sizeof(struct ring_buffer);
+        size += nr_pages * sizeof(void *);
+        rb = kzalloc(size, GFP_KERNEL);
+        if (!rb)
+                goto fail;
+        rb->user_page = perf_mmap_alloc_page(cpu);
+        if (!rb->user_page)
+                goto fail_user_page;
+        for (i = 0; i < nr_pages; i++) {
+                rb->data_pages[i] = perf_mmap_alloc_page(cpu);
+                if (!rb->data_pages[i])
+                        goto fail_data_pages;
+        }
+        rb->nr_pages = nr_pages;
+        ring_buffer_init(rb, watermark, flags);
+        return rb;
+fail_data_pages:
+        for (i--; i >= 0; i--)
+                free_page((unsigned long)rb->data_pages[i]);
+        free_page((unsigned long)rb->user_page);
+fail_user_page:
+        kfree(rb);
+fail:
+        return NULL;
+}
+static void perf_mmap_free_page(unsigned long addr)
+{
+        struct page *page = virt_to_page((void *)addr);
+        page->mapping = NULL;
+        __free_page(page);
+}
+void rb_free(struct ring_buffer *rb)
+{
+        int i;
+        perf_mmap_free_page((unsigned long)rb->user_page);
+        for (i = 0; i < rb->nr_pages; i++)
+                perf_mmap_free_page((unsigned long)rb->data_pages[i]);
+        kfree(rb);
+}
+#else
+struct page *
+perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+{
+        if (pgoff > (1UL << page_order(rb)))
+                return NULL;
+        return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
+}
+static void perf_mmap_unmark_page(void *addr)
+{
+        struct page *page = vmalloc_to_page(addr);
+        page->mapping = NULL;
+}
+static void rb_free_work(struct work_struct *work)
+{
+        struct ring_buffer *rb;
+        void *base;
+        int i, nr;
+        rb = container_of(work, struct ring_buffer, work);
+        nr = 1 << page_order(rb);
+        base = rb->user_page;
+        for (i = 0; i < nr + 1; i++)
+                perf_mmap_unmark_page(base + (i * PAGE_SIZE));
+        vfree(base);
+        kfree(rb);
+}
+void rb_free(struct ring_buffer *rb)
+{
+        schedule_work(&rb->work);
+}
+struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
+{
+        struct ring_buffer *rb;
+        unsigned long size;
+        void *all_buf;
+        size = sizeof(struct ring_buffer);
+        size += sizeof(void *);
+        rb = kzalloc(size, GFP_KERNEL);
+        if (!rb)
+                goto fail;
+        INIT_WORK(&rb->work, rb_free_work);
+        all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
+        if (!all_buf)
+                goto fail_all_buf;
+        rb->user_page = all_buf;
+        rb->data_pages[0] = all_buf + PAGE_SIZE;
+        rb->page_order = ilog2(nr_pages);
+        rb->nr_pages = 1;
+        ring_buffer_init(rb, watermark, flags);
+        return rb;
+fail_all_buf:
+        kfree(rb);
+fail:
+        return NULL;
+}
+#endif
diff --git a/kernel/exit.c b/kernel/exit.c
index 8dd874181542..2913b3509d42 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -85,7 +85,6 @@ static void __exit_signal(struct task_struct *tsk)
        struct tty_struct *uninitialized_var(tty);
        sighand = rcu_dereference_check(tsk->sighand,
-                                        rcu_read_lock_held() ||
                                        lockdep_tasklist_lock_is_held());
        spin_lock(&sighand->siglock);
@@ -169,7 +168,6 @@ void release_task(struct task_struct * p)
        struct task_struct *leader;
        int zap_leader;
 repeat:
-        tracehook_prepare_release_task(p);
        /* don't need to get the RCU readlock here - the process is dead and
         * can't be modifying its own credentials. But shut RCU-lockdep up */
        rcu_read_lock();
@@ -179,7 +177,7 @@ repeat:
        proc_flush_task(p);
        write_lock_irq(&tasklist_lock);
-        tracehook_finish_release_task(p);
+        ptrace_release_task(p);
        __exit_signal(p);
        /*
@@ -190,22 +188,12 @@ repeat:
        zap_leader = 0;
        leader = p->group_leader;
        if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
-                BUG_ON(task_detached(leader));
-                do_notify_parent(leader, leader->exit_signal);
                /*
                 * If we were the last child thread and the leader has
                 * exited already, and the leader's parent ignores SIGCHLD,
                 * then we are the one who should release the leader.
-                 *
-                 * do_notify_parent() will have marked it self-reaping in
-                 * that case.
-                 */
-                zap_leader = task_detached(leader);
-                /*
-                 * This maintains the invariant that release_task()
-                 * only runs on a task in EXIT_DEAD, just for sanity.
                 */
+                zap_leader = do_notify_parent(leader, leader->exit_signal);
                if (zap_leader)
                        leader->exit_state = EXIT_DEAD;
        }
@@ -277,18 +265,16 @@ int is_current_pgrp_orphaned(void)
        return retval;
 }
-static int has_stopped_jobs(struct pid *pgrp)
+static bool has_stopped_jobs(struct pid *pgrp)
 {
-        int retval = 0;
        struct task_struct *p;
        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
-                if (!task_is_stopped(p))
+                if (p->signal->flags & SIGNAL_STOP_STOPPED)
-                        continue;
+                        return true;
-                retval = 1;
-                break;
        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
-        return retval;
+        return false;
 }
 /*
@@ -561,29 +547,28 @@ void exit_files(struct task_struct *tsk)
 #ifdef CONFIG_MM_OWNER
 /*
- * Task p is exiting and it owned mm, lets find a new owner for it
+ * A task is exiting.   If it owned this mm, find a new owner for the mm.
 */
-static inline int
-mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
-{
-        /*
-         * If there are other users of the mm and the owner (us) is exiting
-         * we need to find a new owner to take on the responsibility.
-         */
-        if (atomic_read(&mm->mm_users) <= 1)
-                return 0;
-        if (mm->owner != p)
-                return 0;
-        return 1;
-}
 void mm_update_next_owner(struct mm_struct *mm)
 {
        struct task_struct *c, *g, *p = current;
 retry:
-        if (!mm_need_new_owner(mm, p))
+        /*
+         * If the exiting or execing task is not the owner, it's
+         * someone else's problem.
+         */
+        if (mm->owner != p)
                return;
+        /*
+         * The current owner is exiting/execing and there are no other
+         * candidates.  Do not leave the mm pointing to a possibly
+         * freed task structure.
+         */
+        if (atomic_read(&mm->mm_users) <= 1) {
+                mm->owner = NULL;
+                return;
+        }
        read_lock(&tasklist_lock);
        /*
@@ -752,7 +737,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
 {
        list_move_tail(&p->sibling, &p->real_parent->children);
-        if (task_detached(p))
+        if (p->exit_state == EXIT_DEAD)
                return;
        /*
         * If this is a threaded reparent there is no need to
@@ -765,10 +750,9 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
        p->exit_signal = SIGCHLD;
        /* If it has exited notify the new parent about this child's death. */
-        if (!task_ptrace(p) &&
+        if (!p->ptrace &&
            p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
-                do_notify_parent(p, p->exit_signal);
+                if (do_notify_parent(p, p->exit_signal)) {
-                if (task_detached(p)) {
                        p->exit_state = EXIT_DEAD;
                        list_move_tail(&p->sibling, dead);
                }
@@ -795,7 +779,7 @@ static void forget_original_parent(struct task_struct *father)
                do {
                        t->real_parent = reaper;
                        if (t->parent == father) {
-                                BUG_ON(task_ptrace(t));
+                                BUG_ON(t->ptrace);
                                t->parent = t->real_parent;
                        }
                        if (t->pdeath_signal)
@@ -820,8 +804,7 @@ static void forget_original_parent(struct task_struct *father)
 */
 static void exit_notify(struct task_struct *tsk, int group_dead)
 {
-        int signal;
+        bool autoreap;
-        void *cookie;
        /*
         * This does two things:
@@ -852,26 +835,33 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
         * we have changed execution domain as these two values started
         * the same after a fork.
         */
-        if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
+        if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD &&
            (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
             tsk->self_exec_id != tsk->parent_exec_id))
                tsk->exit_signal = SIGCHLD;
-        signal = tracehook_notify_death(tsk, &cookie, group_dead);
+        if (unlikely(tsk->ptrace)) {
-        if (signal >= 0)
+                int sig = thread_group_leader(tsk) &&
-                signal = do_notify_parent(tsk, signal);
+                                thread_group_empty(tsk) &&
+                                !ptrace_reparented(tsk) ?
+                        tsk->exit_signal : SIGCHLD;
+                autoreap = do_notify_parent(tsk, sig);
+        } else if (thread_group_leader(tsk)) {
+                autoreap = thread_group_empty(tsk) &&
+                        do_notify_parent(tsk, tsk->exit_signal);
+        } else {
+                autoreap = true;
+        }
-        tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
+        tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
        /* mt-exec, de_thread() is waiting for group leader */
        if (unlikely(tsk->signal->notify_count < 0))
                wake_up_process(tsk->signal->group_exit_task);
        write_unlock_irq(&tasklist_lock);
-        tracehook_report_death(tsk, signal, cookie, group_dead);
        /* If the process is dead, release it - nobody will wait for it */
-        if (signal == DEATH_REAP)
+        if (autoreap)
                release_task(tsk);
 }
@@ -907,7 +897,6 @@ NORET_TYPE void do_exit(long code)
        profile_task_exit(tsk);
-        WARN_ON(atomic_read(&tsk->fs_excl));
        WARN_ON(blk_needs_flush_plug(tsk));
        if (unlikely(in_interrupt()))
@@ -924,7 +913,7 @@ NORET_TYPE void do_exit(long code)
         */
        set_fs(USER_DS);
-        tracehook_report_exit(&code);
+        ptrace_event(PTRACE_EVENT_EXIT, code);
        validate_creds_for_do_exit(tsk);
@@ -991,6 +980,7 @@ NORET_TYPE void do_exit(long code)
        trace_sched_process_exit(tsk);
        exit_sem(tsk);
+        exit_shm(tsk);
        exit_files(tsk);
        exit_fs(tsk);
        check_stack_usage();
@@ -1236,9 +1226,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
        traced = ptrace_reparented(p);
        /*
         * It can be ptraced but not reparented, check
-         * !task_detached() to filter out sub-threads.
+         * thread_group_leader() to filter out sub-threads.
         */
-        if (likely(!traced) && likely(!task_detached(p))) {
+        if (likely(!traced) && thread_group_leader(p)) {
                struct signal_struct *psig;
                struct signal_struct *sig;
                unsigned long maxrss;
@@ -1346,16 +1336,13 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
                /* We dropped tasklist, ptracer could die and untrace */
                ptrace_unlink(p);
                /*
-                 * If this is not a detached task, notify the parent.
+                 * If this is not a sub-thread, notify the parent.
-                 * If it's still not detached after that, don't release
+                 * If parent wants a zombie, don't release it now.
-                 * it now.
                 */
-                if (!task_detached(p)) {
+                if (thread_group_leader(p) &&
-                        do_notify_parent(p, p->exit_signal);
+                    !do_notify_parent(p, p->exit_signal)) {
-                        if (!task_detached(p)) {
+                        p->exit_state = EXIT_ZOMBIE;
-                                p->exit_state = EXIT_ZOMBIE;
+                        p = NULL;
-                                p = NULL;
-                        }
                }
                write_unlock_irq(&tasklist_lock);
        }
@@ -1368,7 +1355,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 static int *task_stopped_code(struct task_struct *p, bool ptrace)
 {
        if (ptrace) {
-                if (task_is_stopped_or_traced(p))
+                if (task_is_stopped_or_traced(p) &&
+                    !(p->jobctl & JOBCTL_LISTENING))
                        return &p->exit_code;
        } else {
                if (p->signal->flags & SIGNAL_STOP_STOPPED)
@@ -1377,11 +1365,23 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
        return NULL;
 }
-/*
+/**
- * Handle sys_wait4 work for one task in state TASK_STOPPED.  We hold
+ * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
- * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
+ * @wo: wait options
- * the lock and this task is uninteresting.  If we return nonzero, we have
+ * @ptrace: is the wait for ptrace
- * released the lock and the system call should return.
+ * @p: task to wait for
+ *
+ * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
+ *
+ * CONTEXT:
+ * read_lock(&tasklist_lock), which is released if return value is
+ * non-zero.  Also, grabs and releases @p->sighand->siglock.
+ *
+ * RETURNS:
+ * 0 if wait condition didn't exist and search for other wait conditions
+ * should continue.  Non-zero return, -errno on failure and @p's pid on
+ * success, implies that tasklist_lock is released and wait condition
+ * search should terminate.
 */
 static int wait_task_stopped(struct wait_opts *wo,
                                int ptrace, struct task_struct *p)
@@ -1397,6 +1397,9 @@ static int wait_task_stopped(struct wait_opts *wo,
        if (!ptrace && !(wo->wo_flags & WUNTRACED))
                return 0;
+        if (!task_stopped_code(p, ptrace))
+                return 0;
        exit_code = 0;
        spin_lock_irq(&p->sighand->siglock);
@@ -1538,33 +1541,83 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
                return 0;
        }
-        if (likely(!ptrace) && unlikely(task_ptrace(p))) {
+        /* dead body doesn't have much to contribute */
+        if (p->exit_state == EXIT_DEAD)
+                return 0;
+        /* slay zombie? */
+        if (p->exit_state == EXIT_ZOMBIE) {
+                /*
+                 * A zombie ptracee is only visible to its ptracer.
+                 * Notification and reaping will be cascaded to the real
+                 * parent when the ptracer detaches.
+                 */
+                if (likely(!ptrace) && unlikely(p->ptrace)) {
+                        /* it will become visible, clear notask_error */
+                        wo->notask_error = 0;
+                        return 0;
+                }
+                /* we don't reap group leaders with subthreads */
+                if (!delay_group_leader(p))
+                        return wait_task_zombie(wo, p);
+                /*
+                 * Allow access to stopped/continued state via zombie by
+                 * falling through.  Clearing of notask_error is complex.
+                 *
+                 * When !@ptrace:
+                 *
+                 * If WEXITED is set, notask_error should naturally be
+                 * cleared.  If not, subset of WSTOPPED|WCONTINUED is set,
+                 * so, if there are live subthreads, there are events to
+                 * wait for.  If all subthreads are dead, it's still safe
+                 * to clear - this function will be called again in finite
+                 * amount time once all the subthreads are released and
+                 * will then return without clearing.
+                 *
+                 * When @ptrace:
+                 *
+                 * Stopped state is per-task and thus can't change once the
+                 * target task dies.  Only continued and exited can happen.
+                 * Clear notask_error if WCONTINUED | WEXITED.
+                 */
+                if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
+                        wo->notask_error = 0;
+        } else {
+                /*
+                 * If @p is ptraced by a task in its real parent's group,
+                 * hide group stop/continued state when looking at @p as
+                 * the real parent; otherwise, a single stop can be
+                 * reported twice as group and ptrace stops.
+                 *
+                 * If a ptracer wants to distinguish the two events for its
+                 * own children, it should create a separate process which
+                 * takes the role of real parent.
+                 */
+                if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p))
+                        return 0;
                /*
-                 * This child is hidden by ptrace.
+                 * @p is alive and it's gonna stop, continue or exit, so
-                 * We aren't allowed to see it now, but eventually we will.
+                 * there always is something to wait for.
                 */
                wo->notask_error = 0;
-                return 0;
        }
-        if (p->exit_state == EXIT_DEAD)
-                return 0;
        /*
-         * We don't reap group leaders with subthreads.
+         * Wait for stopped.  Depending on @ptrace, different stopped state
+         * is used and the two don't interact with each other.
         */
-        if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
+        ret = wait_task_stopped(wo, ptrace, p);
-                return wait_task_zombie(wo, p);
+        if (ret)
+                return ret;
        /*
-         * It's stopped or running now, so it might
+         * Wait for continued.  There's only one continued state and the
-         * later continue, exit, or stop again.
+         * ptracer can consume it which can confuse the real parent.  Don't
+         * use WCONTINUED from ptracer.  You don't need or want it.
         */
-        wo->notask_error = 0;
-        if (task_stopped_code(p, ptrace))
-                return wait_task_stopped(wo, ptrace, p);
        return wait_task_continued(wo, p);
 }
diff --git a/kernel/extable.c b/kernel/extable.c
index 7f8f263f8524..5339705b8241 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -72,6 +72,24 @@ int core_kernel_text(unsigned long addr)
        return 0;
 }
+/**
+ * core_kernel_data - tell if addr points to kernel data
+ * @addr: address to test
+ *
+ * Returns true if @addr passed in is from the core kernel data
+ * section.
+ *
+ * Note: On some archs it may return true for core RODATA, and false
+ *  for others. But will always be true for core RW data.
+ */
+int core_kernel_data(unsigned long addr)
+{
+        if (addr >= (unsigned long)_sdata &&
+            addr < (unsigned long)_edata)
+                return 1;
+        return 0;
+}
 int __kernel_text_address(unsigned long addr)
 {
        if (core_kernel_text(addr))
diff --git a/kernel/fork.c b/kernel/fork.c
index e7548dee636b..8e6b6f4fb272 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -37,7 +37,6 @@
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
-#include <linux/tracehook.h>
 #include <linux/futex.h>
 #include <linux/compat.h>
 #include <linux/kthread.h>
@@ -59,7 +58,6 @@
 #include <linux/taskstats_kern.h>
 #include <linux/random.h>
 #include <linux/tty.h>
-#include <linux/proc_fs.h>
 #include <linux/blkdev.h>
 #include <linux/fs_struct.h>
 #include <linux/magic.h>
@@ -82,7 +80,7 @@
 * Protected counters by write_lock_irq(&tasklist_lock)
 */
 unsigned long total_forks;      /* Handle normal Linux uptimes. */
-int nr_threads;                 /* The idle threads do not count.. */
+int nr_threads;                 /* The idle threads do not count.. */
 int max_threads;                /* tunable limit on nr_threads */
@@ -234,7 +232,7 @@ void __init fork_init(unsigned long mempages)
        /*
         * we need to allow at least 20 threads to boot a system
         */
-        if(max_threads < 20)
+        if (max_threads < 20)
                max_threads = 20;
        init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
@@ -270,7 +268,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
                return NULL;
        }
-        err = arch_dup_task_struct(tsk, orig);
+        err = arch_dup_task_struct(tsk, orig);
        if (err)
                goto out;
@@ -290,9 +288,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        tsk->stack_canary = get_random_int();
 #endif
-        /* One for us, one for whoever does the "release_task()" (usually parent) */
+        /*
-        atomic_set(&tsk->usage,2);
+         * One for us, one for whoever does the "release_task()" (usually
-        atomic_set(&tsk->fs_excl, 0);
+         * parent)
+         */
+        atomic_set(&tsk->usage, 2);
 #ifdef CONFIG_BLK_DEV_IO_TRACE
        tsk->btrace_seq = 0;
 #endif
@@ -383,15 +383,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                        get_file(file);
                        if (tmp->vm_flags & VM_DENYWRITE)
                                atomic_dec(&inode->i_writecount);
-                        spin_lock(&mapping->i_mmap_lock);
+                        mutex_lock(&mapping->i_mmap_mutex);
                        if (tmp->vm_flags & VM_SHARED)
                                mapping->i_mmap_writable++;
-                        tmp->vm_truncate_count = mpnt->vm_truncate_count;
                        flush_dcache_mmap_lock(mapping);
                        /* insert tmp into the share list, just after mpnt */
                        vma_prio_tree_add(tmp, mpnt);
                        flush_dcache_mmap_unlock(mapping);
-                        spin_unlock(&mapping->i_mmap_lock);
+                        mutex_unlock(&mapping->i_mmap_mutex);
                }
                /*
@@ -441,7 +440,7 @@ fail_nomem:
        goto out;
 }
-static inline int mm_alloc_pgd(struct mm_struct * mm)
+static inline int mm_alloc_pgd(struct mm_struct *mm)
 {
        mm->pgd = pgd_alloc(mm);
        if (unlikely(!mm->pgd))
@@ -449,7 +448,7 @@ static inline int mm_alloc_pgd(struct mm_struct * mm)
        return 0;
 }
-static inline void mm_free_pgd(struct mm_struct * mm)
+static inline void mm_free_pgd(struct mm_struct *mm)
 {
        pgd_free(mm, mm->pgd);
 }
@@ -486,7 +485,7 @@ static void mm_init_aio(struct mm_struct *mm)
 #endif
 }
-static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
+static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
 {
        atomic_set(&mm->mm_users, 1);
        atomic_set(&mm->mm_count, 1);
@@ -517,16 +516,17 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 /*
 * Allocate and initialize an mm_struct.
 */
-struct mm_struct * mm_alloc(void)
+struct mm_struct *mm_alloc(void)
 {
-        struct mm_struct * mm;
+        struct mm_struct *mm;
        mm = allocate_mm();
-        if (mm) {
+        if (!mm)
-                memset(mm, 0, sizeof(*mm));
+                return NULL;
-                mm = mm_init(mm, current);
-        }
+        memset(mm, 0, sizeof(*mm));
-        return mm;
+        mm_init_cpumask(mm);
+        return mm_init(mm, current);
 }
 /*
@@ -573,6 +573,57 @@ void mmput(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mmput);
+/*
+ * We added or removed a vma mapping the executable. The vmas are only mapped
+ * during exec and are not mapped with the mmap system call.
+ * Callers must hold down_write() on the mm's mmap_sem for these
+ */
+void added_exe_file_vma(struct mm_struct *mm)
+{
+        mm->num_exe_file_vmas++;
+}
+void removed_exe_file_vma(struct mm_struct *mm)
+{
+        mm->num_exe_file_vmas--;
+        if ((mm->num_exe_file_vmas == 0) && mm->exe_file) {
+                fput(mm->exe_file);
+                mm->exe_file = NULL;
+        }
+}
+void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
+{
+        if (new_exe_file)
+                get_file(new_exe_file);
+        if (mm->exe_file)
+                fput(mm->exe_file);
+        mm->exe_file = new_exe_file;
+        mm->num_exe_file_vmas = 0;
+}
+struct file *get_mm_exe_file(struct mm_struct *mm)
+{
+        struct file *exe_file;
+        /* We need mmap_sem to protect against races with removal of
+         * VM_EXECUTABLE vmas */
+        down_read(&mm->mmap_sem);
+        exe_file = mm->exe_file;
+        if (exe_file)
+                get_file(exe_file);
+        up_read(&mm->mmap_sem);
+        return exe_file;
+}
+static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
+{
+        /* It's safe to write the exe_file pointer without exe_file_lock because
+         * this is called during fork when the task is not yet in /proc */
+        newmm->exe_file = get_mm_exe_file(oldmm);
+}
 /**
 * get_task_mm - acquire a reference to the task's mm
 *
@@ -679,6 +730,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
                goto fail_nomem;
        memcpy(mm, oldmm, sizeof(*mm));
+        mm_init_cpumask(mm);
        /* Initializing for Swap token stuff */
        mm->token_priority = 0;
@@ -726,9 +778,9 @@ fail_nocontext:
        return NULL;
 }
-static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
+static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
 {
-        struct mm_struct * mm, *oldmm;
+        struct mm_struct *mm, *oldmm;
        int retval;
        tsk->min_flt = tsk->maj_flt = 0;
@@ -795,7 +847,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
        return 0;
 }
-static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
+static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
 {
        struct files_struct *oldf, *newf;
        int error = 0;
@@ -927,6 +979,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        tty_audit_fork(sig);
        sched_autogroup_fork(sig);
+#ifdef CONFIG_CGROUPS
+        init_rwsem(&sig->threadgroup_fork_lock);
+#endif
        sig->oom_adj = current->signal->oom_adj;
        sig->oom_score_adj = current->signal->oom_score_adj;
        sig->oom_score_adj_min = current->signal->oom_score_adj_min;
@@ -958,7 +1014,7 @@ static void rt_mutex_init_task(struct task_struct *p)
 {
        raw_spin_lock_init(&p->pi_lock);
 #ifdef CONFIG_RT_MUTEXES
-        plist_head_init_raw(&p->pi_waiters, &p->pi_lock);
+        plist_head_init(&p->pi_waiters);
        p->pi_blocked_on = NULL;
 #endif
 }
@@ -1055,6 +1111,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                    p->real_cred->user != INIT_USER)
                        goto bad_fork_free;
        }
+        current->flags &= ~PF_NPROC_EXCEEDED;
        retval = copy_creds(p, clone_flags);
        if (retval < 0)
@@ -1103,22 +1160,27 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        posix_cpu_timers_init(p);
-        p->lock_depth = -1;             /* -1 = no lock */
        do_posix_clock_monotonic_gettime(&p->start_time);
        p->real_start_time = p->start_time;
        monotonic_to_bootbased(&p->real_start_time);
        p->io_context = NULL;
        p->audit_context = NULL;
+        if (clone_flags & CLONE_THREAD)
+                threadgroup_fork_read_lock(current);
        cgroup_fork(p);
 #ifdef CONFIG_NUMA
        p->mempolicy = mpol_dup(p->mempolicy);
-        if (IS_ERR(p->mempolicy)) {
+        if (IS_ERR(p->mempolicy)) {
-                retval = PTR_ERR(p->mempolicy);
+                retval = PTR_ERR(p->mempolicy);
-                p->mempolicy = NULL;
+                p->mempolicy = NULL;
-                goto bad_fork_cleanup_cgroup;
+                goto bad_fork_cleanup_cgroup;
-        }
+        }
        mpol_fix_fork_child_flag(p);
 #endif
+#ifdef CONFIG_CPUSETS
+        p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
+        p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
+#endif
 #ifdef CONFIG_TRACE_IRQFLAGS
        p->irq_events = 0;
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
@@ -1153,30 +1215,38 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #endif
        /* Perform scheduler related setup. Assign this task to a CPU. */
-        sched_fork(p, clone_flags);
+        sched_fork(p);
        retval = perf_event_init_task(p);
        if (retval)
                goto bad_fork_cleanup_policy;
+        retval = audit_alloc(p);
-        if ((retval = audit_alloc(p)))
+        if (retval)
                goto bad_fork_cleanup_policy;
        /* copy all the process information */
-        if ((retval = copy_semundo(clone_flags, p)))
+        retval = copy_semundo(clone_flags, p);
+        if (retval)
                goto bad_fork_cleanup_audit;
-        if ((retval = copy_files(clone_flags, p)))
+        retval = copy_files(clone_flags, p);
+        if (retval)
                goto bad_fork_cleanup_semundo;
-        if ((retval = copy_fs(clone_flags, p)))
+        retval = copy_fs(clone_flags, p);
+        if (retval)
                goto bad_fork_cleanup_files;
-        if ((retval = copy_sighand(clone_flags, p)))
+        retval = copy_sighand(clone_flags, p);
+        if (retval)
                goto bad_fork_cleanup_fs;
-        if ((retval = copy_signal(clone_flags, p)))
+        retval = copy_signal(clone_flags, p);
+        if (retval)
                goto bad_fork_cleanup_sighand;
-        if ((retval = copy_mm(clone_flags, p)))
+        retval = copy_mm(clone_flags, p);
+        if (retval)
                goto bad_fork_cleanup_signal;
-        if ((retval = copy_namespaces(clone_flags, p)))
+        retval = copy_namespaces(clone_flags, p);
+        if (retval)
                goto bad_fork_cleanup_mm;
-        if ((retval = copy_io(clone_flags, p)))
+        retval = copy_io(clone_flags, p);
+        if (retval)
                goto bad_fork_cleanup_namespaces;
        retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
        if (retval)
@@ -1194,17 +1264,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        if (clone_flags & CLONE_THREAD)
                p->tgid = current->tgid;
-        if (current->nsproxy != p->nsproxy) {
-                retval = ns_cgroup_clone(p, pid);
-                if (retval)
-                        goto bad_fork_free_pid;
-        }
        p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
        /*
         * Clear TID on mm_release()?
         */
-        p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
+        p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
 #ifdef CONFIG_BLOCK
        p->plug = NULL;
 #endif
@@ -1272,7 +1336,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         * it's process group.
         * A fatal signal pending means that current will exit, so the new
         * thread can't slip out of an OOM kill (or normal SIGKILL).
-         */
+        */
        recalc_sigpending();
        if (signal_pending(current)) {
                spin_unlock(&current->sighand->siglock);
@@ -1290,7 +1354,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        }
        if (likely(p->pid)) {
-                tracehook_finish_clone(p, clone_flags, trace);
+                ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
                if (thread_group_leader(p)) {
                        if (is_child_reaper(pid))
@@ -1313,6 +1377,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        write_unlock_irq(&tasklist_lock);
        proc_fork_connector(p);
        cgroup_post_fork(p);
+        if (clone_flags & CLONE_THREAD)
+                threadgroup_fork_read_unlock(current);
        perf_event_fork(p);
        return p;
@@ -1351,6 +1417,8 @@ bad_fork_cleanup_policy:
        mpol_put(p->mempolicy);
 bad_fork_cleanup_cgroup:
 #endif
+        if (clone_flags & CLONE_THREAD)
+                threadgroup_fork_read_unlock(current);
        cgroup_exit(p, cgroup_callbacks_done);
        delayacct_tsk_free(p);
        module_put(task_thread_info(p)->exec_domain->module);
@@ -1427,10 +1495,22 @@ long do_fork(unsigned long clone_flags,
        }
        /*
-         * When called from kernel_thread, don't do user tracing stuff.
+         * Determine whether and which event to report to ptracer.  When
+         * called from kernel_thread or CLONE_UNTRACED is explicitly
+         * requested, no event is reported; otherwise, report if the event
+         * for the type of forking is enabled.
         */
-        if (likely(user_mode(regs)))
+        if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) {
-                trace = tracehook_prepare_clone(clone_flags);
+                if (clone_flags & CLONE_VFORK)
+                        trace = PTRACE_EVENT_VFORK;
+                else if ((clone_flags & CSIGNAL) != SIGCHLD)
+                        trace = PTRACE_EVENT_CLONE;
+                else
+                        trace = PTRACE_EVENT_FORK;
+                if (likely(!ptrace_event_enabled(current, trace)))
+                        trace = 0;
+        }
        p = copy_process(clone_flags, stack_start, regs, stack_size,
                         child_tidptr, NULL, trace);
@@ -1454,26 +1534,26 @@ long do_fork(unsigned long clone_flags,
                }
                audit_finish_fork(p);
-                tracehook_report_clone(regs, clone_flags, nr, p);
                /*
                 * We set PF_STARTING at creation in case tracing wants to
                 * use this to distinguish a fully live task from one that
-                 * hasn't gotten to tracehook_report_clone() yet.  Now we
+                 * hasn't finished SIGSTOP raising yet.  Now we clear it
-                 * clear it and set the child going.
+                 * and set the child going.
                 */
                p->flags &= ~PF_STARTING;
-                wake_up_new_task(p, clone_flags);
+                wake_up_new_task(p);
-                tracehook_report_clone_complete(trace, regs,
+                /* forking complete and child started to run, tell ptracer */
-                                                clone_flags, nr, p);
+                if (unlikely(trace))
+                        ptrace_event(trace, nr);
                if (clone_flags & CLONE_VFORK) {
                        freezer_do_not_count();
                        wait_for_completion(&vfork);
                        freezer_count();
-                        tracehook_report_vfork_done(p, nr);
+                        ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
                }
        } else {
                nr = PTR_ERR(p);
@@ -1508,11 +1588,19 @@ void __init proc_caches_init(void)
        fs_cachep = kmem_cache_create("fs_cache",
                        sizeof(struct fs_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+        /*
+         * FIXME! The "sizeof(struct mm_struct)" currently includes the
+         * whole struct cpumask for the OFFSTACK case. We could change
+         * this to *only* allocate as much of it as required by the
+         * maximum number of CPU's we can ever have.  The cpumask_allocation
+         * is at the end of the structure, exactly for that reason.
+         */
        mm_cachep = kmem_cache_create("mm_struct",
                        sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
        vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
        mmap_init();
+        nsproxy_cache_init();
 }
 /*
@@ -1609,12 +1697,14 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
         */
        if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
                do_sysvsem = 1;
-        if ((err = unshare_fs(unshare_flags, &new_fs)))
+        err = unshare_fs(unshare_flags, &new_fs);
+        if (err)
                goto bad_unshare_out;
-        if ((err = unshare_fd(unshare_flags, &new_fd)))
+        err = unshare_fd(unshare_flags, &new_fd);
+        if (err)
                goto bad_unshare_cleanup_fs;
-        if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
+        err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs);
-                        new_fs)))
+        if (err)
                goto bad_unshare_cleanup_fd;
        if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 66ecd2ead215..7b01de98bb6a 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -17,7 +17,7 @@ static inline void frozen_process(void)
 {
        if (!unlikely(current->flags & PF_NOFREEZE)) {
                current->flags |= PF_FROZEN;
-                wmb();
+                smp_wmb();
        }
        clear_freeze_flag(current);
 }
@@ -93,7 +93,7 @@ bool freeze_task(struct task_struct *p, bool sig_only)
         * the task as frozen and next clears its TIF_FREEZE.
         */
        if (!freezing(p)) {
-                rmb();
+                smp_rmb();
                if (frozen(p))
                        return false;
diff --git a/kernel/futex.c b/kernel/futex.c
index fe28dc282eae..11cbe052b2e8 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -218,6 +218,8 @@ static void drop_futex_key_refs(union futex_key *key)
 * @uaddr:      virtual address of the futex
 * @fshared:    0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
 * @key:        address where result is stored.
+ * @rw:         mapping needs to be read/write (values: VERIFY_READ,
+ *              VERIFY_WRITE)
 *
 * Returns a negative error code or 0
 * The key words are stored in *key on success.
@@ -229,12 +231,12 @@ static void drop_futex_key_refs(union futex_key *key)
 * lock_page() might sleep, the caller should not hold a spinlock.
 */
 static int
-get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
+get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
 {
        unsigned long address = (unsigned long)uaddr;
        struct mm_struct *mm = current->mm;
        struct page *page, *page_head;
-        int err;
+        int err, ro = 0;
        /*
         * The futex address must be "naturally" aligned.
@@ -262,8 +264,18 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
 again:
        err = get_user_pages_fast(address, 1, 1, &page);
+        /*
+         * If write access is not required (eg. FUTEX_WAIT), try
+         * and get read-only access.
+         */
+        if (err == -EFAULT && rw == VERIFY_READ) {
+                err = get_user_pages_fast(address, 1, 0, &page);
+                ro = 1;
+        }
        if (err < 0)
                return err;
+        else
+                err = 0;
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        page_head = page;
@@ -305,6 +317,13 @@ again:
        if (!page_head->mapping) {
                unlock_page(page_head);
                put_page(page_head);
+                /*
+                * ZERO_PAGE pages don't have a mapping. Avoid a busy loop
+                * trying to find one. RW mapping would have COW'd (and thus
+                * have a mapping) so this page is RO and won't ever change.
+                */
+                if ((page_head == ZERO_PAGE(address)))
+                        return -EFAULT;
                goto again;
        }
@@ -316,6 +335,15 @@ again:
         * the object not the particular process.
         */
        if (PageAnon(page_head)) {
+                /*
+                 * A RO anonymous page will never change and thus doesn't make
+                 * sense for futex operations.
+                 */
+                if (ro) {
+                        err = -EFAULT;
+                        goto out;
+                }
                key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
                key->private.mm = mm;
                key->private.address = address;
@@ -327,9 +355,10 @@ again:
        get_futex_key_refs(key);
+out:
        unlock_page(page_head);
        put_page(page_head);
-        return 0;
+        return err;
 }
 static inline void put_futex_key(union futex_key *key)
@@ -355,8 +384,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
        int ret;
        down_read(&mm->mmap_sem);
-        ret = get_user_pages(current, mm, (unsigned long)uaddr,
+        ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
-                             1, 1, 0, NULL, NULL);
+                               FAULT_FLAG_WRITE);
        up_read(&mm->mmap_sem);
        return ret < 0 ? ret : 0;
@@ -940,7 +969,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
        if (!bitset)
                return -EINVAL;
-        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
+        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ);
        if (unlikely(ret != 0))
                goto out;
@@ -986,10 +1015,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
        int ret, op_ret;
 retry:
-        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
+        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
        if (unlikely(ret != 0))
                goto out;
-        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
+        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
        if (unlikely(ret != 0))
                goto out_put_key1;
@@ -1243,10 +1272,11 @@ retry:
                pi_state = NULL;
        }
-        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
+        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
        if (unlikely(ret != 0))
                goto out;
-        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
+        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
+                            requeue_pi ? VERIFY_WRITE : VERIFY_READ);
        if (unlikely(ret != 0))
                goto out_put_key1;
@@ -1790,7 +1820,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
         * while the syscall executes.
         */
 retry:
-        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
+        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ);
        if (unlikely(ret != 0))
                return ret;
@@ -1941,7 +1971,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
        }
 retry:
-        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key);
+        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE);
        if (unlikely(ret != 0))
                goto out;
@@ -2060,7 +2090,7 @@ retry:
        if ((uval & FUTEX_TID_MASK) != vpid)
                return -EPERM;
-        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
+        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
        if (unlikely(ret != 0))
                goto out;
@@ -2249,7 +2279,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
        debug_rt_mutex_init_waiter(&rt_waiter);
        rt_waiter.task = NULL;
-        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
+        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
        if (unlikely(ret != 0))
                goto out;
@@ -2697,7 +2727,7 @@ static int __init futex_init(void)
                futex_cmpxchg_enabled = 1;
        for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
-                plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock);
+                plist_head_init(&futex_queues[i].chain);
                spin_lock_init(&futex_queues[i].lock);
        }
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index b8cadf70b1fb..a92028196cc1 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -2,7 +2,8 @@ menu "GCOV-based kernel profiling"
 config GCOV_KERNEL
        bool "Enable gcov-based kernel profiling"
-        depends on DEBUG_FS && CONSTRUCTORS
+        depends on DEBUG_FS
+        select CONSTRUCTORS if !UML
        default n
        ---help---
        This option enables gcov-based code profiling (e.g. for code coverage
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 87fdb3f8db14..a9205e32a059 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -64,24 +64,27 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
        .clock_base =
        {
                {
-                        .index = CLOCK_REALTIME,
+                        .index = HRTIMER_BASE_MONOTONIC,
-                        .get_time = &ktime_get_real,
+                        .clockid = CLOCK_MONOTONIC,
+                        .get_time = &ktime_get,
                        .resolution = KTIME_LOW_RES,
                },
                {
-                        .index = CLOCK_MONOTONIC,
+                        .index = HRTIMER_BASE_REALTIME,
-                        .get_time = &ktime_get,
+                        .clockid = CLOCK_REALTIME,
+                        .get_time = &ktime_get_real,
                        .resolution = KTIME_LOW_RES,
                },
                {
-                        .index = CLOCK_BOOTTIME,
+                        .index = HRTIMER_BASE_BOOTTIME,
+                        .clockid = CLOCK_BOOTTIME,
                        .get_time = &ktime_get_boottime,
                        .resolution = KTIME_LOW_RES,
                },
        }
 };
-static int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
+static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
        [CLOCK_REALTIME]        = HRTIMER_BASE_REALTIME,
        [CLOCK_MONOTONIC]       = HRTIMER_BASE_MONOTONIC,
        [CLOCK_BOOTTIME]        = HRTIMER_BASE_BOOTTIME,
@@ -196,7 +199,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
        struct hrtimer_cpu_base *new_cpu_base;
        int this_cpu = smp_processor_id();
        int cpu = hrtimer_get_target(this_cpu, pinned);
-        int basenum = hrtimer_clockid_to_base(base->index);
+        int basenum = base->index;
 again:
        new_cpu_base = &per_cpu(hrtimer_bases, cpu);
@@ -621,66 +624,6 @@ static int hrtimer_reprogram(struct hrtimer *timer,
        return res;
 }
-/*
- * Retrigger next event is called after clock was set
- *
- * Called with interrupts disabled via on_each_cpu()
- */
-static void retrigger_next_event(void *arg)
-{
-        struct hrtimer_cpu_base *base;
-        struct timespec realtime_offset, wtm, sleep;
-        if (!hrtimer_hres_active())
-                return;
-        get_xtime_and_monotonic_and_sleep_offset(&realtime_offset, &wtm,
-                                                        &sleep);
-        set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
-        base = &__get_cpu_var(hrtimer_bases);
-        /* Adjust CLOCK_REALTIME offset */
-        raw_spin_lock(&base->lock);
-        base->clock_base[HRTIMER_BASE_REALTIME].offset =
-                timespec_to_ktime(realtime_offset);
-        base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
-                timespec_to_ktime(sleep);
-        hrtimer_force_reprogram(base, 0);
-        raw_spin_unlock(&base->lock);
-}
-/*
- * Clock realtime was set
- *
- * Change the offset of the realtime clock vs. the monotonic
- * clock.
- *
- * We might have to reprogram the high resolution timer interrupt. On
- * SMP we call the architecture specific code to retrigger _all_ high
- * resolution timer interrupts. On UP we just disable interrupts and
- * call the high resolution interrupt code.
- */
-void clock_was_set(void)
-{
-        /* Retrigger the CPU local events everywhere */
-        on_each_cpu(retrigger_next_event, NULL, 1);
-}
-/*
- * During resume we might have to reprogram the high resolution timer
- * interrupt (on the local CPU):
- */
-void hres_timers_resume(void)
-{
-        WARN_ONCE(!irqs_disabled(),
-                  KERN_INFO "hres_timers_resume() called with IRQs enabled!");
-        retrigger_next_event(NULL);
-}
 /*
 * Initialize the high resolution related parts of cpu_base
 */
@@ -715,11 +658,39 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 }
 /*
+ * Retrigger next event is called after clock was set
+ *
+ * Called with interrupts disabled via on_each_cpu()
+ */
+static void retrigger_next_event(void *arg)
+{
+        struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
+        struct timespec realtime_offset, xtim, wtm, sleep;
+        if (!hrtimer_hres_active())
+                return;
+        /* Optimized out for !HIGH_RES */
+        get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep);
+        set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
+        /* Adjust CLOCK_REALTIME offset */
+        raw_spin_lock(&base->lock);
+        base->clock_base[HRTIMER_BASE_REALTIME].offset =
+                timespec_to_ktime(realtime_offset);
+        base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
+                timespec_to_ktime(sleep);
+        hrtimer_force_reprogram(base, 0);
+        raw_spin_unlock(&base->lock);
+}
+/*
 * Switch to high resolution mode
 */
 static int hrtimer_switch_to_hres(void)
 {
-        int cpu = smp_processor_id();
+        int i, cpu = smp_processor_id();
        struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
        unsigned long flags;
@@ -735,9 +706,8 @@ static int hrtimer_switch_to_hres(void)
                return 0;
        }
        base->hres_active = 1;
-        base->clock_base[HRTIMER_BASE_REALTIME].resolution = KTIME_HIGH_RES;
+        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
-        base->clock_base[HRTIMER_BASE_MONOTONIC].resolution = KTIME_HIGH_RES;
+                base->clock_base[i].resolution = KTIME_HIGH_RES;
-        base->clock_base[HRTIMER_BASE_BOOTTIME].resolution = KTIME_HIGH_RES;
        tick_setup_sched_timer();
@@ -761,9 +731,43 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
        return 0;
 }
 static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
+static inline void retrigger_next_event(void *arg) { }
 #endif /* CONFIG_HIGH_RES_TIMERS */
+/*
+ * Clock realtime was set
+ *
+ * Change the offset of the realtime clock vs. the monotonic
+ * clock.
+ *
+ * We might have to reprogram the high resolution timer interrupt. On
+ * SMP we call the architecture specific code to retrigger _all_ high
+ * resolution timer interrupts. On UP we just disable interrupts and
+ * call the high resolution interrupt code.
+ */
+void clock_was_set(void)
+{
+#ifdef CONFIG_HIGH_RES_TIMERS
+        /* Retrigger the CPU local events everywhere */
+        on_each_cpu(retrigger_next_event, NULL, 1);
+#endif
+        timerfd_clock_was_set();
+}
+/*
+ * During resume we might have to reprogram the high resolution timer
+ * interrupt (on the local CPU):
+ */
+void hrtimers_resume(void)
+{
+        WARN_ONCE(!irqs_disabled(),
+                  KERN_INFO "hrtimers_resume() called with IRQs enabled!");
+        retrigger_next_event(NULL);
+        timerfd_clock_was_set();
+}
 static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
 {
 #ifdef CONFIG_TIMER_STATS
@@ -856,6 +860,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
        debug_activate(timer);
        timerqueue_add(&base->active, &timer->node);
+        base->cpu_base->active_bases |= 1 << base->index;
        /*
         * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
@@ -897,6 +902,8 @@ static void __remove_hrtimer(struct hrtimer *timer,
 #endif
        }
        timerqueue_del(&base->active, &timer->node);
+        if (!timerqueue_getnext(&base->active))
+                base->cpu_base->active_bases &= ~(1 << base->index);
 out:
        timer->state = newstate;
 }
@@ -1234,7 +1241,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
 void hrtimer_interrupt(struct clock_event_device *dev)
 {
        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
-        struct hrtimer_clock_base *base;
        ktime_t expires_next, now, entry_time, delta;
        int i, retries = 0;
@@ -1256,12 +1262,15 @@ retry:
         */
        cpu_base->expires_next.tv64 = KTIME_MAX;
-        base = cpu_base->clock_base;
        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
-                ktime_t basenow;
+                struct hrtimer_clock_base *base;
                struct timerqueue_node *node;
+                ktime_t basenow;
+                if (!(cpu_base->active_bases & (1 << i)))
+                        continue;
+                base = cpu_base->clock_base + i;
                basenow = ktime_add(now, base->offset);
                while ((node = timerqueue_getnext(&base->active))) {
@@ -1294,7 +1303,6 @@ retry:
                        __run_hrtimer(timer, &basenow);
                }
-                base++;
        }
        /*
@@ -1525,7 +1533,7 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
        struct timespec __user  *rmtp;
        int ret = 0;
-        hrtimer_init_on_stack(&t.timer, restart->nanosleep.index,
+        hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid,
                                HRTIMER_MODE_ABS);
        hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
@@ -1577,7 +1585,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
        restart = &current_thread_info()->restart_block;
        restart->fn = hrtimer_nanosleep_restart;
-        restart->nanosleep.index = t.timer.base->index;
+        restart->nanosleep.clockid = t.timer.base->clockid;
        restart->nanosleep.rmtp = rmtp;
        restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 53ead174da2f..ea640120ab86 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -33,7 +33,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
 /*
 * Zero means infinite timeout - no checking done:
 */
-unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
+unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT;
 unsigned long __read_mostly sysctl_hung_task_warnings = 10;
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index c574f9a12c48..5a38bf4de641 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -48,6 +48,14 @@ config IRQ_PREFLOW_FASTEOI
 config IRQ_EDGE_EOI_HANDLER
       bool
+# Generic configurable interrupt chip implementation
+config GENERIC_IRQ_CHIP
+       bool
+# Generic irq_domain hw <--> linux irq number translation
+config IRQ_DOMAIN
+        bool
 # Support forced irq threading
 config IRQ_FORCED_THREADING
       bool
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 54329cd7b3ee..fff17381f0af 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,6 +1,8 @@
 obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
+obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
+obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
 obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 4af1e2b244cb..d5a3009da71a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -310,6 +310,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
 out_unlock:
        raw_spin_unlock(&desc->lock);
 }
+EXPORT_SYMBOL_GPL(handle_simple_irq);
 /**
 *      handle_level_irq - Level type irq handler
@@ -573,6 +574,7 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
        if (handle != handle_bad_irq && is_chained) {
                irq_settings_set_noprobe(desc);
                irq_settings_set_norequest(desc);
+                irq_settings_set_nothread(desc);
                irq_startup(desc);
        }
 out:
@@ -612,6 +614,7 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
        irq_put_desc_unlock(desc, flags);
 }
+EXPORT_SYMBOL_GPL(irq_modify_status);
 /**
 *      irq_cpu_online - Invoke all irq_cpu_online functions.
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
index 306cba37e9a5..97a8bfadc88a 100644
--- a/kernel/irq/debug.h
+++ b/kernel/irq/debug.h
@@ -27,6 +27,7 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
        P(IRQ_PER_CPU);
        P(IRQ_NOPROBE);
        P(IRQ_NOREQUEST);
+        P(IRQ_NOTHREAD);
        P(IRQ_NOAUTOEN);
        PS(IRQS_AUTODETECT);
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 1ef4ffcdfa55..bd8e788d71e0 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -87,8 +87,8 @@ void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
 {
        struct irq_devres match_data = { irq, dev_id };
-        free_irq(irq, dev_id);
        WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match,
                               &match_data));
+        free_irq(irq, dev_id);
 }
 EXPORT_SYMBOL(devm_free_irq);
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
new file mode 100644
index 000000000000..3a2cab407b93
--- /dev/null
+++ b/kernel/irq/generic-chip.c
@@ -0,0 +1,368 @@
+/*
+ * Library implementing the most common irq chip callback functions
+ *
+ * Copyright (C) 2011, Thomas Gleixner
+ */
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/syscore_ops.h>
+#include "internals.h"
+static LIST_HEAD(gc_list);
+static DEFINE_RAW_SPINLOCK(gc_lock);
+static inline struct irq_chip_regs *cur_regs(struct irq_data *d)
+{
+        return &container_of(d->chip, struct irq_chip_type, chip)->regs;
+}
+/**
+ * irq_gc_noop - NOOP function
+ * @d: irq_data
+ */
+void irq_gc_noop(struct irq_data *d)
+{
+}
+/**
+ * irq_gc_mask_disable_reg - Mask chip via disable register
+ * @d: irq_data
+ *
+ * Chip has separate enable/disable registers instead of a single mask
+ * register.
+ */
+void irq_gc_mask_disable_reg(struct irq_data *d)
+{
+        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+        u32 mask = 1 << (d->irq - gc->irq_base);
+        irq_gc_lock(gc);
+        irq_reg_writel(mask, gc->reg_base + cur_regs(d)->disable);
+        gc->mask_cache &= ~mask;
+        irq_gc_unlock(gc);
+}
+/**
+ * irq_gc_mask_set_mask_bit - Mask chip via setting bit in mask register
+ * @d: irq_data
+ *
+ * Chip has a single mask register. Values of this register are cached
+ * and protected by gc->lock
+ */
+void irq_gc_mask_set_bit(struct irq_data *d)
+{
+        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+        u32 mask = 1 << (d->irq - gc->irq_base);
+        irq_gc_lock(gc);
+        gc->mask_cache |= mask;
+        irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask);
+        irq_gc_unlock(gc);
+}
+/**
+ * irq_gc_mask_set_mask_bit - Mask chip via clearing bit in mask register
+ * @d: irq_data
+ *
+ * Chip has a single mask register. Values of this register are cached
+ * and protected by gc->lock
+ */
+void irq_gc_mask_clr_bit(struct irq_data *d)
+{
+        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+        u32 mask = 1 << (d->irq - gc->irq_base);
+        irq_gc_lock(gc);
+        gc->mask_cache &= ~mask;
+        irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask);
+        irq_gc_unlock(gc);
+}
+/**
+ * irq_gc_unmask_enable_reg - Unmask chip via enable register
+ * @d: irq_data
+ *
+ * Chip has separate enable/disable registers instead of a single mask
+ * register.
+ */
+void irq_gc_unmask_enable_reg(struct irq_data *d)
+{
+        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+        u32 mask = 1 << (d->irq - gc->irq_base);
+        irq_gc_lock(gc);
+        irq_reg_writel(mask, gc->reg_base + cur_regs(d)->enable);
+        gc->mask_cache |= mask;
+        irq_gc_unlock(gc);
+}
+/**
+ * irq_gc_ack_set_bit - Ack pending interrupt via setting bit
+ * @d: irq_data
+ */
+void irq_gc_ack_set_bit(struct irq_data *d)
+{
+        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+        u32 mask = 1 << (d->irq - gc->irq_base);
+        irq_gc_lock(gc);
+        irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
+        irq_gc_unlock(gc);
+}
+/**
+ * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit
+ * @d: irq_data
+ */
+void irq_gc_ack_clr_bit(struct irq_data *d)
+{
+        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+        u32 mask = ~(1 << (d->irq - gc->irq_base));
+        irq_gc_lock(gc);
+        irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
+        irq_gc_unlock(gc);
+}
+/**
+ * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt
+ * @d: irq_data
+ */
+void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
+{
+        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+        u32 mask = 1 << (d->irq - gc->irq_base);
+        irq_gc_lock(gc);
+        irq_reg_writel(mask, gc->reg_base + cur_regs(d)->mask);
+        irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
+        irq_gc_unlock(gc);
+}
+/**
+ * irq_gc_eoi - EOI interrupt
+ * @d: irq_data
+ */
+void irq_gc_eoi(struct irq_data *d)
+{
+        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+        u32 mask = 1 << (d->irq - gc->irq_base);
+        irq_gc_lock(gc);
+        irq_reg_writel(mask, gc->reg_base + cur_regs(d)->eoi);
+        irq_gc_unlock(gc);
+}
+/**
+ * irq_gc_set_wake - Set/clr wake bit for an interrupt
+ * @d: irq_data
+ *
+ * For chips where the wake from suspend functionality is not
+ * configured in a separate register and the wakeup active state is
+ * just stored in a bitmask.
+ */
+int irq_gc_set_wake(struct irq_data *d, unsigned int on)
+{
+        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+        u32 mask = 1 << (d->irq - gc->irq_base);
+        if (!(mask & gc->wake_enabled))
+                return -EINVAL;
+        irq_gc_lock(gc);
+        if (on)
+                gc->wake_active |= mask;
+        else
+                gc->wake_active &= ~mask;
+        irq_gc_unlock(gc);
+        return 0;
+}
+/**
+ * irq_alloc_generic_chip - Allocate a generic chip and initialize it
+ * @name:       Name of the irq chip
+ * @num_ct:     Number of irq_chip_type instances associated with this
+ * @irq_base:   Interrupt base nr for this chip
+ * @reg_base:   Register base address (virtual)
+ * @handler:    Default flow handler associated with this chip
+ *
+ * Returns an initialized irq_chip_generic structure. The chip defaults
+ * to the primary (index 0) irq_chip_type and @handler
+ */
+struct irq_chip_generic *
+irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base,
+                       void __iomem *reg_base, irq_flow_handler_t handler)
+{
+        struct irq_chip_generic *gc;
+        unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
+        gc = kzalloc(sz, GFP_KERNEL);
+        if (gc) {
+                raw_spin_lock_init(&gc->lock);
+                gc->num_ct = num_ct;
+                gc->irq_base = irq_base;
+                gc->reg_base = reg_base;
+                gc->chip_types->chip.name = name;
+                gc->chip_types->handler = handler;
+        }
+        return gc;
+}
+/*
+ * Separate lockdep class for interrupt chip which can nest irq_desc
+ * lock.
+ */
+static struct lock_class_key irq_nested_lock_class;
+/**
+ * irq_setup_generic_chip - Setup a range of interrupts with a generic chip
+ * @gc:         Generic irq chip holding all data
+ * @msk:        Bitmask holding the irqs to initialize relative to gc->irq_base
+ * @flags:      Flags for initialization
+ * @clr:        IRQ_* bits to clear
+ * @set:        IRQ_* bits to set
+ *
+ * Set up max. 32 interrupts starting from gc->irq_base. Note, this
+ * initializes all interrupts to the primary irq_chip_type and its
+ * associated handler.
+ */
+void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
+                            enum irq_gc_flags flags, unsigned int clr,
+                            unsigned int set)
+{
+        struct irq_chip_type *ct = gc->chip_types;
+        unsigned int i;
+        raw_spin_lock(&gc_lock);
+        list_add_tail(&gc->list, &gc_list);
+        raw_spin_unlock(&gc_lock);
+        /* Init mask cache ? */
+        if (flags & IRQ_GC_INIT_MASK_CACHE)
+                gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask);
+        for (i = gc->irq_base; msk; msk >>= 1, i++) {
+                if (!msk & 0x01)
+                        continue;
+                if (flags & IRQ_GC_INIT_NESTED_LOCK)
+                        irq_set_lockdep_class(i, &irq_nested_lock_class);
+                irq_set_chip_and_handler(i, &ct->chip, ct->handler);
+                irq_set_chip_data(i, gc);
+                irq_modify_status(i, clr, set);
+        }
+        gc->irq_cnt = i - gc->irq_base;
+}
+/**
+ * irq_setup_alt_chip - Switch to alternative chip
+ * @d:          irq_data for this interrupt
+ * @type        Flow type to be initialized
+ *
+ * Only to be called from chip->irq_set_type() callbacks.
+ */
+int irq_setup_alt_chip(struct irq_data *d, unsigned int type)
+{
+        struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+        struct irq_chip_type *ct = gc->chip_types;
+        unsigned int i;
+        for (i = 0; i < gc->num_ct; i++, ct++) {
+                if (ct->type & type) {
+                        d->chip = &ct->chip;
+                        irq_data_to_desc(d)->handle_irq = ct->handler;
+                        return 0;
+                }
+        }
+        return -EINVAL;
+}
+/**
+ * irq_remove_generic_chip - Remove a chip
+ * @gc:         Generic irq chip holding all data
+ * @msk:        Bitmask holding the irqs to initialize relative to gc->irq_base
+ * @clr:        IRQ_* bits to clear
+ * @set:        IRQ_* bits to set
+ *
+ * Remove up to 32 interrupts starting from gc->irq_base.
+ */
+void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
+                             unsigned int clr, unsigned int set)
+{
+        unsigned int i = gc->irq_base;
+        raw_spin_lock(&gc_lock);
+        list_del(&gc->list);
+        raw_spin_unlock(&gc_lock);
+        for (; msk; msk >>= 1, i++) {
+                if (!msk & 0x01)
+                        continue;
+                /* Remove handler first. That will mask the irq line */
+                irq_set_handler(i, NULL);
+                irq_set_chip(i, &no_irq_chip);
+                irq_set_chip_data(i, NULL);
+                irq_modify_status(i, clr, set);
+        }
+}
+#ifdef CONFIG_PM
+static int irq_gc_suspend(void)
+{
+        struct irq_chip_generic *gc;
+        list_for_each_entry(gc, &gc_list, list) {
+                struct irq_chip_type *ct = gc->chip_types;
+                if (ct->chip.irq_suspend)
+                        ct->chip.irq_suspend(irq_get_irq_data(gc->irq_base));
+        }
+        return 0;
+}
+static void irq_gc_resume(void)
+{
+        struct irq_chip_generic *gc;
+        list_for_each_entry(gc, &gc_list, list) {
+                struct irq_chip_type *ct = gc->chip_types;
+                if (ct->chip.irq_resume)
+                        ct->chip.irq_resume(irq_get_irq_data(gc->irq_base));
+        }
+}
+#else
+#define irq_gc_suspend NULL
+#define irq_gc_resume NULL
+#endif
+static void irq_gc_shutdown(void)
+{
+        struct irq_chip_generic *gc;
+        list_for_each_entry(gc, &gc_list, list) {
+                struct irq_chip_type *ct = gc->chip_types;
+                if (ct->chip.irq_pm_shutdown)
+                        ct->chip.irq_pm_shutdown(irq_get_irq_data(gc->irq_base));
+        }
+}
+static struct syscore_ops irq_gc_syscore_ops = {
+        .suspend = irq_gc_suspend,
+        .resume = irq_gc_resume,
+        .shutdown = irq_gc_shutdown,
+};
+static int __init irq_gc_init_ops(void)
+{
+        register_syscore_ops(&irq_gc_syscore_ops);
+        return 0;
+}
+device_initcall(irq_gc_init_ops);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 90cb55f6d7eb..470d08c82bbe 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -133,12 +133,6 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
                switch (res) {
                case IRQ_WAKE_THREAD:
                        /*
-                         * Set result to handled so the spurious check
-                         * does not trigger.
-                         */
-                        res = IRQ_HANDLED;
-                        /*
                         * Catch drivers which return WAKE_THREAD but
                         * did not set up a thread function
                         */
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 2c039c9b9383..4c60a50e66b2 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -22,7 +22,7 @@
 */
 static struct lock_class_key irq_desc_lock_class;
-#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
+#if defined(CONFIG_SMP)
 static void __init init_irq_default_affinity(void)
 {
        alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
@@ -257,13 +257,11 @@ int __init early_irq_init(void)
        count = ARRAY_SIZE(irq_desc);
        for (i = 0; i < count; i++) {
-                desc[i].irq_data.irq = i;
-                desc[i].irq_data.chip = &no_irq_chip;
                desc[i].kstat_irqs = alloc_percpu(unsigned int);
-                irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
+                alloc_masks(&desc[i], GFP_KERNEL, node);
-                alloc_masks(desc + i, GFP_KERNEL, node);
+                raw_spin_lock_init(&desc[i].lock);
-                desc_smp_init(desc + i, node);
                lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
+                desc_set_defaults(i, &desc[i], node);
        }
        return arch_early_irq_init();
 }
@@ -290,6 +288,22 @@ static int irq_expand_nr_irqs(unsigned int nr)
 #endif /* !CONFIG_SPARSE_IRQ */
+/**
+ * generic_handle_irq - Invoke the handler for a particular irq
+ * @irq:        The irq number to handle
+ *
+ */
+int generic_handle_irq(unsigned int irq)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        if (!desc)
+                return -EINVAL;
+        generic_handle_irq_desc(irq, desc);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(generic_handle_irq);
 /* Dynamic interrupt handling */
 /**
@@ -311,6 +325,7 @@ void irq_free_descs(unsigned int from, unsigned int cnt)
        bitmap_clear(allocated_irqs, from, cnt);
        mutex_unlock(&sparse_irq_lock);
 }
+EXPORT_SYMBOL_GPL(irq_free_descs);
 /**
 * irq_alloc_descs - allocate and initialize a range of irq descriptors
@@ -329,6 +344,12 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
        if (!cnt)
                return -EINVAL;
+        if (irq >= 0) {
+                if (from > irq)
+                        return -EINVAL;
+                from = irq;
+        }
        mutex_lock(&sparse_irq_lock);
        start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
@@ -351,6 +372,7 @@ err:
        mutex_unlock(&sparse_irq_lock);
        return ret;
 }
+EXPORT_SYMBOL_GPL(irq_alloc_descs);
 /**
 * irq_reserve_irqs - mark irqs allocated
@@ -430,7 +452,6 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
                        *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
 }
-#ifdef CONFIG_GENERIC_HARDIRQS
 unsigned int kstat_irqs(unsigned int irq)
 {
        struct irq_desc *desc = irq_to_desc(irq);
@@ -443,4 +464,3 @@ unsigned int kstat_irqs(unsigned int irq)
                sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
        return sum;
 }
-#endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
new file mode 100644
index 000000000000..d5828da3fd38
--- /dev/null
+++ b/kernel/irq/irqdomain.c
@@ -0,0 +1,180 @@
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/slab.h>
+static LIST_HEAD(irq_domain_list);
+static DEFINE_MUTEX(irq_domain_mutex);
+/**
+ * irq_domain_add() - Register an irq_domain
+ * @domain: ptr to initialized irq_domain structure
+ *
+ * Registers an irq_domain structure.  The irq_domain must at a minimum be
+ * initialized with an ops structure pointer, and either a ->to_irq hook or
+ * a valid irq_base value.  Everything else is optional.
+ */
+void irq_domain_add(struct irq_domain *domain)
+{
+        struct irq_data *d;
+        int hwirq;
+        /*
+         * This assumes that the irq_domain owner has already allocated
+         * the irq_descs.  This block will be removed when support for dynamic
+         * allocation of irq_descs is added to irq_domain.
+         */
+        for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) {
+                d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq));
+                if (d || d->domain) {
+                        /* things are broken; just report, don't clean up */
+                        WARN(1, "error: irq_desc already assigned to a domain");
+                        return;
+                }
+                d->domain = domain;
+                d->hwirq = hwirq;
+        }
+        mutex_lock(&irq_domain_mutex);
+        list_add(&domain->list, &irq_domain_list);
+        mutex_unlock(&irq_domain_mutex);
+}
+/**
+ * irq_domain_del() - Unregister an irq_domain
+ * @domain: ptr to registered irq_domain.
+ */
+void irq_domain_del(struct irq_domain *domain)
+{
+        struct irq_data *d;
+        int hwirq;
+        mutex_lock(&irq_domain_mutex);
+        list_del(&domain->list);
+        mutex_unlock(&irq_domain_mutex);
+        /* Clear the irq_domain assignments */
+        for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) {
+                d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq));
+                d->domain = NULL;
+        }
+}
+#if defined(CONFIG_OF_IRQ)
+/**
+ * irq_create_of_mapping() - Map a linux irq number from a DT interrupt spec
+ *
+ * Used by the device tree interrupt mapping code to translate a device tree
+ * interrupt specifier to a valid linux irq number.  Returns either a valid
+ * linux IRQ number or 0.
+ *
+ * When the caller no longer need the irq number returned by this function it
+ * should arrange to call irq_dispose_mapping().
+ */
+unsigned int irq_create_of_mapping(struct device_node *controller,
+                                   const u32 *intspec, unsigned int intsize)
+{
+        struct irq_domain *domain;
+        unsigned long hwirq;
+        unsigned int irq, type;
+        int rc = -EINVAL;
+        /* Find a domain which can translate the irq spec */
+        mutex_lock(&irq_domain_mutex);
+        list_for_each_entry(domain, &irq_domain_list, list) {
+                if (!domain->ops->dt_translate)
+                        continue;
+                rc = domain->ops->dt_translate(domain, controller,
+                                        intspec, intsize, &hwirq, &type);
+                if (rc == 0)
+                        break;
+        }
+        mutex_unlock(&irq_domain_mutex);
+        if (rc != 0)
+                return 0;
+        irq = irq_domain_to_irq(domain, hwirq);
+        if (type != IRQ_TYPE_NONE)
+                irq_set_irq_type(irq, type);
+        pr_debug("%s: mapped hwirq=%i to irq=%i, flags=%x\n",
+                 controller->full_name, (int)hwirq, irq, type);
+        return irq;
+}
+EXPORT_SYMBOL_GPL(irq_create_of_mapping);
+/**
+ * irq_dispose_mapping() - Discard a mapping created by irq_create_of_mapping()
+ * @irq: linux irq number to be discarded
+ *
+ * Calling this function indicates the caller no longer needs a reference to
+ * the linux irq number returned by a prior call to irq_create_of_mapping().
+ */
+void irq_dispose_mapping(unsigned int irq)
+{
+        /*
+         * nothing yet; will be filled when support for dynamic allocation of
+         * irq_descs is added to irq_domain
+         */
+}
+EXPORT_SYMBOL_GPL(irq_dispose_mapping);
+int irq_domain_simple_dt_translate(struct irq_domain *d,
+                            struct device_node *controller,
+                            const u32 *intspec, unsigned int intsize,
+                            unsigned long *out_hwirq, unsigned int *out_type)
+{
+        if (d->of_node != controller)
+                return -EINVAL;
+        if (intsize < 1)
+                return -EINVAL;
+        *out_hwirq = intspec[0];
+        *out_type = IRQ_TYPE_NONE;
+        if (intsize > 1)
+                *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
+        return 0;
+}
+struct irq_domain_ops irq_domain_simple_ops = {
+        .dt_translate = irq_domain_simple_dt_translate,
+};
+EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
+/**
+ * irq_domain_create_simple() - Set up a 'simple' translation range
+ */
+void irq_domain_add_simple(struct device_node *controller, int irq_base)
+{
+        struct irq_domain *domain;
+        domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+        if (!domain) {
+                WARN_ON(1);
+                return;
+        }
+        domain->irq_base = irq_base;
+        domain->of_node = of_node_get(controller);
+        domain->ops = &irq_domain_simple_ops;
+        irq_domain_add(domain);
+}
+EXPORT_SYMBOL_GPL(irq_domain_add_simple);
+void irq_domain_generate_simple(const struct of_device_id *match,
+                                u64 phys_base, unsigned int irq_start)
+{
+        struct device_node *node;
+        pr_info("looking for phys_base=%llx, irq_start=%i\n",
+                (unsigned long long) phys_base, (int) irq_start);
+        node = of_find_matching_node_by_address(NULL, match, phys_base);
+        if (node)
+                irq_domain_add_simple(node, irq_start);
+        else
+                pr_info("no node found\n");
+}
+EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
+#endif /* CONFIG_OF_IRQ */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 07c1611f3899..0a7840aeb0fb 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -491,6 +491,9 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on)
        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
        int ret = 0;
+        if (!desc)
+                return -EINVAL;
        /* wakeup-capable irqs can be shared between drivers that
         * don't need to have the same sleep mode behaviors.
         */
@@ -723,13 +726,16 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
 * context. So we need to disable bh here to avoid deadlocks and other
 * side effects.
 */
-static void
+static irqreturn_t
 irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
 {
+        irqreturn_t ret;
        local_bh_disable();
-        action->thread_fn(action->irq, action->dev_id);
+        ret = action->thread_fn(action->irq, action->dev_id);
        irq_finalize_oneshot(desc, action, false);
        local_bh_enable();
+        return ret;
 }
 /*
@@ -737,10 +743,14 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
 * preemtible - many of them need to sleep and wait for slow busses to
 * complete.
 */
-static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action)
+static irqreturn_t irq_thread_fn(struct irq_desc *desc,
+                struct irqaction *action)
 {
-        action->thread_fn(action->irq, action->dev_id);
+        irqreturn_t ret;
+        ret = action->thread_fn(action->irq, action->dev_id);
        irq_finalize_oneshot(desc, action, false);
+        return ret;
 }
 /*
@@ -753,7 +763,8 @@ static int irq_thread(void *data)
        };
        struct irqaction *action = data;
        struct irq_desc *desc = irq_to_desc(action->irq);
-        void (*handler_fn)(struct irq_desc *desc, struct irqaction *action);
+        irqreturn_t (*handler_fn)(struct irq_desc *desc,
+                        struct irqaction *action);
        int wake;
        if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD,
@@ -783,8 +794,12 @@ static int irq_thread(void *data)
                        desc->istate |= IRQS_PENDING;
                        raw_spin_unlock_irq(&desc->lock);
                } else {
+                        irqreturn_t action_ret;
                        raw_spin_unlock_irq(&desc->lock);
-                        handler_fn(desc, action);
+                        action_ret = handler_fn(desc, action);
+                        if (!noirqdebug)
+                                note_interrupt(action->irq, desc, action_ret);
                }
                wake = atomic_dec_and_test(&desc->threads_active);
@@ -900,7 +915,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                 */
                new->handler = irq_nested_primary_handler;
        } else {
-                irq_setup_forced_threading(new);
+                if (irq_settings_can_thread(desc))
+                        irq_setup_forced_threading(new);
        }
        /*
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 834899f2500f..4bd4faa6323a 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -19,7 +19,7 @@ static struct proc_dir_entry *root_irq_dir;
 #ifdef CONFIG_SMP
-static int irq_affinity_proc_show(struct seq_file *m, void *v)
+static int show_irq_affinity(int type, struct seq_file *m, void *v)
 {
        struct irq_desc *desc = irq_to_desc((long)m->private);
        const struct cpumask *mask = desc->irq_data.affinity;
@@ -28,7 +28,10 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
        if (irqd_is_setaffinity_pending(&desc->irq_data))
                mask = desc->pending_mask;
 #endif
-        seq_cpumask(m, mask);
+        if (type)
+                seq_cpumask_list(m, mask);
+        else
+                seq_cpumask(m, mask);
        seq_putc(m, '\n');
        return 0;
 }
@@ -59,7 +62,18 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
 #endif
 int no_irq_affinity;
-static ssize_t irq_affinity_proc_write(struct file *file,
+static int irq_affinity_proc_show(struct seq_file *m, void *v)
+{
+        return show_irq_affinity(0, m, v);
+}
+static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
+{
+        return show_irq_affinity(1, m, v);
+}
+static ssize_t write_irq_affinity(int type, struct file *file,
                const char __user *buffer, size_t count, loff_t *pos)
 {
        unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
@@ -72,7 +86,10 @@ static ssize_t irq_affinity_proc_write(struct file *file,
        if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
                return -ENOMEM;
-        err = cpumask_parse_user(buffer, count, new_value);
+        if (type)
+                err = cpumask_parselist_user(buffer, count, new_value);
+        else
+                err = cpumask_parse_user(buffer, count, new_value);
        if (err)
                goto free_cpumask;
@@ -100,11 +117,28 @@ free_cpumask:
        return err;
 }
+static ssize_t irq_affinity_proc_write(struct file *file,
+                const char __user *buffer, size_t count, loff_t *pos)
+{
+        return write_irq_affinity(0, file, buffer, count, pos);
+}
+static ssize_t irq_affinity_list_proc_write(struct file *file,
+                const char __user *buffer, size_t count, loff_t *pos)
+{
+        return write_irq_affinity(1, file, buffer, count, pos);
+}
 static int irq_affinity_proc_open(struct inode *inode, struct file *file)
 {
        return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
 }
+static int irq_affinity_list_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, irq_affinity_list_proc_show, PDE(inode)->data);
+}
 static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
 {
        return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data);
@@ -125,6 +159,14 @@ static const struct file_operations irq_affinity_hint_proc_fops = {
        .release        = single_release,
 };
+static const struct file_operations irq_affinity_list_proc_fops = {
+        .open           = irq_affinity_list_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+        .write          = irq_affinity_list_proc_write,
+};
 static int default_affinity_show(struct seq_file *m, void *v)
 {
        seq_cpumask(m, irq_default_affinity);
@@ -289,6 +331,10 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
        proc_create_data("affinity_hint", 0400, desc->dir,
                         &irq_affinity_hint_proc_fops, (void *)(long)irq);
+        /* create /proc/irq/<irq>/smp_affinity_list */
+        proc_create_data("smp_affinity_list", 0600, desc->dir,
+                         &irq_affinity_list_proc_fops, (void *)(long)irq);
        proc_create_data("node", 0444, desc->dir,
                         &irq_node_proc_fops, (void *)(long)irq);
 #endif
@@ -306,6 +352,7 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
 #ifdef CONFIG_SMP
        remove_proc_entry("smp_affinity", desc->dir);
        remove_proc_entry("affinity_hint", desc->dir);
+        remove_proc_entry("smp_affinity_list", desc->dir);
        remove_proc_entry("node", desc->dir);
 #endif
        remove_proc_entry("spurious", desc->dir);
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 0d91730b6330..f1667833d444 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -8,6 +8,7 @@ enum {
        _IRQ_LEVEL              = IRQ_LEVEL,
        _IRQ_NOPROBE            = IRQ_NOPROBE,
        _IRQ_NOREQUEST          = IRQ_NOREQUEST,
+        _IRQ_NOTHREAD           = IRQ_NOTHREAD,
        _IRQ_NOAUTOEN           = IRQ_NOAUTOEN,
        _IRQ_MOVE_PCNTXT        = IRQ_MOVE_PCNTXT,
        _IRQ_NO_BALANCING       = IRQ_NO_BALANCING,
@@ -20,6 +21,7 @@ enum {
 #define IRQ_LEVEL               GOT_YOU_MORON
 #define IRQ_NOPROBE             GOT_YOU_MORON
 #define IRQ_NOREQUEST           GOT_YOU_MORON
+#define IRQ_NOTHREAD            GOT_YOU_MORON
 #define IRQ_NOAUTOEN            GOT_YOU_MORON
 #define IRQ_NESTED_THREAD       GOT_YOU_MORON
 #undef IRQF_MODIFY_MASK
@@ -94,6 +96,21 @@ static inline void irq_settings_set_norequest(struct irq_desc *desc)
        desc->status_use_accessors |= _IRQ_NOREQUEST;
 }
+static inline bool irq_settings_can_thread(struct irq_desc *desc)
+{
+        return !(desc->status_use_accessors & _IRQ_NOTHREAD);
+}
+static inline void irq_settings_clr_nothread(struct irq_desc *desc)
+{
+        desc->status_use_accessors &= ~_IRQ_NOTHREAD;
+}
+static inline void irq_settings_set_nothread(struct irq_desc *desc)
+{
+        desc->status_use_accessors |= _IRQ_NOTHREAD;
+}
 static inline bool irq_settings_can_probe(struct irq_desc *desc)
 {
        return !(desc->status_use_accessors & _IRQ_NOPROBE);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index dfbd550401b2..aa57d5da18c1 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -167,6 +167,13 @@ out:
                  jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
 }
+static inline int bad_action_ret(irqreturn_t action_ret)
+{
+        if (likely(action_ret <= (IRQ_HANDLED | IRQ_WAKE_THREAD)))
+                return 0;
+        return 1;
+}
 /*
 * If 99,900 of the previous 100,000 interrupts have not been handled
 * then assume that the IRQ is stuck in some manner. Drop a diagnostic
@@ -182,7 +189,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
        struct irqaction *action;
        unsigned long flags;
-        if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) {
+        if (bad_action_ret(action_ret)) {
                printk(KERN_ERR "irq event %d: bogus return value %x\n",
                                irq, action_ret);
        } else {
@@ -201,10 +208,11 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
        raw_spin_lock_irqsave(&desc->lock, flags);
        action = desc->action;
        while (action) {
-                printk(KERN_ERR "[<%p>]", action->handler);
+                printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler);
-                print_symbol(" (%s)",
+                if (action->thread_fn)
-                        (unsigned long)action->handler);
+                        printk(KERN_CONT " threaded [<%p>] %pf",
-                printk("\n");
+                                        action->thread_fn, action->thread_fn);
+                printk(KERN_CONT "\n");
                action = action->next;
        }
        raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -262,7 +270,16 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
        if (desc->istate & IRQS_POLL_INPROGRESS)
                return;
-        if (unlikely(action_ret != IRQ_HANDLED)) {
+        /* we get here again via the threaded handler */
+        if (action_ret == IRQ_WAKE_THREAD)
+                return;
+        if (bad_action_ret(action_ret)) {
+                report_bad_irq(irq, desc, action_ret);
+                return;
+        }
+        if (unlikely(action_ret == IRQ_NONE)) {
                /*
                 * If we are seeing only the odd spurious IRQ caused by
                 * bus asynchronicity then don't eventually trigger an error,
@@ -274,8 +291,6 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
                else
                        desc->irqs_unhandled++;
                desc->last_unhandled = jiffies;
-                if (unlikely(action_ret != IRQ_NONE))
-                        report_bad_irq(irq, desc, action_ret);
        }
        if (unlikely(try_misrouted_irq(irq, desc, action_ret))) {
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 3b79bd938330..a8ce45097f3d 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -2,43 +2,23 @@
 * jump label support
 *
 * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
+ * Copyright (C) 2011 Peter Zijlstra <pzijlstr@redhat.com>
 *
 */
-#include <linux/jump_label.h>
 #include <linux/memory.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/list.h>
-#include <linux/jhash.h>
 #include <linux/slab.h>
 #include <linux/sort.h>
 #include <linux/err.h>
+#include <linux/jump_label.h>
 #ifdef HAVE_JUMP_LABEL
-#define JUMP_LABEL_HASH_BITS 6
-#define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS)
-static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE];
 /* mutex to protect coming/going of the the jump_label table */
 static DEFINE_MUTEX(jump_label_mutex);
-struct jump_label_entry {
-        struct hlist_node hlist;
-        struct jump_entry *table;
-        int nr_entries;
-        /* hang modules off here */
-        struct hlist_head modules;
-        unsigned long key;
-};
-struct jump_label_module_entry {
-        struct hlist_node hlist;
-        struct jump_entry *table;
-        int nr_entries;
-        struct module *mod;
-};
 void jump_label_lock(void)
 {
        mutex_lock(&jump_label_mutex);
@@ -49,6 +29,11 @@ void jump_label_unlock(void)
        mutex_unlock(&jump_label_mutex);
 }
+bool jump_label_enabled(struct jump_label_key *key)
+{
+        return !!atomic_read(&key->enabled);
+}
 static int jump_label_cmp(const void *a, const void *b)
 {
        const struct jump_entry *jea = a;
@@ -64,7 +49,7 @@ static int jump_label_cmp(const void *a, const void *b)
 }
 static void
-sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop)
+jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
 {
        unsigned long size;
@@ -73,118 +58,25 @@ sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop)
        sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
 }
-static struct jump_label_entry *get_jump_label_entry(jump_label_t key)
+static void jump_label_update(struct jump_label_key *key, int enable);
-{
-        struct hlist_head *head;
-        struct hlist_node *node;
-        struct jump_label_entry *e;
-        u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0);
-        head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
-        hlist_for_each_entry(e, node, head, hlist) {
-                if (key == e->key)
-                        return e;
-        }
-        return NULL;
-}
-static struct jump_label_entry *
+void jump_label_inc(struct jump_label_key *key)
-add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table)
 {
-        struct hlist_head *head;
+        if (atomic_inc_not_zero(&key->enabled))
-        struct jump_label_entry *e;
+                return;
-        u32 hash;
-        e = get_jump_label_entry(key);
-        if (e)
-                return ERR_PTR(-EEXIST);
-        e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL);
-        if (!e)
-                return ERR_PTR(-ENOMEM);
-        hash = jhash((void *)&key, sizeof(jump_label_t), 0);
-        head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
-        e->key = key;
-        e->table = table;
-        e->nr_entries = nr_entries;
-        INIT_HLIST_HEAD(&(e->modules));
-        hlist_add_head(&e->hlist, head);
-        return e;
-}
-static int
+        jump_label_lock();
-build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop)
+        if (atomic_add_return(1, &key->enabled) == 1)
-{
+                jump_label_update(key, JUMP_LABEL_ENABLE);
-        struct jump_entry *iter, *iter_begin;
+        jump_label_unlock();
-        struct jump_label_entry *entry;
-        int count;
-        sort_jump_label_entries(start, stop);
-        iter = start;
-        while (iter < stop) {
-                entry = get_jump_label_entry(iter->key);
-                if (!entry) {
-                        iter_begin = iter;
-                        count = 0;
-                        while ((iter < stop) &&
-                                (iter->key == iter_begin->key)) {
-                                iter++;
-                                count++;
-                        }
-                        entry = add_jump_label_entry(iter_begin->key,
-                                                        count, iter_begin);
-                        if (IS_ERR(entry))
-                                return PTR_ERR(entry);
-                 } else {
-                        WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n");
-                        return -1;
-                }
-        }
-        return 0;
 }
-/***
+void jump_label_dec(struct jump_label_key *key)
- * jump_label_update - update jump label text
- * @key -  key value associated with a a jump label
- * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE
- *
- * Will enable/disable the jump for jump label @key, depending on the
- * value of @type.
- *
- */
-void jump_label_update(unsigned long key, enum jump_label_type type)
 {
-        struct jump_entry *iter;
+        if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex))
-        struct jump_label_entry *entry;
+                return;
-        struct hlist_node *module_node;
-        struct jump_label_module_entry *e_module;
-        int count;
-        jump_label_lock();
+        jump_label_update(key, JUMP_LABEL_DISABLE);
-        entry = get_jump_label_entry((jump_label_t)key);
-        if (entry) {
-                count = entry->nr_entries;
-                iter = entry->table;
-                while (count--) {
-                        if (kernel_text_address(iter->code))
-                                arch_jump_label_transform(iter, type);
-                        iter++;
-                }
-                /* eanble/disable jump labels in modules */
-                hlist_for_each_entry(e_module, module_node, &(entry->modules),
-                                                        hlist) {
-                        count = e_module->nr_entries;
-                        iter = e_module->table;
-                        while (count--) {
-                                if (iter->key &&
-                                                kernel_text_address(iter->code))
-                                        arch_jump_label_transform(iter, type);
-                                iter++;
-                        }
-                }
-        }
        jump_label_unlock();
 }
@@ -197,77 +89,36 @@ static int addr_conflict(struct jump_entry *entry, void *start, void *end)
        return 0;
 }
-#ifdef CONFIG_MODULES
+static int __jump_label_text_reserved(struct jump_entry *iter_start,
+                struct jump_entry *iter_stop, void *start, void *end)
-static int module_conflict(void *start, void *end)
-{
-        struct hlist_head *head;
-        struct hlist_node *node, *node_next, *module_node, *module_node_next;
-        struct jump_label_entry *e;
-        struct jump_label_module_entry *e_module;
-        struct jump_entry *iter;
-        int i, count;
-        int conflict = 0;
-        for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
-                head = &jump_label_table[i];
-                hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
-                        hlist_for_each_entry_safe(e_module, module_node,
-                                                        module_node_next,
-                                                        &(e->modules), hlist) {
-                                count = e_module->nr_entries;
-                                iter = e_module->table;
-                                while (count--) {
-                                        if (addr_conflict(iter, start, end)) {
-                                                conflict = 1;
-                                                goto out;
-                                        }
-                                        iter++;
-                                }
-                        }
-                }
-        }
-out:
-        return conflict;
-}
-#endif
-/***
- * jump_label_text_reserved - check if addr range is reserved
- * @start: start text addr
- * @end: end text addr
- *
- * checks if the text addr located between @start and @end
- * overlaps with any of the jump label patch addresses. Code
- * that wants to modify kernel text should first verify that
- * it does not overlap with any of the jump label addresses.
- * Caller must hold jump_label_mutex.
- *
- * returns 1 if there is an overlap, 0 otherwise
- */
-int jump_label_text_reserved(void *start, void *end)
 {
        struct jump_entry *iter;
-        struct jump_entry *iter_start = __start___jump_table;
-        struct jump_entry *iter_stop = __start___jump_table;
-        int conflict = 0;
        iter = iter_start;
        while (iter < iter_stop) {
-                if (addr_conflict(iter, start, end)) {
+                if (addr_conflict(iter, start, end))
-                        conflict = 1;
+                        return 1;
-                        goto out;
-                }
                iter++;
        }
-        /* now check modules */
+        return 0;
-#ifdef CONFIG_MODULES
+}
-        conflict = module_conflict(start, end);
-#endif
+static void __jump_label_update(struct jump_label_key *key,
-out:
+                                struct jump_entry *entry,
-        return conflict;
+                                struct jump_entry *stop, int enable)
+{
+        for (; (entry < stop) &&
+              (entry->key == (jump_label_t)(unsigned long)key);
+              entry++) {
+                /*
+                 * entry->code set to 0 invalidates module init text sections
+                 * kernel_text_address() verifies we are not in core kernel
+                 * init code, see jump_label_invalidate_module_init().
+                 */
+                if (entry->code && kernel_text_address(entry->code))
+                        arch_jump_label_transform(entry, enable);
+        }
 }
 /*
@@ -277,145 +128,181 @@ void __weak arch_jump_label_text_poke_early(jump_label_t addr)
 {
 }
-static __init int init_jump_label(void)
+static __init int jump_label_init(void)
 {
-        int ret;
        struct jump_entry *iter_start = __start___jump_table;
        struct jump_entry *iter_stop = __stop___jump_table;
+        struct jump_label_key *key = NULL;
        struct jump_entry *iter;
        jump_label_lock();
-        ret = build_jump_label_hashtable(__start___jump_table,
+        jump_label_sort_entries(iter_start, iter_stop);
-                                         __stop___jump_table);
-        iter = iter_start;
+        for (iter = iter_start; iter < iter_stop; iter++) {
-        while (iter < iter_stop) {
                arch_jump_label_text_poke_early(iter->code);
-                iter++;
+                if (iter->key == (jump_label_t)(unsigned long)key)
+                        continue;
+                key = (struct jump_label_key *)(unsigned long)iter->key;
+                atomic_set(&key->enabled, 0);
+                key->entries = iter;
+#ifdef CONFIG_MODULES
+                key->next = NULL;
+#endif
        }
        jump_label_unlock();
-        return ret;
+        return 0;
 }
-early_initcall(init_jump_label);
+early_initcall(jump_label_init);
 #ifdef CONFIG_MODULES
-static struct jump_label_module_entry *
+struct jump_label_mod {
-add_jump_label_module_entry(struct jump_label_entry *entry,
+        struct jump_label_mod *next;
-                            struct jump_entry *iter_begin,
+        struct jump_entry *entries;
-                            int count, struct module *mod)
+        struct module *mod;
+};
+static int __jump_label_mod_text_reserved(void *start, void *end)
 {
-        struct jump_label_module_entry *e;
+        struct module *mod;
-        e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL);
+        mod = __module_text_address((unsigned long)start);
-        if (!e)
+        if (!mod)
-                return ERR_PTR(-ENOMEM);
+                return 0;
-        e->mod = mod;
-        e->nr_entries = count;
+        WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
-        e->table = iter_begin;
-        hlist_add_head(&e->hlist, &entry->modules);
+        return __jump_label_text_reserved(mod->jump_entries,
-        return e;
+                                mod->jump_entries + mod->num_jump_entries,
+                                start, end);
 }
-static int add_jump_label_module(struct module *mod)
+static void __jump_label_mod_update(struct jump_label_key *key, int enable)
 {
-        struct jump_entry *iter, *iter_begin;
+        struct jump_label_mod *mod = key->next;
-        struct jump_label_entry *entry;
-        struct jump_label_module_entry *module_entry;
-        int count;
-        /* if the module doesn't have jump label entries, just return */
+        while (mod) {
-        if (!mod->num_jump_entries)
+                struct module *m = mod->mod;
-                return 0;
-        sort_jump_label_entries(mod->jump_entries,
+                __jump_label_update(key, mod->entries,
-                                mod->jump_entries + mod->num_jump_entries);
+                                    m->jump_entries + m->num_jump_entries,
-        iter = mod->jump_entries;
+                                    enable);
-        while (iter < mod->jump_entries + mod->num_jump_entries) {
+                mod = mod->next;
-                entry = get_jump_label_entry(iter->key);
-                iter_begin = iter;
-                count = 0;
-                while ((iter < mod->jump_entries + mod->num_jump_entries) &&
-                        (iter->key == iter_begin->key)) {
-                                iter++;
-                                count++;
-                }
-                if (!entry) {
-                        entry = add_jump_label_entry(iter_begin->key, 0, NULL);
-                        if (IS_ERR(entry))
-                                return PTR_ERR(entry);
-                }
-                module_entry = add_jump_label_module_entry(entry, iter_begin,
-                                                           count, mod);
-                if (IS_ERR(module_entry))
-                        return PTR_ERR(module_entry);
        }
-        return 0;
 }
-static void remove_jump_label_module(struct module *mod)
+/***
+ * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
+ * @mod: module to patch
+ *
+ * Allow for run-time selection of the optimal nops. Before the module
+ * loads patch these with arch_get_jump_label_nop(), which is specified by
+ * the arch specific jump label code.
+ */
+void jump_label_apply_nops(struct module *mod)
 {
-        struct hlist_head *head;
+        struct jump_entry *iter_start = mod->jump_entries;
-        struct hlist_node *node, *node_next, *module_node, *module_node_next;
+        struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
-        struct jump_label_entry *e;
+        struct jump_entry *iter;
-        struct jump_label_module_entry *e_module;
-        int i;
        /* if the module doesn't have jump label entries, just return */
-        if (!mod->num_jump_entries)
+        if (iter_start == iter_stop)
                return;
-        for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
+        for (iter = iter_start; iter < iter_stop; iter++)
-                head = &jump_label_table[i];
+                arch_jump_label_text_poke_early(iter->code);
-                hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
+}
-                        hlist_for_each_entry_safe(e_module, module_node,
-                                                  module_node_next,
+static int jump_label_add_module(struct module *mod)
-                                                  &(e->modules), hlist) {
+{
-                                if (e_module->mod == mod) {
+        struct jump_entry *iter_start = mod->jump_entries;
-                                        hlist_del(&e_module->hlist);
+        struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
-                                        kfree(e_module);
+        struct jump_entry *iter;
-                                }
+        struct jump_label_key *key = NULL;
-                        }
+        struct jump_label_mod *jlm;
-                        if (hlist_empty(&e->modules) && (e->nr_entries == 0)) {
-                                hlist_del(&e->hlist);
+        /* if the module doesn't have jump label entries, just return */
-                                kfree(e);
+        if (iter_start == iter_stop)
-                        }
+                return 0;
+        jump_label_sort_entries(iter_start, iter_stop);
+        for (iter = iter_start; iter < iter_stop; iter++) {
+                if (iter->key == (jump_label_t)(unsigned long)key)
+                        continue;
+                key = (struct jump_label_key *)(unsigned long)iter->key;
+                if (__module_address(iter->key) == mod) {
+                        atomic_set(&key->enabled, 0);
+                        key->entries = iter;
+                        key->next = NULL;
+                        continue;
                }
+                jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL);
+                if (!jlm)
+                        return -ENOMEM;
+                jlm->mod = mod;
+                jlm->entries = iter;
+                jlm->next = key->next;
+                key->next = jlm;
+                if (jump_label_enabled(key))
+                        __jump_label_update(key, iter, iter_stop,
+                                            JUMP_LABEL_ENABLE);
        }
+        return 0;
 }
-static void remove_jump_label_module_init(struct module *mod)
+static void jump_label_del_module(struct module *mod)
 {
-        struct hlist_head *head;
+        struct jump_entry *iter_start = mod->jump_entries;
-        struct hlist_node *node, *node_next, *module_node, *module_node_next;
+        struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
-        struct jump_label_entry *e;
-        struct jump_label_module_entry *e_module;
        struct jump_entry *iter;
-        int i, count;
+        struct jump_label_key *key = NULL;
+        struct jump_label_mod *jlm, **prev;
-        /* if the module doesn't have jump label entries, just return */
+        for (iter = iter_start; iter < iter_stop; iter++) {
-        if (!mod->num_jump_entries)
+                if (iter->key == (jump_label_t)(unsigned long)key)
-                return;
+                        continue;
+                key = (struct jump_label_key *)(unsigned long)iter->key;
+                if (__module_address(iter->key) == mod)
+                        continue;
+                prev = &key->next;
+                jlm = key->next;
+                while (jlm && jlm->mod != mod) {
+                        prev = &jlm->next;
+                        jlm = jlm->next;
+                }
-        for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
+                if (jlm) {
-                head = &jump_label_table[i];
+                        *prev = jlm->next;
-                hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
+                        kfree(jlm);
-                        hlist_for_each_entry_safe(e_module, module_node,
-                                                  module_node_next,
-                                                  &(e->modules), hlist) {
-                                if (e_module->mod != mod)
-                                        continue;
-                                count = e_module->nr_entries;
-                                iter = e_module->table;
-                                while (count--) {
-                                        if (within_module_init(iter->code, mod))
-                                                iter->key = 0;
-                                        iter++;
-                                }
-                        }
                }
        }
 }
+static void jump_label_invalidate_module_init(struct module *mod)
+{
+        struct jump_entry *iter_start = mod->jump_entries;
+        struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
+        struct jump_entry *iter;
+        for (iter = iter_start; iter < iter_stop; iter++) {
+                if (within_module_init(iter->code, mod))
+                        iter->code = 0;
+        }
+}
 static int
 jump_label_module_notify(struct notifier_block *self, unsigned long val,
                         void *data)
@@ -426,59 +313,81 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
        switch (val) {
        case MODULE_STATE_COMING:
                jump_label_lock();
-                ret = add_jump_label_module(mod);
+                ret = jump_label_add_module(mod);
                if (ret)
-                        remove_jump_label_module(mod);
+                        jump_label_del_module(mod);
                jump_label_unlock();
                break;
        case MODULE_STATE_GOING:
                jump_label_lock();
-                remove_jump_label_module(mod);
+                jump_label_del_module(mod);
                jump_label_unlock();
                break;
        case MODULE_STATE_LIVE:
                jump_label_lock();
-                remove_jump_label_module_init(mod);
+                jump_label_invalidate_module_init(mod);
                jump_label_unlock();
                break;
        }
-        return ret;
-}
-/***
- * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
- * @mod: module to patch
- *
- * Allow for run-time selection of the optimal nops. Before the module
- * loads patch these with arch_get_jump_label_nop(), which is specified by
- * the arch specific jump label code.
- */
-void jump_label_apply_nops(struct module *mod)
-{
-        struct jump_entry *iter;
-        /* if the module doesn't have jump label entries, just return */
-        if (!mod->num_jump_entries)
-                return;
-        iter = mod->jump_entries;
+        return notifier_from_errno(ret);
-        while (iter < mod->jump_entries + mod->num_jump_entries) {
-                arch_jump_label_text_poke_early(iter->code);
-                iter++;
-        }
 }
 struct notifier_block jump_label_module_nb = {
        .notifier_call = jump_label_module_notify,
-        .priority = 0,
+        .priority = 1, /* higher than tracepoints */
 };
-static __init int init_jump_label_module(void)
+static __init int jump_label_init_module(void)
 {
        return register_module_notifier(&jump_label_module_nb);
 }
-early_initcall(init_jump_label_module);
+early_initcall(jump_label_init_module);
 #endif /* CONFIG_MODULES */
+/***
+ * jump_label_text_reserved - check if addr range is reserved
+ * @start: start text addr
+ * @end: end text addr
+ *
+ * checks if the text addr located between @start and @end
+ * overlaps with any of the jump label patch addresses. Code
+ * that wants to modify kernel text should first verify that
+ * it does not overlap with any of the jump label addresses.
+ * Caller must hold jump_label_mutex.
+ *
+ * returns 1 if there is an overlap, 0 otherwise
+ */
+int jump_label_text_reserved(void *start, void *end)
+{
+        int ret = __jump_label_text_reserved(__start___jump_table,
+                        __stop___jump_table, start, end);
+        if (ret)
+                return ret;
+#ifdef CONFIG_MODULES
+        ret = __jump_label_mod_text_reserved(start, end);
+#endif
+        return ret;
+}
+static void jump_label_update(struct jump_label_key *key, int enable)
+{
+        struct jump_entry *entry = key->entries, *stop = __stop___jump_table;
+#ifdef CONFIG_MODULES
+        struct module *mod = __module_address((jump_label_t)key);
+        __jump_label_mod_update(key, enable);
+        if (mod)
+                stop = mod->jump_entries + mod->num_jump_entries;
+#endif
+        /* if there are no users, entry can be NULL */
+        if (entry)
+                __jump_label_update(key, entry, stop, enable);
+}
 #endif
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 87b77de03dd3..296fbc84d659 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1095,7 +1095,7 @@ size_t crash_get_memory_size(void)
        size_t size = 0;
        mutex_lock(&kexec_mutex);
        if (crashk_res.end != crashk_res.start)
-                size = crashk_res.end - crashk_res.start + 1;
+                size = resource_size(&crashk_res);
        mutex_unlock(&kexec_mutex);
        return size;
 }
@@ -1531,13 +1531,7 @@ int kernel_kexec(void)
                if (error)
                        goto Enable_cpus;
                local_irq_disable();
-                /* Suspend system devices */
+                error = syscore_suspend();
-                error = sysdev_suspend(PMSG_FREEZE);
-                if (!error) {
-                        error = syscore_suspend();
-                        if (error)
-                                sysdev_resume();
-                }
                if (error)
                        goto Enable_irqs;
        } else
@@ -1553,7 +1547,6 @@ int kernel_kexec(void)
 #ifdef CONFIG_KEXEC_JUMP
        if (kexec_image->preserve_context) {
                syscore_resume();
-                sysdev_resume();
 Enable_irqs:
                local_irq_enable();
 Enable_cpus:
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 9cd0591c96a2..ddc7644c1305 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -25,6 +25,7 @@
 #include <linux/kmod.h>
 #include <linux/slab.h>
 #include <linux/completion.h>
+#include <linux/cred.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
 #include <linux/workqueue.h>
@@ -43,6 +44,13 @@ extern int max_threads;
 static struct workqueue_struct *khelper_wq;
+#define CAP_BSET        (void *)1
+#define CAP_PI          (void *)2
+static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
+static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
+static DEFINE_SPINLOCK(umh_sysctl_lock);
 #ifdef CONFIG_MODULES
 /*
@@ -132,6 +140,7 @@ EXPORT_SYMBOL(__request_module);
 static int ____call_usermodehelper(void *data)
 {
        struct subprocess_info *sub_info = data;
+        struct cred *new;
        int retval;
        spin_lock_irq(&current->sighand->siglock);
@@ -147,12 +156,27 @@ static int ____call_usermodehelper(void *data)
         */
        set_user_nice(current, 0);
+        retval = -ENOMEM;
+        new = prepare_kernel_cred(current);
+        if (!new)
+                goto fail;
+        spin_lock(&umh_sysctl_lock);
+        new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
+        new->cap_inheritable = cap_intersect(usermodehelper_inheritable,
+                                             new->cap_inheritable);
+        spin_unlock(&umh_sysctl_lock);
        if (sub_info->init) {
-                retval = sub_info->init(sub_info);
+                retval = sub_info->init(sub_info, new);
-                if (retval)
+                if (retval) {
+                        abort_creds(new);
                        goto fail;
+                }
        }
+        commit_creds(new);
        retval = kernel_execve(sub_info->path,
                               (const char *const *)sub_info->argv,
                               (const char *const *)sub_info->envp);
@@ -245,13 +269,12 @@ static void __call_usermodehelper(struct work_struct *work)
        }
 }
-#ifdef CONFIG_PM_SLEEP
 /*
 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
 * (used for preventing user land processes from being created after the user
 * land has been frozen during a system-wide hibernation or suspend operation).
 */
-static int usermodehelper_disabled;
+static int usermodehelper_disabled = 1;
 /* Number of helpers running */
 static atomic_t running_helpers = ATOMIC_INIT(0);
@@ -301,6 +324,15 @@ void usermodehelper_enable(void)
        usermodehelper_disabled = 0;
 }
+/**
+ * usermodehelper_is_disabled - check if new helpers are allowed to be started
+ */
+bool usermodehelper_is_disabled(void)
+{
+        return usermodehelper_disabled;
+}
+EXPORT_SYMBOL_GPL(usermodehelper_is_disabled);
 static void helper_lock(void)
 {
        atomic_inc(&running_helpers);
@@ -312,12 +344,6 @@ static void helper_unlock(void)
        if (atomic_dec_and_test(&running_helpers))
                wake_up(&running_helpers_waitq);
 }
-#else /* CONFIG_PM_SLEEP */
-#define usermodehelper_disabled 0
-static inline void helper_lock(void) {}
-static inline void helper_unlock(void) {}
-#endif /* CONFIG_PM_SLEEP */
 /**
 * call_usermodehelper_setup - prepare to call a usermode helper
@@ -364,7 +390,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
 * context in which call_usermodehelper_exec is called.
 */
 void call_usermodehelper_setfns(struct subprocess_info *info,
-                    int (*init)(struct subprocess_info *info),
+                    int (*init)(struct subprocess_info *info, struct cred *new),
                    void (*cleanup)(struct subprocess_info *info),
                    void *data)
 {
@@ -418,6 +444,84 @@ unlock:
 }
 EXPORT_SYMBOL(call_usermodehelper_exec);
+static int proc_cap_handler(struct ctl_table *table, int write,
+                         void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        struct ctl_table t;
+        unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
+        kernel_cap_t new_cap;
+        int err, i;
+        if (write && (!capable(CAP_SETPCAP) ||
+                      !capable(CAP_SYS_MODULE)))
+                return -EPERM;
+        /*
+         * convert from the global kernel_cap_t to the ulong array to print to
+         * userspace if this is a read.
+         */
+        spin_lock(&umh_sysctl_lock);
+        for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)  {
+                if (table->data == CAP_BSET)
+                        cap_array[i] = usermodehelper_bset.cap[i];
+                else if (table->data == CAP_PI)
+                        cap_array[i] = usermodehelper_inheritable.cap[i];
+                else
+                        BUG();
+        }
+        spin_unlock(&umh_sysctl_lock);
+        t = *table;
+        t.data = &cap_array;
+        /*
+         * actually read or write and array of ulongs from userspace.  Remember
+         * these are least significant 32 bits first
+         */
+        err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
+        if (err < 0)
+                return err;
+        /*
+         * convert from the sysctl array of ulongs to the kernel_cap_t
+         * internal representation
+         */
+        for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
+                new_cap.cap[i] = cap_array[i];
+        /*
+         * Drop everything not in the new_cap (but don't add things)
+         */
+        spin_lock(&umh_sysctl_lock);
+        if (write) {
+                if (table->data == CAP_BSET)
+                        usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
+                if (table->data == CAP_PI)
+                        usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
+        }
+        spin_unlock(&umh_sysctl_lock);
+        return 0;
+}
+struct ctl_table usermodehelper_table[] = {
+        {
+                .procname       = "bset",
+                .data           = CAP_BSET,
+                .maxlen         = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+                .mode           = 0600,
+                .proc_handler   = proc_cap_handler,
+        },
+        {
+                .procname       = "inheritable",
+                .data           = CAP_PI,
+                .maxlen         = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+                .mode           = 0600,
+                .proc_handler   = proc_cap_handler,
+        },
+        { }
+};
 void __init usermodehelper_init(void)
 {
        khelper_wq = create_singlethread_workqueue("khelper");
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 77981813a1e7..b30fd54eb985 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1255,19 +1255,29 @@ static int __kprobes in_kprobes_functions(unsigned long addr)
 /*
 * If we have a symbol_name argument, look it up and add the offset field
 * to it. This way, we can specify a relative address to a symbol.
+ * This returns encoded errors if it fails to look up symbol or invalid
+ * combination of parameters.
 */
 static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
 {
        kprobe_opcode_t *addr = p->addr;
+        if ((p->symbol_name && p->addr) ||
+            (!p->symbol_name && !p->addr))
+                goto invalid;
        if (p->symbol_name) {
-                if (addr)
-                        return NULL;
                kprobe_lookup_name(p->symbol_name, addr);
+                if (!addr)
+                        return ERR_PTR(-ENOENT);
        }
-        if (!addr)
+        addr = (kprobe_opcode_t *)(((char *)addr) + p->offset);
-                return NULL;
+        if (addr)
-        return (kprobe_opcode_t *)(((char *)addr) + p->offset);
+                return addr;
+invalid:
+        return ERR_PTR(-EINVAL);
 }
 /* Check passed kprobe is valid and return kprobe in kprobe_table. */
@@ -1311,8 +1321,8 @@ int __kprobes register_kprobe(struct kprobe *p)
        kprobe_opcode_t *addr;
        addr = kprobe_addr(p);
-        if (!addr)
+        if (IS_ERR(addr))
-                return -EINVAL;
+                return PTR_ERR(addr);
        p->addr = addr;
        ret = check_kprobe_rereg(p);
@@ -1335,6 +1345,8 @@ int __kprobes register_kprobe(struct kprobe *p)
         */
        probed_mod = __module_text_address((unsigned long) p->addr);
        if (probed_mod) {
+                /* Return -ENOENT if fail. */
+                ret = -ENOENT;
                /*
                 * We must hold a refcount of the probed module while updating
                 * its code to prohibit unexpected unloading.
@@ -1351,6 +1363,7 @@ int __kprobes register_kprobe(struct kprobe *p)
                        module_put(probed_mod);
                        goto fail_with_jump_label;
                }
+                /* ret will be updated by following code */
        }
        preempt_enable();
        jump_label_unlock();
@@ -1399,7 +1412,7 @@ out:
 fail_with_jump_label:
        preempt_enable();
        jump_label_unlock();
-        return -EINVAL;
+        return ret;
 }
 EXPORT_SYMBOL_GPL(register_kprobe);
@@ -1686,8 +1699,8 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
        if (kretprobe_blacklist_size) {
                addr = kprobe_addr(&rp->kp);
-                if (!addr)
+                if (IS_ERR(addr))
-                        return -EINVAL;
+                        return PTR_ERR(addr);
                for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
                        if (kretprobe_blacklist[i].addr == addr)
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 0b624e791805..3b053c04dd86 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -16,6 +16,7 @@
 #include <linux/kexec.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
+#include <linux/capability.h>
 #define KERNEL_ATTR_RO(_name) \
 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
@@ -131,6 +132,14 @@ KERNEL_ATTR_RO(vmcoreinfo);
 #endif /* CONFIG_KEXEC */
+/* whether file capabilities are enabled */
+static ssize_t fscaps_show(struct kobject *kobj,
+                                  struct kobj_attribute *attr, char *buf)
+{
+        return sprintf(buf, "%d\n", file_caps_enabled);
+}
+KERNEL_ATTR_RO(fscaps);
 /*
 * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
 */
@@ -158,6 +167,7 @@ struct kobject *kernel_kobj;
 EXPORT_SYMBOL_GPL(kernel_kobj);
 static struct attribute * kernel_attrs[] = {
+        &fscaps_attr.attr,
 #if defined(CONFIG_HOTPLUG)
        &uevent_seqnum_attr.attr,
        &uevent_helper_attr.attr,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3b34d2732bce..4ba7cccb4994 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -202,8 +202,8 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
                return;
        }
-        p->cpus_allowed = cpumask_of_cpu(cpu);
+        /* It's safe because the task is inactive. */
-        p->rt.nr_cpus_allowed = 1;
+        do_set_cpus_allowed(p, cpumask_of(cpu));
        p->flags |= PF_THREAD_BOUND;
 }
 EXPORT_SYMBOL(kthread_bind);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 53a68956f131..8c24294e477f 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -490,6 +490,18 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
        usage[i] = '\0';
 }
+static int __print_lock_name(struct lock_class *class)
+{
+        char str[KSYM_NAME_LEN];
+        const char *name;
+        name = class->name;
+        if (!name)
+                name = __get_key_name(class->key, str);
+        return printk("%s", name);
+}
 static void print_lock_name(struct lock_class *class)
 {
        char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS];
@@ -1053,6 +1065,56 @@ print_circular_bug_entry(struct lock_list *target, int depth)
        return 0;
 }
+static void
+print_circular_lock_scenario(struct held_lock *src,
+                             struct held_lock *tgt,
+                             struct lock_list *prt)
+{
+        struct lock_class *source = hlock_class(src);
+        struct lock_class *target = hlock_class(tgt);
+        struct lock_class *parent = prt->class;
+        /*
+         * A direct locking problem where unsafe_class lock is taken
+         * directly by safe_class lock, then all we need to show
+         * is the deadlock scenario, as it is obvious that the
+         * unsafe lock is taken under the safe lock.
+         *
+         * But if there is a chain instead, where the safe lock takes
+         * an intermediate lock (middle_class) where this lock is
+         * not the same as the safe lock, then the lock chain is
+         * used to describe the problem. Otherwise we would need
+         * to show a different CPU case for each link in the chain
+         * from the safe_class lock to the unsafe_class lock.
+         */
+        if (parent != source) {
+                printk("Chain exists of:\n  ");
+                __print_lock_name(source);
+                printk(" --> ");
+                __print_lock_name(parent);
+                printk(" --> ");
+                __print_lock_name(target);
+                printk("\n\n");
+        }
+        printk(" Possible unsafe locking scenario:\n\n");
+        printk("       CPU0                    CPU1\n");
+        printk("       ----                    ----\n");
+        printk("  lock(");
+        __print_lock_name(target);
+        printk(");\n");
+        printk("                               lock(");
+        __print_lock_name(parent);
+        printk(");\n");
+        printk("                               lock(");
+        __print_lock_name(target);
+        printk(");\n");
+        printk("  lock(");
+        __print_lock_name(source);
+        printk(");\n");
+        printk("\n *** DEADLOCK ***\n\n");
+}
 /*
 * When a circular dependency is detected, print the
 * header first:
@@ -1096,6 +1158,7 @@ static noinline int print_circular_bug(struct lock_list *this,
 {
        struct task_struct *curr = current;
        struct lock_list *parent;
+        struct lock_list *first_parent;
        int depth;
        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
@@ -1109,6 +1172,7 @@ static noinline int print_circular_bug(struct lock_list *this,
        print_circular_bug_header(target, depth, check_src, check_tgt);
        parent = get_lock_parent(target);
+        first_parent = parent;
        while (parent) {
                print_circular_bug_entry(parent, --depth);
@@ -1116,6 +1180,9 @@ static noinline int print_circular_bug(struct lock_list *this,
        }
        printk("\nother info that might help us debug this:\n\n");
+        print_circular_lock_scenario(check_src, check_tgt,
+                                     first_parent);
        lockdep_print_held_locks(curr);
        printk("\nstack backtrace:\n");
@@ -1314,7 +1381,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
                printk("\n");
                if (depth == 0 && (entry != root)) {
-                        printk("lockdep:%s bad BFS generated tree\n", __func__);
+                        printk("lockdep:%s bad path found in chain graph\n", __func__);
                        break;
                }
@@ -1325,6 +1392,62 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
        return;
 }
+static void
+print_irq_lock_scenario(struct lock_list *safe_entry,
+                        struct lock_list *unsafe_entry,
+                        struct lock_class *prev_class,
+                        struct lock_class *next_class)
+{
+        struct lock_class *safe_class = safe_entry->class;
+        struct lock_class *unsafe_class = unsafe_entry->class;
+        struct lock_class *middle_class = prev_class;
+        if (middle_class == safe_class)
+                middle_class = next_class;
+        /*
+         * A direct locking problem where unsafe_class lock is taken
+         * directly by safe_class lock, then all we need to show
+         * is the deadlock scenario, as it is obvious that the
+         * unsafe lock is taken under the safe lock.
+         *
+         * But if there is a chain instead, where the safe lock takes
+         * an intermediate lock (middle_class) where this lock is
+         * not the same as the safe lock, then the lock chain is
+         * used to describe the problem. Otherwise we would need
+         * to show a different CPU case for each link in the chain
+         * from the safe_class lock to the unsafe_class lock.
+         */
+        if (middle_class != unsafe_class) {
+                printk("Chain exists of:\n  ");
+                __print_lock_name(safe_class);
+                printk(" --> ");
+                __print_lock_name(middle_class);
+                printk(" --> ");
+                __print_lock_name(unsafe_class);
+                printk("\n\n");
+        }
+        printk(" Possible interrupt unsafe locking scenario:\n\n");
+        printk("       CPU0                    CPU1\n");
+        printk("       ----                    ----\n");
+        printk("  lock(");
+        __print_lock_name(unsafe_class);
+        printk(");\n");
+        printk("                               local_irq_disable();\n");
+        printk("                               lock(");
+        __print_lock_name(safe_class);
+        printk(");\n");
+        printk("                               lock(");
+        __print_lock_name(middle_class);
+        printk(");\n");
+        printk("  <Interrupt>\n");
+        printk("    lock(");
+        __print_lock_name(safe_class);
+        printk(");\n");
+        printk("\n *** DEADLOCK ***\n\n");
+}
 static int
 print_bad_irq_dependency(struct task_struct *curr,
                         struct lock_list *prev_root,
@@ -1376,6 +1499,9 @@ print_bad_irq_dependency(struct task_struct *curr,
        print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
        printk("\nother info that might help us debug this:\n\n");
+        print_irq_lock_scenario(backwards_entry, forwards_entry,
+                                hlock_class(prev), hlock_class(next));
        lockdep_print_held_locks(curr);
        printk("\nthe dependencies between %s-irq-safe lock", irqclass);
@@ -1539,6 +1665,26 @@ static inline void inc_chains(void)
 #endif
+static void
+print_deadlock_scenario(struct held_lock *nxt,
+                             struct held_lock *prv)
+{
+        struct lock_class *next = hlock_class(nxt);
+        struct lock_class *prev = hlock_class(prv);
+        printk(" Possible unsafe locking scenario:\n\n");
+        printk("       CPU0\n");
+        printk("       ----\n");
+        printk("  lock(");
+        __print_lock_name(prev);
+        printk(");\n");
+        printk("  lock(");
+        __print_lock_name(next);
+        printk(");\n");
+        printk("\n *** DEADLOCK ***\n\n");
+        printk(" May be due to missing lock nesting notation\n\n");
+}
 static int
 print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
                   struct held_lock *next)
@@ -1557,6 +1703,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
        print_lock(prev);
        printk("\nother info that might help us debug this:\n");
+        print_deadlock_scenario(next, prev);
        lockdep_print_held_locks(curr);
        printk("\nstack backtrace:\n");
@@ -1826,7 +1973,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
        struct list_head *hash_head = chainhashentry(chain_key);
        struct lock_chain *chain;
        struct held_lock *hlock_curr, *hlock_next;
-        int i, j, n, cn;
+        int i, j;
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return 0;
@@ -1886,15 +2033,9 @@ cache_hit:
        }
        i++;
        chain->depth = curr->lockdep_depth + 1 - i;
-        cn = nr_chain_hlocks;
+        if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
-        while (cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS) {
+                chain->base = nr_chain_hlocks;
-                n = cmpxchg(&nr_chain_hlocks, cn, cn + chain->depth);
+                nr_chain_hlocks += chain->depth;
-                if (n == cn)
-                        break;
-                cn = n;
-        }
-        if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
-                chain->base = cn;
                for (j = 0; j < chain->depth - 1; j++, i++) {
                        int lock_id = curr->held_locks[i].class_idx - 1;
                        chain_hlocks[chain->base + j] = lock_id;
@@ -2011,6 +2152,24 @@ static void check_chain_key(struct task_struct *curr)
 #endif
 }
+static void
+print_usage_bug_scenario(struct held_lock *lock)
+{
+        struct lock_class *class = hlock_class(lock);
+        printk(" Possible unsafe locking scenario:\n\n");
+        printk("       CPU0\n");
+        printk("       ----\n");
+        printk("  lock(");
+        __print_lock_name(class);
+        printk(");\n");
+        printk("  <Interrupt>\n");
+        printk("    lock(");
+        __print_lock_name(class);
+        printk(");\n");
+        printk("\n *** DEADLOCK ***\n\n");
+}
 static int
 print_usage_bug(struct task_struct *curr, struct held_lock *this,
                enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
@@ -2039,6 +2198,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
        print_irqtrace_events(curr);
        printk("\nother info that might help us debug this:\n");
+        print_usage_bug_scenario(this);
        lockdep_print_held_locks(curr);
        printk("\nstack backtrace:\n");
@@ -2073,6 +2234,10 @@ print_irq_inversion_bug(struct task_struct *curr,
                        struct held_lock *this, int forwards,
                        const char *irqclass)
 {
+        struct lock_list *entry = other;
+        struct lock_list *middle = NULL;
+        int depth;
        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
                return 0;
@@ -2091,6 +2256,25 @@ print_irq_inversion_bug(struct task_struct *curr,
        printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
        printk("\nother info that might help us debug this:\n");
+        /* Find a middle lock (if one exists) */
+        depth = get_lock_depth(other);
+        do {
+                if (depth == 0 && (entry != root)) {
+                        printk("lockdep:%s bad path found in chain graph\n", __func__);
+                        break;
+                }
+                middle = entry;
+                entry = get_lock_parent(entry);
+                depth--;
+        } while (entry && entry != root && (depth >= 0));
+        if (forwards)
+                print_irq_lock_scenario(root, other,
+                        middle ? middle->class : root->class, other->class);
+        else
+                print_irq_lock_scenario(other, root,
+                        middle ? middle->class : other->class, root->class);
        lockdep_print_held_locks(curr);
        printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
@@ -2284,6 +2468,9 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
                BUG_ON(usage_bit >= LOCK_USAGE_STATES);
+                if (hlock_class(hlock)->key == __lockdep_no_validate__.subkeys)
+                        continue;
                if (!mark_lock(curr, hlock, usage_bit))
                        return 0;
        }
@@ -2294,34 +2481,13 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
 /*
 * Hardirqs will be enabled:
 */
-void trace_hardirqs_on_caller(unsigned long ip)
+static void __trace_hardirqs_on_caller(unsigned long ip)
 {
        struct task_struct *curr = current;
-        time_hardirqs_on(CALLER_ADDR0, ip);
-        if (unlikely(!debug_locks || current->lockdep_recursion))
-                return;
-        if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
-                return;
-        if (unlikely(curr->hardirqs_enabled)) {
-                /*
-                 * Neither irq nor preemption are disabled here
-                 * so this is racy by nature but losing one hit
-                 * in a stat is not a big deal.
-                 */
-                __debug_atomic_inc(redundant_hardirqs_on);
-                return;
-        }
        /* we'll do an OFF -> ON transition: */
        curr->hardirqs_enabled = 1;
-        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
-                return;
-        if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
-                return;
        /*
         * We are going to turn hardirqs on, so set the
         * usage bit for all held locks:
@@ -2341,6 +2507,37 @@ void trace_hardirqs_on_caller(unsigned long ip)
        curr->hardirq_enable_event = ++curr->irq_events;
        debug_atomic_inc(hardirqs_on_events);
 }
+void trace_hardirqs_on_caller(unsigned long ip)
+{
+        time_hardirqs_on(CALLER_ADDR0, ip);
+        if (unlikely(!debug_locks || current->lockdep_recursion))
+                return;
+        if (unlikely(current->hardirqs_enabled)) {
+                /*
+                 * Neither irq nor preemption are disabled here
+                 * so this is racy by nature but losing one hit
+                 * in a stat is not a big deal.
+                 */
+                __debug_atomic_inc(redundant_hardirqs_on);
+                return;
+        }
+        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+                return;
+        if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
+                return;
+        if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
+                return;
+        current->lockdep_recursion = 1;
+        __trace_hardirqs_on_caller(ip);
+        current->lockdep_recursion = 0;
+}
 EXPORT_SYMBOL(trace_hardirqs_on_caller);
 void trace_hardirqs_on(void)
@@ -2390,7 +2587,7 @@ void trace_softirqs_on(unsigned long ip)
 {
        struct task_struct *curr = current;
-        if (unlikely(!debug_locks))
+        if (unlikely(!debug_locks || current->lockdep_recursion))
                return;
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
@@ -2401,6 +2598,7 @@ void trace_softirqs_on(unsigned long ip)
                return;
        }
+        current->lockdep_recursion = 1;
        /*
         * We'll do an OFF -> ON transition:
         */
@@ -2415,6 +2613,7 @@ void trace_softirqs_on(unsigned long ip)
         */
        if (curr->hardirqs_enabled)
                mark_held_locks(curr, SOFTIRQ);
+        current->lockdep_recursion = 0;
 }
 /*
@@ -2424,7 +2623,7 @@ void trace_softirqs_off(unsigned long ip)
 {
        struct task_struct *curr = current;
-        if (unlikely(!debug_locks))
+        if (unlikely(!debug_locks || current->lockdep_recursion))
                return;
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
@@ -2675,10 +2874,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 void lockdep_init_map(struct lockdep_map *lock, const char *name,
                      struct lock_class_key *key, int subclass)
 {
-        int i;
+        memset(lock, 0, sizeof(*lock));
-        for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
-                lock->class_cache[i] = NULL;
 #ifdef CONFIG_LOCK_STAT
        lock->cpu = raw_smp_processor_id();
@@ -3242,7 +3438,7 @@ int lock_is_held(struct lockdep_map *lock)
        int ret = 0;
        if (unlikely(current->lockdep_recursion))
-                return ret;
+                return 1; /* avoid false negative lockdep_assert_held() */
        raw_local_irq_save(flags);
        check_flags(flags);
diff --git a/kernel/module.c b/kernel/module.c
index d5938a5c19c4..04379f92f843 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -57,6 +57,7 @@
 #include <linux/kmemleak.h>
 #include <linux/jump_label.h>
 #include <linux/pfn.h>
+#include <linux/bsearch.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/module.h>
@@ -240,23 +241,24 @@ static bool each_symbol_in_section(const struct symsearch *arr,
                                   struct module *owner,
                                   bool (*fn)(const struct symsearch *syms,
                                              struct module *owner,
-                                              unsigned int symnum, void *data),
+                                              void *data),
                                   void *data)
 {
-        unsigned int i, j;
+        unsigned int j;
        for (j = 0; j < arrsize; j++) {
-                for (i = 0; i < arr[j].stop - arr[j].start; i++)
+                if (fn(&arr[j], owner, data))
-                        if (fn(&arr[j], owner, i, data))
+                        return true;
-                                return true;
        }
        return false;
 }
 /* Returns true as soon as fn returns true, otherwise false. */
-bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
+bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
-                            unsigned int symnum, void *data), void *data)
+                                    struct module *owner,
+                                    void *data),
+                         void *data)
 {
        struct module *mod;
        static const struct symsearch arr[] = {
@@ -309,7 +311,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
        }
        return false;
 }
-EXPORT_SYMBOL_GPL(each_symbol);
+EXPORT_SYMBOL_GPL(each_symbol_section);
 struct find_symbol_arg {
        /* Input */
@@ -323,15 +325,12 @@ struct find_symbol_arg {
        const struct kernel_symbol *sym;
 };
-static bool find_symbol_in_section(const struct symsearch *syms,
+static bool check_symbol(const struct symsearch *syms,
-                                   struct module *owner,
+                                 struct module *owner,
-                                   unsigned int symnum, void *data)
+                                 unsigned int symnum, void *data)
 {
        struct find_symbol_arg *fsa = data;
-        if (strcmp(syms->start[symnum].name, fsa->name) != 0)
-                return false;
        if (!fsa->gplok) {
                if (syms->licence == GPL_ONLY)
                        return false;
@@ -365,6 +364,30 @@ static bool find_symbol_in_section(const struct symsearch *syms,
        return true;
 }
+static int cmp_name(const void *va, const void *vb)
+{
+        const char *a;
+        const struct kernel_symbol *b;
+        a = va; b = vb;
+        return strcmp(a, b->name);
+}
+static bool find_symbol_in_section(const struct symsearch *syms,
+                                   struct module *owner,
+                                   void *data)
+{
+        struct find_symbol_arg *fsa = data;
+        struct kernel_symbol *sym;
+        sym = bsearch(fsa->name, syms->start, syms->stop - syms->start,
+                        sizeof(struct kernel_symbol), cmp_name);
+        if (sym != NULL && check_symbol(syms, owner, sym - syms->start, data))
+                return true;
+        return false;
+}
 /* Find a symbol and return it, along with, (optional) crc and
 * (optional) module which owns it.  Needs preempt disabled or module_mutex. */
 const struct kernel_symbol *find_symbol(const char *name,
@@ -379,7 +402,7 @@ const struct kernel_symbol *find_symbol(const char *name,
        fsa.gplok = gplok;
        fsa.warn = warn;
-        if (each_symbol(find_symbol_in_section, &fsa)) {
+        if (each_symbol_section(find_symbol_in_section, &fsa)) {
                if (owner)
                        *owner = fsa.owner;
                if (crc)
@@ -522,9 +545,9 @@ static void setup_modinfo_##field(struct module *mod, const char *s)  \
        mod->field = kstrdup(s, GFP_KERNEL);                          \
 }                                                                     \
 static ssize_t show_modinfo_##field(struct module_attribute *mattr,   \
-                        struct module *mod, char *buffer)             \
+                        struct module_kobject *mk, char *buffer)      \
 {                                                                     \
-        return sprintf(buffer, "%s\n", mod->field);                   \
+        return sprintf(buffer, "%s\n", mk->mod->field);               \
 }                                                                     \
 static int modinfo_##field##_exists(struct module *mod)               \
 {                                                                     \
@@ -879,9 +902,9 @@ void symbol_put_addr(void *addr)
 EXPORT_SYMBOL_GPL(symbol_put_addr);
 static ssize_t show_refcnt(struct module_attribute *mattr,
-                           struct module *mod, char *buffer)
+                           struct module_kobject *mk, char *buffer)
 {
-        return sprintf(buffer, "%u\n", module_refcount(mod));
+        return sprintf(buffer, "%u\n", module_refcount(mk->mod));
 }
 static struct module_attribute refcnt = {
@@ -929,11 +952,11 @@ static inline int module_unload_init(struct module *mod)
 #endif /* CONFIG_MODULE_UNLOAD */
 static ssize_t show_initstate(struct module_attribute *mattr,
-                           struct module *mod, char *buffer)
+                              struct module_kobject *mk, char *buffer)
 {
        const char *state = "unknown";
-        switch (mod->state) {
+        switch (mk->mod->state) {
        case MODULE_STATE_LIVE:
                state = "live";
                break;
@@ -952,10 +975,27 @@ static struct module_attribute initstate = {
        .show = show_initstate,
 };
+static ssize_t store_uevent(struct module_attribute *mattr,
+                            struct module_kobject *mk,
+                            const char *buffer, size_t count)
+{
+        enum kobject_action action;
+        if (kobject_action_type(buffer, count, &action) == 0)
+                kobject_uevent(&mk->kobj, action);
+        return count;
+}
+struct module_attribute module_uevent = {
+        .attr = { .name = "uevent", .mode = 0200 },
+        .store = store_uevent,
+};
 static struct module_attribute *modinfo_attrs[] = {
        &modinfo_version,
        &modinfo_srcversion,
        &initstate,
+        &module_uevent,
 #ifdef CONFIG_MODULE_UNLOAD
        &refcnt,
 #endif
@@ -1164,7 +1204,7 @@ struct module_sect_attrs
 };
 static ssize_t module_sect_show(struct module_attribute *mattr,
-                                struct module *mod, char *buf)
+                                struct module_kobject *mk, char *buf)
 {
        struct module_sect_attr *sattr =
                container_of(mattr, struct module_sect_attr, mattr);
@@ -1607,27 +1647,28 @@ static void set_section_ro_nx(void *base,
        }
 }
-/* Setting memory back to RW+NX before releasing it */
+static void unset_module_core_ro_nx(struct module *mod)
-void unset_section_ro_nx(struct module *mod, void *module_region)
 {
-        unsigned long total_pages;
+        set_page_attributes(mod->module_core + mod->core_text_size,
+                mod->module_core + mod->core_size,
-        if (mod->module_core == module_region) {
+                set_memory_x);
-                /* Set core as NX+RW */
+        set_page_attributes(mod->module_core,
-                total_pages = MOD_NUMBER_OF_PAGES(mod->module_core, mod->core_size);
+                mod->module_core + mod->core_ro_size,
-                set_memory_nx((unsigned long)mod->module_core, total_pages);
+                set_memory_rw);
-                set_memory_rw((unsigned long)mod->module_core, total_pages);
+}
-        } else if (mod->module_init == module_region) {
+static void unset_module_init_ro_nx(struct module *mod)
-                /* Set init as NX+RW */
+{
-                total_pages = MOD_NUMBER_OF_PAGES(mod->module_init, mod->init_size);
+        set_page_attributes(mod->module_init + mod->init_text_size,
-                set_memory_nx((unsigned long)mod->module_init, total_pages);
+                mod->module_init + mod->init_size,
-                set_memory_rw((unsigned long)mod->module_init, total_pages);
+                set_memory_x);
-        }
+        set_page_attributes(mod->module_init,
+                mod->module_init + mod->init_ro_size,
+                set_memory_rw);
 }
 /* Iterate through all modules and set each module's text as RW */
-void set_all_modules_text_rw()
+void set_all_modules_text_rw(void)
 {
        struct module *mod;
@@ -1648,7 +1689,7 @@ void set_all_modules_text_rw()
 }
 /* Iterate through all modules and set each module's text as RO */
-void set_all_modules_text_ro()
+void set_all_modules_text_ro(void)
 {
        struct module *mod;
@@ -1669,9 +1710,19 @@ void set_all_modules_text_ro()
 }
 #else
 static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { }
-static inline void unset_section_ro_nx(struct module *mod, void *module_region) { }
+static void unset_module_core_ro_nx(struct module *mod) { }
+static void unset_module_init_ro_nx(struct module *mod) { }
 #endif
+void __weak module_free(struct module *mod, void *module_region)
+{
+        vfree(module_region);
+}
+void __weak module_arch_cleanup(struct module *mod)
+{
+}
 /* Free a module, remove from lists, etc. */
 static void free_module(struct module *mod)
 {
@@ -1696,7 +1747,7 @@ static void free_module(struct module *mod)
        destroy_params(mod->kp, mod->num_kp);
        /* This may be NULL, but that's OK */
-        unset_section_ro_nx(mod, mod->module_init);
+        unset_module_init_ro_nx(mod);
        module_free(mod, mod->module_init);
        kfree(mod->args);
        percpu_modfree(mod);
@@ -1705,7 +1756,7 @@ static void free_module(struct module *mod)
        lockdep_free_key_range(mod->module_core, mod->core_size);
        /* Finally, free the core (containing the module structure) */
-        unset_section_ro_nx(mod, mod->module_core);
+        unset_module_core_ro_nx(mod);
        module_free(mod, mod->module_core);
 #ifdef CONFIG_MPU
@@ -1826,6 +1877,26 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
        return ret;
 }
+int __weak apply_relocate(Elf_Shdr *sechdrs,
+                          const char *strtab,
+                          unsigned int symindex,
+                          unsigned int relsec,
+                          struct module *me)
+{
+        pr_err("module %s: REL relocation unsupported\n", me->name);
+        return -ENOEXEC;
+}
+int __weak apply_relocate_add(Elf_Shdr *sechdrs,
+                              const char *strtab,
+                              unsigned int symindex,
+                              unsigned int relsec,
+                              struct module *me)
+{
+        pr_err("module %s: RELA relocation unsupported\n", me->name);
+        return -ENOEXEC;
+}
 static int apply_relocations(struct module *mod, const struct load_info *info)
 {
        unsigned int i;
@@ -2030,11 +2101,8 @@ static const struct kernel_symbol *lookup_symbol(const char *name,
        const struct kernel_symbol *start,
        const struct kernel_symbol *stop)
 {
-        const struct kernel_symbol *ks = start;
+        return bsearch(name, start, stop - start,
-        for (; ks < stop; ks++)
+                        sizeof(struct kernel_symbol), cmp_name);
-                if (strcmp(ks->name, name) == 0)
-                        return ks;
-        return NULL;
 }
 static int is_exported(const char *name, unsigned long value,
@@ -2213,6 +2281,11 @@ static void dynamic_debug_remove(struct _ddebug *debug)
                ddebug_remove_module(debug->modname);
 }
+void * __weak module_alloc(unsigned long size)
+{
+        return size == 0 ? NULL : vmalloc_exec(size);
+}
 static void *module_alloc_update_bounds(unsigned long size)
 {
        void *ret = module_alloc(size);
@@ -2623,6 +2696,14 @@ static void flush_module_icache(const struct module *mod)
        set_fs(old_fs);
 }
+int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
+                                     Elf_Shdr *sechdrs,
+                                     char *secstrings,
+                                     struct module *mod)
+{
+        return 0;
+}
 static struct module *layout_and_allocate(struct load_info *info)
 {
        /* Module within temporary copy. */
@@ -2694,6 +2775,13 @@ static void module_deallocate(struct module *mod, struct load_info *info)
        module_free(mod, mod->module_core);
 }
+int __weak module_finalize(const Elf_Ehdr *hdr,
+                           const Elf_Shdr *sechdrs,
+                           struct module *me)
+{
+        return 0;
+}
 static int post_relocation(struct module *mod, const struct load_info *info)
 {
        /* Sort exception table now relocations are done. */
@@ -2790,7 +2878,7 @@ static struct module *load_module(void __user *umod,
        }
        /* This has to be done once we're sure module name is unique. */
-        if (!mod->taints)
+        if (!mod->taints || mod->taints == (1U<<TAINT_CRAP))
                dynamic_debug_setup(info.debug, info.num_debug);
        /* Find duplicate symbols */
@@ -2827,7 +2915,7 @@ static struct module *load_module(void __user *umod,
        module_bug_cleanup(mod);
 ddebug:
-        if (!mod->taints)
+        if (!mod->taints || mod->taints == (1U<<TAINT_CRAP))
                dynamic_debug_remove(info.debug);
 unlock:
        mutex_unlock(&module_mutex);
@@ -2931,10 +3019,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
        mod->symtab = mod->core_symtab;
        mod->strtab = mod->core_strtab;
 #endif
-        unset_section_ro_nx(mod, mod->module_init);
+        unset_module_init_ro_nx(mod);
        module_free(mod, mod->module_init);
        mod->module_init = NULL;
        mod->init_size = 0;
+        mod->init_ro_size = 0;
        mod->init_text_size = 0;
        mutex_unlock(&module_mutex);
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index ec815a960b5d..73da83aff418 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -75,7 +75,7 @@ void debug_mutex_unlock(struct mutex *lock)
                return;
        DEBUG_LOCKS_WARN_ON(lock->magic != lock);
-        DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
+        DEBUG_LOCKS_WARN_ON(lock->owner != current);
        DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
        mutex_clear_owner(lock);
 }
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index 57d527a16f9d..0799fd3e4cfa 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -29,7 +29,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name,
 static inline void mutex_set_owner(struct mutex *lock)
 {
-        lock->owner = current_thread_info();
+        lock->owner = current;
 }
 static inline void mutex_clear_owner(struct mutex *lock)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index c4195fa98900..d607ed5dd441 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -131,14 +131,14 @@ EXPORT_SYMBOL(mutex_unlock);
 */
 static inline int __sched
 __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
-                unsigned long ip)
+                    struct lockdep_map *nest_lock, unsigned long ip)
 {
        struct task_struct *task = current;
        struct mutex_waiter waiter;
        unsigned long flags;
        preempt_disable();
-        mutex_acquire(&lock->dep_map, subclass, 0, ip);
+        mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
        /*
@@ -160,14 +160,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
         */
        for (;;) {
-                struct thread_info *owner;
+                struct task_struct *owner;
-                /*
-                 * If we own the BKL, then don't spin. The owner of
-                 * the mutex might be waiting on us to release the BKL.
-                 */
-                if (unlikely(current->lock_depth >= 0))
-                        break;
                /*
                 * If there's an owner, wait for it to either
@@ -276,16 +269,25 @@ void __sched
 mutex_lock_nested(struct mutex *lock, unsigned int subclass)
 {
        might_sleep();
-        __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, _RET_IP_);
+        __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
 }
 EXPORT_SYMBOL_GPL(mutex_lock_nested);
+void __sched
+_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
+{
+        might_sleep();
+        __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
 int __sched
 mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
 {
        might_sleep();
-        return __mutex_lock_common(lock, TASK_KILLABLE, subclass, _RET_IP_);
+        return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_);
 }
 EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
@@ -294,7 +296,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
 {
        might_sleep();
        return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
-                                   subclass, _RET_IP_);
+                                   subclass, NULL, _RET_IP_);
 }
 EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -400,7 +402,7 @@ __mutex_lock_slowpath(atomic_t *lock_count)
 {
        struct mutex *lock = container_of(lock_count, struct mutex, count);
-        __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, _RET_IP_);
+        __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
 }
 static noinline int __sched
@@ -408,7 +410,7 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count)
 {
        struct mutex *lock = container_of(lock_count, struct mutex, count);
-        return __mutex_lock_common(lock, TASK_KILLABLE, 0, _RET_IP_);
+        return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
 }
 static noinline int __sched
@@ -416,7 +418,7 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count)
 {
        struct mutex *lock = container_of(lock_count, struct mutex, count);
-        return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, _RET_IP_);
+        return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
 }
 #endif
diff --git a/kernel/mutex.h b/kernel/mutex.h
index 67578ca48f94..4115fbf83b12 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -19,7 +19,7 @@
 #ifdef CONFIG_SMP
 static inline void mutex_set_owner(struct mutex *lock)
 {
-        lock->owner = current_thread_info();
+        lock->owner = current;
 }
 static inline void mutex_clear_owner(struct mutex *lock)
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 2488ba7eb568..8d7b435806c9 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -525,37 +525,6 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh)
 }
 EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
-/**
- *      register_reboot_notifier - Register function to be called at reboot time
- *      @nb: Info about notifier function to be called
- *
- *      Registers a function with the list of functions
- *      to be called at reboot time.
- *
- *      Currently always returns zero, as blocking_notifier_chain_register()
- *      always returns zero.
- */
-int register_reboot_notifier(struct notifier_block *nb)
-{
-        return blocking_notifier_chain_register(&reboot_notifier_list, nb);
-}
-EXPORT_SYMBOL(register_reboot_notifier);
-/**
- *      unregister_reboot_notifier - Unregister previously registered reboot notifier
- *      @nb: Hook to be unregistered
- *
- *      Unregisters a previously registered reboot
- *      notifier function.
- *
- *      Returns zero on success, or %-ENOENT on failure.
- */
-int unregister_reboot_notifier(struct notifier_block *nb)
-{
-        return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
-}
-EXPORT_SYMBOL(unregister_reboot_notifier);
 static ATOMIC_NOTIFIER_HEAD(die_chain);
 int notrace __kprobes notify_die(enum die_val val, const char *str,
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
deleted file mode 100644
index 2c98ad94ba0e..000000000000
--- a/kernel/ns_cgroup.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * ns_cgroup.c - namespace cgroup subsystem
- *
- * Copyright 2006, 2007 IBM Corp
- */
-#include <linux/module.h>
-#include <linux/cgroup.h>
-#include <linux/fs.h>
-#include <linux/proc_fs.h>
-#include <linux/slab.h>
-#include <linux/nsproxy.h>
-struct ns_cgroup {
-        struct cgroup_subsys_state css;
-};
-struct cgroup_subsys ns_subsys;
-static inline struct ns_cgroup *cgroup_to_ns(
-                struct cgroup *cgroup)
-{
-        return container_of(cgroup_subsys_state(cgroup, ns_subsys_id),
-                            struct ns_cgroup, css);
-}
-int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
-{
-        char name[PROC_NUMBUF];
-        snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid));
-        return cgroup_clone(task, &ns_subsys, name);
-}
-/*
- * Rules:
- *   1. you can only enter a cgroup which is a descendant of your current
- *     cgroup
- *   2. you can only place another process into a cgroup if
- *     a. you have CAP_SYS_ADMIN
- *     b. your cgroup is an ancestor of task's destination cgroup
- *       (hence either you are in the same cgroup as task, or in an
- *        ancestor cgroup thereof)
- */
-static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
-                         struct task_struct *task, bool threadgroup)
-{
-        if (current != task) {
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                if (!cgroup_is_descendant(new_cgroup, current))
-                        return -EPERM;
-        }
-        if (!cgroup_is_descendant(new_cgroup, task))
-                return -EPERM;
-        if (threadgroup) {
-                struct task_struct *c;
-                rcu_read_lock();
-                list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
-                        if (!cgroup_is_descendant(new_cgroup, c)) {
-                                rcu_read_unlock();
-                                return -EPERM;
-                        }
-                }
-                rcu_read_unlock();
-        }
-        return 0;
-}
-/*
- * Rules: you can only create a cgroup if
- *     1. you are capable(CAP_SYS_ADMIN)
- *     2. the target cgroup is a descendant of your own cgroup
- */
-static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
-                                                struct cgroup *cgroup)
-{
-        struct ns_cgroup *ns_cgroup;
-        if (!capable(CAP_SYS_ADMIN))
-                return ERR_PTR(-EPERM);
-        if (!cgroup_is_descendant(cgroup, current))
-                return ERR_PTR(-EPERM);
-        if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) {
-                printk("ns_cgroup can't be created with parent "
-                       "'clone_children' set.\n");
-                return ERR_PTR(-EINVAL);
-        }
-        printk_once("ns_cgroup deprecated: consider using the "
-                    "'clone_children' flag without the ns_cgroup.\n");
-        ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
-        if (!ns_cgroup)
-                return ERR_PTR(-ENOMEM);
-        return &ns_cgroup->css;
-}
-static void ns_destroy(struct cgroup_subsys *ss,
-                        struct cgroup *cgroup)
-{
-        struct ns_cgroup *ns_cgroup;
-        ns_cgroup = cgroup_to_ns(cgroup);
-        kfree(ns_cgroup);
-}
-struct cgroup_subsys ns_subsys = {
-        .name = "ns",
-        .can_attach = ns_can_attach,
-        .create = ns_create,
-        .destroy  = ns_destroy,
-        .subsys_id = ns_subsys_id,
-};
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index a05d191ffdd9..9aeab4b98c64 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -22,6 +22,9 @@
 #include <linux/pid_namespace.h>
 #include <net/net_namespace.h>
 #include <linux/ipc_namespace.h>
+#include <linux/proc_fs.h>
+#include <linux/file.h>
+#include <linux/syscalls.h>
 static struct kmem_cache *nsproxy_cachep;
@@ -198,10 +201,6 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
                goto out;
        }
-        err = ns_cgroup_clone(current, task_pid(current));
-        if (err)
-                put_nsproxy(*new_nsp);
 out:
        return err;
 }
@@ -233,10 +232,47 @@ void exit_task_namespaces(struct task_struct *p)
        switch_task_namespaces(p, NULL);
 }
-static int __init nsproxy_cache_init(void)
+SYSCALL_DEFINE2(setns, int, fd, int, nstype)
+{
+        const struct proc_ns_operations *ops;
+        struct task_struct *tsk = current;
+        struct nsproxy *new_nsproxy;
+        struct proc_inode *ei;
+        struct file *file;
+        int err;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        file = proc_ns_fget(fd);
+        if (IS_ERR(file))
+                return PTR_ERR(file);
+        err = -EINVAL;
+        ei = PROC_I(file->f_dentry->d_inode);
+        ops = ei->ns_ops;
+        if (nstype && (ops->type != nstype))
+                goto out;
+        new_nsproxy = create_new_namespaces(0, tsk, tsk->fs);
+        if (IS_ERR(new_nsproxy)) {
+                err = PTR_ERR(new_nsproxy);
+                goto out;
+        }
+        err = ops->install(new_nsproxy, ei->ns);
+        if (err) {
+                free_nsproxy(new_nsproxy);
+                goto out;
+        }
+        switch_task_namespaces(tsk, new_nsproxy);
+out:
+        fput(file);
+        return err;
+}
+int __init nsproxy_cache_init(void)
 {
        nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
        return 0;
 }
-module_init(nsproxy_cache_init);
diff --git a/kernel/panic.c b/kernel/panic.c
index 69231670eb95..d7bb6974efb5 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -119,6 +119,8 @@ NORET_TYPE void panic(const char * fmt, ...)
                        }
                        mdelay(PANIC_TIMER_STEP);
                }
+        }
+        if (panic_timeout != 0) {
                /*
                 * This will not be a clean reboot, with everything
                 * shutting down.  But if there is a chance of
diff --git a/kernel/params.c b/kernel/params.c
index 7ab388a48a2e..22df3e0d142a 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -225,8 +225,8 @@ int parse_args(const char *name,
                int ret;                                                \
                                                                        \
                ret = strtolfn(val, 0, &l);                             \
-                if (ret == -EINVAL || ((type)l != l))                   \
+                if (ret < 0 || ((type)l != l))                          \
-                        return -EINVAL;                                 \
+                        return ret < 0 ? ret : -EINVAL;                 \
                *((type *)kp->arg) = l;                                 \
                return 0;                                               \
        }                                                               \
@@ -297,21 +297,15 @@ EXPORT_SYMBOL(param_ops_charp);
 int param_set_bool(const char *val, const struct kernel_param *kp)
 {
        bool v;
+        int ret;
        /* No equals means "set"... */
        if (!val) val = "1";
        /* One of =[yYnN01] */
-        switch (val[0]) {
+        ret = strtobool(val, &v);
-        case 'y': case 'Y': case '1':
+        if (ret)
-                v = true;
+                return ret;
-                break;
-        case 'n': case 'N': case '0':
-                v = false;
-                break;
-        default:
-                return -EINVAL;
-        }
        if (kp->flags & KPARAM_ISBOOL)
                *(bool *)kp->arg = v;
@@ -517,7 +511,7 @@ struct module_param_attrs
 #define to_param_attr(n) container_of(n, struct param_attribute, mattr)
 static ssize_t param_attr_show(struct module_attribute *mattr,
-                               struct module *mod, char *buf)
+                               struct module_kobject *mk, char *buf)
 {
        int count;
        struct param_attribute *attribute = to_param_attr(mattr);
@@ -537,7 +531,7 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
 /* sysfs always hands a nul-terminated string in buf.  We rely on that. */
 static ssize_t param_attr_store(struct module_attribute *mattr,
-                                struct module *owner,
+                                struct module_kobject *km,
                                const char *buf, size_t len)
 {
        int err;
@@ -736,6 +730,10 @@ static struct module_kobject * __init locate_module_kobject(const char *name)
                mk->kobj.kset = module_kset;
                err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL,
                                           "%s", name);
+#ifdef CONFIG_MODULES
+                if (!err)
+                        err = sysfs_create_file(&mk->kobj, &module_uevent.attr);
+#endif
                if (err) {
                        kobject_put(&mk->kobj);
                        printk(KERN_ERR
@@ -813,7 +811,7 @@ static void __init param_sysfs_builtin(void)
 }
 ssize_t __modver_version_show(struct module_attribute *mattr,
-                              struct module *mod, char *buf)
+                              struct module_kobject *mk, char *buf)
 {
        struct module_version_attribute *vattr =
                container_of(mattr, struct module_version_attribute, mattr);
@@ -821,15 +819,18 @@ ssize_t __modver_version_show(struct module_attribute *mattr,
        return sprintf(buf, "%s\n", vattr->version);
 }
-extern struct module_version_attribute __start___modver[], __stop___modver[];
+extern const struct module_version_attribute *__start___modver[];
+extern const struct module_version_attribute *__stop___modver[];
 static void __init version_sysfs_builtin(void)
 {
-        const struct module_version_attribute *vattr;
+        const struct module_version_attribute **p;
        struct module_kobject *mk;
        int err;
-        for (vattr = __start___modver; vattr < __stop___modver; vattr++) {
+        for (p = __start___modver; p < __stop___modver; p++) {
+                const struct module_version_attribute *vattr = *p;
                mk = locate_module_kobject(vattr->module_name);
                if (mk) {
                        err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
@@ -855,7 +856,7 @@ static ssize_t module_attr_show(struct kobject *kobj,
        if (!attribute->show)
                return -EIO;
-        ret = attribute->show(attribute, mk->mod, buf);
+        ret = attribute->show(attribute, mk, buf);
        return ret;
 }
@@ -874,7 +875,7 @@ static ssize_t module_attr_store(struct kobject *kobj,
        if (!attribute->store)
                return -EIO;
-        ret = attribute->store(attribute, mk->mod, buf, len);
+        ret = attribute->store(attribute, mk, buf, len);
        return ret;
 }
diff --git a/kernel/pid.c b/kernel/pid.c
index 57a8346a270e..e432057f3b21 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -405,7 +405,6 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
        if (pid) {
                struct hlist_node *first;
                first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
-                                              rcu_read_lock_held() ||
                                              lockdep_tasklist_lock_is_held());
                if (first)
                        result = hlist_entry(first, struct task_struct, pids[(type)].node);
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 0da058bff8eb..37f05d0f0793 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -40,6 +40,7 @@
 #include <linux/string.h>
 #include <linux/platform_device.h>
 #include <linux/init.h>
+#include <linux/kernel.h>
 #include <linux/uaccess.h>
@@ -53,11 +54,17 @@ enum pm_qos_type {
        PM_QOS_MIN              /* return the smallest value */
 };
+/*
+ * Note: The lockless read path depends on the CPU accessing
+ * target_value atomically.  Atomic access is only guaranteed on all CPU
+ * types linux supports for 32 bit quantites
+ */
 struct pm_qos_object {
        struct plist_head requests;
        struct blocking_notifier_head *notifiers;
        struct miscdevice pm_qos_power_miscdev;
        char *name;
+        s32 target_value;       /* Do not change to 64 bit */
        s32 default_value;
        enum pm_qos_type type;
 };
@@ -67,29 +74,32 @@ static DEFINE_SPINLOCK(pm_qos_lock);
 static struct pm_qos_object null_pm_qos;
 static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
 static struct pm_qos_object cpu_dma_pm_qos = {
-        .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock),
+        .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests),
        .notifiers = &cpu_dma_lat_notifier,
        .name = "cpu_dma_latency",
-        .default_value = 2000 * USEC_PER_SEC,
+        .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
+        .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
        .type = PM_QOS_MIN,
 };
 static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
 static struct pm_qos_object network_lat_pm_qos = {
-        .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock),
+        .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests),
        .notifiers = &network_lat_notifier,
        .name = "network_latency",
-        .default_value = 2000 * USEC_PER_SEC,
+        .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
+        .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
        .type = PM_QOS_MIN
 };
 static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
 static struct pm_qos_object network_throughput_pm_qos = {
-        .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock),
+        .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests),
        .notifiers = &network_throughput_notifier,
        .name = "network_throughput",
-        .default_value = 0,
+        .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
+        .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
        .type = PM_QOS_MAX,
 };
@@ -135,6 +145,16 @@ static inline int pm_qos_get_value(struct pm_qos_object *o)
        }
 }
+static inline s32 pm_qos_read_value(struct pm_qos_object *o)
+{
+        return o->target_value;
+}
+static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value)
+{
+        o->target_value = value;
+}
 static void update_target(struct pm_qos_object *o, struct plist_node *node,
                          int del, int value)
 {
@@ -159,6 +179,7 @@ static void update_target(struct pm_qos_object *o, struct plist_node *node,
                plist_add(node, &o->requests);
        }
        curr_value = pm_qos_get_value(o);
+        pm_qos_set_value(o, curr_value);
        spin_unlock_irqrestore(&pm_qos_lock, flags);
        if (prev_value != curr_value)
@@ -193,18 +214,11 @@ static int find_pm_qos_object_by_minor(int minor)
 * pm_qos_request - returns current system wide qos expectation
 * @pm_qos_class: identification of which qos value is requested
 *
- * This function returns the current target value in an atomic manner.
+ * This function returns the current target value.
 */
 int pm_qos_request(int pm_qos_class)
 {
-        unsigned long flags;
+        return pm_qos_read_value(pm_qos_array[pm_qos_class]);
-        int value;
-        spin_lock_irqsave(&pm_qos_lock, flags);
-        value = pm_qos_get_value(pm_qos_array[pm_qos_class]);
-        spin_unlock_irqrestore(&pm_qos_lock, flags);
-        return value;
 }
 EXPORT_SYMBOL_GPL(pm_qos_request);
@@ -385,7 +399,7 @@ static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
        s32 value;
        unsigned long flags;
        struct pm_qos_object *o;
-        struct pm_qos_request_list *pm_qos_req = filp->private_data;;
+        struct pm_qos_request_list *pm_qos_req = filp->private_data;
        if (!pm_qos_req)
                return -EINVAL;
@@ -404,24 +418,36 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
                size_t count, loff_t *f_pos)
 {
        s32 value;
-        int x;
-        char ascii_value[11];
        struct pm_qos_request_list *pm_qos_req;
        if (count == sizeof(s32)) {
                if (copy_from_user(&value, buf, sizeof(s32)))
                        return -EFAULT;
-        } else if (count == 11) { /* len('0x12345678/0') */
+        } else if (count <= 11) { /* ASCII perhaps? */
-                if (copy_from_user(ascii_value, buf, 11))
+                char ascii_value[11];
+                unsigned long int ulval;
+                int ret;
+                if (copy_from_user(ascii_value, buf, count))
                        return -EFAULT;
-                if (strlen(ascii_value) != 10)
-                        return -EINVAL;
+                if (count > 10) {
-                x = sscanf(ascii_value, "%x", &value);
+                        if (ascii_value[10] == '\n')
-                if (x != 1)
+                                ascii_value[10] = '\0';
+                        else
+                                return -EINVAL;
+                } else {
+                        ascii_value[count] = '\0';
+                }
+                ret = strict_strtoul(ascii_value, 16, &ulval);
+                if (ret) {
+                        pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret);
                        return -EINVAL;
-                pr_debug("%s, %d, 0x%x\n", ascii_value, x, value);
+                }
-        } else
+                value = (s32)lower_32_bits(ulval);
+        } else {
                return -EINVAL;
+        }
        pm_qos_req = filp->private_data;
        pm_qos_update_request(pm_qos_req, value);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 0791b13df7bf..58f405b581e7 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1514,7 +1514,7 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
                        return -EFAULT;
                restart_block->fn = posix_cpu_nsleep_restart;
-                restart_block->nanosleep.index = which_clock;
+                restart_block->nanosleep.clockid = which_clock;
                restart_block->nanosleep.rmtp = rmtp;
                restart_block->nanosleep.expires = timespec_to_ns(rqtp);
        }
@@ -1523,7 +1523,7 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
 {
-        clockid_t which_clock = restart_block->nanosleep.index;
+        clockid_t which_clock = restart_block->nanosleep.clockid;
        struct timespec t;
        struct itimerspec it;
        int error;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index e5498d7405c3..4556182527f3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -491,6 +491,13 @@ static struct k_itimer * alloc_posix_timer(void)
        return tmr;
 }
+static void k_itimer_rcu_free(struct rcu_head *head)
+{
+        struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu);
+        kmem_cache_free(posix_timers_cache, tmr);
+}
 #define IT_ID_SET       1
 #define IT_ID_NOT_SET   0
 static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
@@ -503,7 +510,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
        }
        put_pid(tmr->it_pid);
        sigqueue_free(tmr->sigq);
-        kmem_cache_free(posix_timers_cache, tmr);
+        call_rcu(&tmr->it.rcu, k_itimer_rcu_free);
 }
 static struct k_clock *clockid_to_kclock(const clockid_t id)
@@ -631,22 +638,18 @@ out:
 static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
 {
        struct k_itimer *timr;
-        /*
-         * Watch out here.  We do a irqsave on the idr_lock and pass the
+        rcu_read_lock();
-         * flags part over to the timer lock.  Must not let interrupts in
-         * while we are moving the lock.
-         */
-        spin_lock_irqsave(&idr_lock, *flags);
        timr = idr_find(&posix_timers_id, (int)timer_id);
        if (timr) {
-                spin_lock(&timr->it_lock);
+                spin_lock_irqsave(&timr->it_lock, *flags);
                if (timr->it_signal == current->signal) {
-                        spin_unlock(&idr_lock);
+                        rcu_read_unlock();
                        return timr;
                }
-                spin_unlock(&timr->it_lock);
+                spin_unlock_irqrestore(&timr->it_lock, *flags);
        }
-        spin_unlock_irqrestore(&idr_lock, *flags);
+        rcu_read_unlock();
        return NULL;
 }
@@ -1056,7 +1059,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 */
 long clock_nanosleep_restart(struct restart_block *restart_block)
 {
-        clockid_t which_clock = restart_block->nanosleep.index;
+        clockid_t which_clock = restart_block->nanosleep.clockid;
        struct k_clock *kc = clockid_to_kclock(which_clock);
        if (WARN_ON_ONCE(!kc || !kc->nsleep_restart))
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 6de9a8fc3417..b1914cb9095c 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -125,12 +125,6 @@ config PM_DEBUG
        code. This is helpful when debugging and reporting PM bugs, like
        suspend support.
-config PM_VERBOSE
-        bool "Verbose Power Management debugging"
-        depends on PM_DEBUG
-        ---help---
-        This option enables verbose messages from the Power Management code.
 config PM_ADVANCED_DEBUG
        bool "Extra PM attributes in sysfs for low-level debugging/testing"
        depends on PM_DEBUG
@@ -199,8 +193,8 @@ config APM_EMULATION
          notification of APM "events" (e.g. battery status change).
          In order to use APM, you will need supporting software. For location
-          and more information, read <file:Documentation/power/pm.txt> and the
+          and more information, read <file:Documentation/power/apm-acpi.txt>
-          Battery Powered Linux mini-HOWTO, available from
+          and the Battery Powered Linux mini-HOWTO, available from
          <http://www.tldp.org/docs.html#howto>.
          This driver does not spin down disk drives (see the hdparm(8)
@@ -229,3 +223,11 @@ config PM_OPP
          representing individual voltage domains and provides SOC
          implementations a ready to use framework to manage OPPs.
          For more information, read <file:Documentation/power/opp.txt>
+config PM_CLK
+        def_bool y
+        depends on PM && HAVE_CLK
+config PM_GENERIC_DOMAINS
+        bool
+        depends on PM
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 50aae660174d..8f7b1db1ece1 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -25,7 +25,6 @@
 #include <linux/gfp.h>
 #include <linux/syscore_ops.h>
 #include <scsi/scsi_scan.h>
-#include <asm/suspend.h>
 #include "power.h"
@@ -55,10 +54,9 @@ static int hibernation_mode = HIBERNATION_SHUTDOWN;
 static const struct platform_hibernation_ops *hibernation_ops;
 /**
- * hibernation_set_ops - set the global hibernate operations
+ * hibernation_set_ops - Set the global hibernate operations.
- * @ops: the hibernation operations to use in subsequent hibernation transitions
+ * @ops: Hibernation operations to use in subsequent hibernation transitions.
 */
 void hibernation_set_ops(const struct platform_hibernation_ops *ops)
 {
        if (ops && !(ops->begin && ops->end &&  ops->pre_snapshot
@@ -115,10 +113,9 @@ static int hibernation_test(int level) { return 0; }
 #endif /* !CONFIG_PM_DEBUG */
 /**
- *      platform_begin - tell the platform driver that we're starting
+ * platform_begin - Call platform to start hibernation.
- *      hibernation
+ * @platform_mode: Whether or not to use the platform driver.
 */
 static int platform_begin(int platform_mode)
 {
        return (platform_mode && hibernation_ops) ?
@@ -126,10 +123,9 @@ static int platform_begin(int platform_mode)
 }
 /**
- *      platform_end - tell the platform driver that we've entered the
+ * platform_end - Call platform to finish transition to the working state.
- *      working state
+ * @platform_mode: Whether or not to use the platform driver.
 */
 static void platform_end(int platform_mode)
 {
        if (platform_mode && hibernation_ops)
@@ -137,8 +133,11 @@ static void platform_end(int platform_mode)
 }
 /**
- *      platform_pre_snapshot - prepare the machine for hibernation using the
+ * platform_pre_snapshot - Call platform to prepare the machine for hibernation.
- *      platform driver if so configured and return an error code if it fails
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Use the platform driver to prepare the system for creating a hibernate image,
+ * if so configured, and return an error code if that fails.
 */
 static int platform_pre_snapshot(int platform_mode)
@@ -148,10 +147,14 @@ static int platform_pre_snapshot(int platform_mode)
 }
 /**
- *      platform_leave - prepare the machine for switching to the normal mode
+ * platform_leave - Call platform to prepare a transition to the working state.
- *      of operation using the platform driver (called with interrupts disabled)
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Use the platform driver prepare to prepare the machine for switching to the
+ * normal mode of operation.
+ *
+ * This routine is called on one CPU with interrupts disabled.
 */
 static void platform_leave(int platform_mode)
 {
        if (platform_mode && hibernation_ops)
@@ -159,10 +162,14 @@ static void platform_leave(int platform_mode)
 }
 /**
- *      platform_finish - switch the machine to the normal mode of operation
+ * platform_finish - Call platform to switch the system to the working state.
- *      using the platform driver (must be called after platform_prepare())
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Use the platform driver to switch the machine to the normal mode of
+ * operation.
+ *
+ * This routine must be called after platform_prepare().
 */
 static void platform_finish(int platform_mode)
 {
        if (platform_mode && hibernation_ops)
@@ -170,11 +177,15 @@ static void platform_finish(int platform_mode)
 }
 /**
- *      platform_pre_restore - prepare the platform for the restoration from a
+ * platform_pre_restore - Prepare for hibernate image restoration.
- *      hibernation image.  If the restore fails after this function has been
+ * @platform_mode: Whether or not to use the platform driver.
- *      called, platform_restore_cleanup() must be called.
+ *
+ * Use the platform driver to prepare the system for resume from a hibernation
+ * image.
+ *
+ * If the restore fails after this function has been called,
+ * platform_restore_cleanup() must be called.
 */
 static int platform_pre_restore(int platform_mode)
 {
        return (platform_mode && hibernation_ops) ?
@@ -182,12 +193,16 @@ static int platform_pre_restore(int platform_mode)
 }
 /**
- *      platform_restore_cleanup - switch the platform to the normal mode of
+ * platform_restore_cleanup - Switch to the working state after failing restore.
- *      operation after a failing restore.  If platform_pre_restore() has been
+ * @platform_mode: Whether or not to use the platform driver.
- *      called before the failing restore, this function must be called too,
+ *
- *      regardless of the result of platform_pre_restore().
+ * Use the platform driver to switch the system to the normal mode of operation
+ * after a failing restore.
+ *
+ * If platform_pre_restore() has been called before the failing restore, this
+ * function must be called too, regardless of the result of
+ * platform_pre_restore().
 */
 static void platform_restore_cleanup(int platform_mode)
 {
        if (platform_mode && hibernation_ops)
@@ -195,10 +210,9 @@ static void platform_restore_cleanup(int platform_mode)
 }
 /**
- *      platform_recover - recover the platform from a failure to suspend
+ * platform_recover - Recover from a failure to suspend devices.
- *      devices.
+ * @platform_mode: Whether or not to use the platform driver.
 */
 static void platform_recover(int platform_mode)
 {
        if (platform_mode && hibernation_ops && hibernation_ops->recover)
@@ -206,13 +220,12 @@ static void platform_recover(int platform_mode)
 }
 /**
- *      swsusp_show_speed - print the time elapsed between two events.
+ * swsusp_show_speed - Print time elapsed between two events during hibernation.
- *      @start: Starting event.
+ * @start: Starting event.
- *      @stop: Final event.
+ * @stop: Final event.
- *      @nr_pages -     number of pages processed between @start and @stop
+ * @nr_pages: Number of memory pages processed between @start and @stop.
- *      @msg -          introductory message to print
+ * @msg: Additional diagnostic message to print.
 */
 void swsusp_show_speed(struct timeval *start, struct timeval *stop,
                        unsigned nr_pages, char *msg)
 {
@@ -235,25 +248,18 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
 }
 /**
- *      create_image - freeze devices that need to be frozen with interrupts
+ * create_image - Create a hibernation image.
- *      off, create the hibernation image and thaw those devices.  Control
+ * @platform_mode: Whether or not to use the platform driver.
- *      reappears in this routine after a restore.
+ *
+ * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image
+ * and execute the drivers' .thaw_noirq() callbacks.
+ *
+ * Control reappears in this routine after the subsequent restore.
 */
 static int create_image(int platform_mode)
 {
        int error;
-        error = arch_prepare_suspend();
-        if (error)
-                return error;
-        /* At this point, dpm_suspend_start() has been called, but *not*
-         * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now.
-         * Otherwise, drivers for some devices (e.g. interrupt controllers)
-         * become desynchronized with the actual state of the hardware
-         * at resume time, and evil weirdness ensues.
-         */
        error = dpm_suspend_noirq(PMSG_FREEZE);
        if (error) {
                printk(KERN_ERR "PM: Some devices failed to power down, "
@@ -272,12 +278,7 @@ static int create_image(int platform_mode)
        local_irq_disable();
-        error = sysdev_suspend(PMSG_FREEZE);
+        error = syscore_suspend();
-        if (!error) {
-                error = syscore_suspend();
-                if (error)
-                        sysdev_resume();
-        }
        if (error) {
                printk(KERN_ERR "PM: Some system devices failed to power down, "
                        "aborting hibernation\n");
@@ -302,10 +303,6 @@ static int create_image(int platform_mode)
 Power_up:
        syscore_resume();
-        sysdev_resume();
-        /* NOTE:  dpm_resume_noirq() is just a resume() for devices
-         * that suspended with irqs off ... no overall powerup.
-         */
 Enable_irqs:
        local_irq_enable();
@@ -323,30 +320,32 @@ static int create_image(int platform_mode)
 }
 /**
- *      hibernation_snapshot - quiesce devices and create the hibernation
+ * hibernation_snapshot - Quiesce devices and create a hibernation image.
- *      snapshot image.
+ * @platform_mode: If set, use platform driver to prepare for the transition.
- *      @platform_mode - if set, use the platform driver, if available, to
- *                       prepare the platform firmware for the power transition.
 *
- *      Must be called with pm_mutex held
+ * This routine must be called with pm_mutex held.
 */
 int hibernation_snapshot(int platform_mode)
 {
+        pm_message_t msg = PMSG_RECOVER;
        int error;
        error = platform_begin(platform_mode);
        if (error)
                goto Close;
+        error = dpm_prepare(PMSG_FREEZE);
+        if (error)
+                goto Complete_devices;
        /* Preallocate image memory before shutting down devices. */
        error = hibernate_preallocate_memory();
        if (error)
-                goto Close;
+                goto Complete_devices;
        suspend_console();
        pm_restrict_gfp_mask();
-        error = dpm_suspend_start(PMSG_FREEZE);
+        error = dpm_suspend(PMSG_FREEZE);
        if (error)
                goto Recover_platform;
@@ -364,13 +363,17 @@ int hibernation_snapshot(int platform_mode)
        if (error || !in_suspend)
                swsusp_free();
-        dpm_resume_end(in_suspend ?
+        msg = in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE;
-                (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
+        dpm_resume(msg);
        if (error || !in_suspend)
                pm_restore_gfp_mask();
        resume_console();
+ Complete_devices:
+        dpm_complete(msg);
 Close:
        platform_end(platform_mode);
        return error;
@@ -381,13 +384,14 @@ int hibernation_snapshot(int platform_mode)
 }
 /**
- *      resume_target_kernel - prepare devices that need to be suspended with
+ * resume_target_kernel - Restore system state from a hibernation image.
- *      interrupts off, restore the contents of highmem that have not been
+ * @platform_mode: Whether or not to use the platform driver.
- *      restored yet from the image and run the low level code that will restore
+ *
- *      the remaining contents of memory and switch to the just restored target
+ * Execute device drivers' .freeze_noirq() callbacks, restore the contents of
- *      kernel.
+ * highmem that have not been restored yet from the image and run the low-level
+ * code that will restore the remaining contents of memory and switch to the
+ * just restored target kernel.
 */
 static int resume_target_kernel(bool platform_mode)
 {
        int error;
@@ -409,40 +413,36 @@ static int resume_target_kernel(bool platform_mode)
        local_irq_disable();
-        error = sysdev_suspend(PMSG_QUIESCE);
+        error = syscore_suspend();
-        if (!error) {
-                error = syscore_suspend();
-                if (error)
-                        sysdev_resume();
-        }
        if (error)
                goto Enable_irqs;
-        /* We'll ignore saved state, but this gets preempt count (etc) right */
        save_processor_state();
        error = restore_highmem();
        if (!error) {
                error = swsusp_arch_resume();
                /*
                 * The code below is only ever reached in case of a failure.
-                 * Otherwise execution continues at place where
+                 * Otherwise, execution continues at the place where
-                 * swsusp_arch_suspend() was called
+                 * swsusp_arch_suspend() was called.
                 */
                BUG_ON(!error);
-                /* This call to restore_highmem() undos the previous one */
+                /*
+                 * This call to restore_highmem() reverts the changes made by
+                 * the previous one.
+                 */
                restore_highmem();
        }
        /*
         * The only reason why swsusp_arch_resume() can fail is memory being
         * very tight, so we have to free it as soon as we can to avoid
-         * subsequent failures
+         * subsequent failures.
         */
        swsusp_free();
        restore_processor_state();
        touch_softlockup_watchdog();
        syscore_resume();
-        sysdev_resume();
 Enable_irqs:
        local_irq_enable();
@@ -459,14 +459,12 @@ static int resume_target_kernel(bool platform_mode)
 }
 /**
- *      hibernation_restore - quiesce devices and restore the hibernation
+ * hibernation_restore - Quiesce devices and restore from a hibernation image.
- *      snapshot image.  If successful, control returns in hibernation_snaphot()
+ * @platform_mode: If set, use platform driver to prepare for the transition.
- *      @platform_mode - if set, use the platform driver, if available, to
- *                       prepare the platform firmware for the transition.
 *
- *      Must be called with pm_mutex held
+ * This routine must be called with pm_mutex held.  If it is successful, control
+ * reappears in the restored target kernel in hibernation_snaphot().
 */
 int hibernation_restore(int platform_mode)
 {
        int error;
@@ -486,10 +484,8 @@ int hibernation_restore(int platform_mode)
 }
 /**
- *      hibernation_platform_enter - enter the hibernation state using the
+ * hibernation_platform_enter - Power off the system using the platform driver.
- *      platform driver (if available)
 */
 int hibernation_platform_enter(void)
 {
        int error;
@@ -528,7 +524,6 @@ int hibernation_platform_enter(void)
                goto Platform_finish;
        local_irq_disable();
-        sysdev_suspend(PMSG_HIBERNATE);
        syscore_suspend();
        if (pm_wakeup_pending()) {
                error = -EAGAIN;
@@ -541,7 +536,6 @@ int hibernation_platform_enter(void)
 Power_up:
        syscore_resume();
-        sysdev_resume();
        local_irq_enable();
        enable_nonboot_cpus();
@@ -562,12 +556,12 @@ int hibernation_platform_enter(void)
 }
 /**
- *      power_down - Shut the machine down for hibernation.
+ * power_down - Shut the machine down for hibernation.
 *
- *      Use the platform driver, if configured so; otherwise try
+ * Use the platform driver, if configured, to put the system into the sleep
- *      to power off or reboot.
+ * state corresponding to hibernation, or try to power it off or reboot,
+ * depending on the value of hibernation_mode.
 */
 static void power_down(void)
 {
        switch (hibernation_mode) {
@@ -604,9 +598,8 @@ static int prepare_processes(void)
 }
 /**
- *      hibernate - The granpappy of the built-in hibernation management
+ * hibernate - Carry out system hibernation, including saving the image.
 */
 int hibernate(void)
 {
        int error;
@@ -684,17 +677,20 @@ int hibernate(void)
 /**
- *      software_resume - Resume from a saved image.
+ * software_resume - Resume from a saved hibernation image.
+ *
+ * This routine is called as a late initcall, when all devices have been
+ * discovered and initialized already.
 *
- *      Called as a late_initcall (so all devices are discovered and
+ * The image reading code is called to see if there is a hibernation image
- *      initialized), we call swsusp to see if we have a saved image or not.
+ * available for reading.  If that is the case, devices are quiesced and the
- *      If so, we quiesce devices, the restore the saved image. We will
+ * contents of memory is restored from the saved image.
- *      return above (in hibernate() ) if everything goes well.
- *      Otherwise, we fail gracefully and return to the normally
- *      scheduled program.
 *
+ * If this is successful, control reappears in the restored target kernel in
+ * hibernation_snaphot() which returns to hibernate().  Otherwise, the routine
+ * attempts to recover gracefully and make the kernel return to the normal mode
+ * of operation.
 */
 static int software_resume(void)
 {
        int error;
@@ -824,21 +820,17 @@ static const char * const hibernation_modes[] = {
        [HIBERNATION_TESTPROC]  = "testproc",
 };
-/**
+/*
- *      disk - Control hibernation mode
+ * /sys/power/disk - Control hibernation mode.
- *
- *      Suspend-to-disk can be handled in several ways. We have a few options
- *      for putting the system to sleep - using the platform driver (e.g. ACPI
- *      or other hibernation_ops), powering off the system or rebooting the
- *      system (for testing) as well as the two test modes.
 *
- *      The system can support 'platform', and that is known a priori (and
+ * Hibernation can be handled in several ways.  There are a few different ways
- *      encoded by the presence of hibernation_ops). However, the user may
+ * to put the system into the sleep state: using the platform driver (e.g. ACPI
- *      choose 'shutdown' or 'reboot' as alternatives, as well as one fo the
+ * or other hibernation_ops), powering it off or rebooting it (for testing
- *      test modes, 'test' or 'testproc'.
+ * mostly), or using one of the two available test modes.
 *
- *      show() will display what the mode is currently set to.
+ * The sysfs file /sys/power/disk provides an interface for selecting the
- *      store() will accept one of
+ * hibernation mode to use.  Reading from this file causes the available modes
+ * to be printed.  There are 5 modes that can be supported:
 *
 *      'platform'
 *      'shutdown'
@@ -846,8 +838,14 @@ static const char * const hibernation_modes[] = {
 *      'test'
 *      'testproc'
 *
- *      It will only change to 'platform' if the system
+ * If a platform hibernation driver is in use, 'platform' will be supported
- *      supports it (as determined by having hibernation_ops).
+ * and will be used by default.  Otherwise, 'shutdown' will be used by default.
+ * The selected option (i.e. the one corresponding to the current value of
+ * hibernation_mode) is enclosed by a square bracket.
+ *
+ * To select a given hibernation mode it is necessary to write the mode's
+ * string representation (as returned by reading from /sys/power/disk) back
+ * into /sys/power/disk.
 */
 static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -880,7 +878,6 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
        return buf-start;
 }
 static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
                          const char *buf, size_t n)
 {
@@ -982,10 +979,33 @@ static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *att
 power_attr(image_size);
+static ssize_t reserved_size_show(struct kobject *kobj,
+                                  struct kobj_attribute *attr, char *buf)
+{
+        return sprintf(buf, "%lu\n", reserved_size);
+}
+static ssize_t reserved_size_store(struct kobject *kobj,
+                                   struct kobj_attribute *attr,
+                                   const char *buf, size_t n)
+{
+        unsigned long size;
+        if (sscanf(buf, "%lu", &size) == 1) {
+                reserved_size = size;
+                return n;
+        }
+        return -EINVAL;
+}
+power_attr(reserved_size);
 static struct attribute * g[] = {
        &disk_attr.attr,
        &resume_attr.attr,
        &image_size_attr.attr,
+        &reserved_size_attr.attr,
        NULL,
 };
diff --git a/kernel/power/main.c b/kernel/power/main.c
index de9aef8742f4..6c601f871964 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -37,8 +37,9 @@ EXPORT_SYMBOL_GPL(unregister_pm_notifier);
 int pm_notifier_call_chain(unsigned long val)
 {
-        return (blocking_notifier_call_chain(&pm_chain_head, val, NULL)
+        int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL);
-                        == NOTIFY_BAD) ? -EINVAL : 0;
+        return notifier_to_errno(ret);
 }
 /* If set, devices may be suspended and resumed asynchronously. */
@@ -337,6 +338,7 @@ static int __init pm_init(void)
        if (error)
                return error;
        hibernate_image_size_init();
+        hibernate_reserved_size_init();
        power_kobj = kobject_create_and_add("power", NULL);
        if (!power_kobj)
                return -ENOMEM;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 03634be55f62..9a00a0a26280 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -15,6 +15,7 @@ struct swsusp_info {
 #ifdef CONFIG_HIBERNATION
 /* kernel/power/snapshot.c */
+extern void __init hibernate_reserved_size_init(void);
 extern void __init hibernate_image_size_init(void);
 #ifdef CONFIG_ARCH_HIBERNATION_HEADER
@@ -55,6 +56,7 @@ extern int hibernation_platform_enter(void);
 #else /* !CONFIG_HIBERNATION */
+static inline void hibernate_reserved_size_init(void) {}
 static inline void hibernate_image_size_init(void) {}
 #endif /* !CONFIG_HIBERNATION */
@@ -72,6 +74,8 @@ static struct kobj_attribute _name##_attr = {	\
 /* Preferred image size in bytes (default 500 MB) */
 extern unsigned long image_size;
+/* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */
+extern unsigned long reserved_size;
 extern int in_suspend;
 extern dev_t swsusp_resume_device;
 extern sector_t swsusp_resume_block;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index ca0aacc24874..06efa54f93d6 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -41,16 +41,28 @@ static void swsusp_set_page_forbidden(struct page *);
 static void swsusp_unset_page_forbidden(struct page *);
 /*
+ * Number of bytes to reserve for memory allocations made by device drivers
+ * from their ->freeze() and ->freeze_noirq() callbacks so that they don't
+ * cause image creation to fail (tunable via /sys/power/reserved_size).
+ */
+unsigned long reserved_size;
+void __init hibernate_reserved_size_init(void)
+{
+        reserved_size = SPARE_PAGES * PAGE_SIZE;
+}
+/*
 * Preferred image size in bytes (tunable via /sys/power/image_size).
- * When it is set to N, the image creating code will do its best to
+ * When it is set to N, swsusp will do its best to ensure the image
- * ensure the image size will not exceed N bytes, but if that is
+ * size will not exceed N bytes, but if that is impossible, it will
- * impossible, it will try to create the smallest image possible.
+ * try to create the smallest image possible.
 */
 unsigned long image_size;
 void __init hibernate_image_size_init(void)
 {
-        image_size = (totalram_pages / 3) * PAGE_SIZE;
+        image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;
 }
 /* List of PBEs needed for restoring the pages that were allocated before
@@ -1199,7 +1211,11 @@ static void free_unnecessary_pages(void)
                to_free_highmem = alloc_highmem - save;
        } else {
                to_free_highmem = 0;
-                to_free_normal -= save - alloc_highmem;
+                save -= alloc_highmem;
+                if (to_free_normal > save)
+                        to_free_normal -= save;
+                else
+                        to_free_normal = 0;
        }
        memory_bm_position_reset(&copy_bm);
@@ -1263,11 +1279,13 @@ static unsigned long minimum_image_size(unsigned long saveable)
 * frame in use.  We also need a number of page frames to be free during
 * hibernation for allocations made while saving the image and for device
 * drivers, in case they need to allocate memory from their hibernation
- * callbacks (these two numbers are given by PAGES_FOR_IO and SPARE_PAGES,
+ * callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough
- * respectively, both of which are rough estimates).  To make this happen, we
+ * estimate) and reserverd_size divided by PAGE_SIZE (which is tunable through
- * compute the total number of available page frames and allocate at least
+ * /sys/power/reserved_size, respectively).  To make this happen, we compute the
+ * total number of available page frames and allocate at least
 *
- * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + 2 * SPARE_PAGES
+ * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2
+ *  + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE)
 *
 * of them, which corresponds to the maximum size of a hibernation image.
 *
@@ -1322,7 +1340,8 @@ int hibernate_preallocate_memory(void)
        count -= totalreserve_pages;
        /* Compute the maximum number of saveable pages to leave in memory. */
-        max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES;
+        max_size = (count - (size + PAGES_FOR_IO)) / 2
+                        - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE);
        /* Compute the desired number of image pages specified by image_size. */
        size = DIV_ROUND_UP(image_size, PAGE_SIZE);
        if (size > max_size)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 8935369d503a..b6b71ad2208f 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -44,6 +44,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops)
        suspend_ops = ops;
        mutex_unlock(&pm_mutex);
 }
+EXPORT_SYMBOL_GPL(suspend_set_ops);
 bool valid_state(suspend_state_t state)
 {
@@ -65,6 +66,7 @@ int suspend_valid_only_mem(suspend_state_t state)
 {
        return state == PM_SUSPEND_MEM;
 }
+EXPORT_SYMBOL_GPL(suspend_valid_only_mem);
 static int suspend_test(int level)
 {
@@ -126,12 +128,13 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
 }
 /**
- *      suspend_enter - enter the desired system sleep state.
+ * suspend_enter - enter the desired system sleep state.
- *      @state:         state to enter
+ * @state: State to enter
+ * @wakeup: Returns information that suspend should not be entered again.
 *
- *      This function should be called after devices have been suspended.
+ * This function should be called after devices have been suspended.
 */
-static int suspend_enter(suspend_state_t state)
+static int suspend_enter(suspend_state_t state, bool *wakeup)
 {
        int error;
@@ -163,19 +166,14 @@ static int suspend_enter(suspend_state_t state)
        arch_suspend_disable_irqs();
        BUG_ON(!irqs_disabled());
-        error = sysdev_suspend(PMSG_SUSPEND);
+        error = syscore_suspend();
        if (!error) {
-                error = syscore_suspend();
+                *wakeup = pm_wakeup_pending();
-                if (error)
+                if (!(suspend_test(TEST_CORE) || *wakeup)) {
-                        sysdev_resume();
-        }
-        if (!error) {
-                if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
                        error = suspend_ops->enter(state);
                        events_check_enabled = false;
                }
                syscore_resume();
-                sysdev_resume();
        }
        arch_suspend_enable_irqs();
@@ -205,6 +203,7 @@ static int suspend_enter(suspend_state_t state)
 int suspend_devices_and_enter(suspend_state_t state)
 {
        int error;
+        bool wakeup = false;
        if (!suspend_ops)
                return -ENOSYS;
@@ -216,7 +215,6 @@ int suspend_devices_and_enter(suspend_state_t state)
                        goto Close;
        }
        suspend_console();
-        pm_restrict_gfp_mask();
        suspend_test_start();
        error = dpm_suspend_start(PMSG_SUSPEND);
        if (error) {
@@ -227,13 +225,15 @@ int suspend_devices_and_enter(suspend_state_t state)
        if (suspend_test(TEST_DEVICES))
                goto Recover_platform;
-        suspend_enter(state);
+        do {
+                error = suspend_enter(state, &wakeup);
+        } while (!error && !wakeup
+                && suspend_ops->suspend_again && suspend_ops->suspend_again());
 Resume_devices:
        suspend_test_start();
        dpm_resume_end(PMSG_RESUME);
        suspend_test_finish("resume devices");
-        pm_restore_gfp_mask();
        resume_console();
 Close:
        if (suspend_ops->end)
@@ -294,7 +294,9 @@ int enter_state(suspend_state_t state)
                goto Finish;
        pr_debug("PM: Entering %s sleep\n", pm_states[state]);
+        pm_restrict_gfp_mask();
        error = suspend_devices_and_enter(state);
+        pm_restore_gfp_mask();
 Finish:
        pr_debug("PM: Finishing wakeup.\n");
diff --git a/kernel/power/user.c b/kernel/power/user.c
index c36c3b9e8a84..42ddbc6f0de6 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -113,8 +113,10 @@ static int snapshot_open(struct inode *inode, struct file *filp)
                if (error)
                        pm_notifier_call_chain(PM_POST_RESTORE);
        }
-        if (error)
+        if (error) {
+                free_basic_memory_bitmaps();
                atomic_inc(&snapshot_device_available);
+        }
        data->frozen = 0;
        data->ready = 0;
        data->platform_support = 0;
@@ -135,8 +137,10 @@ static int snapshot_release(struct inode *inode, struct file *filp)
        free_basic_memory_bitmaps();
        data = filp->private_data;
        free_all_swap_pages(data->swap);
-        if (data->frozen)
+        if (data->frozen) {
+                pm_restore_gfp_mask();
                thaw_processes();
+        }
        pm_notifier_call_chain(data->mode == O_RDONLY ?
                        PM_POST_HIBERNATION : PM_POST_RESTORE);
        atomic_inc(&snapshot_device_available);
@@ -379,6 +383,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                 * PM_HIBERNATION_PREPARE
                 */
                error = suspend_devices_and_enter(PM_SUSPEND_MEM);
+                data->ready = 0;
                break;
        case SNAPSHOT_PLATFORM_SUPPORT:
diff --git a/kernel/printk.c b/kernel/printk.c
index da8ca817eae3..836a2ae0ac31 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -31,6 +31,7 @@
 #include <linux/smp.h>
 #include <linux/security.h>
 #include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/syscalls.h>
 #include <linux/kexec.h>
 #include <linux/kdb.h>
@@ -167,46 +168,74 @@ void log_buf_kexec_setup(void)
 }
 #endif
+/* requested log_buf_len from kernel cmdline */
+static unsigned long __initdata new_log_buf_len;
+/* save requested log_buf_len since it's too early to process it */
 static int __init log_buf_len_setup(char *str)
 {
        unsigned size = memparse(str, &str);
-        unsigned long flags;
        if (size)
                size = roundup_pow_of_two(size);
-        if (size > log_buf_len) {
+        if (size > log_buf_len)
-                unsigned start, dest_idx, offset;
+                new_log_buf_len = size;
-                char *new_log_buf;
-                new_log_buf = alloc_bootmem(size);
+        return 0;
-                if (!new_log_buf) {
+}
-                        printk(KERN_WARNING "log_buf_len: allocation failed\n");
+early_param("log_buf_len", log_buf_len_setup);
-                        goto out;
-                }
-                spin_lock_irqsave(&logbuf_lock, flags);
+void __init setup_log_buf(int early)
-                log_buf_len = size;
+{
-                log_buf = new_log_buf;
+        unsigned long flags;
+        unsigned start, dest_idx, offset;
-                offset = start = min(con_start, log_start);
+        char *new_log_buf;
-                dest_idx = 0;
+        int free;
-                while (start != log_end) {
-                        log_buf[dest_idx] = __log_buf[start & (__LOG_BUF_LEN - 1)];
+        if (!new_log_buf_len)
-                        start++;
+                return;
-                        dest_idx++;
-                }
+        if (early) {
-                log_start -= offset;
+                unsigned long mem;
-                con_start -= offset;
-                log_end -= offset;
-                spin_unlock_irqrestore(&logbuf_lock, flags);
-                printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len);
+                mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
+                if (mem == MEMBLOCK_ERROR)
+                        return;
+                new_log_buf = __va(mem);
+        } else {
+                new_log_buf = alloc_bootmem_nopanic(new_log_buf_len);
        }
-out:
-        return 1;
-}
-__setup("log_buf_len=", log_buf_len_setup);
+        if (unlikely(!new_log_buf)) {
+                pr_err("log_buf_len: %ld bytes not available\n",
+                        new_log_buf_len);
+                return;
+        }
+        spin_lock_irqsave(&logbuf_lock, flags);
+        log_buf_len = new_log_buf_len;
+        log_buf = new_log_buf;
+        new_log_buf_len = 0;
+        free = __LOG_BUF_LEN - log_end;
+        offset = start = min(con_start, log_start);
+        dest_idx = 0;
+        while (start != log_end) {
+                unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1);
+                log_buf[dest_idx] = __log_buf[log_idx_mask];
+                start++;
+                dest_idx++;
+        }
+        log_start -= offset;
+        con_start -= offset;
+        log_end -= offset;
+        spin_unlock_irqrestore(&logbuf_lock, flags);
+        pr_info("log_buf_len: %d\n", log_buf_len);
+        pr_info("early log buf free: %d(%d%%)\n",
+                free, (free * 100) / __LOG_BUF_LEN);
+}
 #ifdef CONFIG_BOOT_PRINTK_DELAY
@@ -289,8 +318,10 @@ static int check_syslog_permissions(int type, bool from_file)
                        return 0;
                /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */
                if (capable(CAP_SYS_ADMIN)) {
-                        WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN "
+                        printk_once(KERN_WARNING "%s (%d): "
-                                 "but no CAP_SYSLOG (deprecated).\n");
+                                 "Attempt to access syslog with CAP_SYS_ADMIN "
+                                 "but no CAP_SYSLOG (deprecated).\n",
+                                 current->comm, task_pid_nr(current));
                        return 0;
                }
                return -EPERM;
@@ -753,7 +784,7 @@ static inline int can_use_console(unsigned int cpu)
 static int console_trylock_for_printk(unsigned int cpu)
        __releases(&logbuf_lock)
 {
-        int retval = 0;
+        int retval = 0, wake = 0;
        if (console_trylock()) {
                retval = 1;
@@ -766,12 +797,14 @@ static int console_trylock_for_printk(unsigned int cpu)
                 */
                if (!can_use_console(cpu)) {
                        console_locked = 0;
-                        up(&console_sem);
+                        wake = 1;
                        retval = 0;
                }
        }
        printk_cpu = UINT_MAX;
        spin_unlock(&logbuf_lock);
+        if (wake)
+                up(&console_sem);
        return retval;
 }
 static const char recursion_bug_msg [] =
@@ -1213,7 +1246,7 @@ void console_unlock(void)
 {
        unsigned long flags;
        unsigned _con_start, _log_end;
-        unsigned wake_klogd = 0;
+        unsigned wake_klogd = 0, retry = 0;
        if (console_suspended) {
                up(&console_sem);
@@ -1222,6 +1255,7 @@ void console_unlock(void)
        console_may_schedule = 0;
+again:
        for ( ; ; ) {
                spin_lock_irqsave(&logbuf_lock, flags);
                wake_klogd |= log_start - log_end;
@@ -1242,8 +1276,23 @@ void console_unlock(void)
        if (unlikely(exclusive_console))
                exclusive_console = NULL;
+        spin_unlock(&logbuf_lock);
        up(&console_sem);
+        /*
+         * Someone could have filled up the buffer again, so re-check if there's
+         * something to flush. In case we cannot trylock the console_sem again,
+         * there's a new owner and the console_unlock() from them will do the
+         * flush, no worries.
+         */
+        spin_lock(&logbuf_lock);
+        if (con_start != log_end)
+                retry = 1;
        spin_unlock_irqrestore(&logbuf_lock, flags);
+        if (retry && console_trylock())
+                goto again;
        if (wake_klogd)
                wake_up_klogd();
 }
diff --git a/kernel/profile.c b/kernel/profile.c
index 66f841b7fbd3..961b389fe52f 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -126,11 +126,9 @@ int __ref profile_init(void)
        if (prof_buffer)
                return 0;
-        prof_buffer = vmalloc(buffer_bytes);
+        prof_buffer = vzalloc(buffer_bytes);
-        if (prof_buffer) {
+        if (prof_buffer)
-                memset(prof_buffer, 0, buffer_bytes);
                return 0;
-        }
        free_cpumask_var(prof_cpu_mask);
        return -ENOMEM;
@@ -305,14 +303,12 @@ static void profile_discard_flip_buffers(void)
        mutex_unlock(&profile_flip_mutex);
 }
-void profile_hits(int type, void *__pc, unsigned int nr_hits)
+static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
 {
        unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
        int i, j, cpu;
        struct profile_hit *hits;
-        if (prof_on != type || !prof_buffer)
-                return;
        pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
        i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
        secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
@@ -419,16 +415,20 @@ out_free:
 #define profile_discard_flip_buffers()  do { } while (0)
 #define profile_cpu_callback            NULL
-void profile_hits(int type, void *__pc, unsigned int nr_hits)
+static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
 {
        unsigned long pc;
-        if (prof_on != type || !prof_buffer)
-                return;
        pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
        atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
 }
 #endif /* !CONFIG_SMP */
+void profile_hits(int type, void *__pc, unsigned int nr_hits)
+{
+        if (prof_on != type || !prof_buffer)
+                return;
+        do_profile_hits(type, __pc, nr_hits);
+}
 EXPORT_SYMBOL_GPL(profile_hits);
 void profile_tick(int type)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index dc7ab65f3b36..9de3ecfd20f9 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -23,8 +23,15 @@
 #include <linux/uaccess.h>
 #include <linux/regset.h>
 #include <linux/hw_breakpoint.h>
+#include <linux/cn_proc.h>
+static int ptrace_trapping_sleep_fn(void *flags)
+{
+        schedule();
+        return 0;
+}
 /*
 * ptrace a task: make the debugger its new parent and
 * move it to the ptrace list.
@@ -38,35 +45,33 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
        child->parent = new_parent;
 }
-/*
+/**
- * Turn a tracing stop into a normal stop now, since with no tracer there
+ * __ptrace_unlink - unlink ptracee and restore its execution state
- * would be no way to wake it up with SIGCONT or SIGKILL.  If there was a
+ * @child: ptracee to be unlinked
- * signal sent that would resume the child, but didn't because it was in
- * TASK_TRACED, resume it now.
- * Requires that irqs be disabled.
- */
-static void ptrace_untrace(struct task_struct *child)
-{
-        spin_lock(&child->sighand->siglock);
-        if (task_is_traced(child)) {
-                /*
-                 * If the group stop is completed or in progress,
-                 * this thread was already counted as stopped.
-                 */
-                if (child->signal->flags & SIGNAL_STOP_STOPPED ||
-                    child->signal->group_stop_count)
-                        __set_task_state(child, TASK_STOPPED);
-                else
-                        signal_wake_up(child, 1);
-        }
-        spin_unlock(&child->sighand->siglock);
-}
-/*
- * unptrace a task: move it back to its original parent and
- * remove it from the ptrace list.
 *
- * Must be called with the tasklist lock write-held.
+ * Remove @child from the ptrace list, move it back to the original parent,
+ * and restore the execution state so that it conforms to the group stop
+ * state.
+ *
+ * Unlinking can happen via two paths - explicit PTRACE_DETACH or ptracer
+ * exiting.  For PTRACE_DETACH, unless the ptracee has been killed between
+ * ptrace_check_attach() and here, it's guaranteed to be in TASK_TRACED.
+ * If the ptracer is exiting, the ptracee can be in any state.
+ *
+ * After detach, the ptracee should be in a state which conforms to the
+ * group stop.  If the group is stopped or in the process of stopping, the
+ * ptracee should be put into TASK_STOPPED; otherwise, it should be woken
+ * up from TASK_TRACED.
+ *
+ * If the ptracee is in TASK_TRACED and needs to be moved to TASK_STOPPED,
+ * it goes through TRACED -> RUNNING -> STOPPED transition which is similar
+ * to but in the opposite direction of what happens while attaching to a
+ * stopped task.  However, in this direction, the intermediate RUNNING
+ * state is not hidden even from the current ptracer and if it immediately
+ * re-attaches and performs a WNOHANG wait(2), it may fail.
+ *
+ * CONTEXT:
+ * write_lock_irq(tasklist_lock)
 */
 void __ptrace_unlink(struct task_struct *child)
 {
@@ -76,14 +81,54 @@ void __ptrace_unlink(struct task_struct *child)
        child->parent = child->real_parent;
        list_del_init(&child->ptrace_entry);
-        if (task_is_traced(child))
+        spin_lock(&child->sighand->siglock);
-                ptrace_untrace(child);
+        /*
+         * Clear all pending traps and TRAPPING.  TRAPPING should be
+         * cleared regardless of JOBCTL_STOP_PENDING.  Do it explicitly.
+         */
+        task_clear_jobctl_pending(child, JOBCTL_TRAP_MASK);
+        task_clear_jobctl_trapping(child);
+        /*
+         * Reinstate JOBCTL_STOP_PENDING if group stop is in effect and
+         * @child isn't dead.
+         */
+        if (!(child->flags & PF_EXITING) &&
+            (child->signal->flags & SIGNAL_STOP_STOPPED ||
+             child->signal->group_stop_count))
+                child->jobctl |= JOBCTL_STOP_PENDING;
+        /*
+         * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
+         * @child in the butt.  Note that @resume should be used iff @child
+         * is in TASK_TRACED; otherwise, we might unduly disrupt
+         * TASK_KILLABLE sleeps.
+         */
+        if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child))
+                signal_wake_up(child, task_is_traced(child));
+        spin_unlock(&child->sighand->siglock);
 }
-/*
+/**
- * Check that we have indeed attached to the thing..
+ * ptrace_check_attach - check whether ptracee is ready for ptrace operation
+ * @child: ptracee to check for
+ * @ignore_state: don't check whether @child is currently %TASK_TRACED
+ *
+ * Check whether @child is being ptraced by %current and ready for further
+ * ptrace operations.  If @ignore_state is %false, @child also should be in
+ * %TASK_TRACED state and on return the child is guaranteed to be traced
+ * and not executing.  If @ignore_state is %true, @child can be in any
+ * state.
+ *
+ * CONTEXT:
+ * Grabs and releases tasklist_lock and @child->sighand->siglock.
+ *
+ * RETURNS:
+ * 0 on success, -ESRCH if %child is not ready.
 */
-int ptrace_check_attach(struct task_struct *child, int kill)
+int ptrace_check_attach(struct task_struct *child, bool ignore_state)
 {
        int ret = -ESRCH;
@@ -96,21 +141,20 @@ int ptrace_check_attach(struct task_struct *child, int kill)
         */
        read_lock(&tasklist_lock);
        if ((child->ptrace & PT_PTRACED) && child->parent == current) {
-                ret = 0;
                /*
                 * child->sighand can't be NULL, release_task()
                 * does ptrace_unlink() before __exit_signal().
                 */
                spin_lock_irq(&child->sighand->siglock);
-                if (task_is_stopped(child))
+                WARN_ON_ONCE(task_is_stopped(child));
-                        child->state = TASK_TRACED;
+                if (ignore_state || (task_is_traced(child) &&
-                else if (!task_is_traced(child) && !kill)
+                                     !(child->jobctl & JOBCTL_LISTENING)))
-                        ret = -ESRCH;
+                        ret = 0;
                spin_unlock_irq(&child->sighand->siglock);
        }
        read_unlock(&tasklist_lock);
-        if (!ret && !kill)
+        if (!ret && !ignore_state)
                ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH;
        /* All systems go.. */
@@ -167,10 +211,28 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
        return !err;
 }
-static int ptrace_attach(struct task_struct *task)
+static int ptrace_attach(struct task_struct *task, long request,
+                         unsigned long flags)
 {
+        bool seize = (request == PTRACE_SEIZE);
        int retval;
+        /*
+         * SEIZE will enable new ptrace behaviors which will be implemented
+         * gradually.  SEIZE_DEVEL is used to prevent applications
+         * expecting full SEIZE behaviors trapping on kernel commits which
+         * are still in the process of implementing them.
+         *
+         * Only test programs for new ptrace behaviors being implemented
+         * should set SEIZE_DEVEL.  If unset, SEIZE will fail with -EIO.
+         *
+         * Once SEIZE behaviors are completely implemented, this flag and
+         * the following test will be removed.
+         */
+        retval = -EIO;
+        if (seize && !(flags & PTRACE_SEIZE_DEVEL))
+                goto out;
        audit_ptrace(task);
        retval = -EPERM;
@@ -202,11 +264,41 @@ static int ptrace_attach(struct task_struct *task)
                goto unlock_tasklist;
        task->ptrace = PT_PTRACED;
+        if (seize)
+                task->ptrace |= PT_SEIZED;
        if (task_ns_capable(task, CAP_SYS_PTRACE))
                task->ptrace |= PT_PTRACE_CAP;
        __ptrace_link(task, current);
-        send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
+        /* SEIZE doesn't trap tracee on attach */
+        if (!seize)
+                send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
+        spin_lock(&task->sighand->siglock);
+        /*
+         * If the task is already STOPPED, set JOBCTL_TRAP_STOP and
+         * TRAPPING, and kick it so that it transits to TRACED.  TRAPPING
+         * will be cleared if the child completes the transition or any
+         * event which clears the group stop states happens.  We'll wait
+         * for the transition to complete before returning from this
+         * function.
+         *
+         * This hides STOPPED -> RUNNING -> TRACED transition from the
+         * attaching thread but a different thread in the same group can
+         * still observe the transient RUNNING state.  IOW, if another
+         * thread's WNOHANG wait(2) on the stopped tracee races against
+         * ATTACH, the wait(2) may fail due to the transient RUNNING.
+         *
+         * The following task_is_stopped() test is safe as both transitions
+         * in and out of STOPPED are protected by siglock.
+         */
+        if (task_is_stopped(task) &&
+            task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING))
+                signal_wake_up(task, 1);
+        spin_unlock(&task->sighand->siglock);
        retval = 0;
 unlock_tasklist:
@@ -214,6 +306,12 @@ unlock_tasklist:
 unlock_creds:
        mutex_unlock(&task->signal->cred_guard_mutex);
 out:
+        if (!retval) {
+                wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT,
+                            ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE);
+                proc_ptrace_connector(task, PTRACE_ATTACH);
+        }
        return retval;
 }
@@ -276,25 +374,27 @@ static int ignoring_children(struct sighand_struct *sigh)
 */
 static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
 {
+        bool dead;
        __ptrace_unlink(p);
-        if (p->exit_state == EXIT_ZOMBIE) {
+        if (p->exit_state != EXIT_ZOMBIE)
-                if (!task_detached(p) && thread_group_empty(p)) {
+                return false;
-                        if (!same_thread_group(p->real_parent, tracer))
-                                do_notify_parent(p, p->exit_signal);
+        dead = !thread_group_leader(p);
-                        else if (ignoring_children(tracer->sighand)) {
-                                __wake_up_parent(p, tracer);
+        if (!dead && thread_group_empty(p)) {
-                                p->exit_signal = -1;
+                if (!same_thread_group(p->real_parent, tracer))
-                        }
+                        dead = do_notify_parent(p, p->exit_signal);
-                }
+                else if (ignoring_children(tracer->sighand)) {
-                if (task_detached(p)) {
+                        __wake_up_parent(p, tracer);
-                        /* Mark it as in the process of being reaped. */
+                        dead = true;
-                        p->exit_state = EXIT_DEAD;
-                        return true;
                }
        }
+        /* Mark it as in the process of being reaped. */
-        return false;
+        if (dead)
+                p->exit_state = EXIT_DEAD;
+        return dead;
 }
 static int ptrace_detach(struct task_struct *child, unsigned int data)
@@ -316,11 +416,10 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
        if (child->ptrace) {
                child->exit_code = data;
                dead = __ptrace_detach(current, child);
-                if (!child->exit_state)
-                        wake_up_state(child, TASK_TRACED | TASK_STOPPED);
        }
        write_unlock_irq(&tasklist_lock);
+        proc_ptrace_connector(child, PTRACE_DETACH);
        if (unlikely(dead))
                release_task(child);
@@ -518,7 +617,7 @@ static int ptrace_resume(struct task_struct *child, long request,
        }
        child->exit_code = data;
-        wake_up_process(child);
+        wake_up_state(child, __TASK_TRACED);
        return 0;
 }
@@ -567,10 +666,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
 int ptrace_request(struct task_struct *child, long request,
                   unsigned long addr, unsigned long data)
 {
+        bool seized = child->ptrace & PT_SEIZED;
        int ret = -EIO;
-        siginfo_t siginfo;
+        siginfo_t siginfo, *si;
        void __user *datavp = (void __user *) data;
        unsigned long __user *datalp = datavp;
+        unsigned long flags;
        switch (request) {
        case PTRACE_PEEKTEXT:
@@ -603,6 +704,62 @@ int ptrace_request(struct task_struct *child, long request,
                        ret = ptrace_setsiginfo(child, &siginfo);
                break;
+        case PTRACE_INTERRUPT:
+                /*
+                 * Stop tracee without any side-effect on signal or job
+                 * control.  At least one trap is guaranteed to happen
+                 * after this request.  If @child is already trapped, the
+                 * current trap is not disturbed and another trap will
+                 * happen after the current trap is ended with PTRACE_CONT.
+                 *
+                 * The actual trap might not be PTRACE_EVENT_STOP trap but
+                 * the pending condition is cleared regardless.
+                 */
+                if (unlikely(!seized || !lock_task_sighand(child, &flags)))
+                        break;
+                /*
+                 * INTERRUPT doesn't disturb existing trap sans one
+                 * exception.  If ptracer issued LISTEN for the current
+                 * STOP, this INTERRUPT should clear LISTEN and re-trap
+                 * tracee into STOP.
+                 */
+                if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP)))
+                        signal_wake_up(child, child->jobctl & JOBCTL_LISTENING);
+                unlock_task_sighand(child, &flags);
+                ret = 0;
+                break;
+        case PTRACE_LISTEN:
+                /*
+                 * Listen for events.  Tracee must be in STOP.  It's not
+                 * resumed per-se but is not considered to be in TRACED by
+                 * wait(2) or ptrace(2).  If an async event (e.g. group
+                 * stop state change) happens, tracee will enter STOP trap
+                 * again.  Alternatively, ptracer can issue INTERRUPT to
+                 * finish listening and re-trap tracee into STOP.
+                 */
+                if (unlikely(!seized || !lock_task_sighand(child, &flags)))
+                        break;
+                si = child->last_siginfo;
+                if (unlikely(!si || si->si_code >> 8 != PTRACE_EVENT_STOP))
+                        break;
+                child->jobctl |= JOBCTL_LISTENING;
+                /*
+                 * If NOTIFY is set, it means event happened between start
+                 * of this trap and now.  Trigger re-trap immediately.
+                 */
+                if (child->jobctl & JOBCTL_TRAP_NOTIFY)
+                        signal_wake_up(child, true);
+                unlock_task_sighand(child, &flags);
+                ret = 0;
+                break;
        case PTRACE_DETACH:      /* detach a process that was attached. */
                ret = ptrace_detach(child, data);
                break;
@@ -717,8 +874,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
                goto out;
        }
-        if (request == PTRACE_ATTACH) {
+        if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
-                ret = ptrace_attach(child);
+                ret = ptrace_attach(child, request, data);
                /*
                 * Some architectures need to do book-keeping after
                 * a ptrace attach.
@@ -728,7 +885,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
                goto out_put_task_struct;
        }
-        ret = ptrace_check_attach(child, request == PTRACE_KILL);
+        ret = ptrace_check_attach(child, request == PTRACE_KILL ||
+                                  request == PTRACE_INTERRUPT);
        if (ret < 0)
                goto out_put_task_struct;
@@ -859,8 +1017,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
                goto out;
        }
-        if (request == PTRACE_ATTACH) {
+        if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
-                ret = ptrace_attach(child);
+                ret = ptrace_attach(child, request, data);
                /*
                 * Some architectures need to do book-keeping after
                 * a ptrace attach.
@@ -870,7 +1028,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
                goto out_put_task_struct;
        }
-        ret = ptrace_check_attach(child, request == PTRACE_KILL);
+        ret = ptrace_check_attach(child, request == PTRACE_KILL ||
+                                  request == PTRACE_INTERRUPT);
        if (!ret)
                ret = compat_arch_ptrace(child, request, addr, data);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f3240e987928..ddddb320be61 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -37,7 +37,7 @@
 #include <linux/smp.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <linux/bitops.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
@@ -142,10 +142,17 @@ static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
                 * Ensure that queued callbacks are all executed.
                 * If we detect that we are nested in a RCU read-side critical
                 * section, we should simply fail, otherwise we would deadlock.
+                 * In !PREEMPT configurations, there is no way to tell if we are
+                 * in a RCU read-side critical section or not, so we never
+                 * attempt any fixup and just print a warning.
                 */
+#ifndef CONFIG_PREEMPT
+                WARN_ON_ONCE(1);
+                return 0;
+#endif
                if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
                    irqs_disabled()) {
-                        WARN_ON(1);
+                        WARN_ON_ONCE(1);
                        return 0;
                }
                rcu_barrier();
@@ -184,10 +191,17 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
                 * Ensure that queued callbacks are all executed.
                 * If we detect that we are nested in a RCU read-side critical
                 * section, we should simply fail, otherwise we would deadlock.
+                 * In !PREEMPT configurations, there is no way to tell if we are
+                 * in a RCU read-side critical section or not, so we never
+                 * attempt any fixup and just print a warning.
                 */
+#ifndef CONFIG_PREEMPT
+                WARN_ON_ONCE(1);
+                return 0;
+#endif
                if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
                    irqs_disabled()) {
-                        WARN_ON(1);
+                        WARN_ON_ONCE(1);
                        return 0;
                }
                rcu_barrier();
@@ -214,15 +228,17 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
                 * Ensure that queued callbacks are all executed.
                 * If we detect that we are nested in a RCU read-side critical
                 * section, we should simply fail, otherwise we would deadlock.
-                 * Note that the machinery to reliably determine whether
+                 * In !PREEMPT configurations, there is no way to tell if we are
-                 * or not we are in an RCU read-side critical section
+                 * in a RCU read-side critical section or not, so we never
-                 * exists only in the preemptible RCU implementations
+                 * attempt any fixup and just print a warning.
-                 * (TINY_PREEMPT_RCU and TREE_PREEMPT_RCU), which is why
-                 * DEBUG_OBJECTS_RCU_HEAD is disallowed if !PREEMPT.
                 */
+#ifndef CONFIG_PREEMPT
+                WARN_ON_ONCE(1);
+                return 0;
+#endif
                if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
                    irqs_disabled()) {
-                        WARN_ON(1);
+                        WARN_ON_ONCE(1);
                        return 0;
                }
                rcu_barrier();
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 0c343b9a46d5..7bbac7d0f5ab 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -35,15 +35,16 @@
 #include <linux/init.h>
 #include <linux/time.h>
 #include <linux/cpu.h>
+#include <linux/prefetch.h>
 /* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
 static struct task_struct *rcu_kthread_task;
 static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
 static unsigned long have_rcu_kthread_work;
-static void invoke_rcu_kthread(void);
 /* Forward declarations for rcutiny_plugin.h. */
 struct rcu_ctrlblk;
+static void invoke_rcu_kthread(void);
 static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
 static int rcu_kthread(void *arg);
 static void __call_rcu(struct rcu_head *head,
@@ -79,36 +80,45 @@ void rcu_exit_nohz(void)
 #endif /* #ifdef CONFIG_NO_HZ */
 /*
- * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc().
+ * Helper function for rcu_sched_qs() and rcu_bh_qs().
- * Also disable irqs to avoid confusion due to interrupt handlers
+ * Also irqs are disabled to avoid confusion due to interrupt handlers
 * invoking call_rcu().
 */
 static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
 {
-        unsigned long flags;
-        local_irq_save(flags);
        if (rcp->rcucblist != NULL &&
            rcp->donetail != rcp->curtail) {
                rcp->donetail = rcp->curtail;
-                local_irq_restore(flags);
                return 1;
        }
-        local_irq_restore(flags);
        return 0;
 }
 /*
+ * Wake up rcu_kthread() to process callbacks now eligible for invocation
+ * or to boost readers.
+ */
+static void invoke_rcu_kthread(void)
+{
+        have_rcu_kthread_work = 1;
+        wake_up(&rcu_kthread_wq);
+}
+/*
 * Record an rcu quiescent state.  And an rcu_bh quiescent state while we
 * are at it, given that any rcu quiescent state is also an rcu_bh
 * quiescent state.  Use "+" instead of "||" to defeat short circuiting.
 */
 void rcu_sched_qs(int cpu)
 {
+        unsigned long flags;
+        local_irq_save(flags);
        if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
            rcu_qsctr_help(&rcu_bh_ctrlblk))
                invoke_rcu_kthread();
+        local_irq_restore(flags);
 }
 /*
@@ -116,8 +126,12 @@ void rcu_sched_qs(int cpu)
 */
 void rcu_bh_qs(int cpu)
 {
+        unsigned long flags;
+        local_irq_save(flags);
        if (rcu_qsctr_help(&rcu_bh_ctrlblk))
                invoke_rcu_kthread();
+        local_irq_restore(flags);
 }
 /*
@@ -167,7 +181,7 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
                prefetch(next);
                debug_rcu_head_unqueue(list);
                local_bh_disable();
-                list->func(list);
+                __rcu_reclaim(list);
                local_bh_enable();
                list = next;
                RCU_TRACE(cb_count++);
@@ -208,20 +222,6 @@ static int rcu_kthread(void *arg)
 }
 /*
- * Wake up rcu_kthread() to process callbacks now eligible for invocation
- * or to boost readers.
- */
-static void invoke_rcu_kthread(void)
-{
-        unsigned long flags;
-        local_irq_save(flags);
-        have_rcu_kthread_work = 1;
-        wake_up(&rcu_kthread_wq);
-        local_irq_restore(flags);
-}
-/*
 * Wait for a grace period to elapse.  But it is illegal to invoke
 * synchronize_sched() from within an RCU read-side critical section.
 * Therefore, any legal call to synchronize_sched() is a quiescent
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 3cb8e362e883..f259c676195f 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -100,23 +100,28 @@ struct rcu_preempt_ctrlblk {
        u8 completed;           /* Last grace period completed. */
                                /*  If all three are equal, RCU is idle. */
 #ifdef CONFIG_RCU_BOOST
-        s8 boosted_this_gp;     /* Has boosting already happened? */
        unsigned long boost_time; /* When to start boosting (jiffies) */
 #endif /* #ifdef CONFIG_RCU_BOOST */
 #ifdef CONFIG_RCU_TRACE
        unsigned long n_grace_periods;
 #ifdef CONFIG_RCU_BOOST
        unsigned long n_tasks_boosted;
+                                /* Total number of tasks boosted. */
        unsigned long n_exp_boosts;
+                                /* Number of tasks boosted for expedited GP. */
        unsigned long n_normal_boosts;
-        unsigned long n_normal_balk_blkd_tasks;
+                                /* Number of tasks boosted for normal GP. */
-        unsigned long n_normal_balk_gp_tasks;
+        unsigned long n_balk_blkd_tasks;
-        unsigned long n_normal_balk_boost_tasks;
+                                /* Refused to boost: no blocked tasks. */
-        unsigned long n_normal_balk_boosted;
+        unsigned long n_balk_exp_gp_tasks;
-        unsigned long n_normal_balk_notyet;
+                                /* Refused to boost: nothing blocking GP. */
-        unsigned long n_normal_balk_nos;
+        unsigned long n_balk_boost_tasks;
-        unsigned long n_exp_balk_blkd_tasks;
+                                /* Refused to boost: already boosting. */
-        unsigned long n_exp_balk_nos;
+        unsigned long n_balk_notyet;
+                                /* Refused to boost: not yet time. */
+        unsigned long n_balk_nos;
+                                /* Refused to boost: not sure why, though. */
+                                /*  This can happen due to race conditions. */
 #endif /* #ifdef CONFIG_RCU_BOOST */
 #endif /* #ifdef CONFIG_RCU_TRACE */
 };
@@ -201,7 +206,6 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t)
 #ifdef CONFIG_RCU_BOOST
 static void rcu_initiate_boost_trace(void);
-static void rcu_initiate_exp_boost_trace(void);
 #endif /* #ifdef CONFIG_RCU_BOOST */
 /*
@@ -219,41 +223,21 @@ static void show_tiny_preempt_stats(struct seq_file *m)
                   "N."[!rcu_preempt_ctrlblk.gp_tasks],
                   "E."[!rcu_preempt_ctrlblk.exp_tasks]);
 #ifdef CONFIG_RCU_BOOST
-        seq_printf(m, "             ttb=%c btg=",
+        seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
-                   "B."[!rcu_preempt_ctrlblk.boost_tasks]);
+                   "             ",
-        switch (rcu_preempt_ctrlblk.boosted_this_gp) {
+                   "B."[!rcu_preempt_ctrlblk.boost_tasks],
-        case -1:
-                seq_puts(m, "exp");
-                break;
-        case 0:
-                seq_puts(m, "no");
-                break;
-        case 1:
-                seq_puts(m, "begun");
-                break;
-        case 2:
-                seq_puts(m, "done");
-                break;
-        default:
-                seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp);
-        }
-        seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
                   rcu_preempt_ctrlblk.n_tasks_boosted,
                   rcu_preempt_ctrlblk.n_exp_boosts,
                   rcu_preempt_ctrlblk.n_normal_boosts,
                   (int)(jiffies & 0xffff),
                   (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
-        seq_printf(m, "             %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n",
+        seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n",
-                   "normal balk",
+                   "             balk",
-                   rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks,
+                   rcu_preempt_ctrlblk.n_balk_blkd_tasks,
-                   rcu_preempt_ctrlblk.n_normal_balk_gp_tasks,
+                   rcu_preempt_ctrlblk.n_balk_exp_gp_tasks,
-                   rcu_preempt_ctrlblk.n_normal_balk_boost_tasks,
+                   rcu_preempt_ctrlblk.n_balk_boost_tasks,
-                   rcu_preempt_ctrlblk.n_normal_balk_boosted,
+                   rcu_preempt_ctrlblk.n_balk_notyet,
-                   rcu_preempt_ctrlblk.n_normal_balk_notyet,
+                   rcu_preempt_ctrlblk.n_balk_nos);
-                   rcu_preempt_ctrlblk.n_normal_balk_nos);
-        seq_printf(m, "             exp balk: bt=%lu nos=%lu\n",
-                   rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks,
-                   rcu_preempt_ctrlblk.n_exp_balk_nos);
 #endif /* #ifdef CONFIG_RCU_BOOST */
 }
@@ -271,25 +255,59 @@ static int rcu_boost(void)
 {
        unsigned long flags;
        struct rt_mutex mtx;
-        struct list_head *np;
        struct task_struct *t;
+        struct list_head *tb;
-        if (rcu_preempt_ctrlblk.boost_tasks == NULL)
+        if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
+            rcu_preempt_ctrlblk.exp_tasks == NULL)
                return 0;  /* Nothing to boost. */
        raw_local_irq_save(flags);
-        rcu_preempt_ctrlblk.boosted_this_gp++;
-        t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct,
+        /*
-                         rcu_node_entry);
+         * Recheck with irqs disabled: all tasks in need of boosting
-        np = rcu_next_node_entry(t);
+         * might exit their RCU read-side critical sections on their own
+         * if we are preempted just before disabling irqs.
+         */
+        if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
+            rcu_preempt_ctrlblk.exp_tasks == NULL) {
+                raw_local_irq_restore(flags);
+                return 0;
+        }
+        /*
+         * Preferentially boost tasks blocking expedited grace periods.
+         * This cannot starve the normal grace periods because a second
+         * expedited grace period must boost all blocked tasks, including
+         * those blocking the pre-existing normal grace period.
+         */
+        if (rcu_preempt_ctrlblk.exp_tasks != NULL) {
+                tb = rcu_preempt_ctrlblk.exp_tasks;
+                RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
+        } else {
+                tb = rcu_preempt_ctrlblk.boost_tasks;
+                RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
+        }
+        RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
+        /*
+         * We boost task t by manufacturing an rt_mutex that appears to
+         * be held by task t.  We leave a pointer to that rt_mutex where
+         * task t can find it, and task t will release the mutex when it
+         * exits its outermost RCU read-side critical section.  Then
+         * simply acquiring this artificial rt_mutex will boost task
+         * t's priority.  (Thanks to tglx for suggesting this approach!)
+         */
+        t = container_of(tb, struct task_struct, rcu_node_entry);
        rt_mutex_init_proxy_locked(&mtx, t);
        t->rcu_boost_mutex = &mtx;
        t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
        raw_local_irq_restore(flags);
        rt_mutex_lock(&mtx);
-        RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
+        rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
-        rcu_preempt_ctrlblk.boosted_this_gp++;
-        rt_mutex_unlock(&mtx);
+        return rcu_preempt_ctrlblk.boost_tasks != NULL ||
-        return rcu_preempt_ctrlblk.boost_tasks != NULL;
+               rcu_preempt_ctrlblk.exp_tasks != NULL;
 }
 /*
@@ -304,42 +322,25 @@ static int rcu_boost(void)
 */
 static int rcu_initiate_boost(void)
 {
-        if (!rcu_preempt_blocked_readers_cgp()) {
+        if (!rcu_preempt_blocked_readers_cgp() &&
-                RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++);
+            rcu_preempt_ctrlblk.exp_tasks == NULL) {
+                RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++);
                return 0;
        }
-        if (rcu_preempt_ctrlblk.gp_tasks != NULL &&
+        if (rcu_preempt_ctrlblk.exp_tasks != NULL ||
-            rcu_preempt_ctrlblk.boost_tasks == NULL &&
+            (rcu_preempt_ctrlblk.gp_tasks != NULL &&
-            rcu_preempt_ctrlblk.boosted_this_gp == 0 &&
+             rcu_preempt_ctrlblk.boost_tasks == NULL &&
-            ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) {
+             ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) {
-                rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks;
+                if (rcu_preempt_ctrlblk.exp_tasks == NULL)
+                        rcu_preempt_ctrlblk.boost_tasks =
+                                rcu_preempt_ctrlblk.gp_tasks;
                invoke_rcu_kthread();
-                RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
        } else
                RCU_TRACE(rcu_initiate_boost_trace());
        return 1;
 }
-/*
+#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
- * Initiate boosting for an expedited grace period.
- */
-static void rcu_initiate_expedited_boost(void)
-{
-        unsigned long flags;
-        raw_local_irq_save(flags);
-        if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) {
-                rcu_preempt_ctrlblk.boost_tasks =
-                        rcu_preempt_ctrlblk.blkd_tasks.next;
-                rcu_preempt_ctrlblk.boosted_this_gp = -1;
-                invoke_rcu_kthread();
-                RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
-        } else
-                RCU_TRACE(rcu_initiate_exp_boost_trace());
-        raw_local_irq_restore(flags);
-}
-#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000);
 /*
 * Do priority-boost accounting for the start of a new grace period.
@@ -347,8 +348,6 @@ static void rcu_initiate_expedited_boost(void)
 static void rcu_preempt_boost_start_gp(void)
 {
        rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
-        if (rcu_preempt_ctrlblk.boosted_this_gp > 0)
-                rcu_preempt_ctrlblk.boosted_this_gp = 0;
 }
 #else /* #ifdef CONFIG_RCU_BOOST */
@@ -372,13 +371,6 @@ static int rcu_initiate_boost(void)
 }
 /*
- * If there is no RCU priority boosting, we don't initiate expedited boosting.
- */
-static void rcu_initiate_expedited_boost(void)
-{
-}
-/*
 * If there is no RCU priority boosting, nothing to do at grace-period start.
 */
 static void rcu_preempt_boost_start_gp(void)
@@ -418,7 +410,7 @@ static void rcu_preempt_cpu_qs(void)
        if (!rcu_preempt_gp_in_progress())
                return;
        /*
-         * Check up on boosting.  If there are no readers blocking the
+         * Check up on boosting.  If there are readers blocking the
         * current grace period, leave.
         */
        if (rcu_initiate_boost())
@@ -578,7 +570,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
                empty = !rcu_preempt_blocked_readers_cgp();
                empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
                np = rcu_next_node_entry(t);
-                list_del(&t->rcu_node_entry);
+                list_del_init(&t->rcu_node_entry);
                if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
                        rcu_preempt_ctrlblk.gp_tasks = np;
                if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
@@ -587,7 +579,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
                if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
                        rcu_preempt_ctrlblk.boost_tasks = np;
 #endif /* #ifdef CONFIG_RCU_BOOST */
-                INIT_LIST_HEAD(&t->rcu_node_entry);
                /*
                 * If this was the last task on the current list, and if
@@ -812,13 +803,16 @@ void synchronize_rcu_expedited(void)
        rpcp->exp_tasks = rpcp->blkd_tasks.next;
        if (rpcp->exp_tasks == &rpcp->blkd_tasks)
                rpcp->exp_tasks = NULL;
-        local_irq_restore(flags);
        /* Wait for tail of ->blkd_tasks list to drain. */
-        if (rcu_preempted_readers_exp())
+        if (!rcu_preempted_readers_exp())
-                rcu_initiate_expedited_boost();
+                local_irq_restore(flags);
+        else {
+                rcu_initiate_boost();
+                local_irq_restore(flags);
                wait_event(sync_rcu_preempt_exp_wq,
                           !rcu_preempted_readers_exp());
+        }
        /* Clean up and exit. */
        barrier(); /* ensure expedited GP seen before counter increment. */
@@ -931,24 +925,17 @@ void __init rcu_scheduler_starting(void)
 static void rcu_initiate_boost_trace(void)
 {
-        if (rcu_preempt_ctrlblk.gp_tasks == NULL)
+        if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
-                rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++;
+                rcu_preempt_ctrlblk.n_balk_blkd_tasks++;
+        else if (rcu_preempt_ctrlblk.gp_tasks == NULL &&
+                 rcu_preempt_ctrlblk.exp_tasks == NULL)
+                rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++;
        else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
-                rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++;
+                rcu_preempt_ctrlblk.n_balk_boost_tasks++;
-        else if (rcu_preempt_ctrlblk.boosted_this_gp != 0)
-                rcu_preempt_ctrlblk.n_normal_balk_boosted++;
        else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
-                rcu_preempt_ctrlblk.n_normal_balk_notyet++;
+                rcu_preempt_ctrlblk.n_balk_notyet++;
-        else
-                rcu_preempt_ctrlblk.n_normal_balk_nos++;
-}
-static void rcu_initiate_exp_boost_trace(void)
-{
-        if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
-                rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++;
        else
-                rcu_preempt_ctrlblk.n_exp_balk_nos++;
+                rcu_preempt_ctrlblk.n_balk_nos++;
 }
 #endif /* #ifdef CONFIG_RCU_BOOST */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index c224da41890c..98f51b13bb7e 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -33,7 +33,7 @@
 #include <linux/rcupdate.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <linux/bitops.h>
 #include <linux/completion.h>
 #include <linux/moduleparam.h>
@@ -131,7 +131,7 @@ struct rcu_torture {
 static LIST_HEAD(rcu_torture_freelist);
 static struct rcu_torture __rcu *rcu_torture_current;
-static long rcu_torture_current_version;
+static unsigned long rcu_torture_current_version;
 static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
 static DEFINE_SPINLOCK(rcu_torture_lock);
 static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
@@ -146,8 +146,6 @@ static atomic_t n_rcu_torture_mberror;
 static atomic_t n_rcu_torture_error;
 static long n_rcu_torture_boost_ktrerror;
 static long n_rcu_torture_boost_rterror;
-static long n_rcu_torture_boost_allocerror;
-static long n_rcu_torture_boost_afferror;
 static long n_rcu_torture_boost_failure;
 static long n_rcu_torture_boosts;
 static long n_rcu_torture_timers;
@@ -163,11 +161,11 @@ static int stutter_pause_test;
 #endif
 int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
-#ifdef CONFIG_RCU_BOOST
+#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
 #define rcu_can_boost() 1
-#else /* #ifdef CONFIG_RCU_BOOST */
+#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
 #define rcu_can_boost() 0
-#endif /* #else #ifdef CONFIG_RCU_BOOST */
+#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
 static unsigned long boost_starttime;   /* jiffies of next boost test start. */
 DEFINE_MUTEX(boost_mutex);              /* protect setting boost_starttime */
@@ -751,6 +749,7 @@ static int rcu_torture_boost(void *arg)
                n_rcu_torture_boost_rterror++;
        }
+        init_rcu_head_on_stack(&rbi.rcu);
        /* Each pass through the following loop does one boost-test cycle. */
        do {
                /* Wait for the next test interval. */
@@ -810,6 +809,7 @@ checkwait:	rcu_stutter_wait("rcu_torture_boost");
        /* Clean up and exit. */
        VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
+        destroy_rcu_head_on_stack(&rbi.rcu);
        rcutorture_shutdown_absorb("rcu_torture_boost");
        while (!kthread_should_stop() || rbi.inflight)
                schedule_timeout_uninterruptible(1);
@@ -886,7 +886,7 @@ rcu_torture_writer(void *arg)
                        old_rp->rtort_pipe_count++;
                        cur_ops->deferred_free(old_rp);
                }
-                rcu_torture_current_version++;
+                rcutorture_record_progress(++rcu_torture_current_version);
                oldbatch = cur_ops->completed();
                rcu_stutter_wait("rcu_torture_writer");
        } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
@@ -941,7 +941,6 @@ static void rcu_torture_timer(unsigned long unused)
        idx = cur_ops->readlock();
        completed = cur_ops->completed();
        p = rcu_dereference_check(rcu_torture_current,
-                                  rcu_read_lock_held() ||
                                  rcu_read_lock_bh_held() ||
                                  rcu_read_lock_sched_held() ||
                                  srcu_read_lock_held(&srcu_ctl));
@@ -1002,7 +1001,6 @@ rcu_torture_reader(void *arg)
                idx = cur_ops->readlock();
                completed = cur_ops->completed();
                p = rcu_dereference_check(rcu_torture_current,
-                                          rcu_read_lock_held() ||
                                          rcu_read_lock_bh_held() ||
                                          rcu_read_lock_sched_held() ||
                                          srcu_read_lock_held(&srcu_ctl));
@@ -1066,8 +1064,8 @@ rcu_torture_printk(char *page)
        }
        cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
        cnt += sprintf(&page[cnt],
-                       "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
+                       "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
-                       "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld "
+                       "rtmbe: %d rtbke: %ld rtbre: %ld "
                       "rtbf: %ld rtb: %ld nt: %ld",
                       rcu_torture_current,
                       rcu_torture_current_version,
@@ -1078,16 +1076,12 @@ rcu_torture_printk(char *page)
                       atomic_read(&n_rcu_torture_mberror),
                       n_rcu_torture_boost_ktrerror,
                       n_rcu_torture_boost_rterror,
-                       n_rcu_torture_boost_allocerror,
-                       n_rcu_torture_boost_afferror,
                       n_rcu_torture_boost_failure,
                       n_rcu_torture_boosts,
                       n_rcu_torture_timers);
        if (atomic_read(&n_rcu_torture_mberror) != 0 ||
            n_rcu_torture_boost_ktrerror != 0 ||
            n_rcu_torture_boost_rterror != 0 ||
-            n_rcu_torture_boost_allocerror != 0 ||
-            n_rcu_torture_boost_afferror != 0 ||
            n_rcu_torture_boost_failure != 0)
                cnt += sprintf(&page[cnt], " !!!");
        cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
@@ -1331,6 +1325,7 @@ rcu_torture_cleanup(void)
        int i;
        mutex_lock(&fullstop_mutex);
+        rcutorture_record_test_transition();
        if (fullstop == FULLSTOP_SHUTDOWN) {
                printk(KERN_WARNING /* but going down anyway, so... */
                       "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
@@ -1486,8 +1481,6 @@ rcu_torture_init(void)
        atomic_set(&n_rcu_torture_error, 0);
        n_rcu_torture_boost_ktrerror = 0;
        n_rcu_torture_boost_rterror = 0;
-        n_rcu_torture_boost_allocerror = 0;
-        n_rcu_torture_boost_afferror = 0;
        n_rcu_torture_boost_failure = 0;
        n_rcu_torture_boosts = 0;
        for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
@@ -1624,6 +1617,7 @@ rcu_torture_init(void)
                }
        }
        register_reboot_notifier(&rcutorture_shutdown_nb);
+        rcutorture_record_test_transition();
        mutex_unlock(&fullstop_mutex);
        return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index dd4aea806f8e..ba06207b1dd3 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -36,7 +36,7 @@
 #include <linux/interrupt.h>
 #include <linux/sched.h>
 #include <linux/nmi.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <linux/bitops.h>
 #include <linux/module.h>
 #include <linux/completion.h>
@@ -47,6 +47,9 @@
 #include <linux/mutex.h>
 #include <linux/time.h>
 #include <linux/kernel_stat.h>
+#include <linux/wait.h>
+#include <linux/kthread.h>
+#include <linux/prefetch.h>
 #include "rcutree.h"
@@ -79,10 +82,67 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
 struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+static struct rcu_state *rcu_state;
+/*
+ * The rcu_scheduler_active variable transitions from zero to one just
+ * before the first task is spawned.  So when this variable is zero, RCU
+ * can assume that there is but one task, allowing RCU to (for example)
+ * optimized synchronize_sched() to a simple barrier().  When this variable
+ * is one, RCU must actually do all the hard work required to detect real
+ * grace periods.  This variable is also used to suppress boot-time false
+ * positives from lockdep-RCU error checking.
+ */
 int rcu_scheduler_active __read_mostly;
 EXPORT_SYMBOL_GPL(rcu_scheduler_active);
 /*
+ * The rcu_scheduler_fully_active variable transitions from zero to one
+ * during the early_initcall() processing, which is after the scheduler
+ * is capable of creating new tasks.  So RCU processing (for example,
+ * creating tasks for RCU priority boosting) must be delayed until after
+ * rcu_scheduler_fully_active transitions from zero to one.  We also
+ * currently delay invocation of any RCU callbacks until after this point.
+ *
+ * It might later prove better for people registering RCU callbacks during
+ * early boot to take responsibility for these callbacks, but one step at
+ * a time.
+ */
+static int rcu_scheduler_fully_active __read_mostly;
+#ifdef CONFIG_RCU_BOOST
+/*
+ * Control variables for per-CPU and per-rcu_node kthreads.  These
+ * handle all flavors of RCU.
+ */
+static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
+DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
+DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
+DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
+DEFINE_PER_CPU(char, rcu_cpu_has_work);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
+static void invoke_rcu_core(void);
+static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
+#define RCU_KTHREAD_PRIO 1      /* RT priority for per-CPU kthreads. */
+/*
+ * Track the rcutorture test sequence number and the update version
+ * number within a given test.  The rcutorture_testseq is incremented
+ * on every rcutorture module load and unload, so has an odd value
+ * when a test is running.  The rcutorture_vernum is set to zero
+ * when rcutorture starts and is incremented on each rcutorture update.
+ * These variables enable correlating rcutorture output with the
+ * RCU tracing information.
+ */
+unsigned long rcutorture_testseq;
+unsigned long rcutorture_vernum;
+/*
 * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
 * permit this function to be invoked without holding the root rcu_node
 * structure's ->lock, but of course results can be subject to change.
@@ -124,11 +184,12 @@ void rcu_note_context_switch(int cpu)
        rcu_sched_qs(cpu);
        rcu_preempt_note_context_switch(cpu);
 }
+EXPORT_SYMBOL_GPL(rcu_note_context_switch);
 #ifdef CONFIG_NO_HZ
 DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
        .dynticks_nesting = 1,
-        .dynticks = 1,
+        .dynticks = ATOMIC_INIT(1),
 };
 #endif /* #ifdef CONFIG_NO_HZ */
@@ -140,10 +201,8 @@ module_param(blimit, int, 0);
 module_param(qhimark, int, 0);
 module_param(qlowmark, int, 0);
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+int rcu_cpu_stall_suppress __read_mostly;
-int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT;
 module_param(rcu_cpu_stall_suppress, int, 0644);
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
 static int rcu_pending(int cpu);
@@ -176,6 +235,31 @@ void rcu_bh_force_quiescent_state(void)
 EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
 /*
+ * Record the number of times rcutorture tests have been initiated and
+ * terminated.  This information allows the debugfs tracing stats to be
+ * correlated to the rcutorture messages, even when the rcutorture module
+ * is being repeatedly loaded and unloaded.  In other words, we cannot
+ * store this state in rcutorture itself.
+ */
+void rcutorture_record_test_transition(void)
+{
+        rcutorture_testseq++;
+        rcutorture_vernum = 0;
+}
+EXPORT_SYMBOL_GPL(rcutorture_record_test_transition);
+/*
+ * Record the number of writer passes through the current rcutorture test.
+ * This is also used to correlate debugfs tracing stats with the rcutorture
+ * messages.
+ */
+void rcutorture_record_progress(unsigned long vernum)
+{
+        rcutorture_vernum++;
+}
+EXPORT_SYMBOL_GPL(rcutorture_record_progress);
+/*
 * Force a quiescent state for RCU-sched.
 */
 void rcu_sched_force_quiescent_state(void)
@@ -234,8 +318,8 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
                return 1;
        }
-        /* If preemptable RCU, no point in sending reschedule IPI. */
+        /* If preemptible RCU, no point in sending reschedule IPI. */
-        if (rdp->preemptable)
+        if (rdp->preemptible)
                return 0;
        /* The CPU is online, so send it a reschedule IPI. */
@@ -264,13 +348,25 @@ void rcu_enter_nohz(void)
        unsigned long flags;
        struct rcu_dynticks *rdtp;
-        smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
        local_irq_save(flags);
        rdtp = &__get_cpu_var(rcu_dynticks);
-        rdtp->dynticks++;
+        if (--rdtp->dynticks_nesting) {
-        rdtp->dynticks_nesting--;
+                local_irq_restore(flags);
-        WARN_ON_ONCE(rdtp->dynticks & 0x1);
+                return;
+        }
+        /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
+        smp_mb__before_atomic_inc();  /* See above. */
+        atomic_inc(&rdtp->dynticks);
+        smp_mb__after_atomic_inc();  /* Force ordering with next sojourn. */
+        WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
        local_irq_restore(flags);
+        /* If the interrupt queued a callback, get out of dyntick mode. */
+        if (in_irq() &&
+            (__get_cpu_var(rcu_sched_data).nxtlist ||
+             __get_cpu_var(rcu_bh_data).nxtlist ||
+             rcu_preempt_needs_cpu(smp_processor_id())))
+                set_need_resched();
 }
 /*
@@ -286,11 +382,16 @@ void rcu_exit_nohz(void)
        local_irq_save(flags);
        rdtp = &__get_cpu_var(rcu_dynticks);
-        rdtp->dynticks++;
+        if (rdtp->dynticks_nesting++) {
-        rdtp->dynticks_nesting++;
+                local_irq_restore(flags);
-        WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
+                return;
+        }
+        smp_mb__before_atomic_inc();  /* Force ordering w/previous sojourn. */
+        atomic_inc(&rdtp->dynticks);
+        /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
+        smp_mb__after_atomic_inc();  /* See above. */
+        WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
        local_irq_restore(flags);
-        smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
 }
 /**
@@ -304,11 +405,15 @@ void rcu_nmi_enter(void)
 {
        struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
-        if (rdtp->dynticks & 0x1)
+        if (rdtp->dynticks_nmi_nesting == 0 &&
+            (atomic_read(&rdtp->dynticks) & 0x1))
                return;
-        rdtp->dynticks_nmi++;
+        rdtp->dynticks_nmi_nesting++;
-        WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1));
+        smp_mb__before_atomic_inc();  /* Force delay from prior write. */
-        smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+        atomic_inc(&rdtp->dynticks);
+        /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
+        smp_mb__after_atomic_inc();  /* See above. */
+        WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
 }
 /**
@@ -322,11 +427,14 @@ void rcu_nmi_exit(void)
 {
        struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
-        if (rdtp->dynticks & 0x1)
+        if (rdtp->dynticks_nmi_nesting == 0 ||
+            --rdtp->dynticks_nmi_nesting != 0)
                return;
-        smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
+        /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
-        rdtp->dynticks_nmi++;
+        smp_mb__before_atomic_inc();  /* See above. */
-        WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1);
+        atomic_inc(&rdtp->dynticks);
+        smp_mb__after_atomic_inc();  /* Force delay to next write. */
+        WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
 }
 /**
@@ -337,13 +445,7 @@ void rcu_nmi_exit(void)
 */
 void rcu_irq_enter(void)
 {
-        struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+        rcu_exit_nohz();
-        if (rdtp->dynticks_nesting++)
-                return;
-        rdtp->dynticks++;
-        WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
-        smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
 }
 /**
@@ -355,18 +457,7 @@ void rcu_irq_enter(void)
 */
 void rcu_irq_exit(void)
 {
-        struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+        rcu_enter_nohz();
-        if (--rdtp->dynticks_nesting)
-                return;
-        smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
-        rdtp->dynticks++;
-        WARN_ON_ONCE(rdtp->dynticks & 0x1);
-        /* If the interrupt queued a callback, get out of dyntick mode. */
-        if (__this_cpu_read(rcu_sched_data.nxtlist) ||
-            __this_cpu_read(rcu_bh_data.nxtlist))
-                set_need_resched();
 }
 #ifdef CONFIG_SMP
@@ -378,19 +469,8 @@ void rcu_irq_exit(void)
 */
 static int dyntick_save_progress_counter(struct rcu_data *rdp)
 {
-        int ret;
+        rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
-        int snap;
+        return 0;
-        int snap_nmi;
-        snap = rdp->dynticks->dynticks;
-        snap_nmi = rdp->dynticks->dynticks_nmi;
-        smp_mb();       /* Order sampling of snap with end of grace period. */
-        rdp->dynticks_snap = snap;
-        rdp->dynticks_nmi_snap = snap_nmi;
-        ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0);
-        if (ret)
-                rdp->dynticks_fqs++;
-        return ret;
 }
 /*
@@ -401,16 +481,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
 */
 static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 {
-        long curr;
+        unsigned long curr;
-        long curr_nmi;
+        unsigned long snap;
-        long snap;
-        long snap_nmi;
-        curr = rdp->dynticks->dynticks;
+        curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks);
-        snap = rdp->dynticks_snap;
+        snap = (unsigned long)rdp->dynticks_snap;
-        curr_nmi = rdp->dynticks->dynticks_nmi;
-        snap_nmi = rdp->dynticks_nmi_snap;
-        smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
        /*
         * If the CPU passed through or entered a dynticks idle phase with
@@ -420,8 +495,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
         * read-side critical section that started before the beginning
         * of the current RCU grace period.
         */
-        if ((curr != snap || (curr & 0x1) == 0) &&
+        if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) {
-            (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) {
                rdp->dynticks_fqs++;
                return 1;
        }
@@ -450,8 +524,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 #endif /* #else #ifdef CONFIG_NO_HZ */
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 int rcu_cpu_stall_suppress __read_mostly;
 static void record_gp_stall_check_time(struct rcu_state *rsp)
@@ -537,21 +609,24 @@ static void print_cpu_stall(struct rcu_state *rsp)
 static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 {
-        long delta;
+        unsigned long j;
+        unsigned long js;
        struct rcu_node *rnp;
        if (rcu_cpu_stall_suppress)
                return;
-        delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
+        j = ACCESS_ONCE(jiffies);
+        js = ACCESS_ONCE(rsp->jiffies_stall);
        rnp = rdp->mynode;
-        if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) {
+        if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
                /* We haven't checked in, so go dump stack. */
                print_cpu_stall(rsp);
-        } else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) {
+        } else if (rcu_gp_in_progress(rsp) &&
+                   ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
-                /* They had two time units to dump stack, so complain. */
+                /* They had a few time units to dump stack, so complain. */
                print_other_cpu_stall(rsp);
        }
 }
@@ -587,26 +662,6 @@ static void __init check_cpu_stall_init(void)
        atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
 }
-#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-static void record_gp_stall_check_time(struct rcu_state *rsp)
-{
-}
-static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
-{
-}
-void rcu_cpu_stall_reset(void)
-{
-}
-static void __init check_cpu_stall_init(void)
-{
-}
-#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 /*
 * Update CPU-local rcu_data state to record the newly noticed grace period.
 * This is used both when we started the grace period and when we notice
@@ -809,6 +864,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
                rnp->completed = rsp->completed;
                rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
                rcu_start_gp_per_cpu(rsp, rnp, rdp);
+                rcu_preempt_boost_start_gp(rnp);
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
@@ -844,6 +900,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
                rnp->completed = rsp->completed;
                if (rnp == rdp->mynode)
                        rcu_start_gp_per_cpu(rsp, rnp, rdp);
+                rcu_preempt_boost_start_gp(rnp);
                raw_spin_unlock(&rnp->lock);    /* irqs remain disabled. */
        }
@@ -864,7 +921,18 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
        __releases(rcu_get_root(rsp)->lock)
 {
+        unsigned long gp_duration;
        WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
+        /*
+         * Ensure that all grace-period and pre-grace-period activity
+         * is seen before the assignment to rsp->completed.
+         */
+        smp_mb(); /* See above block comment. */
+        gp_duration = jiffies - rsp->gp_start;
+        if (gp_duration > rsp->gp_max)
+                rsp->gp_max = gp_duration;
        rsp->completed = rsp->gpnum;
        rsp->signaled = RCU_GP_IDLE;
        rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
@@ -894,7 +962,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
                        return;
                }
                rnp->qsmask &= ~mask;
-                if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
+                if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
                        /* Other bits still set at this level, so done. */
                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1037,6 +1105,8 @@ static void rcu_send_cbs_to_online(struct rcu_state *rsp)
 /*
 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
 * and move all callbacks from the outgoing CPU to the current one.
+ * There can only be one CPU hotplug operation at a time, so no other
+ * CPU can be attempting to update rcu_cpu_kthread_task.
 */
 static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 {
@@ -1046,6 +1116,8 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
        struct rcu_node *rnp;
+        rcu_stop_cpu_kthread(cpu);
        /* Exclude any attempts to start a new grace period. */
        raw_spin_lock_irqsave(&rsp->onofflock, flags);
@@ -1082,6 +1154,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        if (need_report & RCU_OFL_TASKS_EXP_GP)
                rcu_report_exp_rnp(rsp, rnp);
+        rcu_node_kthread_setaffinity(rnp, -1);
 }
 /*
@@ -1143,7 +1216,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
                next = list->next;
                prefetch(next);
                debug_rcu_head_unqueue(list);
-                list->func(list);
+                __rcu_reclaim(list);
                list = next;
                if (++count >= rdp->blimit)
                        break;
@@ -1179,7 +1252,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
        /* Re-raise the RCU softirq if there are callbacks remaining. */
        if (cpu_has_callbacks_ready_to_invoke(rdp))
-                raise_softirq(RCU_SOFTIRQ);
+                invoke_rcu_core();
 }
 /*
@@ -1225,7 +1298,7 @@ void rcu_check_callbacks(int cpu, int user)
        }
        rcu_preempt_check_callbacks(cpu);
        if (rcu_pending(cpu))
-                raise_softirq(RCU_SOFTIRQ);
+                invoke_rcu_core();
 }
 #ifdef CONFIG_SMP
@@ -1233,6 +1306,8 @@ void rcu_check_callbacks(int cpu, int user)
 /*
 * Scan the leaf rcu_node structures, processing dyntick state for any that
 * have not yet encountered a quiescent state, using the function specified.
+ * Also initiate boosting for any threads blocked on the root rcu_node.
+ *
 * The caller must have suppressed start of new grace periods.
 */
 static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
@@ -1251,7 +1326,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
                        return;
                }
                if (rnp->qsmask == 0) {
-                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                        rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
                        continue;
                }
                cpu = rnp->grplo;
@@ -1269,6 +1344,11 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
                }
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        }
+        rnp = rcu_get_root(rsp);
+        if (rnp->qsmask == 0) {
+                raw_spin_lock_irqsave(&rnp->lock, flags);
+                rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
+        }
 }
 /*
@@ -1383,7 +1463,8 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
        }
        /* If there are callbacks ready, invoke them. */
-        rcu_do_batch(rsp, rdp);
+        if (cpu_has_callbacks_ready_to_invoke(rdp))
+                invoke_rcu_callbacks(rsp, rdp);
 }
 /*
@@ -1391,29 +1472,37 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 */
 static void rcu_process_callbacks(struct softirq_action *unused)
 {
-        /*
-         * Memory references from any prior RCU read-side critical sections
-         * executed by the interrupted code must be seen before any RCU
-         * grace-period manipulations below.
-         */
-        smp_mb(); /* See above block comment. */
        __rcu_process_callbacks(&rcu_sched_state,
                                &__get_cpu_var(rcu_sched_data));
        __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
        rcu_preempt_process_callbacks();
-        /*
-         * Memory references from any later RCU read-side critical sections
-         * executed by the interrupted code must be seen after any RCU
-         * grace-period manipulations above.
-         */
-        smp_mb(); /* See above block comment. */
        /* If we are last CPU on way to dyntick-idle mode, accelerate it. */
        rcu_needs_cpu_flush();
 }
+/*
+ * Wake up the current CPU's kthread.  This replaces raise_softirq()
+ * in earlier versions of RCU.  Note that because we are running on
+ * the current CPU with interrupts disabled, the rcu_cpu_kthread_task
+ * cannot disappear out from under us.
+ */
+static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+        if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active)))
+                return;
+        if (likely(!rsp->boost)) {
+                rcu_do_batch(rsp, rdp);
+                return;
+        }
+        invoke_rcu_callbacks_kthread();
+}
+static void invoke_rcu_core(void)
+{
+        raise_softirq(RCU_SOFTIRQ);
+}
 static void
 __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
           struct rcu_state *rsp)
@@ -1439,6 +1528,13 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
        /* Add the callback to our list. */
        *rdp->nxttail[RCU_NEXT_TAIL] = head;
        rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
+        rdp->qlen++;
+        /* If interrupts were disabled, don't dive into RCU core. */
+        if (irqs_disabled_flags(flags)) {
+                local_irq_restore(flags);
+                return;
+        }
        /*
         * Force the grace period if too many callbacks or too long waiting.
@@ -1447,7 +1543,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
         * invoking force_quiescent_state() if the newly enqueued callback
         * is the only one waiting for a grace period to complete.
         */
-        if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
+        if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
                /* Are we ignoring a completed grace period? */
                rcu_process_gp_end(rsp, rdp);
@@ -1583,7 +1679,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
                 * or RCU-bh, force a local reschedule.
                 */
                rdp->n_rp_qs_pending++;
-                if (!rdp->preemptable &&
+                if (!rdp->preemptible &&
                    ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
                                 jiffies))
                        set_need_resched();
@@ -1760,7 +1856,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 * that this CPU cannot possibly have any RCU callbacks in flight yet.
 */
 static void __cpuinit
-rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
+rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
 {
        unsigned long flags;
        unsigned long mask;
@@ -1772,7 +1868,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
        rdp->passed_quiesc = 0;  /* We could be racing with new GP, */
        rdp->qs_pending = 1;     /*  so set up to respond to current GP. */
        rdp->beenonline = 1;     /* We have now been online. */
-        rdp->preemptable = preemptable;
+        rdp->preemptible = preemptible;
        rdp->qlen_last_fqs_check = 0;
        rdp->n_force_qs_snap = rsp->n_force_qs;
        rdp->blimit = blimit;
@@ -1806,7 +1902,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
-static void __cpuinit rcu_online_cpu(int cpu)
+static void __cpuinit rcu_prepare_cpu(int cpu)
 {
        rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
        rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
@@ -1820,11 +1916,23 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
                                    unsigned long action, void *hcpu)
 {
        long cpu = (long)hcpu;
+        struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
+        struct rcu_node *rnp = rdp->mynode;
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                rcu_online_cpu(cpu);
+                rcu_prepare_cpu(cpu);
+                rcu_prepare_kthreads(cpu);
+                break;
+        case CPU_ONLINE:
+        case CPU_DOWN_FAILED:
+                rcu_node_kthread_setaffinity(rnp, -1);
+                rcu_cpu_kthread_setrt(cpu, 1);
+                break;
+        case CPU_DOWN_PREPARE:
+                rcu_node_kthread_setaffinity(rnp, cpu);
+                rcu_cpu_kthread_setrt(cpu, 0);
                break;
        case CPU_DYING:
        case CPU_DYING_FROZEN:
@@ -1943,10 +2051,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
                                              j / rsp->levelspread[i - 1];
                        }
                        rnp->level = i;
-                        INIT_LIST_HEAD(&rnp->blocked_tasks[0]);
+                        INIT_LIST_HEAD(&rnp->blkd_tasks);
-                        INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
-                        INIT_LIST_HEAD(&rnp->blocked_tasks[2]);
-                        INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
                }
        }
@@ -1968,7 +2073,7 @@ void __init rcu_init(void)
        rcu_init_one(&rcu_sched_state, &rcu_sched_data);
        rcu_init_one(&rcu_bh_state, &rcu_bh_data);
        __rcu_init_preempt();
-        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+         open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
        /*
         * We don't need protection against CPU-hotplug here because
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index e8f057e44e3e..01b2ccda26fb 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -84,13 +84,19 @@
 * Dynticks per-CPU state.
 */
 struct rcu_dynticks {
-        int dynticks_nesting;   /* Track nesting level, sort of. */
+        int dynticks_nesting;   /* Track irq/process nesting level. */
-        int dynticks;           /* Even value for dynticks-idle, else odd. */
+        int dynticks_nmi_nesting; /* Track NMI nesting level. */
-        int dynticks_nmi;       /* Even value for either dynticks-idle or */
+        atomic_t dynticks;      /* Even value for dynticks-idle, else odd. */
-                                /*  not in nmi handler, else odd.  So this */
-                                /*  remains even for nmi from irq handler. */
 };
+/* RCU's kthread states for tracing. */
+#define RCU_KTHREAD_STOPPED  0
+#define RCU_KTHREAD_RUNNING  1
+#define RCU_KTHREAD_WAITING  2
+#define RCU_KTHREAD_OFFCPU   3
+#define RCU_KTHREAD_YIELDING 4
+#define RCU_KTHREAD_MAX      4
 /*
 * Definition for node within the RCU grace-period-detection hierarchy.
 */
@@ -109,10 +115,13 @@ struct rcu_node {
                                /*  an rcu_data structure, otherwise, each */
                                /*  bit corresponds to a child rcu_node */
                                /*  structure. */
-        unsigned long expmask;  /* Groups that have ->blocked_tasks[] */
+        unsigned long expmask;  /* Groups that have ->blkd_tasks */
                                /*  elements that need to drain to allow the */
                                /*  current expedited grace period to */
                                /*  complete (only for TREE_PREEMPT_RCU). */
+        atomic_t wakemask;      /* CPUs whose kthread needs to be awakened. */
+                                /*  Since this has meaning only for leaf */
+                                /*  rcu_node structures, 32 bits suffices. */
        unsigned long qsmaskinit;
                                /* Per-GP initial value for qsmask & expmask. */
        unsigned long grpmask;  /* Mask to apply to parent qsmask. */
@@ -122,11 +131,62 @@ struct rcu_node {
        u8      grpnum;         /* CPU/group number for next level up. */
        u8      level;          /* root is at level 0. */
        struct rcu_node *parent;
-        struct list_head blocked_tasks[4];
+        struct list_head blkd_tasks;
-                                /* Tasks blocked in RCU read-side critsect. */
+                                /* Tasks blocked in RCU read-side critical */
-                                /*  Grace period number (->gpnum) x blocked */
+                                /*  section.  Tasks are placed at the head */
-                                /*  by tasks on the (x & 0x1) element of the */
+                                /*  of this list and age towards the tail. */
-                                /*  blocked_tasks[] array. */
+        struct list_head *gp_tasks;
+                                /* Pointer to the first task blocking the */
+                                /*  current grace period, or NULL if there */
+                                /*  is no such task. */
+        struct list_head *exp_tasks;
+                                /* Pointer to the first task blocking the */
+                                /*  current expedited grace period, or NULL */
+                                /*  if there is no such task.  If there */
+                                /*  is no current expedited grace period, */
+                                /*  then there can cannot be any such task. */
+#ifdef CONFIG_RCU_BOOST
+        struct list_head *boost_tasks;
+                                /* Pointer to first task that needs to be */
+                                /*  priority boosted, or NULL if no priority */
+                                /*  boosting is needed for this rcu_node */
+                                /*  structure.  If there are no tasks */
+                                /*  queued on this rcu_node structure that */
+                                /*  are blocking the current grace period, */
+                                /*  there can be no such task. */
+        unsigned long boost_time;
+                                /* When to start boosting (jiffies). */
+        struct task_struct *boost_kthread_task;
+                                /* kthread that takes care of priority */
+                                /*  boosting for this rcu_node structure. */
+        unsigned int boost_kthread_status;
+                                /* State of boost_kthread_task for tracing. */
+        unsigned long n_tasks_boosted;
+                                /* Total number of tasks boosted. */
+        unsigned long n_exp_boosts;
+                                /* Number of tasks boosted for expedited GP. */
+        unsigned long n_normal_boosts;
+                                /* Number of tasks boosted for normal GP. */
+        unsigned long n_balk_blkd_tasks;
+                                /* Refused to boost: no blocked tasks. */
+        unsigned long n_balk_exp_gp_tasks;
+                                /* Refused to boost: nothing blocking GP. */
+        unsigned long n_balk_boost_tasks;
+                                /* Refused to boost: already boosting. */
+        unsigned long n_balk_notblocked;
+                                /* Refused to boost: RCU RS CS still running. */
+        unsigned long n_balk_notyet;
+                                /* Refused to boost: not yet time. */
+        unsigned long n_balk_nos;
+                                /* Refused to boost: not sure why, though. */
+                                /*  This can happen due to race conditions. */
+#endif /* #ifdef CONFIG_RCU_BOOST */
+        struct task_struct *node_kthread_task;
+                                /* kthread that takes care of this rcu_node */
+                                /*  structure, for example, awakening the */
+                                /*  per-CPU kthreads as needed. */
+        unsigned int node_kthread_status;
+                                /* State of node_kthread_task for tracing. */
 } ____cacheline_internodealigned_in_smp;
 /*
@@ -175,7 +235,7 @@ struct rcu_data {
        bool            passed_quiesc;  /* User-mode/idle loop etc. */
        bool            qs_pending;     /* Core waits for quiesc state. */
        bool            beenonline;     /* CPU online at least once. */
-        bool            preemptable;    /* Preemptable RCU? */
+        bool            preemptible;    /* Preemptible RCU? */
        struct rcu_node *mynode;        /* This CPU's leaf of hierarchy */
        unsigned long grpmask;          /* Mask to apply to leaf qsmask. */
@@ -218,7 +278,6 @@ struct rcu_data {
        /* 3) dynticks interface. */
        struct rcu_dynticks *dynticks;  /* Shared per-CPU dynticks state. */
        int dynticks_snap;              /* Per-GP tracking for dynticks. */
-        int dynticks_nmi_snap;          /* Per-GP tracking for dynticks_nmi. */
 #endif /* #ifdef CONFIG_NO_HZ */
        /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
@@ -254,7 +313,6 @@ struct rcu_data {
 #endif /* #else #ifdef CONFIG_NO_HZ */
 #define RCU_JIFFIES_TILL_FORCE_QS        3      /* for rsp->jiffies_force_qs */
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 #ifdef CONFIG_PROVE_RCU
 #define RCU_STALL_DELAY_DELTA          (5 * HZ)
@@ -272,13 +330,16 @@ struct rcu_data {
                                                /*  scheduling clock irq */
                                                /*  before ratting on them. */
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE
+#define rcu_wait(cond)                                                  \
-#define RCU_CPU_STALL_SUPPRESS_INIT 0
+do {                                                                    \
-#else
+        for (;;) {                                                      \
-#define RCU_CPU_STALL_SUPPRESS_INIT 1
+                set_current_state(TASK_INTERRUPTIBLE);                  \
-#endif
+                if (cond)                                               \
+                        break;                                          \
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+                schedule();                                             \
+        }                                                               \
+        __set_current_state(TASK_RUNNING);                              \
+} while (0)
 /*
 * RCU global state, including node hierarchy.  This hierarchy is
@@ -308,6 +369,7 @@ struct rcu_state {
                                                /*  period because */
                                                /*  force_quiescent_state() */
                                                /*  was running. */
+        u8      boost;                          /* Subject to priority boost. */
        unsigned long gpnum;                    /* Current gp number. */
        unsigned long completed;                /* # of last completed gp. */
@@ -325,12 +387,12 @@ struct rcu_state {
                                                /*  due to lock unavailable. */
        unsigned long n_force_qs_ngp;           /* Number of calls leaving */
                                                /*  due to no GP active. */
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
        unsigned long gp_start;                 /* Time at which GP started, */
                                                /*  but in jiffies. */
        unsigned long jiffies_stall;            /* Time at which to check */
                                                /*  for CPU stalls. */
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+        unsigned long gp_max;                   /* Maximum GP duration in */
+                                                /*  jiffies. */
        char *name;                             /* Name of structure. */
 };
@@ -361,16 +423,15 @@ DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
 static void rcu_bootup_announce(void);
 long rcu_batches_completed(void);
 static void rcu_preempt_note_context_switch(int cpu);
-static int rcu_preempted_readers(struct rcu_node *rnp);
+static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
 static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
                                      unsigned long flags);
+static void rcu_stop_cpu_kthread(int cpu);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 static void rcu_print_detail_task_stall(struct rcu_state *rsp);
 static void rcu_print_task_stall(struct rcu_node *rnp);
 static void rcu_preempt_stall_reset(void);
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
 static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
@@ -390,5 +451,20 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
 static void rcu_preempt_send_cbs_to_online(void);
 static void __init __rcu_init_preempt(void);
 static void rcu_needs_cpu_flush(void);
+static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
+static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
+static void invoke_rcu_callbacks_kthread(void);
+#ifdef CONFIG_RCU_BOOST
+static void rcu_preempt_do_callbacks(void);
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
+                                          cpumask_var_t cm);
+static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
+                                                 struct rcu_node *rnp,
+                                                 int rnp_index);
+static void invoke_rcu_node_kthread(struct rcu_node *rnp);
+static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
+static void __cpuinit rcu_prepare_kthreads(int cpu);
 #endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index a3638710dc67..8aafbb80b8b0 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1,7 +1,7 @@
 /*
 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
 * Internal non-public definitions that provide either classic
- * or preemptable semantics.
+ * or preemptible semantics.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -54,10 +54,6 @@ static void __init rcu_bootup_announce_oddness(void)
 #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
        printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
 #endif
-#ifndef CONFIG_RCU_CPU_STALL_DETECTOR
-        printk(KERN_INFO
-               "\tRCU-based detection of stalled CPUs is disabled.\n");
-#endif
 #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
        printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
 #endif
@@ -70,7 +66,9 @@ static void __init rcu_bootup_announce_oddness(void)
 struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
 DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
+static struct rcu_state *rcu_state = &rcu_preempt_state;
+static void rcu_read_unlock_special(struct task_struct *t);
 static int rcu_preempted_readers_exp(struct rcu_node *rnp);
 /*
@@ -78,7 +76,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp);
 */
 static void __init rcu_bootup_announce(void)
 {
-        printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n");
+        printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n");
        rcu_bootup_announce_oddness();
 }
@@ -111,7 +109,7 @@ void rcu_force_quiescent_state(void)
 EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
 /*
- * Record a preemptable-RCU quiescent state for the specified CPU.  Note
+ * Record a preemptible-RCU quiescent state for the specified CPU.  Note
 * that this just means that the task currently running on the CPU is
 * not in a quiescent state.  There might be any number of tasks blocked
 * while in an RCU read-side critical section.
@@ -134,12 +132,12 @@ static void rcu_preempt_qs(int cpu)
 * We have entered the scheduler, and the current task might soon be
 * context-switched away from.  If this task is in an RCU read-side
 * critical section, we will no longer be able to rely on the CPU to
- * record that fact, so we enqueue the task on the appropriate entry
+ * record that fact, so we enqueue the task on the blkd_tasks list.
- * of the blocked_tasks[] array.  The task will dequeue itself when
+ * The task will dequeue itself when it exits the outermost enclosing
- * it exits the outermost enclosing RCU read-side critical section.
+ * RCU read-side critical section.  Therefore, the current grace period
- * Therefore, the current grace period cannot be permitted to complete
+ * cannot be permitted to complete until the blkd_tasks list entries
- * until the blocked_tasks[] entry indexed by the low-order bit of
+ * predating the current grace period drain, in other words, until
- * rnp->gpnum empties.
+ * rnp->gp_tasks becomes NULL.
 *
 * Caller must disable preemption.
 */
@@ -147,11 +145,10 @@ static void rcu_preempt_note_context_switch(int cpu)
 {
        struct task_struct *t = current;
        unsigned long flags;
-        int phase;
        struct rcu_data *rdp;
        struct rcu_node *rnp;
-        if (t->rcu_read_lock_nesting &&
+        if (t->rcu_read_lock_nesting > 0 &&
            (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
                /* Possibly blocking in an RCU read-side critical section. */
@@ -169,16 +166,39 @@ static void rcu_preempt_note_context_switch(int cpu)
                 * (i.e., this CPU has not yet passed through a quiescent
                 * state for the current grace period), then as long
                 * as that task remains queued, the current grace period
-                 * cannot end.
+                 * cannot end.  Note that there is some uncertainty as
+                 * to exactly when the current grace period started.
+                 * We take a conservative approach, which can result
+                 * in unnecessarily waiting on tasks that started very
+                 * slightly after the current grace period began.  C'est
+                 * la vie!!!
                 *
                 * But first, note that the current CPU must still be
                 * on line!
                 */
                WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
                WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
-                phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1;
+                if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
-                list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
+                        list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
+                        rnp->gp_tasks = &t->rcu_node_entry;
+#ifdef CONFIG_RCU_BOOST
+                        if (rnp->boost_tasks != NULL)
+                                rnp->boost_tasks = rnp->gp_tasks;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+                } else {
+                        list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
+                        if (rnp->qsmask & rdp->grpmask)
+                                rnp->gp_tasks = &t->rcu_node_entry;
+                }
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+        } else if (t->rcu_read_lock_nesting < 0 &&
+                   t->rcu_read_unlock_special) {
+                /*
+                 * Complete exit from RCU read-side critical section on
+                 * behalf of preempted instance of __rcu_read_unlock().
+                 */
+                rcu_read_unlock_special(t);
        }
        /*
@@ -196,7 +216,7 @@ static void rcu_preempt_note_context_switch(int cpu)
 }
 /*
- * Tree-preemptable RCU implementation for rcu_read_lock().
+ * Tree-preemptible RCU implementation for rcu_read_lock().
 * Just increment ->rcu_read_lock_nesting, shared state will be updated
 * if we block.
 */
@@ -212,12 +232,9 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock);
 * for the specified rcu_node structure.  If the caller needs a reliable
 * answer, it must hold the rcu_node's ->lock.
 */
-static int rcu_preempted_readers(struct rcu_node *rnp)
+static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 {
-        int phase = rnp->gpnum & 0x1;
+        return rnp->gp_tasks != NULL;
-        return !list_empty(&rnp->blocked_tasks[phase]) ||
-               !list_empty(&rnp->blocked_tasks[phase + 2]);
 }
 /*
@@ -233,7 +250,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
        unsigned long mask;
        struct rcu_node *rnp_p;
-        if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
+        if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;  /* Still need more quiescent states! */
        }
@@ -257,15 +274,31 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
 }
 /*
+ * Advance a ->blkd_tasks-list pointer to the next entry, instead
+ * returning NULL if at the end of the list.
+ */
+static struct list_head *rcu_next_node_entry(struct task_struct *t,
+                                             struct rcu_node *rnp)
+{
+        struct list_head *np;
+        np = t->rcu_node_entry.next;
+        if (np == &rnp->blkd_tasks)
+                np = NULL;
+        return np;
+}
+/*
 * Handle special cases during rcu_read_unlock(), such as needing to
 * notify RCU core processing or task having blocked during the RCU
 * read-side critical section.
 */
-static void rcu_read_unlock_special(struct task_struct *t)
+static noinline void rcu_read_unlock_special(struct task_struct *t)
 {
        int empty;
        int empty_exp;
        unsigned long flags;
+        struct list_head *np;
        struct rcu_node *rnp;
        int special;
@@ -285,7 +318,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
        }
        /* Hardware IRQ handlers cannot block. */
-        if (in_irq()) {
+        if (in_irq() || in_serving_softirq()) {
                local_irq_restore(flags);
                return;
        }
@@ -306,10 +339,24 @@ static void rcu_read_unlock_special(struct task_struct *t)
                                break;
                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
                }
-                empty = !rcu_preempted_readers(rnp);
+                empty = !rcu_preempt_blocked_readers_cgp(rnp);
                empty_exp = !rcu_preempted_readers_exp(rnp);
                smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
+                np = rcu_next_node_entry(t, rnp);
                list_del_init(&t->rcu_node_entry);
+                if (&t->rcu_node_entry == rnp->gp_tasks)
+                        rnp->gp_tasks = np;
+                if (&t->rcu_node_entry == rnp->exp_tasks)
+                        rnp->exp_tasks = np;
+#ifdef CONFIG_RCU_BOOST
+                if (&t->rcu_node_entry == rnp->boost_tasks)
+                        rnp->boost_tasks = np;
+                /* Snapshot and clear ->rcu_boosted with rcu_node lock held. */
+                if (t->rcu_boosted) {
+                        special |= RCU_READ_UNLOCK_BOOSTED;
+                        t->rcu_boosted = 0;
+                }
+#endif /* #ifdef CONFIG_RCU_BOOST */
                t->rcu_blocked_node = NULL;
                /*
@@ -322,6 +369,14 @@ static void rcu_read_unlock_special(struct task_struct *t)
                else
                        rcu_report_unblock_qs_rnp(rnp, flags);
+#ifdef CONFIG_RCU_BOOST
+                /* Unboost if we were boosted. */
+                if (special & RCU_READ_UNLOCK_BOOSTED) {
+                        rt_mutex_unlock(t->rcu_boost_mutex);
+                        t->rcu_boost_mutex = NULL;
+                }
+#endif /* #ifdef CONFIG_RCU_BOOST */
                /*
                 * If this was the last task on the expedited lists,
                 * then we need to report up the rcu_node hierarchy.
@@ -334,7 +389,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
 }
 /*
- * Tree-preemptable RCU implementation for rcu_read_unlock().
+ * Tree-preemptible RCU implementation for rcu_read_unlock().
 * Decrement ->rcu_read_lock_nesting.  If the result is zero (outermost
 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
 * invoke rcu_read_unlock_special() to clean up after a context switch
@@ -345,19 +400,26 @@ void __rcu_read_unlock(void)
        struct task_struct *t = current;
        barrier();  /* needed if we ever invoke rcu_read_unlock in rcutree.c */
-        --t->rcu_read_lock_nesting;
+        if (t->rcu_read_lock_nesting != 1)
-        barrier();  /* decrement before load of ->rcu_read_unlock_special */
+                --t->rcu_read_lock_nesting;
-        if (t->rcu_read_lock_nesting == 0 &&
+        else {
-            unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
+                t->rcu_read_lock_nesting = INT_MIN;
-                rcu_read_unlock_special(t);
+                barrier();  /* assign before ->rcu_read_unlock_special load */
+                if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
+                        rcu_read_unlock_special(t);
+                barrier();  /* ->rcu_read_unlock_special load before assign */
+                t->rcu_read_lock_nesting = 0;
+        }
 #ifdef CONFIG_PROVE_LOCKING
-        WARN_ON_ONCE(ACCESS_ONCE(t->rcu_read_lock_nesting) < 0);
+        {
+                int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
+                WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
+        }
 #endif /* #ifdef CONFIG_PROVE_LOCKING */
 }
 EXPORT_SYMBOL_GPL(__rcu_read_unlock);
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 #ifdef CONFIG_RCU_CPU_STALL_VERBOSE
 /*
@@ -367,18 +429,16 @@ EXPORT_SYMBOL_GPL(__rcu_read_unlock);
 static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
 {
        unsigned long flags;
-        struct list_head *lp;
-        int phase;
        struct task_struct *t;
-        if (rcu_preempted_readers(rnp)) {
+        if (!rcu_preempt_blocked_readers_cgp(rnp))
-                raw_spin_lock_irqsave(&rnp->lock, flags);
+                return;
-                phase = rnp->gpnum & 0x1;
+        raw_spin_lock_irqsave(&rnp->lock, flags);
-                lp = &rnp->blocked_tasks[phase];
+        t = list_entry(rnp->gp_tasks,
-                list_for_each_entry(t, lp, rcu_node_entry)
+                       struct task_struct, rcu_node_entry);
-                        sched_show_task(t);
+        list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
-                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                sched_show_task(t);
-        }
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 /*
@@ -408,16 +468,14 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
 */
 static void rcu_print_task_stall(struct rcu_node *rnp)
 {
-        struct list_head *lp;
-        int phase;
        struct task_struct *t;
-        if (rcu_preempted_readers(rnp)) {
+        if (!rcu_preempt_blocked_readers_cgp(rnp))
-                phase = rnp->gpnum & 0x1;
+                return;
-                lp = &rnp->blocked_tasks[phase];
+        t = list_entry(rnp->gp_tasks,
-                list_for_each_entry(t, lp, rcu_node_entry)
+                       struct task_struct, rcu_node_entry);
-                        printk(" P%d", t->pid);
+        list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
-        }
+                printk(" P%d", t->pid);
 }
 /*
@@ -430,18 +488,21 @@ static void rcu_preempt_stall_reset(void)
        rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
 }
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 /*
 * Check that the list of blocked tasks for the newly completed grace
 * period is in fact empty.  It is a serious bug to complete a grace
 * period that still has RCU readers blocked!  This function must be
 * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
 * must be held by the caller.
+ *
+ * Also, if there are blocked tasks on the list, they automatically
+ * block the newly created grace period, so set up ->gp_tasks accordingly.
 */
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 {
-        WARN_ON_ONCE(rcu_preempted_readers(rnp));
+        WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
+        if (!list_empty(&rnp->blkd_tasks))
+                rnp->gp_tasks = rnp->blkd_tasks.next;
        WARN_ON_ONCE(rnp->qsmask);
 }
@@ -465,50 +526,68 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
                                     struct rcu_node *rnp,
                                     struct rcu_data *rdp)
 {
-        int i;
        struct list_head *lp;
        struct list_head *lp_root;
        int retval = 0;
        struct rcu_node *rnp_root = rcu_get_root(rsp);
-        struct task_struct *tp;
+        struct task_struct *t;
        if (rnp == rnp_root) {
                WARN_ONCE(1, "Last CPU thought to be offlined?");
                return 0;  /* Shouldn't happen: at least one CPU online. */
        }
-        WARN_ON_ONCE(rnp != rdp->mynode &&
-                     (!list_empty(&rnp->blocked_tasks[0]) ||
+        /* If we are on an internal node, complain bitterly. */
-                      !list_empty(&rnp->blocked_tasks[1]) ||
+        WARN_ON_ONCE(rnp != rdp->mynode);
-                      !list_empty(&rnp->blocked_tasks[2]) ||
-                      !list_empty(&rnp->blocked_tasks[3])));
        /*
-         * Move tasks up to root rcu_node.  Rely on the fact that the
+         * Move tasks up to root rcu_node.  Don't try to get fancy for
-         * root rcu_node can be at most one ahead of the rest of the
+         * this corner-case operation -- just put this node's tasks
-         * rcu_nodes in terms of gp_num value.  This fact allows us to
+         * at the head of the root node's list, and update the root node's
-         * move the blocked_tasks[] array directly, element by element.
+         * ->gp_tasks and ->exp_tasks pointers to those of this node's,
+         * if non-NULL.  This might result in waiting for more tasks than
+         * absolutely necessary, but this is a good performance/complexity
+         * tradeoff.
         */
-        if (rcu_preempted_readers(rnp))
+        if (rcu_preempt_blocked_readers_cgp(rnp))
                retval |= RCU_OFL_TASKS_NORM_GP;
        if (rcu_preempted_readers_exp(rnp))
                retval |= RCU_OFL_TASKS_EXP_GP;
-        for (i = 0; i < 4; i++) {
+        lp = &rnp->blkd_tasks;
-                lp = &rnp->blocked_tasks[i];
+        lp_root = &rnp_root->blkd_tasks;
-                lp_root = &rnp_root->blocked_tasks[i];
+        while (!list_empty(lp)) {
-                while (!list_empty(lp)) {
+                t = list_entry(lp->next, typeof(*t), rcu_node_entry);
-                        tp = list_entry(lp->next, typeof(*tp), rcu_node_entry);
+                raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
-                        raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
+                list_del(&t->rcu_node_entry);
-                        list_del(&tp->rcu_node_entry);
+                t->rcu_blocked_node = rnp_root;
-                        tp->rcu_blocked_node = rnp_root;
+                list_add(&t->rcu_node_entry, lp_root);
-                        list_add(&tp->rcu_node_entry, lp_root);
+                if (&t->rcu_node_entry == rnp->gp_tasks)
-                        raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */
+                        rnp_root->gp_tasks = rnp->gp_tasks;
-                }
+                if (&t->rcu_node_entry == rnp->exp_tasks)
+                        rnp_root->exp_tasks = rnp->exp_tasks;
+#ifdef CONFIG_RCU_BOOST
+                if (&t->rcu_node_entry == rnp->boost_tasks)
+                        rnp_root->boost_tasks = rnp->boost_tasks;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+                raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
        }
+#ifdef CONFIG_RCU_BOOST
+        /* In case root is being boosted and leaf is not. */
+        raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
+        if (rnp_root->boost_tasks != NULL &&
+            rnp_root->boost_tasks != rnp_root->gp_tasks)
+                rnp_root->boost_tasks = rnp_root->gp_tasks;
+        raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
+#endif /* #ifdef CONFIG_RCU_BOOST */
+        rnp->gp_tasks = NULL;
+        rnp->exp_tasks = NULL;
        return retval;
 }
 /*
- * Do CPU-offline processing for preemptable RCU.
+ * Do CPU-offline processing for preemptible RCU.
 */
 static void rcu_preempt_offline_cpu(int cpu)
 {
@@ -532,12 +611,13 @@ static void rcu_preempt_check_callbacks(int cpu)
                rcu_preempt_qs(cpu);
                return;
        }
-        if (per_cpu(rcu_preempt_data, cpu).qs_pending)
+        if (t->rcu_read_lock_nesting > 0 &&
+            per_cpu(rcu_preempt_data, cpu).qs_pending)
                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
 }
 /*
- * Process callbacks for preemptable RCU.
+ * Process callbacks for preemptible RCU.
 */
 static void rcu_preempt_process_callbacks(void)
 {
@@ -545,8 +625,17 @@ static void rcu_preempt_process_callbacks(void)
                                &__get_cpu_var(rcu_preempt_data));
 }
+#ifdef CONFIG_RCU_BOOST
+static void rcu_preempt_do_callbacks(void)
+{
+        rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data));
+}
+#endif /* #ifdef CONFIG_RCU_BOOST */
 /*
- * Queue a preemptable-RCU callback for invocation after a grace period.
+ * Queue a preemptible-RCU callback for invocation after a grace period.
 */
 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
@@ -594,8 +683,7 @@ static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
 */
 static int rcu_preempted_readers_exp(struct rcu_node *rnp)
 {
-        return !list_empty(&rnp->blocked_tasks[2]) ||
+        return rnp->exp_tasks != NULL;
-               !list_empty(&rnp->blocked_tasks[3]);
 }
 /*
@@ -630,9 +718,12 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
        raw_spin_lock_irqsave(&rnp->lock, flags);
        for (;;) {
-                if (!sync_rcu_preempt_exp_done(rnp))
+                if (!sync_rcu_preempt_exp_done(rnp)) {
+                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
                        break;
+                }
                if (rnp->parent == NULL) {
+                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
                        wake_up(&sync_rcu_preempt_exp_wq);
                        break;
                }
@@ -642,7 +733,6 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
                raw_spin_lock(&rnp->lock); /* irqs already disabled */
                rnp->expmask &= ~mask;
        }
-        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 /*
@@ -655,13 +745,17 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
 static void
 sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
 {
-        int must_wait;
+        unsigned long flags;
+        int must_wait = 0;
-        raw_spin_lock(&rnp->lock); /* irqs already disabled */
+        raw_spin_lock_irqsave(&rnp->lock, flags);
-        list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]);
+        if (list_empty(&rnp->blkd_tasks))
-        list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
-        must_wait = rcu_preempted_readers_exp(rnp);
+        else {
-        raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
+                rnp->exp_tasks = rnp->blkd_tasks.next;
+                rcu_initiate_boost(rnp, flags);  /* releases rnp->lock */
+                must_wait = 1;
+        }
        if (!must_wait)
                rcu_report_exp_rnp(rsp, rnp);
 }
@@ -669,9 +763,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
 /*
 * Wait for an rcu-preempt grace period, but expedite it.  The basic idea
 * is to invoke synchronize_sched_expedited() to push all the tasks to
- * the ->blocked_tasks[] lists, move all entries from the first set of
+ * the ->blkd_tasks lists and wait for this list to drain.
- * ->blocked_tasks[] lists to the second set, and finally wait for this
- * second set to drain.
 */
 void synchronize_rcu_expedited(void)
 {
@@ -703,7 +795,7 @@ void synchronize_rcu_expedited(void)
        if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
                goto unlock_mb_ret; /* Others did our work for us. */
-        /* force all RCU readers onto blocked_tasks[]. */
+        /* force all RCU readers onto ->blkd_tasks lists. */
        synchronize_sched_expedited();
        raw_spin_lock_irqsave(&rsp->onofflock, flags);
@@ -715,7 +807,7 @@ void synchronize_rcu_expedited(void)
                raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
        }
-        /* Snapshot current state of ->blocked_tasks[] lists. */
+        /* Snapshot current state of ->blkd_tasks lists. */
        rcu_for_each_leaf_node(rsp, rnp)
                sync_rcu_preempt_exp_init(rsp, rnp);
        if (NUM_RCU_NODES > 1)
@@ -723,7 +815,7 @@ void synchronize_rcu_expedited(void)
        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
-        /* Wait for snapshotted ->blocked_tasks[] lists to drain. */
+        /* Wait for snapshotted ->blkd_tasks lists to drain. */
        rnp = rcu_get_root(rsp);
        wait_event(sync_rcu_preempt_exp_wq,
                   sync_rcu_preempt_exp_done(rnp));
@@ -739,7 +831,7 @@ mb_ret:
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 /*
- * Check to see if there is any immediate preemptable-RCU-related work
+ * Check to see if there is any immediate preemptible-RCU-related work
 * to be done.
 */
 static int rcu_preempt_pending(int cpu)
@@ -749,7 +841,7 @@ static int rcu_preempt_pending(int cpu)
 }
 /*
- * Does preemptable RCU need the CPU to stay out of dynticks mode?
+ * Does preemptible RCU need the CPU to stay out of dynticks mode?
 */
 static int rcu_preempt_needs_cpu(int cpu)
 {
@@ -766,7 +858,7 @@ void rcu_barrier(void)
 EXPORT_SYMBOL_GPL(rcu_barrier);
 /*
- * Initialize preemptable RCU's per-CPU data.
+ * Initialize preemptible RCU's per-CPU data.
 */
 static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 {
@@ -774,7 +866,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 }
 /*
- * Move preemptable RCU's callbacks from dying CPU to other online CPU.
+ * Move preemptible RCU's callbacks from dying CPU to other online CPU.
 */
 static void rcu_preempt_send_cbs_to_online(void)
 {
@@ -782,7 +874,7 @@ static void rcu_preempt_send_cbs_to_online(void)
 }
 /*
- * Initialize preemptable RCU's state structures.
+ * Initialize preemptible RCU's state structures.
 */
 static void __init __rcu_init_preempt(void)
 {
@@ -790,7 +882,7 @@ static void __init __rcu_init_preempt(void)
 }
 /*
- * Check for a task exiting while in a preemptable-RCU read-side
+ * Check for a task exiting while in a preemptible-RCU read-side
 * critical section, clean up if so.  No need to issue warnings,
 * as debug_check_no_locks_held() already does this if lockdep
 * is enabled.
@@ -802,11 +894,13 @@ void exit_rcu(void)
        if (t->rcu_read_lock_nesting == 0)
                return;
        t->rcu_read_lock_nesting = 1;
-        rcu_read_unlock();
+        __rcu_read_unlock();
 }
 #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+static struct rcu_state *rcu_state = &rcu_sched_state;
 /*
 * Tell them what RCU they are running.
 */
@@ -836,7 +930,7 @@ void rcu_force_quiescent_state(void)
 EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
 /*
- * Because preemptable RCU does not exist, we never have to check for
+ * Because preemptible RCU does not exist, we never have to check for
 * CPUs being in quiescent states.
 */
 static void rcu_preempt_note_context_switch(int cpu)
@@ -844,10 +938,10 @@ static void rcu_preempt_note_context_switch(int cpu)
 }
 /*
- * Because preemptable RCU does not exist, there are never any preempted
+ * Because preemptible RCU does not exist, there are never any preempted
 * RCU readers.
 */
-static int rcu_preempted_readers(struct rcu_node *rnp)
+static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 {
        return 0;
 }
@@ -862,10 +956,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 /*
- * Because preemptable RCU does not exist, we never have to check for
+ * Because preemptible RCU does not exist, we never have to check for
 * tasks blocked within RCU read-side critical sections.
 */
 static void rcu_print_detail_task_stall(struct rcu_state *rsp)
@@ -873,7 +965,7 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
 }
 /*
- * Because preemptable RCU does not exist, we never have to check for
+ * Because preemptible RCU does not exist, we never have to check for
 * tasks blocked within RCU read-side critical sections.
 */
 static void rcu_print_task_stall(struct rcu_node *rnp)
@@ -888,10 +980,8 @@ static void rcu_preempt_stall_reset(void)
 {
 }
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 /*
- * Because there is no preemptable RCU, there can be no readers blocked,
+ * Because there is no preemptible RCU, there can be no readers blocked,
 * so there is no need to check for blocked tasks.  So check only for
 * bogus qsmask values.
 */
@@ -903,7 +993,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 #ifdef CONFIG_HOTPLUG_CPU
 /*
- * Because preemptable RCU does not exist, it never needs to migrate
+ * Because preemptible RCU does not exist, it never needs to migrate
 * tasks that were blocked within RCU read-side critical sections, and
 * such non-existent tasks cannot possibly have been blocking the current
 * grace period.
@@ -916,7 +1006,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
 }
 /*
- * Because preemptable RCU does not exist, it never needs CPU-offline
+ * Because preemptible RCU does not exist, it never needs CPU-offline
 * processing.
 */
 static void rcu_preempt_offline_cpu(int cpu)
@@ -926,7 +1016,7 @@ static void rcu_preempt_offline_cpu(int cpu)
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
- * Because preemptable RCU does not exist, it never has any callbacks
+ * Because preemptible RCU does not exist, it never has any callbacks
 * to check.
 */
 static void rcu_preempt_check_callbacks(int cpu)
@@ -934,7 +1024,7 @@ static void rcu_preempt_check_callbacks(int cpu)
 }
 /*
- * Because preemptable RCU does not exist, it never has any callbacks
+ * Because preemptible RCU does not exist, it never has any callbacks
 * to process.
 */
 static void rcu_preempt_process_callbacks(void)
@@ -943,7 +1033,7 @@ static void rcu_preempt_process_callbacks(void)
 /*
 * Wait for an rcu-preempt grace period, but make it happen quickly.
- * But because preemptable RCU does not exist, map to rcu-sched.
+ * But because preemptible RCU does not exist, map to rcu-sched.
 */
 void synchronize_rcu_expedited(void)
 {
@@ -954,7 +1044,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 #ifdef CONFIG_HOTPLUG_CPU
 /*
- * Because preemptable RCU does not exist, there is never any need to
+ * Because preemptible RCU does not exist, there is never any need to
 * report on tasks preempted in RCU read-side critical sections during
 * expedited RCU grace periods.
 */
@@ -966,7 +1056,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
- * Because preemptable RCU does not exist, it never has any work to do.
+ * Because preemptible RCU does not exist, it never has any work to do.
 */
 static int rcu_preempt_pending(int cpu)
 {
@@ -974,7 +1064,7 @@ static int rcu_preempt_pending(int cpu)
 }
 /*
- * Because preemptable RCU does not exist, it never needs any CPU.
+ * Because preemptible RCU does not exist, it never needs any CPU.
 */
 static int rcu_preempt_needs_cpu(int cpu)
 {
@@ -982,7 +1072,7 @@ static int rcu_preempt_needs_cpu(int cpu)
 }
 /*
- * Because preemptable RCU does not exist, rcu_barrier() is just
+ * Because preemptible RCU does not exist, rcu_barrier() is just
 * another name for rcu_barrier_sched().
 */
 void rcu_barrier(void)
@@ -992,7 +1082,7 @@ void rcu_barrier(void)
 EXPORT_SYMBOL_GPL(rcu_barrier);
 /*
- * Because preemptable RCU does not exist, there is no per-CPU
+ * Because preemptible RCU does not exist, there is no per-CPU
 * data to initialize.
 */
 static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
@@ -1000,14 +1090,14 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 }
 /*
- * Because there is no preemptable RCU, there are no callbacks to move.
+ * Because there is no preemptible RCU, there are no callbacks to move.
 */
 static void rcu_preempt_send_cbs_to_online(void)
 {
 }
 /*
- * Because preemptable RCU does not exist, it need not be initialized.
+ * Because preemptible RCU does not exist, it need not be initialized.
 */
 static void __init __rcu_init_preempt(void)
 {
@@ -1015,6 +1105,665 @@ static void __init __rcu_init_preempt(void)
 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+#include "rtmutex_common.h"
+#ifdef CONFIG_RCU_TRACE
+static void rcu_initiate_boost_trace(struct rcu_node *rnp)
+{
+        if (list_empty(&rnp->blkd_tasks))
+                rnp->n_balk_blkd_tasks++;
+        else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
+                rnp->n_balk_exp_gp_tasks++;
+        else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL)
+                rnp->n_balk_boost_tasks++;
+        else if (rnp->gp_tasks != NULL && rnp->qsmask != 0)
+                rnp->n_balk_notblocked++;
+        else if (rnp->gp_tasks != NULL &&
+                 ULONG_CMP_LT(jiffies, rnp->boost_time))
+                rnp->n_balk_notyet++;
+        else
+                rnp->n_balk_nos++;
+}
+#else /* #ifdef CONFIG_RCU_TRACE */
+static void rcu_initiate_boost_trace(struct rcu_node *rnp)
+{
+}
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+/*
+ * Carry out RCU priority boosting on the task indicated by ->exp_tasks
+ * or ->boost_tasks, advancing the pointer to the next task in the
+ * ->blkd_tasks list.
+ *
+ * Note that irqs must be enabled: boosting the task can block.
+ * Returns 1 if there are more tasks needing to be boosted.
+ */
+static int rcu_boost(struct rcu_node *rnp)
+{
+        unsigned long flags;
+        struct rt_mutex mtx;
+        struct task_struct *t;
+        struct list_head *tb;
+        if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
+                return 0;  /* Nothing left to boost. */
+        raw_spin_lock_irqsave(&rnp->lock, flags);
+        /*
+         * Recheck under the lock: all tasks in need of boosting
+         * might exit their RCU read-side critical sections on their own.
+         */
+        if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                return 0;
+        }
+        /*
+         * Preferentially boost tasks blocking expedited grace periods.
+         * This cannot starve the normal grace periods because a second
+         * expedited grace period must boost all blocked tasks, including
+         * those blocking the pre-existing normal grace period.
+         */
+        if (rnp->exp_tasks != NULL) {
+                tb = rnp->exp_tasks;
+                rnp->n_exp_boosts++;
+        } else {
+                tb = rnp->boost_tasks;
+                rnp->n_normal_boosts++;
+        }
+        rnp->n_tasks_boosted++;
+        /*
+         * We boost task t by manufacturing an rt_mutex that appears to
+         * be held by task t.  We leave a pointer to that rt_mutex where
+         * task t can find it, and task t will release the mutex when it
+         * exits its outermost RCU read-side critical section.  Then
+         * simply acquiring this artificial rt_mutex will boost task
+         * t's priority.  (Thanks to tglx for suggesting this approach!)
+         *
+         * Note that task t must acquire rnp->lock to remove itself from
+         * the ->blkd_tasks list, which it will do from exit() if from
+         * nowhere else.  We therefore are guaranteed that task t will
+         * stay around at least until we drop rnp->lock.  Note that
+         * rnp->lock also resolves races between our priority boosting
+         * and task t's exiting its outermost RCU read-side critical
+         * section.
+         */
+        t = container_of(tb, struct task_struct, rcu_node_entry);
+        rt_mutex_init_proxy_locked(&mtx, t);
+        t->rcu_boost_mutex = &mtx;
+        t->rcu_boosted = 1;
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+        rt_mutex_lock(&mtx);  /* Side effect: boosts task t's priority. */
+        rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
+        return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL;
+}
+/*
+ * Timer handler to initiate waking up of boost kthreads that
+ * have yielded the CPU due to excessive numbers of tasks to
+ * boost.  We wake up the per-rcu_node kthread, which in turn
+ * will wake up the booster kthread.
+ */
+static void rcu_boost_kthread_timer(unsigned long arg)
+{
+        invoke_rcu_node_kthread((struct rcu_node *)arg);
+}
+/*
+ * Priority-boosting kthread.  One per leaf rcu_node and one for the
+ * root rcu_node.
+ */
+static int rcu_boost_kthread(void *arg)
+{
+        struct rcu_node *rnp = (struct rcu_node *)arg;
+        int spincnt = 0;
+        int more2boost;
+        for (;;) {
+                rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
+                rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
+                rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
+                more2boost = rcu_boost(rnp);
+                if (more2boost)
+                        spincnt++;
+                else
+                        spincnt = 0;
+                if (spincnt > 10) {
+                        rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
+                        spincnt = 0;
+                }
+        }
+        /* NOTREACHED */
+        return 0;
+}
+/*
+ * Check to see if it is time to start boosting RCU readers that are
+ * blocking the current grace period, and, if so, tell the per-rcu_node
+ * kthread to start boosting them.  If there is an expedited grace
+ * period in progress, it is always time to boost.
+ *
+ * The caller must hold rnp->lock, which this function releases,
+ * but irqs remain disabled.  The ->boost_kthread_task is immortal,
+ * so we don't need to worry about it going away.
+ */
+static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
+{
+        struct task_struct *t;
+        if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
+                rnp->n_balk_exp_gp_tasks++;
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                return;
+        }
+        if (rnp->exp_tasks != NULL ||
+            (rnp->gp_tasks != NULL &&
+             rnp->boost_tasks == NULL &&
+             rnp->qsmask == 0 &&
+             ULONG_CMP_GE(jiffies, rnp->boost_time))) {
+                if (rnp->exp_tasks == NULL)
+                        rnp->boost_tasks = rnp->gp_tasks;
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                t = rnp->boost_kthread_task;
+                if (t != NULL)
+                        wake_up_process(t);
+        } else {
+                rcu_initiate_boost_trace(rnp);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+        }
+}
+/*
+ * Wake up the per-CPU kthread to invoke RCU callbacks.
+ */
+static void invoke_rcu_callbacks_kthread(void)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        __this_cpu_write(rcu_cpu_has_work, 1);
+        if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) {
+                local_irq_restore(flags);
+                return;
+        }
+        wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
+        local_irq_restore(flags);
+}
+/*
+ * Set the affinity of the boost kthread.  The CPU-hotplug locks are
+ * held, so no one should be messing with the existence of the boost
+ * kthread.
+ */
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
+                                          cpumask_var_t cm)
+{
+        struct task_struct *t;
+        t = rnp->boost_kthread_task;
+        if (t != NULL)
+                set_cpus_allowed_ptr(rnp->boost_kthread_task, cm);
+}
+#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
+/*
+ * Do priority-boost accounting for the start of a new grace period.
+ */
+static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
+{
+        rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
+}
+/*
+ * Create an RCU-boost kthread for the specified node if one does not
+ * already exist.  We only create this kthread for preemptible RCU.
+ * Returns zero if all is well, a negated errno otherwise.
+ */
+static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
+                                                 struct rcu_node *rnp,
+                                                 int rnp_index)
+{
+        unsigned long flags;
+        struct sched_param sp;
+        struct task_struct *t;
+        if (&rcu_preempt_state != rsp)
+                return 0;
+        rsp->boost = 1;
+        if (rnp->boost_kthread_task != NULL)
+                return 0;
+        t = kthread_create(rcu_boost_kthread, (void *)rnp,
+                           "rcub%d", rnp_index);
+        if (IS_ERR(t))
+                return PTR_ERR(t);
+        raw_spin_lock_irqsave(&rnp->lock, flags);
+        rnp->boost_kthread_task = t;
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+        sp.sched_priority = RCU_KTHREAD_PRIO;
+        sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+        wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
+        return 0;
+}
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Stop the RCU's per-CPU kthread when its CPU goes offline,.
+ */
+static void rcu_stop_cpu_kthread(int cpu)
+{
+        struct task_struct *t;
+        /* Stop the CPU's kthread. */
+        t = per_cpu(rcu_cpu_kthread_task, cpu);
+        if (t != NULL) {
+                per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
+                kthread_stop(t);
+        }
+}
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+static void rcu_kthread_do_work(void)
+{
+        rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data));
+        rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
+        rcu_preempt_do_callbacks();
+}
+/*
+ * Wake up the specified per-rcu_node-structure kthread.
+ * Because the per-rcu_node kthreads are immortal, we don't need
+ * to do anything to keep them alive.
+ */
+static void invoke_rcu_node_kthread(struct rcu_node *rnp)
+{
+        struct task_struct *t;
+        t = rnp->node_kthread_task;
+        if (t != NULL)
+                wake_up_process(t);
+}
+/*
+ * Set the specified CPU's kthread to run RT or not, as specified by
+ * the to_rt argument.  The CPU-hotplug locks are held, so the task
+ * is not going away.
+ */
+static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
+{
+        int policy;
+        struct sched_param sp;
+        struct task_struct *t;
+        t = per_cpu(rcu_cpu_kthread_task, cpu);
+        if (t == NULL)
+                return;
+        if (to_rt) {
+                policy = SCHED_FIFO;
+                sp.sched_priority = RCU_KTHREAD_PRIO;
+        } else {
+                policy = SCHED_NORMAL;
+                sp.sched_priority = 0;
+        }
+        sched_setscheduler_nocheck(t, policy, &sp);
+}
+/*
+ * Timer handler to initiate the waking up of per-CPU kthreads that
+ * have yielded the CPU due to excess numbers of RCU callbacks.
+ * We wake up the per-rcu_node kthread, which in turn will wake up
+ * the booster kthread.
+ */
+static void rcu_cpu_kthread_timer(unsigned long arg)
+{
+        struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
+        struct rcu_node *rnp = rdp->mynode;
+        atomic_or(rdp->grpmask, &rnp->wakemask);
+        invoke_rcu_node_kthread(rnp);
+}
+/*
+ * Drop to non-real-time priority and yield, but only after posting a
+ * timer that will cause us to regain our real-time priority if we
+ * remain preempted.  Either way, we restore our real-time priority
+ * before returning.
+ */
+static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
+{
+        struct sched_param sp;
+        struct timer_list yield_timer;
+        setup_timer_on_stack(&yield_timer, f, arg);
+        mod_timer(&yield_timer, jiffies + 2);
+        sp.sched_priority = 0;
+        sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
+        set_user_nice(current, 19);
+        schedule();
+        sp.sched_priority = RCU_KTHREAD_PRIO;
+        sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+        del_timer(&yield_timer);
+}
+/*
+ * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
+ * This can happen while the corresponding CPU is either coming online
+ * or going offline.  We cannot wait until the CPU is fully online
+ * before starting the kthread, because the various notifier functions
+ * can wait for RCU grace periods.  So we park rcu_cpu_kthread() until
+ * the corresponding CPU is online.
+ *
+ * Return 1 if the kthread needs to stop, 0 otherwise.
+ *
+ * Caller must disable bh.  This function can momentarily enable it.
+ */
+static int rcu_cpu_kthread_should_stop(int cpu)
+{
+        while (cpu_is_offline(cpu) ||
+               !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
+               smp_processor_id() != cpu) {
+                if (kthread_should_stop())
+                        return 1;
+                per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
+                per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
+                local_bh_enable();
+                schedule_timeout_uninterruptible(1);
+                if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
+                        set_cpus_allowed_ptr(current, cpumask_of(cpu));
+                local_bh_disable();
+        }
+        per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
+        return 0;
+}
+/*
+ * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
+ * earlier RCU softirq.
+ */
+static int rcu_cpu_kthread(void *arg)
+{
+        int cpu = (int)(long)arg;
+        unsigned long flags;
+        int spincnt = 0;
+        unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
+        char work;
+        char *workp = &per_cpu(rcu_cpu_has_work, cpu);
+        for (;;) {
+                *statusp = RCU_KTHREAD_WAITING;
+                rcu_wait(*workp != 0 || kthread_should_stop());
+                local_bh_disable();
+                if (rcu_cpu_kthread_should_stop(cpu)) {
+                        local_bh_enable();
+                        break;
+                }
+                *statusp = RCU_KTHREAD_RUNNING;
+                per_cpu(rcu_cpu_kthread_loops, cpu)++;
+                local_irq_save(flags);
+                work = *workp;
+                *workp = 0;
+                local_irq_restore(flags);
+                if (work)
+                        rcu_kthread_do_work();
+                local_bh_enable();
+                if (*workp != 0)
+                        spincnt++;
+                else
+                        spincnt = 0;
+                if (spincnt > 10) {
+                        *statusp = RCU_KTHREAD_YIELDING;
+                        rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
+                        spincnt = 0;
+                }
+        }
+        *statusp = RCU_KTHREAD_STOPPED;
+        return 0;
+}
+/*
+ * Spawn a per-CPU kthread, setting up affinity and priority.
+ * Because the CPU hotplug lock is held, no other CPU will be attempting
+ * to manipulate rcu_cpu_kthread_task.  There might be another CPU
+ * attempting to access it during boot, but the locking in kthread_bind()
+ * will enforce sufficient ordering.
+ *
+ * Please note that we cannot simply refuse to wake up the per-CPU
+ * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state,
+ * which can result in softlockup complaints if the task ends up being
+ * idle for more than a couple of minutes.
+ *
+ * However, please note also that we cannot bind the per-CPU kthread to its
+ * CPU until that CPU is fully online.  We also cannot wait until the
+ * CPU is fully online before we create its per-CPU kthread, as this would
+ * deadlock the system when CPU notifiers tried waiting for grace
+ * periods.  So we bind the per-CPU kthread to its CPU only if the CPU
+ * is online.  If its CPU is not yet fully online, then the code in
+ * rcu_cpu_kthread() will wait until it is fully online, and then do
+ * the binding.
+ */
+static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
+{
+        struct sched_param sp;
+        struct task_struct *t;
+        if (!rcu_scheduler_fully_active ||
+            per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
+                return 0;
+        t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu);
+        if (IS_ERR(t))
+                return PTR_ERR(t);
+        if (cpu_online(cpu))
+                kthread_bind(t, cpu);
+        per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
+        WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
+        sp.sched_priority = RCU_KTHREAD_PRIO;
+        sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+        per_cpu(rcu_cpu_kthread_task, cpu) = t;
+        wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */
+        return 0;
+}
+/*
+ * Per-rcu_node kthread, which is in charge of waking up the per-CPU
+ * kthreads when needed.  We ignore requests to wake up kthreads
+ * for offline CPUs, which is OK because force_quiescent_state()
+ * takes care of this case.
+ */
+static int rcu_node_kthread(void *arg)
+{
+        int cpu;
+        unsigned long flags;
+        unsigned long mask;
+        struct rcu_node *rnp = (struct rcu_node *)arg;
+        struct sched_param sp;
+        struct task_struct *t;
+        for (;;) {
+                rnp->node_kthread_status = RCU_KTHREAD_WAITING;
+                rcu_wait(atomic_read(&rnp->wakemask) != 0);
+                rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
+                raw_spin_lock_irqsave(&rnp->lock, flags);
+                mask = atomic_xchg(&rnp->wakemask, 0);
+                rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
+                for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
+                        if ((mask & 0x1) == 0)
+                                continue;
+                        preempt_disable();
+                        t = per_cpu(rcu_cpu_kthread_task, cpu);
+                        if (!cpu_online(cpu) || t == NULL) {
+                                preempt_enable();
+                                continue;
+                        }
+                        per_cpu(rcu_cpu_has_work, cpu) = 1;
+                        sp.sched_priority = RCU_KTHREAD_PRIO;
+                        sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+                        preempt_enable();
+                }
+        }
+        /* NOTREACHED */
+        rnp->node_kthread_status = RCU_KTHREAD_STOPPED;
+        return 0;
+}
+/*
+ * Set the per-rcu_node kthread's affinity to cover all CPUs that are
+ * served by the rcu_node in question.  The CPU hotplug lock is still
+ * held, so the value of rnp->qsmaskinit will be stable.
+ *
+ * We don't include outgoingcpu in the affinity set, use -1 if there is
+ * no outgoing CPU.  If there are no CPUs left in the affinity set,
+ * this function allows the kthread to execute on any CPU.
+ */
+static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+{
+        cpumask_var_t cm;
+        int cpu;
+        unsigned long mask = rnp->qsmaskinit;
+        if (rnp->node_kthread_task == NULL)
+                return;
+        if (!alloc_cpumask_var(&cm, GFP_KERNEL))
+                return;
+        cpumask_clear(cm);
+        for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
+                if ((mask & 0x1) && cpu != outgoingcpu)
+                        cpumask_set_cpu(cpu, cm);
+        if (cpumask_weight(cm) == 0) {
+                cpumask_setall(cm);
+                for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
+                        cpumask_clear_cpu(cpu, cm);
+                WARN_ON_ONCE(cpumask_weight(cm) == 0);
+        }
+        set_cpus_allowed_ptr(rnp->node_kthread_task, cm);
+        rcu_boost_kthread_setaffinity(rnp, cm);
+        free_cpumask_var(cm);
+}
+/*
+ * Spawn a per-rcu_node kthread, setting priority and affinity.
+ * Called during boot before online/offline can happen, or, if
+ * during runtime, with the main CPU-hotplug locks held.  So only
+ * one of these can be executing at a time.
+ */
+static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
+                                                struct rcu_node *rnp)
+{
+        unsigned long flags;
+        int rnp_index = rnp - &rsp->node[0];
+        struct sched_param sp;
+        struct task_struct *t;
+        if (!rcu_scheduler_fully_active ||
+            rnp->qsmaskinit == 0)
+                return 0;
+        if (rnp->node_kthread_task == NULL) {
+                t = kthread_create(rcu_node_kthread, (void *)rnp,
+                                   "rcun%d", rnp_index);
+                if (IS_ERR(t))
+                        return PTR_ERR(t);
+                raw_spin_lock_irqsave(&rnp->lock, flags);
+                rnp->node_kthread_task = t;
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                sp.sched_priority = 99;
+                sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+                wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
+        }
+        return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
+}
+/*
+ * Spawn all kthreads -- called as soon as the scheduler is running.
+ */
+static int __init rcu_spawn_kthreads(void)
+{
+        int cpu;
+        struct rcu_node *rnp;
+        rcu_scheduler_fully_active = 1;
+        for_each_possible_cpu(cpu) {
+                per_cpu(rcu_cpu_has_work, cpu) = 0;
+                if (cpu_online(cpu))
+                        (void)rcu_spawn_one_cpu_kthread(cpu);
+        }
+        rnp = rcu_get_root(rcu_state);
+        (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+        if (NUM_RCU_NODES > 1) {
+                rcu_for_each_leaf_node(rcu_state, rnp)
+                        (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+        }
+        return 0;
+}
+early_initcall(rcu_spawn_kthreads);
+static void __cpuinit rcu_prepare_kthreads(int cpu)
+{
+        struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
+        struct rcu_node *rnp = rdp->mynode;
+        /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
+        if (rcu_scheduler_fully_active) {
+                (void)rcu_spawn_one_cpu_kthread(cpu);
+                if (rnp->node_kthread_task == NULL)
+                        (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+        }
+}
+#else /* #ifdef CONFIG_RCU_BOOST */
+static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
+{
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+static void invoke_rcu_callbacks_kthread(void)
+{
+        WARN_ON_ONCE(1);
+}
+static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
+{
+}
+#ifdef CONFIG_HOTPLUG_CPU
+static void rcu_stop_cpu_kthread(int cpu)
+{
+}
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+{
+}
+static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
+{
+}
+static int __init rcu_scheduler_really_started(void)
+{
+        rcu_scheduler_fully_active = 1;
+        return 0;
+}
+early_initcall(rcu_scheduler_really_started);
+static void __cpuinit rcu_prepare_kthreads(int cpu)
+{
+}
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
 #ifndef CONFIG_SMP
 void synchronize_sched_expedited(void)
@@ -1187,14 +1936,13 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
 *
 * Because it is not legal to invoke rcu_process_callbacks() with irqs
 * disabled, we do one pass of force_quiescent_state(), then do a
- * raise_softirq() to cause rcu_process_callbacks() to be invoked later.
+ * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
- * The per-cpu rcu_dyntick_drain variable controls the sequencing.
+ * later.  The per-cpu rcu_dyntick_drain variable controls the sequencing.
 */
 int rcu_needs_cpu(int cpu)
 {
        int c = 0;
        int snap;
-        int snap_nmi;
        int thatcpu;
        /* Check for being in the holdoff period. */
@@ -1205,10 +1953,10 @@ int rcu_needs_cpu(int cpu)
        for_each_online_cpu(thatcpu) {
                if (thatcpu == cpu)
                        continue;
-                snap = per_cpu(rcu_dynticks, thatcpu).dynticks;
+                snap = atomic_add_return(0, &per_cpu(rcu_dynticks,
-                snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi;
+                                                     thatcpu).dynticks);
                smp_mb(); /* Order sampling of snap with end of grace period. */
-                if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) {
+                if ((snap & 0x1) != 0) {
                        per_cpu(rcu_dyntick_drain, cpu) = 0;
                        per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
                        return rcu_needs_cpu_quick_check(cpu);
@@ -1239,7 +1987,7 @@ int rcu_needs_cpu(int cpu)
        /* If RCU callbacks are still pending, RCU still needs this CPU. */
        if (c)
-                raise_softirq(RCU_SOFTIRQ);
+                invoke_rcu_core();
        return c;
 }
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index c8e97853b970..3b0c0986afc0 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -31,7 +31,7 @@
 #include <linux/rcupdate.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <linux/bitops.h>
 #include <linux/module.h>
 #include <linux/completion.h>
@@ -46,6 +46,22 @@
 #define RCU_TREE_NONCORE
 #include "rcutree.h"
+#ifdef CONFIG_RCU_BOOST
+DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
+DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu);
+DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
+DECLARE_PER_CPU(char, rcu_cpu_has_work);
+static char convert_kthread_status(unsigned int kthread_status)
+{
+        if (kthread_status > RCU_KTHREAD_MAX)
+                return '?';
+        return "SRWOY"[kthread_status];
+}
+#endif /* #ifdef CONFIG_RCU_BOOST */
 static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 {
        if (!rdp->beenonline)
@@ -57,14 +73,31 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
                   rdp->passed_quiesc, rdp->passed_quiesc_completed,
                   rdp->qs_pending);
 #ifdef CONFIG_NO_HZ
-        seq_printf(m, " dt=%d/%d dn=%d df=%lu",
+        seq_printf(m, " dt=%d/%d/%d df=%lu",
-                   rdp->dynticks->dynticks,
+                   atomic_read(&rdp->dynticks->dynticks),
                   rdp->dynticks->dynticks_nesting,
-                   rdp->dynticks->dynticks_nmi,
+                   rdp->dynticks->dynticks_nmi_nesting,
                   rdp->dynticks_fqs);
 #endif /* #ifdef CONFIG_NO_HZ */
        seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
-        seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit);
+        seq_printf(m, " ql=%ld qs=%c%c%c%c",
+                   rdp->qlen,
+                   ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
+                        rdp->nxttail[RCU_NEXT_TAIL]],
+                   ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
+                        rdp->nxttail[RCU_NEXT_READY_TAIL]],
+                   ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
+                        rdp->nxttail[RCU_WAIT_TAIL]],
+                   ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
+#ifdef CONFIG_RCU_BOOST
+        seq_printf(m, " kt=%d/%c/%d ktl=%x",
+                   per_cpu(rcu_cpu_has_work, rdp->cpu),
+                   convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
+                                          rdp->cpu)),
+                   per_cpu(rcu_cpu_kthread_cpu, rdp->cpu),
+                   per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+        seq_printf(m, " b=%ld", rdp->blimit);
        seq_printf(m, " ci=%lu co=%lu ca=%lu\n",
                   rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
 }
@@ -115,13 +148,27 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
                   rdp->qs_pending);
 #ifdef CONFIG_NO_HZ
        seq_printf(m, ",%d,%d,%d,%lu",
-                   rdp->dynticks->dynticks,
+                   atomic_read(&rdp->dynticks->dynticks),
                   rdp->dynticks->dynticks_nesting,
-                   rdp->dynticks->dynticks_nmi,
+                   rdp->dynticks->dynticks_nmi_nesting,
                   rdp->dynticks_fqs);
 #endif /* #ifdef CONFIG_NO_HZ */
        seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
-        seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit);
+        seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen,
+                   ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
+                        rdp->nxttail[RCU_NEXT_TAIL]],
+                   ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
+                        rdp->nxttail[RCU_NEXT_READY_TAIL]],
+                   ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
+                        rdp->nxttail[RCU_WAIT_TAIL]],
+                   ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
+#ifdef CONFIG_RCU_BOOST
+        seq_printf(m, ",%d,\"%c\"",
+                   per_cpu(rcu_cpu_has_work, rdp->cpu),
+                   convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
+                                          rdp->cpu)));
+#endif /* #ifdef CONFIG_RCU_BOOST */
+        seq_printf(m, ",%ld", rdp->blimit);
        seq_printf(m, ",%lu,%lu,%lu\n",
                   rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
 }
@@ -130,9 +177,13 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
 {
        seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",");
 #ifdef CONFIG_NO_HZ
-        seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
+        seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
 #endif /* #ifdef CONFIG_NO_HZ */
-        seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n");
+        seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\"");
+#ifdef CONFIG_RCU_BOOST
+        seq_puts(m, "\"kt\",\"ktl\"");
+#endif /* #ifdef CONFIG_RCU_BOOST */
+        seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n");
 #ifdef CONFIG_TREE_PREEMPT_RCU
        seq_puts(m, "\"rcu_preempt:\"\n");
        PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
@@ -157,11 +208,76 @@ static const struct file_operations rcudata_csv_fops = {
        .release = single_release,
 };
+#ifdef CONFIG_RCU_BOOST
+static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
+{
+        seq_printf(m,  "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu "
+                   "j=%04x bt=%04x\n",
+                   rnp->grplo, rnp->grphi,
+                   "T."[list_empty(&rnp->blkd_tasks)],
+                   "N."[!rnp->gp_tasks],
+                   "E."[!rnp->exp_tasks],
+                   "B."[!rnp->boost_tasks],
+                   convert_kthread_status(rnp->boost_kthread_status),
+                   rnp->n_tasks_boosted, rnp->n_exp_boosts,
+                   rnp->n_normal_boosts,
+                   (int)(jiffies & 0xffff),
+                   (int)(rnp->boost_time & 0xffff));
+        seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
+                   "     balk",
+                   rnp->n_balk_blkd_tasks,
+                   rnp->n_balk_exp_gp_tasks,
+                   rnp->n_balk_boost_tasks,
+                   rnp->n_balk_notblocked,
+                   rnp->n_balk_notyet,
+                   rnp->n_balk_nos);
+}
+static int show_rcu_node_boost(struct seq_file *m, void *unused)
+{
+        struct rcu_node *rnp;
+        rcu_for_each_leaf_node(&rcu_preempt_state, rnp)
+                print_one_rcu_node_boost(m, rnp);
+        return 0;
+}
+static int rcu_node_boost_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, show_rcu_node_boost, NULL);
+}
+static const struct file_operations rcu_node_boost_fops = {
+        .owner = THIS_MODULE,
+        .open = rcu_node_boost_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
+/*
+ * Create the rcuboost debugfs entry.  Standard error return.
+ */
+static int rcu_boost_trace_create_file(struct dentry *rcudir)
+{
+        return !debugfs_create_file("rcuboost", 0444, rcudir, NULL,
+                                    &rcu_node_boost_fops);
+}
+#else /* #ifdef CONFIG_RCU_BOOST */
+static int rcu_boost_trace_create_file(struct dentry *rcudir)
+{
+        return 0;  /* There cannot be an error if we didn't create it! */
+}
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
 static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 {
        unsigned long gpnum;
        int level = 0;
-        int phase;
        struct rcu_node *rnp;
        gpnum = rsp->gpnum;
@@ -178,13 +294,11 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
                        seq_puts(m, "\n");
                        level = rnp->level;
                }
-                phase = gpnum & 0x1;
+                seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d    ",
-                seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d    ",
                           rnp->qsmask, rnp->qsmaskinit,
-                           "T."[list_empty(&rnp->blocked_tasks[phase])],
+                           ".G"[rnp->gp_tasks != NULL],
-                           "E."[list_empty(&rnp->blocked_tasks[phase + 2])],
+                           ".E"[rnp->exp_tasks != NULL],
-                           "T."[list_empty(&rnp->blocked_tasks[!phase])],
+                           ".T"[!list_empty(&rnp->blkd_tasks)],
-                           "E."[list_empty(&rnp->blocked_tasks[!phase + 2])],
                           rnp->grplo, rnp->grphi, rnp->grpnum);
        }
        seq_puts(m, "\n");
@@ -216,16 +330,35 @@ static const struct file_operations rcuhier_fops = {
        .release = single_release,
 };
+static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
+{
+        unsigned long flags;
+        unsigned long completed;
+        unsigned long gpnum;
+        unsigned long gpage;
+        unsigned long gpmax;
+        struct rcu_node *rnp = &rsp->node[0];
+        raw_spin_lock_irqsave(&rnp->lock, flags);
+        completed = rsp->completed;
+        gpnum = rsp->gpnum;
+        if (rsp->completed == rsp->gpnum)
+                gpage = 0;
+        else
+                gpage = jiffies - rsp->gp_start;
+        gpmax = rsp->gp_max;
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+        seq_printf(m, "%s: completed=%ld  gpnum=%lu  age=%ld  max=%ld\n",
+                   rsp->name, completed, gpnum, gpage, gpmax);
+}
 static int show_rcugp(struct seq_file *m, void *unused)
 {
 #ifdef CONFIG_TREE_PREEMPT_RCU
-        seq_printf(m, "rcu_preempt: completed=%ld  gpnum=%lu\n",
+        show_one_rcugp(m, &rcu_preempt_state);
-                   rcu_preempt_state.completed, rcu_preempt_state.gpnum);
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-        seq_printf(m, "rcu_sched: completed=%ld  gpnum=%lu\n",
+        show_one_rcugp(m, &rcu_sched_state);
-                   rcu_sched_state.completed, rcu_sched_state.gpnum);
+        show_one_rcugp(m, &rcu_bh_state);
-        seq_printf(m, "rcu_bh: completed=%ld  gpnum=%lu\n",
-                   rcu_bh_state.completed, rcu_bh_state.gpnum);
        return 0;
 }
@@ -298,6 +431,29 @@ static const struct file_operations rcu_pending_fops = {
        .release = single_release,
 };
+static int show_rcutorture(struct seq_file *m, void *unused)
+{
+        seq_printf(m, "rcutorture test sequence: %lu %s\n",
+                   rcutorture_testseq >> 1,
+                   (rcutorture_testseq & 0x1) ? "(test in progress)" : "");
+        seq_printf(m, "rcutorture update version number: %lu\n",
+                   rcutorture_vernum);
+        return 0;
+}
+static int rcutorture_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, show_rcutorture, NULL);
+}
+static const struct file_operations rcutorture_fops = {
+        .owner = THIS_MODULE,
+        .open = rcutorture_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
 static struct dentry *rcudir;
 static int __init rcutree_trace_init(void)
@@ -318,6 +474,9 @@ static int __init rcutree_trace_init(void)
        if (!retval)
                goto free_out;
+        if (rcu_boost_trace_create_file(rcudir))
+                goto free_out;
        retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
        if (!retval)
                goto free_out;
@@ -331,6 +490,11 @@ static int __init rcutree_trace_init(void)
                                                NULL, &rcu_pending_fops);
        if (!retval)
                goto free_out;
+        retval = debugfs_create_file("rcutorture", 0444, rcudir,
+                                                NULL, &rcutorture_fops);
+        if (!retval)
+                goto free_out;
        return 0;
 free_out:
        debugfs_remove_recursive(rcudir);
diff --git a/kernel/resource.c b/kernel/resource.c
index 798e2fae2a06..3b3cedc52592 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -38,6 +38,14 @@ struct resource iomem_resource = {
 };
 EXPORT_SYMBOL(iomem_resource);
+/* constraints to be met while allocating resources */
+struct resource_constraint {
+        resource_size_t min, max, align;
+        resource_size_t (*alignf)(void *, const struct resource *,
+                        resource_size_t, resource_size_t);
+        void *alignf_data;
+};
 static DEFINE_RWLOCK(resource_lock);
 static void *r_next(struct seq_file *m, void *v, loff_t *pos)
@@ -384,16 +392,13 @@ static bool resource_contains(struct resource *res1, struct resource *res2)
 }
 /*
- * Find empty slot in the resource tree given range and alignment.
+ * Find empty slot in the resource tree with the given range and
+ * alignment constraints
 */
-static int find_resource(struct resource *root, struct resource *new,
+static int __find_resource(struct resource *root, struct resource *old,
-                         resource_size_t size, resource_size_t min,
+                         struct resource *new,
-                         resource_size_t max, resource_size_t align,
+                         resource_size_t  size,
-                         resource_size_t (*alignf)(void *,
+                         struct resource_constraint *constraint)
-                                                   const struct resource *,
-                                                   resource_size_t,
-                                                   resource_size_t),
-                         void *alignf_data)
 {
        struct resource *this = root->child;
        struct resource tmp = *new, avail, alloc;
@@ -404,25 +409,26 @@ static int find_resource(struct resource *root, struct resource *new,
         * Skip past an allocated resource that starts at 0, since the assignment
         * of this->start - 1 to tmp->end below would cause an underflow.
         */
-        if (this && this->start == 0) {
+        if (this && this->start == root->start) {
-                tmp.start = this->end + 1;
+                tmp.start = (this == old) ? old->start : this->end + 1;
                this = this->sibling;
        }
        for(;;) {
                if (this)
-                        tmp.end = this->start - 1;
+                        tmp.end = (this == old) ?  this->end : this->start - 1;
                else
                        tmp.end = root->end;
-                resource_clip(&tmp, min, max);
+                resource_clip(&tmp, constraint->min, constraint->max);
                arch_remove_reservations(&tmp);
                /* Check for overflow after ALIGN() */
                avail = *new;
-                avail.start = ALIGN(tmp.start, align);
+                avail.start = ALIGN(tmp.start, constraint->align);
                avail.end = tmp.end;
                if (avail.start >= tmp.start) {
-                        alloc.start = alignf(alignf_data, &avail, size, align);
+                        alloc.start = constraint->alignf(constraint->alignf_data, &avail,
+                                        size, constraint->align);
                        alloc.end = alloc.start + size - 1;
                        if (resource_contains(&avail, &alloc)) {
                                new->start = alloc.start;
@@ -432,14 +438,75 @@ static int find_resource(struct resource *root, struct resource *new,
                }
                if (!this)
                        break;
-                tmp.start = this->end + 1;
+                if (this != old)
+                        tmp.start = this->end + 1;
                this = this->sibling;
        }
        return -EBUSY;
 }
+/*
+ * Find empty slot in the resource tree given range and alignment.
+ */
+static int find_resource(struct resource *root, struct resource *new,
+                        resource_size_t size,
+                        struct resource_constraint  *constraint)
+{
+        return  __find_resource(root, NULL, new, size, constraint);
+}
 /**
- * allocate_resource - allocate empty slot in the resource tree given range & alignment
+ * reallocate_resource - allocate a slot in the resource tree given range & alignment.
+ *      The resource will be relocated if the new size cannot be reallocated in the
+ *      current location.
+ *
+ * @root: root resource descriptor
+ * @old:  resource descriptor desired by caller
+ * @newsize: new size of the resource descriptor
+ * @constraint: the size and alignment constraints to be met.
+ */
+int reallocate_resource(struct resource *root, struct resource *old,
+                        resource_size_t newsize,
+                        struct resource_constraint  *constraint)
+{
+        int err=0;
+        struct resource new = *old;
+        struct resource *conflict;
+        write_lock(&resource_lock);
+        if ((err = __find_resource(root, old, &new, newsize, constraint)))
+                goto out;
+        if (resource_contains(&new, old)) {
+                old->start = new.start;
+                old->end = new.end;
+                goto out;
+        }
+        if (old->child) {
+                err = -EBUSY;
+                goto out;
+        }
+        if (resource_contains(old, &new)) {
+                old->start = new.start;
+                old->end = new.end;
+        } else {
+                __release_resource(old);
+                *old = new;
+                conflict = __request_resource(root, old);
+                BUG_ON(conflict);
+        }
+out:
+        write_unlock(&resource_lock);
+        return err;
+}
+/**
+ * allocate_resource - allocate empty slot in the resource tree given range & alignment.
+ *      The resource will be reallocated with a new size if it was already allocated
 * @root: root resource descriptor
 * @new: resource descriptor desired by caller
 * @size: requested resource region size
@@ -459,12 +526,25 @@ int allocate_resource(struct resource *root, struct resource *new,
                      void *alignf_data)
 {
        int err;
+        struct resource_constraint constraint;
        if (!alignf)
                alignf = simple_align_resource;
+        constraint.min = min;
+        constraint.max = max;
+        constraint.align = align;
+        constraint.alignf = alignf;
+        constraint.alignf_data = alignf_data;
+        if ( new->parent ) {
+                /* resource is already allocated, try reallocating with
+                   the new constraints */
+                return reallocate_resource(root, new, size, &constraint);
+        }
        write_lock(&resource_lock);
-        err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
+        err = find_resource(root, new, size, &constraint);
        if (err >= 0 && __request_resource(root, new))
                err = -EBUSY;
        write_unlock(&resource_lock);
@@ -473,6 +553,27 @@ int allocate_resource(struct resource *root, struct resource *new,
 EXPORT_SYMBOL(allocate_resource);
+/**
+ * lookup_resource - find an existing resource by a resource start address
+ * @root: root resource descriptor
+ * @start: resource start address
+ *
+ * Returns a pointer to the resource if found, NULL otherwise
+ */
+struct resource *lookup_resource(struct resource *root, resource_size_t start)
+{
+        struct resource *res;
+        read_lock(&resource_lock);
+        for (res = root->child; res; res = res->sibling) {
+                if (res->start == start)
+                        break;
+        }
+        read_unlock(&resource_lock);
+        return res;
+}
 /*
 * Insert a resource into the resource tree. If successful, return NULL,
 * otherwise return the conflicting resource (compare to __request_resource())
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index ab449117aaf2..255e1662acdb 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -890,7 +890,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name)
 {
        lock->owner = NULL;
        raw_spin_lock_init(&lock->wait_lock);
-        plist_head_init_raw(&lock->wait_list, &lock->wait_lock);
+        plist_head_init(&lock->wait_list);
        debug_rt_mutex_init(lock, name);
 }
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index cae050b05f5e..9f48f3d82e9b 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -11,7 +11,7 @@
 #include <linux/rwsem.h>
 #include <asm/system.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 /*
 * lock for reading
@@ -117,15 +117,6 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
 EXPORT_SYMBOL(down_read_nested);
-void down_read_non_owner(struct rw_semaphore *sem)
-{
-        might_sleep();
-        __down_read(sem);
-}
-EXPORT_SYMBOL(down_read_non_owner);
 void down_write_nested(struct rw_semaphore *sem, int subclass)
 {
        might_sleep();
@@ -136,13 +127,6 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
 EXPORT_SYMBOL(down_write_nested);
-void up_read_non_owner(struct rw_semaphore *sem)
-{
-        __up_read(sem);
-}
-EXPORT_SYMBOL(up_read_non_owner);
 #endif
diff --git a/kernel/sched.c b/kernel/sched.c
index 312f8b95c2d4..ccacdbdecf45 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,6 +75,9 @@
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
 #include <asm/mutex.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
 #include "sched_cpupri.h"
 #include "workqueue_sched.h"
@@ -124,7 +127,7 @@
 static inline int rt_policy(int policy)
 {
-        if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
+        if (policy == SCHED_FIFO || policy == SCHED_RR)
                return 1;
        return 0;
 }
@@ -231,7 +234,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
 #endif
 /*
- * sched_domains_mutex serializes calls to arch_init_sched_domains,
+ * sched_domains_mutex serializes calls to init_sched_domains,
 * detach_destroy_domains and partition_sched_domains.
 */
 static DEFINE_MUTEX(sched_domains_mutex);
@@ -292,7 +295,7 @@ static DEFINE_SPINLOCK(task_group_lock);
 * (The default weight is 1024 - so there's no practical
 *  limitation from this.)
 */
-#define MIN_SHARES      2
+#define MIN_SHARES      (1UL <<  1)
 #define MAX_SHARES      (1UL << 18)
 static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
@@ -312,6 +315,9 @@ struct cfs_rq {
        u64 exec_clock;
        u64 min_vruntime;
+#ifndef CONFIG_64BIT
+        u64 min_vruntime_copy;
+#endif
        struct rb_root tasks_timeline;
        struct rb_node *rb_leftmost;
@@ -325,7 +331,9 @@ struct cfs_rq {
         */
        struct sched_entity *curr, *next, *last, *skip;
+#ifdef  CONFIG_SCHED_DEBUG
        unsigned int nr_spread_over;
+#endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
        struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
@@ -417,6 +425,8 @@ struct rt_rq {
 */
 struct root_domain {
        atomic_t refcount;
+        atomic_t rto_count;
+        struct rcu_head rcu;
        cpumask_var_t span;
        cpumask_var_t online;
@@ -425,7 +435,6 @@ struct root_domain {
         * one runnable RT task.
         */
        cpumask_var_t rto_mask;
-        atomic_t rto_count;
        struct cpupri cpupri;
 };
@@ -460,7 +469,7 @@ struct rq {
        u64 nohz_stamp;
        unsigned char nohz_balance_kick;
 #endif
-        unsigned int skip_clock_update;
+        int skip_clock_update;
        /* capture load from *all* tasks on this cpu: */
        struct load_weight load;
@@ -522,6 +531,12 @@ struct rq {
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
        u64 prev_irq_time;
 #endif
+#ifdef CONFIG_PARAVIRT
+        u64 prev_steal_time;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+        u64 prev_steal_time_rq;
+#endif
        /* calc_load related fields */
        unsigned long calc_load_update;
@@ -553,6 +568,10 @@ struct rq {
        unsigned int ttwu_count;
        unsigned int ttwu_local;
 #endif
+#ifdef CONFIG_SMP
+        struct task_struct *wake_list;
+#endif
 };
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -571,7 +590,6 @@ static inline int cpu_of(struct rq *rq)
 #define rcu_dereference_check_sched_domain(p) \
        rcu_dereference_check((p), \
-                              rcu_read_lock_sched_held() || \
                              lockdep_is_held(&sched_domains_mutex))
 /*
@@ -595,10 +613,10 @@ static inline int cpu_of(struct rq *rq)
 /*
 * Return the group to which this tasks belongs.
 *
- * We use task_subsys_state_check() and extend the RCU verification
+ * We use task_subsys_state_check() and extend the RCU verification with
- * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
+ * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
- * holds that lock for each task it moves into the cgroup. Therefore
+ * task it moves into the cgroup. Therefore by holding either of those locks,
- * by holding that lock, we pin the task to the current cgroup.
+ * we pin the task to the current cgroup.
 */
 static inline struct task_group *task_group(struct task_struct *p)
 {
@@ -606,6 +624,7 @@ static inline struct task_group *task_group(struct task_struct *p)
        struct cgroup_subsys_state *css;
        css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
+                        lockdep_is_held(&p->pi_lock) ||
                        lockdep_is_held(&task_rq(p)->lock));
        tg = container_of(css, struct task_group, css);
@@ -642,7 +661,7 @@ static void update_rq_clock(struct rq *rq)
 {
        s64 delta;
-        if (rq->skip_clock_update)
+        if (rq->skip_clock_update > 0)
                return;
        delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
@@ -838,18 +857,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p)
        return rq->curr == p;
 }
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline int task_running(struct rq *rq, struct task_struct *p)
 {
+#ifdef CONFIG_SMP
+        return p->on_cpu;
+#else
        return task_current(rq, p);
+#endif
 }
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
+#ifdef CONFIG_SMP
+        /*
+         * We can optimise this out completely for !SMP, because the
+         * SMP rebalancing from interrupt is the only thing that cares
+         * here.
+         */
+        next->on_cpu = 1;
+#endif
 }
 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 {
+#ifdef CONFIG_SMP
+        /*
+         * After ->on_cpu is cleared, the task can be moved to a different CPU.
+         * We must ensure this doesn't happen until the switch is completely
+         * finished.
+         */
+        smp_wmb();
+        prev->on_cpu = 0;
+#endif
 #ifdef CONFIG_DEBUG_SPINLOCK
        /* this is a valid case when another task releases the spinlock */
        rq->lock.owner = current;
@@ -865,15 +905,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 }
 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
-static inline int task_running(struct rq *rq, struct task_struct *p)
-{
-#ifdef CONFIG_SMP
-        return p->oncpu;
-#else
-        return task_current(rq, p);
-#endif
-}
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
 #ifdef CONFIG_SMP
@@ -882,7 +913,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
         * SMP rebalancing from interrupt is the only thing that cares
         * here.
         */
-        next->oncpu = 1;
+        next->on_cpu = 1;
 #endif
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
        raw_spin_unlock_irq(&rq->lock);
@@ -895,12 +926,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 {
 #ifdef CONFIG_SMP
        /*
-         * After ->oncpu is cleared, the task can be moved to a different CPU.
+         * After ->on_cpu is cleared, the task can be moved to a different CPU.
         * We must ensure this doesn't happen until the switch is completely
         * finished.
         */
        smp_wmb();
-        prev->oncpu = 0;
+        prev->on_cpu = 0;
 #endif
 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
        local_irq_enable();
@@ -909,23 +940,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 /*
- * Check whether the task is waking, we use this to synchronize ->cpus_allowed
+ * __task_rq_lock - lock the rq @p resides on.
- * against ttwu().
- */
-static inline int task_is_waking(struct task_struct *p)
-{
-        return unlikely(p->state == TASK_WAKING);
-}
-/*
- * __task_rq_lock - lock the runqueue a given task resides on.
- * Must be called interrupts disabled.
 */
 static inline struct rq *__task_rq_lock(struct task_struct *p)
        __acquires(rq->lock)
 {
        struct rq *rq;
+        lockdep_assert_held(&p->pi_lock);
        for (;;) {
                rq = task_rq(p);
                raw_spin_lock(&rq->lock);
@@ -936,22 +959,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
 }
 /*
- * task_rq_lock - lock the runqueue a given task resides on and disable
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
- * interrupts. Note the ordering: we can safely lookup the task_rq without
- * explicitly disabling preemption.
 */
 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+        __acquires(p->pi_lock)
        __acquires(rq->lock)
 {
        struct rq *rq;
        for (;;) {
-                local_irq_save(*flags);
+                raw_spin_lock_irqsave(&p->pi_lock, *flags);
                rq = task_rq(p);
                raw_spin_lock(&rq->lock);
                if (likely(rq == task_rq(p)))
                        return rq;
-                raw_spin_unlock_irqrestore(&rq->lock, *flags);
+                raw_spin_unlock(&rq->lock);
+                raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
        }
 }
@@ -961,10 +984,13 @@ static void __task_rq_unlock(struct rq *rq)
        raw_spin_unlock(&rq->lock);
 }
-static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
+static inline void
+task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
        __releases(rq->lock)
+        __releases(p->pi_lock)
 {
-        raw_spin_unlock_irqrestore(&rq->lock, *flags);
+        raw_spin_unlock(&rq->lock);
+        raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 }
 /*
@@ -1193,11 +1219,17 @@ int get_nohz_timer_target(void)
        int i;
        struct sched_domain *sd;
+        rcu_read_lock();
        for_each_domain(cpu, sd) {
-                for_each_cpu(i, sched_domain_span(sd))
+                for_each_cpu(i, sched_domain_span(sd)) {
-                        if (!idle_cpu(i))
+                        if (!idle_cpu(i)) {
-                                return i;
+                                cpu = i;
+                                goto unlock;
+                        }
+                }
        }
+unlock:
+        rcu_read_unlock();
        return cpu;
 }
 /*
@@ -1307,15 +1339,27 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 {
        u64 tmp;
+        /*
+         * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
+         * entities since MIN_SHARES = 2. Treat weight as 1 if less than
+         * 2^SCHED_LOAD_RESOLUTION.
+         */
+        if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
+                tmp = (u64)delta_exec * scale_load_down(weight);
+        else
+                tmp = (u64)delta_exec;
        if (!lw->inv_weight) {
-                if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
+                unsigned long w = scale_load_down(lw->weight);
+                if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
                        lw->inv_weight = 1;
+                else if (unlikely(!w))
+                        lw->inv_weight = WMULT_CONST;
                else
-                        lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
+                        lw->inv_weight = WMULT_CONST / w;
-                                / (lw->weight+1);
        }
-        tmp = (u64)delta_exec * weight;
        /*
         * Check whether we'd overflow the 64-bit multiplication:
         */
@@ -1532,38 +1576,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
        return rq->avg_load_per_task;
 }
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * Compute the cpu's hierarchical load factor for each task group.
- * This needs to be done in a top-down fashion because the load of a child
- * group is a fraction of its parents load.
- */
-static int tg_load_down(struct task_group *tg, void *data)
-{
-        unsigned long load;
-        long cpu = (long)data;
-        if (!tg->parent) {
-                load = cpu_rq(cpu)->load.weight;
-        } else {
-                load = tg->parent->cfs_rq[cpu]->h_load;
-                load *= tg->se[cpu]->load.weight;
-                load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
-        }
-        tg->cfs_rq[cpu]->h_load = load;
-        return 0;
-}
-static void update_h_load(long cpu)
-{
-        walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
-}
-#endif
 #ifdef CONFIG_PREEMPT
 static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -1755,17 +1767,20 @@ static void dec_nr_running(struct rq *rq)
 static void set_load_weight(struct task_struct *p)
 {
+        int prio = p->static_prio - MAX_RT_PRIO;
+        struct load_weight *load = &p->se.load;
        /*
         * SCHED_IDLE tasks get minimal weight:
         */
        if (p->policy == SCHED_IDLE) {
-                p->se.load.weight = WEIGHT_IDLEPRIO;
+                load->weight = scale_load(WEIGHT_IDLEPRIO);
-                p->se.load.inv_weight = WMULT_IDLEPRIO;
+                load->inv_weight = WMULT_IDLEPRIO;
                return;
        }
-        p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
+        load->weight = scale_load(prio_to_weight[prio]);
-        p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
+        load->inv_weight = prio_to_wmult[prio];
 }
 static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1773,7 +1788,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
        update_rq_clock(rq);
        sched_info_queued(p);
        p->sched_class->enqueue_task(rq, p, flags);
-        p->se.on_rq = 1;
 }
 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1781,7 +1795,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
        update_rq_clock(rq);
        sched_info_dequeued(p);
        p->sched_class->dequeue_task(rq, p, flags);
-        p->se.on_rq = 0;
 }
 /*
@@ -1916,10 +1929,28 @@ void account_system_vtime(struct task_struct *curr)
 }
 EXPORT_SYMBOL_GPL(account_system_vtime);
-static void update_rq_clock_task(struct rq *rq, s64 delta)
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+#ifdef CONFIG_PARAVIRT
+static inline u64 steal_ticks(u64 steal)
 {
-        s64 irq_delta;
+        if (unlikely(steal > NSEC_PER_SEC))
+                return div_u64(steal, TICK_NSEC);
+        return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
+}
+#endif
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+        s64 steal = 0, irq_delta = 0;
+#endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
        irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
        /*
@@ -1942,12 +1973,35 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
        rq->prev_irq_time += irq_delta;
        delta -= irq_delta;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+        if (static_branch((&paravirt_steal_rq_enabled))) {
+                u64 st;
+                steal = paravirt_steal_clock(cpu_of(rq));
+                steal -= rq->prev_steal_time_rq;
+                if (unlikely(steal > delta))
+                        steal = delta;
+                st = steal_ticks(steal);
+                steal = st * TICK_NSEC;
+                rq->prev_steal_time_rq += steal;
+                delta -= steal;
+        }
+#endif
        rq->clock_task += delta;
-        if (irq_delta && sched_feat(NONIRQ_POWER))
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
-                sched_rt_avg_update(rq, irq_delta);
+        if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
+                sched_rt_avg_update(rq, irq_delta + steal);
+#endif
 }
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
 static int irqtime_account_hi_update(void)
 {
        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
@@ -1982,12 +2036,7 @@ static int irqtime_account_si_update(void)
 #define sched_clock_irqtime     (0)
-static void update_rq_clock_task(struct rq *rq, s64 delta)
+#endif
-{
-        rq->clock_task += delta;
-}
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -2116,7 +2165,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
         * A queue event has occurred, and we're going to schedule.  In
         * this case, we can save a useless back to back clock update.
         */
-        if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
+        if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
                rq->skip_clock_update = 1;
 }
@@ -2162,13 +2211,28 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
         */
        WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
                        !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
+#ifdef CONFIG_LOCKDEP
+        /*
+         * The caller should hold either p->pi_lock or rq->lock, when changing
+         * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
+         *
+         * sched_move_task() holds both and thus holding either pins the cgroup,
+         * see set_task_rq().
+         *
+         * Furthermore, all task_rq users should acquire both locks, see
+         * task_rq_lock().
+         */
+        WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
+                                      lockdep_is_held(&task_rq(p)->lock)));
+#endif
 #endif
        trace_sched_migrate_task(p, new_cpu);
        if (task_cpu(p) != new_cpu) {
                p->se.nr_migrations++;
-                perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
+                perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
        }
        __set_task_cpu(p, new_cpu);
@@ -2182,19 +2246,6 @@ struct migration_arg {
 static int migration_cpu_stop(void *data);
 /*
- * The task's runqueue lock must be held.
- * Returns true if you have to wait for migration thread.
- */
-static bool migrate_task(struct task_struct *p, struct rq *rq)
-{
-        /*
-         * If the task is not on a runqueue (and not running), then
-         * the next wake-up will properly place the task.
-         */
-        return p->se.on_rq || task_running(rq, p);
-}
-/*
 * wait_task_inactive - wait for a thread to unschedule.
 *
 * If @match_state is nonzero, it's the @p->state value just checked and
@@ -2251,11 +2302,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                rq = task_rq_lock(p, &flags);
                trace_sched_wait_task(p);
                running = task_running(rq, p);
-                on_rq = p->se.on_rq;
+                on_rq = p->on_rq;
                ncsw = 0;
                if (!match_state || p->state == match_state)
                        ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
-                task_rq_unlock(rq, &flags);
+                task_rq_unlock(rq, p, &flags);
                /*
                 * If it changed from the expected state, bail out now.
@@ -2330,7 +2381,7 @@ EXPORT_SYMBOL_GPL(kick_process);
 #ifdef CONFIG_SMP
 /*
- * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
+ * ->cpus_allowed is protected by both rq->lock and p->pi_lock
 */
 static int select_fallback_rq(int cpu, struct task_struct *p)
 {
@@ -2363,12 +2414,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 }
 /*
- * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
 */
 static inline
-int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
 {
-        int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
+        int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
        /*
         * In order not to call set_task_cpu() on a blocking task we need
@@ -2394,27 +2445,63 @@ static void update_avg(u64 *avg, u64 sample)
 }
 #endif
-static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
+static void
-                                 bool is_sync, bool is_migrate, bool is_local,
+ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
-                                 unsigned long en_flags)
 {
-        schedstat_inc(p, se.statistics.nr_wakeups);
+#ifdef CONFIG_SCHEDSTATS
-        if (is_sync)
+        struct rq *rq = this_rq();
-                schedstat_inc(p, se.statistics.nr_wakeups_sync);
-        if (is_migrate)
+#ifdef CONFIG_SMP
-                schedstat_inc(p, se.statistics.nr_wakeups_migrate);
+        int this_cpu = smp_processor_id();
-        if (is_local)
+        if (cpu == this_cpu) {
+                schedstat_inc(rq, ttwu_local);
                schedstat_inc(p, se.statistics.nr_wakeups_local);
-        else
+        } else {
+                struct sched_domain *sd;
                schedstat_inc(p, se.statistics.nr_wakeups_remote);
+                rcu_read_lock();
+                for_each_domain(this_cpu, sd) {
+                        if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+                                schedstat_inc(sd, ttwu_wake_remote);
+                                break;
+                        }
+                }
+                rcu_read_unlock();
+        }
+        if (wake_flags & WF_MIGRATED)
+                schedstat_inc(p, se.statistics.nr_wakeups_migrate);
+#endif /* CONFIG_SMP */
+        schedstat_inc(rq, ttwu_count);
+        schedstat_inc(p, se.statistics.nr_wakeups);
+        if (wake_flags & WF_SYNC)
+                schedstat_inc(p, se.statistics.nr_wakeups_sync);
+#endif /* CONFIG_SCHEDSTATS */
+}
+static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+{
        activate_task(rq, p, en_flags);
+        p->on_rq = 1;
+        /* if a worker is waking up, notify workqueue */
+        if (p->flags & PF_WQ_WORKER)
+                wq_worker_waking_up(p, cpu_of(rq));
 }
-static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
+/*
-                                        int wake_flags, bool success)
+ * Mark the task runnable and perform wakeup-preemption.
+ */
+static void
+ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 {
-        trace_sched_wakeup(p, success);
+        trace_sched_wakeup(p, true);
        check_preempt_curr(rq, p, wake_flags);
        p->state = TASK_RUNNING;
@@ -2422,7 +2509,7 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
        if (p->sched_class->task_woken)
                p->sched_class->task_woken(rq, p);
-        if (unlikely(rq->idle_stamp)) {
+        if (rq->idle_stamp) {
                u64 delta = rq->clock - rq->idle_stamp;
                u64 max = 2*sysctl_sched_migration_cost;
@@ -2433,9 +2520,151 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
                rq->idle_stamp = 0;
        }
 #endif
-        /* if a worker is waking up, notify workqueue */
+}
-        if ((p->flags & PF_WQ_WORKER) && success)
-                wq_worker_waking_up(p, cpu_of(rq));
+static void
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+#ifdef CONFIG_SMP
+        if (p->sched_contributes_to_load)
+                rq->nr_uninterruptible--;
+#endif
+        ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
+        ttwu_do_wakeup(rq, p, wake_flags);
+}
+/*
+ * Called in case the task @p isn't fully descheduled from its runqueue,
+ * in this case we must do a remote wakeup. Its a 'light' wakeup though,
+ * since all we need to do is flip p->state to TASK_RUNNING, since
+ * the task is still ->on_rq.
+ */
+static int ttwu_remote(struct task_struct *p, int wake_flags)
+{
+        struct rq *rq;
+        int ret = 0;
+        rq = __task_rq_lock(p);
+        if (p->on_rq) {
+                ttwu_do_wakeup(rq, p, wake_flags);
+                ret = 1;
+        }
+        __task_rq_unlock(rq);
+        return ret;
+}
+#ifdef CONFIG_SMP
+static void sched_ttwu_do_pending(struct task_struct *list)
+{
+        struct rq *rq = this_rq();
+        raw_spin_lock(&rq->lock);
+        while (list) {
+                struct task_struct *p = list;
+                list = list->wake_entry;
+                ttwu_do_activate(rq, p, 0);
+        }
+        raw_spin_unlock(&rq->lock);
+}
+#ifdef CONFIG_HOTPLUG_CPU
+static void sched_ttwu_pending(void)
+{
+        struct rq *rq = this_rq();
+        struct task_struct *list = xchg(&rq->wake_list, NULL);
+        if (!list)
+                return;
+        sched_ttwu_do_pending(list);
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+void scheduler_ipi(void)
+{
+        struct rq *rq = this_rq();
+        struct task_struct *list = xchg(&rq->wake_list, NULL);
+        if (!list)
+                return;
+        /*
+         * Not all reschedule IPI handlers call irq_enter/irq_exit, since
+         * traditionally all their work was done from the interrupt return
+         * path. Now that we actually do some work, we need to make sure
+         * we do call them.
+         *
+         * Some archs already do call them, luckily irq_enter/exit nest
+         * properly.
+         *
+         * Arguably we should visit all archs and update all handlers,
+         * however a fair share of IPIs are still resched only so this would
+         * somewhat pessimize the simple resched case.
+         */
+        irq_enter();
+        sched_ttwu_do_pending(list);
+        irq_exit();
+}
+static void ttwu_queue_remote(struct task_struct *p, int cpu)
+{
+        struct rq *rq = cpu_rq(cpu);
+        struct task_struct *next = rq->wake_list;
+        for (;;) {
+                struct task_struct *old = next;
+                p->wake_entry = next;
+                next = cmpxchg(&rq->wake_list, old, p);
+                if (next == old)
+                        break;
+        }
+        if (!next)
+                smp_send_reschedule(cpu);
+}
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
+{
+        struct rq *rq;
+        int ret = 0;
+        rq = __task_rq_lock(p);
+        if (p->on_cpu) {
+                ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+                ttwu_do_wakeup(rq, p, wake_flags);
+                ret = 1;
+        }
+        __task_rq_unlock(rq);
+        return ret;
+}
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+#endif /* CONFIG_SMP */
+static void ttwu_queue(struct task_struct *p, int cpu)
+{
+        struct rq *rq = cpu_rq(cpu);
+#if defined(CONFIG_SMP)
+        if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
+                sched_clock_cpu(cpu); /* sync clocks x-cpu */
+                ttwu_queue_remote(p, cpu);
+                return;
+        }
+#endif
+        raw_spin_lock(&rq->lock);
+        ttwu_do_activate(rq, p, 0);
+        raw_spin_unlock(&rq->lock);
 }
 /**
@@ -2453,92 +2682,66 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
 * Returns %true if @p was woken up, %false if it was already running
 * or @state didn't match @p's state.
 */
-static int try_to_wake_up(struct task_struct *p, unsigned int state,
+static int
-                          int wake_flags)
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 {
-        int cpu, orig_cpu, this_cpu, success = 0;
        unsigned long flags;
-        unsigned long en_flags = ENQUEUE_WAKEUP;
+        int cpu, success = 0;
-        struct rq *rq;
-        this_cpu = get_cpu();
        smp_wmb();
-        rq = task_rq_lock(p, &flags);
+        raw_spin_lock_irqsave(&p->pi_lock, flags);
        if (!(p->state & state))
                goto out;
-        if (p->se.on_rq)
+        success = 1; /* we're going to change ->state */
-                goto out_running;
        cpu = task_cpu(p);
-        orig_cpu = cpu;
-#ifdef CONFIG_SMP
+        if (p->on_rq && ttwu_remote(p, wake_flags))
-        if (unlikely(task_running(rq, p)))
+                goto stat;
-                goto out_activate;
+#ifdef CONFIG_SMP
        /*
-         * In order to handle concurrent wakeups and release the rq->lock
+         * If the owning (remote) cpu is still in the middle of schedule() with
-         * we put the task in TASK_WAKING state.
+         * this task as prev, wait until its done referencing the task.
-         *
-         * First fix up the nr_uninterruptible count:
         */
-        if (task_contributes_to_load(p)) {
+        while (p->on_cpu) {
-                if (likely(cpu_online(orig_cpu)))
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
-                        rq->nr_uninterruptible--;
+                /*
-                else
+                 * In case the architecture enables interrupts in
-                        this_rq()->nr_uninterruptible--;
+                 * context_switch(), we cannot busy wait, since that
+                 * would lead to deadlocks when an interrupt hits and
+                 * tries to wake up @prev. So bail and do a complete
+                 * remote wakeup.
+                 */
+                if (ttwu_activate_remote(p, wake_flags))
+                        goto stat;
+#else
+                cpu_relax();
+#endif
        }
+        /*
+         * Pairs with the smp_wmb() in finish_lock_switch().
+         */
+        smp_rmb();
+        p->sched_contributes_to_load = !!task_contributes_to_load(p);
        p->state = TASK_WAKING;
-        if (p->sched_class->task_waking) {
+        if (p->sched_class->task_waking)
-                p->sched_class->task_waking(rq, p);
+                p->sched_class->task_waking(p);
-                en_flags |= ENQUEUE_WAKING;
-        }
-        cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
+        cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
-        if (cpu != orig_cpu)
+        if (task_cpu(p) != cpu) {
+                wake_flags |= WF_MIGRATED;
                set_task_cpu(p, cpu);
-        __task_rq_unlock(rq);
-        rq = cpu_rq(cpu);
-        raw_spin_lock(&rq->lock);
-        /*
-         * We migrated the task without holding either rq->lock, however
-         * since the task is not on the task list itself, nobody else
-         * will try and migrate the task, hence the rq should match the
-         * cpu we just moved it to.
-         */
-        WARN_ON(task_cpu(p) != cpu);
-        WARN_ON(p->state != TASK_WAKING);
-#ifdef CONFIG_SCHEDSTATS
-        schedstat_inc(rq, ttwu_count);
-        if (cpu == this_cpu)
-                schedstat_inc(rq, ttwu_local);
-        else {
-                struct sched_domain *sd;
-                for_each_domain(this_cpu, sd) {
-                        if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-                                schedstat_inc(sd, ttwu_wake_remote);
-                                break;
-                        }
-                }
        }
-#endif /* CONFIG_SCHEDSTATS */
-out_activate:
 #endif /* CONFIG_SMP */
-        ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
-                      cpu == this_cpu, en_flags);
+        ttwu_queue(p, cpu);
-        success = 1;
+stat:
-out_running:
+        ttwu_stat(p, cpu, wake_flags);
-        ttwu_post_activation(p, rq, wake_flags, success);
 out:
-        task_rq_unlock(rq, &flags);
+        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-        put_cpu();
        return success;
 }
@@ -2547,31 +2750,34 @@ out:
 * try_to_wake_up_local - try to wake up a local task with rq lock held
 * @p: the thread to be awakened
 *
- * Put @p on the run-queue if it's not already there.  The caller must
+ * Put @p on the run-queue if it's not already there. The caller must
 * ensure that this_rq() is locked, @p is bound to this_rq() and not
- * the current task.  this_rq() stays locked over invocation.
+ * the current task.
 */
 static void try_to_wake_up_local(struct task_struct *p)
 {
        struct rq *rq = task_rq(p);
-        bool success = false;
        BUG_ON(rq != this_rq());
        BUG_ON(p == current);
        lockdep_assert_held(&rq->lock);
+        if (!raw_spin_trylock(&p->pi_lock)) {
+                raw_spin_unlock(&rq->lock);
+                raw_spin_lock(&p->pi_lock);
+                raw_spin_lock(&rq->lock);
+        }
        if (!(p->state & TASK_NORMAL))
-                return;
+                goto out;
-        if (!p->se.on_rq) {
+        if (!p->on_rq)
-                if (likely(!task_running(rq, p))) {
+                ttwu_activate(rq, p, ENQUEUE_WAKEUP);
-                        schedstat_inc(rq, ttwu_count);
-                        schedstat_inc(rq, ttwu_local);
+        ttwu_do_wakeup(rq, p, 0);
-                }
+        ttwu_stat(p, smp_processor_id(), 0);
-                ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
+out:
-                success = true;
+        raw_spin_unlock(&p->pi_lock);
-        }
-        ttwu_post_activation(p, rq, 0, success);
 }
 /**
@@ -2604,19 +2810,21 @@ int wake_up_state(struct task_struct *p, unsigned int state)
 */
 static void __sched_fork(struct task_struct *p)
 {
+        p->on_rq                        = 0;
+        p->se.on_rq                     = 0;
        p->se.exec_start                = 0;
        p->se.sum_exec_runtime          = 0;
        p->se.prev_sum_exec_runtime     = 0;
        p->se.nr_migrations             = 0;
        p->se.vruntime                  = 0;
+        INIT_LIST_HEAD(&p->se.group_node);
 #ifdef CONFIG_SCHEDSTATS
        memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
        INIT_LIST_HEAD(&p->rt.run_list);
-        p->se.on_rq = 0;
-        INIT_LIST_HEAD(&p->se.group_node);
 #ifdef CONFIG_PREEMPT_NOTIFIERS
        INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2626,8 +2834,9 @@ static void __sched_fork(struct task_struct *p)
 /*
 * fork()/clone()-time setup:
 */
-void sched_fork(struct task_struct *p, int clone_flags)
+void sched_fork(struct task_struct *p)
 {
+        unsigned long flags;
        int cpu = get_cpu();
        __sched_fork(p);
@@ -2678,18 +2887,18 @@ void sched_fork(struct task_struct *p, int clone_flags)
         *
         * Silence PROVE_RCU.
         */
-        rcu_read_lock();
+        raw_spin_lock_irqsave(&p->pi_lock, flags);
        set_task_cpu(p, cpu);
-        rcu_read_unlock();
+        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
        if (likely(sched_info_on()))
                memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+#if defined(CONFIG_SMP)
-        p->oncpu = 0;
+        p->on_cpu = 0;
 #endif
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_COUNT
        /* Want to start with kernel preemption disabled. */
        task_thread_info(p)->preempt_count = 1;
 #endif
@@ -2707,41 +2916,31 @@ void sched_fork(struct task_struct *p, int clone_flags)
 * that must be done for every newly created context, then puts the task
 * on the runqueue and wakes it.
 */
-void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
+void wake_up_new_task(struct task_struct *p)
 {
        unsigned long flags;
        struct rq *rq;
-        int cpu __maybe_unused = get_cpu();
+        raw_spin_lock_irqsave(&p->pi_lock, flags);
 #ifdef CONFIG_SMP
-        rq = task_rq_lock(p, &flags);
-        p->state = TASK_WAKING;
        /*
         * Fork balancing, do it here and not earlier because:
         *  - cpus_allowed can change in the fork path
         *  - any previously selected cpu might disappear through hotplug
-         *
-         * We set TASK_WAKING so that select_task_rq() can drop rq->lock
-         * without people poking at ->cpus_allowed.
         */
-        cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
+        set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
-        set_task_cpu(p, cpu);
-        p->state = TASK_RUNNING;
-        task_rq_unlock(rq, &flags);
 #endif
-        rq = task_rq_lock(p, &flags);
+        rq = __task_rq_lock(p);
        activate_task(rq, p, 0);
-        trace_sched_wakeup_new(p, 1);
+        p->on_rq = 1;
+        trace_sched_wakeup_new(p, true);
        check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
        if (p->sched_class->task_woken)
                p->sched_class->task_woken(rq, p);
 #endif
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, p, &flags);
-        put_cpu();
 }
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -3450,27 +3649,22 @@ void sched_exec(void)
 {
        struct task_struct *p = current;
        unsigned long flags;
-        struct rq *rq;
        int dest_cpu;
-        rq = task_rq_lock(p, &flags);
+        raw_spin_lock_irqsave(&p->pi_lock, flags);
-        dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
+        dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
        if (dest_cpu == smp_processor_id())
                goto unlock;
-        /*
+        if (likely(cpu_active(dest_cpu))) {
-         * select_task_rq() can race against ->cpus_allowed
-         */
-        if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
-            likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
                struct migration_arg arg = { p, dest_cpu };
-                task_rq_unlock(rq, &flags);
+                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-                stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+                stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
                return;
        }
 unlock:
-        task_rq_unlock(rq, &flags);
+        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 }
 #endif
@@ -3507,7 +3701,7 @@ unsigned long long task_delta_exec(struct task_struct *p)
        rq = task_rq_lock(p, &flags);
        ns = do_task_delta_exec(p, rq);
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, p, &flags);
        return ns;
 }
@@ -3525,7 +3719,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
        rq = task_rq_lock(p, &flags);
        ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, p, &flags);
        return ns;
 }
@@ -3549,7 +3743,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
        rq = task_rq_lock(p, &flags);
        thread_group_cputime(p, &totals);
        ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, p, &flags);
        return ns;
 }
@@ -3695,6 +3889,25 @@ void account_idle_time(cputime_t cputime)
                cpustat->idle = cputime64_add(cpustat->idle, cputime64);
 }
+static __always_inline bool steal_account_process_tick(void)
+{
+#ifdef CONFIG_PARAVIRT
+        if (static_branch(&paravirt_steal_enabled)) {
+                u64 steal, st = 0;
+                steal = paravirt_steal_clock(smp_processor_id());
+                steal -= this_rq()->prev_steal_time;
+                st = steal_ticks(steal);
+                this_rq()->prev_steal_time += st * TICK_NSEC;
+                account_steal_time(st);
+                return st;
+        }
+#endif
+        return false;
+}
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -3726,6 +3939,9 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
        cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+        if (steal_account_process_tick())
+                return;
        if (irqtime_account_hi_update()) {
                cpustat->irq = cputime64_add(cpustat->irq, tmp);
        } else if (irqtime_account_si_update()) {
@@ -3779,6 +3995,9 @@ void account_process_tick(struct task_struct *p, int user_tick)
                return;
        }
+        if (steal_account_process_tick())
+                return;
        if (user_tick)
                account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
        else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -3903,9 +4122,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 /*
 * This function gets called by the timer code, with HZ frequency.
 * We call it with interrupts disabled.
- *
- * It also gets called by the fork code, when changing the parent's
- * timeslices.
 */
 void scheduler_tick(void)
 {
@@ -4025,17 +4241,11 @@ static inline void schedule_debug(struct task_struct *prev)
        profile_hit(SCHED_PROFILING, __builtin_return_address(0));
        schedstat_inc(this_rq(), sched_count);
-#ifdef CONFIG_SCHEDSTATS
-        if (unlikely(prev->lock_depth >= 0)) {
-                schedstat_inc(this_rq(), rq_sched_info.bkl_count);
-                schedstat_inc(prev, sched_info.bkl_count);
-        }
-#endif
 }
 static void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
-        if (prev->se.on_rq)
+        if (prev->on_rq || rq->skip_clock_update < 0)
                update_rq_clock(rq);
        prev->sched_class->put_prev_task(rq, prev);
 }
@@ -4097,11 +4307,13 @@ need_resched:
                if (unlikely(signal_pending_state(prev->state, prev))) {
                        prev->state = TASK_RUNNING;
                } else {
+                        deactivate_task(rq, prev, DEQUEUE_SLEEP);
+                        prev->on_rq = 0;
                        /*
-                         * If a worker is going to sleep, notify and
+                         * If a worker went to sleep, notify and ask workqueue
-                         * ask workqueue whether it wants to wake up a
+                         * whether it wants to wake up a task to maintain
-                         * task to maintain concurrency.  If so, wake
+                         * concurrency.
-                         * up the task.
                         */
                        if (prev->flags & PF_WQ_WORKER) {
                                struct task_struct *to_wakeup;
@@ -4110,11 +4322,10 @@ need_resched:
                                if (to_wakeup)
                                        try_to_wake_up_local(to_wakeup);
                        }
-                        deactivate_task(rq, prev, DEQUEUE_SLEEP);
                        /*
-                         * If we are going to sleep and we have plugged IO queued, make
+                         * If we are going to sleep and we have plugged IO
-                         * sure to submit it to avoid deadlocks.
+                         * queued, make sure to submit it to avoid deadlocks.
                         */
                        if (blk_needs_flush_plug(prev)) {
                                raw_spin_unlock(&rq->lock);
@@ -4161,71 +4372,47 @@ need_resched:
 EXPORT_SYMBOL(schedule);
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-/*
- * Look out! "owner" is an entirely speculative pointer
- * access and not reliable.
- */
-int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
-{
-        unsigned int cpu;
-        struct rq *rq;
-        if (!sched_feat(OWNER_SPIN))
+static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
-                return 0;
+{
+        if (lock->owner != owner)
+                return false;
-#ifdef CONFIG_DEBUG_PAGEALLOC
        /*
-         * Need to access the cpu field knowing that
+         * Ensure we emit the owner->on_cpu, dereference _after_ checking
-         * DEBUG_PAGEALLOC could have unmapped it if
+         * lock->owner still matches owner, if that fails, owner might
-         * the mutex owner just released it and exited.
+         * point to free()d memory, if it still matches, the rcu_read_lock()
+         * ensures the memory stays valid.
         */
-        if (probe_kernel_address(&owner->cpu, cpu))
+        barrier();
-                return 0;
-#else
-        cpu = owner->cpu;
-#endif
-        /*
+        return owner->on_cpu;
-         * Even if the access succeeded (likely case),
+}
-         * the cpu field may no longer be valid.
-         */
-        if (cpu >= nr_cpumask_bits)
-                return 0;
-        /*
+/*
-         * We need to validate that we can do a
+ * Look out! "owner" is an entirely speculative pointer
-         * get_cpu() and that we have the percpu area.
+ * access and not reliable.
-         */
+ */
-        if (!cpu_online(cpu))
+int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+{
+        if (!sched_feat(OWNER_SPIN))
                return 0;
-        rq = cpu_rq(cpu);
+        rcu_read_lock();
+        while (owner_running(lock, owner)) {
-        for (;;) {
+                if (need_resched())
-                /*
-                 * Owner changed, break to re-assess state.
-                 */
-                if (lock->owner != owner) {
-                        /*
-                         * If the lock has switched to a different owner,
-                         * we likely have heavy contention. Return 0 to quit
-                         * optimistic spinning and not contend further:
-                         */
-                        if (lock->owner)
-                                return 0;
                        break;
-                }
-                /*
-                 * Is that owner really running on that cpu?
-                 */
-                if (task_thread_info(rq->curr) != owner || need_resched())
-                        return 0;
                arch_mutex_cpu_relax();
        }
+        rcu_read_unlock();
-        return 1;
+        /*
+         * We break out the loop above on need_resched() and when the
+         * owner changed, which is a sign for heavy contention. Return
+         * success only when lock->owner is NULL.
+         */
+        return lock->owner == NULL;
 }
 #endif
@@ -4684,19 +4871,18 @@ EXPORT_SYMBOL(sleep_on_timeout);
 */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-        unsigned long flags;
        int oldprio, on_rq, running;
        struct rq *rq;
        const struct sched_class *prev_class;
        BUG_ON(prio < 0 || prio > MAX_PRIO);
-        rq = task_rq_lock(p, &flags);
+        rq = __task_rq_lock(p);
        trace_sched_pi_setprio(p, prio);
        oldprio = p->prio;
        prev_class = p->sched_class;
-        on_rq = p->se.on_rq;
+        on_rq = p->on_rq;
        running = task_current(rq, p);
        if (on_rq)
                dequeue_task(rq, p, 0);
@@ -4716,7 +4902,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
        check_class_changed(rq, p, prev_class, oldprio);
-        task_rq_unlock(rq, &flags);
+        __task_rq_unlock(rq);
 }
 #endif
@@ -4744,7 +4930,7 @@ void set_user_nice(struct task_struct *p, long nice)
                p->static_prio = NICE_TO_PRIO(nice);
                goto out_unlock;
        }
-        on_rq = p->se.on_rq;
+        on_rq = p->on_rq;
        if (on_rq)
                dequeue_task(rq, p, 0);
@@ -4764,7 +4950,7 @@ void set_user_nice(struct task_struct *p, long nice)
                        resched_task(rq->curr);
        }
 out_unlock:
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, p, &flags);
 }
 EXPORT_SYMBOL(set_user_nice);
@@ -4878,8 +5064,6 @@ static struct task_struct *find_process_by_pid(pid_t pid)
 static void
 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
 {
-        BUG_ON(p->se.on_rq);
        p->policy = policy;
        p->rt_priority = prio;
        p->normal_prio = normal_prio(p);
@@ -4994,20 +5178,17 @@ recheck:
        /*
         * make sure no PI-waiters arrive (or leave) while we are
         * changing the priority of the task:
-         */
+         *
-        raw_spin_lock_irqsave(&p->pi_lock, flags);
-        /*
         * To be able to change p->policy safely, the appropriate
         * runqueue lock must be held.
         */
-        rq = __task_rq_lock(p);
+        rq = task_rq_lock(p, &flags);
        /*
         * Changing the policy of the stop threads its a very bad idea
         */
        if (p == rq->stop) {
-                __task_rq_unlock(rq);
+                task_rq_unlock(rq, p, &flags);
-                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
                return -EINVAL;
        }
@@ -5031,8 +5212,7 @@ recheck:
                if (rt_bandwidth_enabled() && rt_policy(policy) &&
                                task_group(p)->rt_bandwidth.rt_runtime == 0 &&
                                !task_group_is_autogroup(task_group(p))) {
-                        __task_rq_unlock(rq);
+                        task_rq_unlock(rq, p, &flags);
-                        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
                        return -EPERM;
                }
        }
@@ -5041,11 +5221,10 @@ recheck:
        /* recheck policy now with rq lock held */
        if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                policy = oldpolicy = -1;
-                __task_rq_unlock(rq);
+                task_rq_unlock(rq, p, &flags);
-                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
                goto recheck;
        }
-        on_rq = p->se.on_rq;
+        on_rq = p->on_rq;
        running = task_current(rq, p);
        if (on_rq)
                deactivate_task(rq, p, 0);
@@ -5064,8 +5243,7 @@ recheck:
                activate_task(rq, p, 0);
        check_class_changed(rq, p, prev_class, oldprio);
-        __task_rq_unlock(rq);
+        task_rq_unlock(rq, p, &flags);
-        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
        rt_mutex_adjust_pi(p);
@@ -5316,7 +5494,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
 {
        struct task_struct *p;
        unsigned long flags;
-        struct rq *rq;
        int retval;
        get_online_cpus();
@@ -5331,9 +5508,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
        if (retval)
                goto out_unlock;
-        rq = task_rq_lock(p, &flags);
+        raw_spin_lock_irqsave(&p->pi_lock, flags);
        cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
-        task_rq_unlock(rq, &flags);
+        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 out_unlock:
        rcu_read_unlock();
@@ -5658,7 +5835,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
        rq = task_rq_lock(p, &flags);
        time_slice = p->sched_class->get_rr_interval(rq, p);
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, p, &flags);
        rcu_read_unlock();
        jiffies_to_timespec(time_slice, &t);
@@ -5760,7 +5937,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
        idle->state = TASK_RUNNING;
        idle->se.exec_start = sched_clock();
-        cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
+        do_set_cpus_allowed(idle, cpumask_of(cpu));
        /*
         * We're having a chicken and egg problem, even though we are
         * holding rq->lock, the cpu isn't yet set to this cpu so the
@@ -5776,17 +5953,14 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
        rcu_read_unlock();
        rq->curr = rq->idle = idle;
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+#if defined(CONFIG_SMP)
-        idle->oncpu = 1;
+        idle->on_cpu = 1;
 #endif
        raw_spin_unlock_irqrestore(&rq->lock, flags);
        /* Set the preempt count _outside_ the spinlocks! */
-#if defined(CONFIG_PREEMPT)
-        task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
-#else
        task_thread_info(idle)->preempt_count = 0;
-#endif
        /*
         * The idle tasks have their own, simple scheduling class:
         */
@@ -5851,6 +6025,16 @@ static inline void sched_init_granularity(void)
 }
 #ifdef CONFIG_SMP
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+        if (p->sched_class && p->sched_class->set_cpus_allowed)
+                p->sched_class->set_cpus_allowed(p, new_mask);
+        else {
+                cpumask_copy(&p->cpus_allowed, new_mask);
+                p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
+        }
+}
 /*
 * This is how migration works:
 *
@@ -5881,52 +6065,38 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
        unsigned int dest_cpu;
        int ret = 0;
-        /*
-         * Serialize against TASK_WAKING so that ttwu() and wunt() can
-         * drop the rq->lock and still rely on ->cpus_allowed.
-         */
-again:
-        while (task_is_waking(p))
-                cpu_relax();
        rq = task_rq_lock(p, &flags);
-        if (task_is_waking(p)) {
-                task_rq_unlock(rq, &flags);
+        if (cpumask_equal(&p->cpus_allowed, new_mask))
-                goto again;
+                goto out;
-        }
        if (!cpumask_intersects(new_mask, cpu_active_mask)) {
                ret = -EINVAL;
                goto out;
        }
-        if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
+        if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
-                     !cpumask_equal(&p->cpus_allowed, new_mask))) {
                ret = -EINVAL;
                goto out;
        }
-        if (p->sched_class->set_cpus_allowed)
+        do_set_cpus_allowed(p, new_mask);
-                p->sched_class->set_cpus_allowed(p, new_mask);
-        else {
-                cpumask_copy(&p->cpus_allowed, new_mask);
-                p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
-        }
        /* Can the task run on the task's current CPU? If so, we're done */
        if (cpumask_test_cpu(task_cpu(p), new_mask))
                goto out;
        dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
-        if (migrate_task(p, rq)) {
+        if (p->on_rq) {
                struct migration_arg arg = { p, dest_cpu };
                /* Need help from migration thread: drop lock and wait. */
-                task_rq_unlock(rq, &flags);
+                task_rq_unlock(rq, p, &flags);
                stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
                tlb_migrate_finish(p->mm);
                return 0;
        }
 out:
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, p, &flags);
        return ret;
 }
@@ -5954,6 +6124,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
        rq_src = cpu_rq(src_cpu);
        rq_dest = cpu_rq(dest_cpu);
+        raw_spin_lock(&p->pi_lock);
        double_rq_lock(rq_src, rq_dest);
        /* Already moved. */
        if (task_cpu(p) != src_cpu)
@@ -5966,7 +6137,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
         * If we're not on a rq, the next wake-up will ensure we're
         * placed properly.
         */
-        if (p->se.on_rq) {
+        if (p->on_rq) {
                deactivate_task(rq_src, p, 0);
                set_task_cpu(p, dest_cpu);
                activate_task(rq_dest, p, 0);
@@ -5976,6 +6147,7 @@ done:
        ret = 1;
 fail:
        double_rq_unlock(rq_src, rq_dest);
+        raw_spin_unlock(&p->pi_lock);
        return ret;
 }
@@ -6316,6 +6488,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_DYING:
+                sched_ttwu_pending();
                /* Update our root-domain */
                raw_spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
@@ -6394,6 +6567,8 @@ early_initcall(migration_init);
 #ifdef CONFIG_SMP
+static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
 #ifdef CONFIG_SCHED_DEBUG
 static __read_mostly int sched_domain_debug_enabled;
@@ -6444,7 +6619,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                        break;
                }
-                if (!group->cpu_power) {
+                if (!group->sgp->power) {
                        printk(KERN_CONT "\n");
                        printk(KERN_ERR "ERROR: domain->cpu_power not "
                                        "set\n");
@@ -6468,9 +6643,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
                printk(KERN_CONT " %s", str);
-                if (group->cpu_power != SCHED_LOAD_SCALE) {
+                if (group->sgp->power != SCHED_POWER_SCALE) {
                        printk(KERN_CONT " (cpu_power = %d)",
-                                group->cpu_power);
+                                group->sgp->power);
                }
                group = group->next;
@@ -6489,7 +6664,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
-        cpumask_var_t groupmask;
        int level = 0;
        if (!sched_domain_debug_enabled)
@@ -6502,20 +6676,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
        printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
-        if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
-                printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
-                return;
-        }
        for (;;) {
-                if (sched_domain_debug_one(sd, cpu, level, groupmask))
+                if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
                        break;
                level++;
                sd = sd->parent;
                if (!sd)
                        break;
        }
-        free_cpumask_var(groupmask);
 }
 #else /* !CONFIG_SCHED_DEBUG */
 # define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6572,12 +6740,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
        return 1;
 }
-static void free_rootdomain(struct root_domain *rd)
+static void free_rootdomain(struct rcu_head *rcu)
 {
-        synchronize_sched();
+        struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
        cpupri_cleanup(&rd->cpupri);
        free_cpumask_var(rd->rto_mask);
        free_cpumask_var(rd->online);
        free_cpumask_var(rd->span);
@@ -6618,7 +6785,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
        raw_spin_unlock_irqrestore(&rq->lock, flags);
        if (old_rd)
-                free_rootdomain(old_rd);
+                call_rcu_sched(&old_rd->rcu, free_rootdomain);
 }
 static int init_rootdomain(struct root_domain *rd)
@@ -6669,6 +6836,53 @@ static struct root_domain *alloc_rootdomain(void)
        return rd;
 }
+static void free_sched_groups(struct sched_group *sg, int free_sgp)
+{
+        struct sched_group *tmp, *first;
+        if (!sg)
+                return;
+        first = sg;
+        do {
+                tmp = sg->next;
+                if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
+                        kfree(sg->sgp);
+                kfree(sg);
+                sg = tmp;
+        } while (sg != first);
+}
+static void free_sched_domain(struct rcu_head *rcu)
+{
+        struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+        /*
+         * If its an overlapping domain it has private groups, iterate and
+         * nuke them all.
+         */
+        if (sd->flags & SD_OVERLAP) {
+                free_sched_groups(sd->groups, 1);
+        } else if (atomic_dec_and_test(&sd->groups->ref)) {
+                kfree(sd->groups->sgp);
+                kfree(sd->groups);
+        }
+        kfree(sd);
+}
+static void destroy_sched_domain(struct sched_domain *sd, int cpu)
+{
+        call_rcu(&sd->rcu, free_sched_domain);
+}
+static void destroy_sched_domains(struct sched_domain *sd, int cpu)
+{
+        for (; sd; sd = sd->parent)
+                destroy_sched_domain(sd, cpu);
+}
 /*
 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
 * hold the hotplug lock.
@@ -6679,9 +6893,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
        struct rq *rq = cpu_rq(cpu);
        struct sched_domain *tmp;
-        for (tmp = sd; tmp; tmp = tmp->parent)
-                tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
        /* Remove the sched domains which do not contribute to scheduling. */
        for (tmp = sd; tmp; ) {
                struct sched_domain *parent = tmp->parent;
@@ -6692,12 +6903,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
                        tmp->parent = parent->parent;
                        if (parent->parent)
                                parent->parent->child = tmp;
+                        destroy_sched_domain(parent, cpu);
                } else
                        tmp = tmp->parent;
        }
        if (sd && sd_degenerate(sd)) {
+                tmp = sd;
                sd = sd->parent;
+                destroy_sched_domain(tmp, cpu);
                if (sd)
                        sd->child = NULL;
        }
@@ -6705,7 +6919,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
        sched_domain_debug(sd, cpu);
        rq_attach_root(rq, rd);
+        tmp = rq->sd;
        rcu_assign_pointer(rq->sd, sd);
+        destroy_sched_domains(tmp, cpu);
 }
 /* cpus with isolated domains */
@@ -6721,56 +6937,6 @@ static int __init isolated_cpu_setup(char *str)
 __setup("isolcpus=", isolated_cpu_setup);
-/*
- * init_sched_build_groups takes the cpumask we wish to span, and a pointer
- * to a function which identifies what group(along with sched group) a CPU
- * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
- * (due to the fact that we keep track of groups covered with a struct cpumask).
- *
- * init_sched_build_groups will build a circular linked list of the groups
- * covered by the given span, and will set each group's ->cpumask correctly,
- * and ->cpu_power to 0.
- */
-static void
-init_sched_build_groups(const struct cpumask *span,
-                        const struct cpumask *cpu_map,
-                        int (*group_fn)(int cpu, const struct cpumask *cpu_map,
-                                        struct sched_group **sg,
-                                        struct cpumask *tmpmask),
-                        struct cpumask *covered, struct cpumask *tmpmask)
-{
-        struct sched_group *first = NULL, *last = NULL;
-        int i;
-        cpumask_clear(covered);
-        for_each_cpu(i, span) {
-                struct sched_group *sg;
-                int group = group_fn(i, cpu_map, &sg, tmpmask);
-                int j;
-                if (cpumask_test_cpu(i, covered))
-                        continue;
-                cpumask_clear(sched_group_cpus(sg));
-                sg->cpu_power = 0;
-                for_each_cpu(j, span) {
-                        if (group_fn(j, cpu_map, NULL, tmpmask) != group)
-                                continue;
-                        cpumask_set_cpu(j, covered);
-                        cpumask_set_cpu(j, sched_group_cpus(sg));
-                }
-                if (!first)
-                        first = sg;
-                if (last)
-                        last->next = sg;
-                last = sg;
-        }
-        last->next = first;
-}
 #define SD_NODES_PER_DOMAIN 16
 #ifdef CONFIG_NUMA
@@ -6787,7 +6953,7 @@ init_sched_build_groups(const struct cpumask *span,
 */
 static int find_next_best_node(int node, nodemask_t *used_nodes)
 {
-        int i, n, val, min_val, best_node = 0;
+        int i, n, val, min_val, best_node = -1;
        min_val = INT_MAX;
@@ -6811,7 +6977,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
                }
        }
-        node_set(best_node, *used_nodes);
+        if (best_node != -1)
+                node_set(best_node, *used_nodes);
        return best_node;
 }
@@ -6837,315 +7004,197 @@ static void sched_domain_node_span(int node, struct cpumask *span)
        for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
                int next_node = find_next_best_node(node, &used_nodes);
+                if (next_node < 0)
+                        break;
                cpumask_or(span, span, cpumask_of_node(next_node));
        }
 }
+static const struct cpumask *cpu_node_mask(int cpu)
+{
+        lockdep_assert_held(&sched_domains_mutex);
+        sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
+        return sched_domains_tmpmask;
+}
+static const struct cpumask *cpu_allnodes_mask(int cpu)
+{
+        return cpu_possible_mask;
+}
 #endif /* CONFIG_NUMA */
-int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
+static const struct cpumask *cpu_cpu_mask(int cpu)
+{
+        return cpumask_of_node(cpu_to_node(cpu));
+}
-/*
+int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
- * The cpus mask in sched_group and sched_domain hangs off the end.
- *
- * ( See the the comments in include/linux/sched.h:struct sched_group
- *   and struct sched_domain. )
- */
-struct static_sched_group {
-        struct sched_group sg;
-        DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
-};
-struct static_sched_domain {
+struct sd_data {
-        struct sched_domain sd;
+        struct sched_domain **__percpu sd;
-        DECLARE_BITMAP(span, CONFIG_NR_CPUS);
+        struct sched_group **__percpu sg;
+        struct sched_group_power **__percpu sgp;
 };
 struct s_data {
-#ifdef CONFIG_NUMA
+        struct sched_domain ** __percpu sd;
-        int                     sd_allnodes;
-        cpumask_var_t           domainspan;
-        cpumask_var_t           covered;
-        cpumask_var_t           notcovered;
-#endif
-        cpumask_var_t           nodemask;
-        cpumask_var_t           this_sibling_map;
-        cpumask_var_t           this_core_map;
-        cpumask_var_t           this_book_map;
-        cpumask_var_t           send_covered;
-        cpumask_var_t           tmpmask;
-        struct sched_group      **sched_group_nodes;
        struct root_domain      *rd;
 };
 enum s_alloc {
-        sa_sched_groups = 0,
        sa_rootdomain,
-        sa_tmpmask,
+        sa_sd,
-        sa_send_covered,
+        sa_sd_storage,
-        sa_this_book_map,
-        sa_this_core_map,
-        sa_this_sibling_map,
-        sa_nodemask,
-        sa_sched_group_nodes,
-#ifdef CONFIG_NUMA
-        sa_notcovered,
-        sa_covered,
-        sa_domainspan,
-#endif
        sa_none,
 };
-/*
+struct sched_domain_topology_level;
- * SMT sched-domains:
- */
-#ifdef CONFIG_SCHED_SMT
-static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
-static int
+typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
-cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
+typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
-                 struct sched_group **sg, struct cpumask *unused)
-{
-        if (sg)
-                *sg = &per_cpu(sched_groups, cpu).sg;
-        return cpu;
-}
-#endif /* CONFIG_SCHED_SMT */
-/*
+#define SDTL_OVERLAP    0x01
- * multi-core sched-domains:
- */
+struct sched_domain_topology_level {
-#ifdef CONFIG_SCHED_MC
+        sched_domain_init_f init;
-static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
+        sched_domain_mask_f mask;
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
+        int                 flags;
+        struct sd_data      data;
+};
 static int
-cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
+build_overlap_sched_groups(struct sched_domain *sd, int cpu)
-                  struct sched_group **sg, struct cpumask *mask)
 {
-        int group;
+        struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
-#ifdef CONFIG_SCHED_SMT
+        const struct cpumask *span = sched_domain_span(sd);
-        cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
+        struct cpumask *covered = sched_domains_tmpmask;
-        group = cpumask_first(mask);
+        struct sd_data *sdd = sd->private;
-#else
+        struct sched_domain *child;
-        group = cpu;
+        int i;
-#endif
-        if (sg)
-                *sg = &per_cpu(sched_group_core, group).sg;
-        return group;
-}
-#endif /* CONFIG_SCHED_MC */
-/*
+        cpumask_clear(covered);
- * book sched-domains:
- */
-#ifdef CONFIG_SCHED_BOOK
-static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
-static int
+        for_each_cpu(i, span) {
-cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
+                struct cpumask *sg_span;
-                  struct sched_group **sg, struct cpumask *mask)
-{
-        int group = cpu;
-#ifdef CONFIG_SCHED_MC
-        cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
-        group = cpumask_first(mask);
-#elif defined(CONFIG_SCHED_SMT)
-        cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
-        group = cpumask_first(mask);
-#endif
-        if (sg)
-                *sg = &per_cpu(sched_group_book, group).sg;
-        return group;
-}
-#endif /* CONFIG_SCHED_BOOK */
-static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
+                if (cpumask_test_cpu(i, covered))
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
+                        continue;
-static int
+                sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
-cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
+                                GFP_KERNEL, cpu_to_node(i));
-                  struct sched_group **sg, struct cpumask *mask)
-{
-        int group;
-#ifdef CONFIG_SCHED_BOOK
-        cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
-        group = cpumask_first(mask);
-#elif defined(CONFIG_SCHED_MC)
-        cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
-        group = cpumask_first(mask);
-#elif defined(CONFIG_SCHED_SMT)
-        cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
-        group = cpumask_first(mask);
-#else
-        group = cpu;
-#endif
-        if (sg)
-                *sg = &per_cpu(sched_group_phys, group).sg;
-        return group;
-}
-#ifdef CONFIG_NUMA
+                if (!sg)
-/*
+                        goto fail;
- * The init_sched_build_groups can't handle what we want to do with node
- * groups, so roll our own. Now each node has its own list of groups which
- * gets dynamically allocated.
- */
-static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
-static struct sched_group ***sched_group_nodes_bycpu;
-static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
+                sg_span = sched_group_cpus(sg);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
-static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
+                child = *per_cpu_ptr(sdd->sd, i);
-                                 struct sched_group **sg,
+                if (child->child) {
-                                 struct cpumask *nodemask)
+                        child = child->child;
-{
+                        cpumask_copy(sg_span, sched_domain_span(child));
-        int group;
+                } else
+                        cpumask_set_cpu(i, sg_span);
-        cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
+                cpumask_or(covered, covered, sg_span);
-        group = cpumask_first(nodemask);
-        if (sg)
+                sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
-                *sg = &per_cpu(sched_group_allnodes, group).sg;
+                atomic_inc(&sg->sgp->ref);
-        return group;
-}
-static void init_numa_sched_groups_power(struct sched_group *group_head)
+                if (cpumask_test_cpu(cpu, sg_span))
-{
+                        groups = sg;
-        struct sched_group *sg = group_head;
-        int j;
-        if (!sg)
+                if (!first)
-                return;
+                        first = sg;
-        do {
+                if (last)
-                for_each_cpu(j, sched_group_cpus(sg)) {
+                        last->next = sg;
-                        struct sched_domain *sd;
+                last = sg;
+                last->next = first;
+        }
+        sd->groups = groups;
-                        sd = &per_cpu(phys_domains, j).sd;
+        return 0;
-                        if (j != group_first_cpu(sd->groups)) {
-                                /*
-                                 * Only add "power" once for each
-                                 * physical package.
-                                 */
-                                continue;
-                        }
-                        sg->cpu_power += sd->groups->cpu_power;
+fail:
-                }
+        free_sched_groups(first, 0);
-                sg = sg->next;
-        } while (sg != group_head);
+        return -ENOMEM;
 }
-static int build_numa_sched_groups(struct s_data *d,
+static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
-                                   const struct cpumask *cpu_map, int num)
 {
-        struct sched_domain *sd;
+        struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
-        struct sched_group *sg, *prev;
+        struct sched_domain *child = sd->child;
-        int n, j;
-        cpumask_clear(d->covered);
+        if (child)
-        cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
+                cpu = cpumask_first(sched_domain_span(child));
-        if (cpumask_empty(d->nodemask)) {
-                d->sched_group_nodes[num] = NULL;
+        if (sg) {
-                goto out;
+                *sg = *per_cpu_ptr(sdd->sg, cpu);
+                (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
+                atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
        }
-        sched_domain_node_span(num, d->domainspan);
+        return cpu;
-        cpumask_and(d->domainspan, d->domainspan, cpu_map);
+}
-        sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
+/*
-                          GFP_KERNEL, num);
+ * build_sched_groups will build a circular linked list of the groups
-        if (!sg) {
+ * covered by the given span, and will set each group's ->cpumask correctly,
-                printk(KERN_WARNING "Can not alloc domain group for node %d\n",
+ * and ->cpu_power to 0.
-                       num);
+ *
-                return -ENOMEM;
+ * Assumes the sched_domain tree is fully constructed
-        }
+ */
-        d->sched_group_nodes[num] = sg;
+static int
+build_sched_groups(struct sched_domain *sd, int cpu)
+{
+        struct sched_group *first = NULL, *last = NULL;
+        struct sd_data *sdd = sd->private;
+        const struct cpumask *span = sched_domain_span(sd);
+        struct cpumask *covered;
+        int i;
-        for_each_cpu(j, d->nodemask) {
+        get_group(cpu, sdd, &sd->groups);
-                sd = &per_cpu(node_domains, j).sd;
+        atomic_inc(&sd->groups->ref);
-                sd->groups = sg;
-        }
-        sg->cpu_power = 0;
+        if (cpu != cpumask_first(sched_domain_span(sd)))
-        cpumask_copy(sched_group_cpus(sg), d->nodemask);
+                return 0;
-        sg->next = sg;
-        cpumask_or(d->covered, d->covered, d->nodemask);
-        prev = sg;
+        lockdep_assert_held(&sched_domains_mutex);
-        for (j = 0; j < nr_node_ids; j++) {
+        covered = sched_domains_tmpmask;
-                n = (num + j) % nr_node_ids;
-                cpumask_complement(d->notcovered, d->covered);
-                cpumask_and(d->tmpmask, d->notcovered, cpu_map);
-                cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
-                if (cpumask_empty(d->tmpmask))
-                        break;
-                cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
-                if (cpumask_empty(d->tmpmask))
-                        continue;
-                sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
-                                  GFP_KERNEL, num);
-                if (!sg) {
-                        printk(KERN_WARNING
-                               "Can not alloc domain group for node %d\n", j);
-                        return -ENOMEM;
-                }
-                sg->cpu_power = 0;
-                cpumask_copy(sched_group_cpus(sg), d->tmpmask);
-                sg->next = prev->next;
-                cpumask_or(d->covered, d->covered, d->tmpmask);
-                prev->next = sg;
-                prev = sg;
-        }
-out:
-        return 0;
-}
-#endif /* CONFIG_NUMA */
-#ifdef CONFIG_NUMA
+        cpumask_clear(covered);
-/* Free memory allocated for various sched_group structures */
-static void free_sched_groups(const struct cpumask *cpu_map,
-                              struct cpumask *nodemask)
-{
-        int cpu, i;
-        for_each_cpu(cpu, cpu_map) {
+        for_each_cpu(i, span) {
-                struct sched_group **sched_group_nodes
+                struct sched_group *sg;
-                        = sched_group_nodes_bycpu[cpu];
+                int group = get_group(i, sdd, &sg);
+                int j;
-                if (!sched_group_nodes)
+                if (cpumask_test_cpu(i, covered))
                        continue;
-                for (i = 0; i < nr_node_ids; i++) {
+                cpumask_clear(sched_group_cpus(sg));
-                        struct sched_group *oldsg, *sg = sched_group_nodes[i];
+                sg->sgp->power = 0;
-                        cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
+                for_each_cpu(j, span) {
-                        if (cpumask_empty(nodemask))
+                        if (get_group(j, sdd, NULL) != group)
                                continue;
-                        if (sg == NULL)
+                        cpumask_set_cpu(j, covered);
-                                continue;
+                        cpumask_set_cpu(j, sched_group_cpus(sg));
-                        sg = sg->next;
-next_sg:
-                        oldsg = sg;
-                        sg = sg->next;
-                        kfree(oldsg);
-                        if (oldsg != sched_group_nodes[i])
-                                goto next_sg;
                }
-                kfree(sched_group_nodes);
-                sched_group_nodes_bycpu[cpu] = NULL;
+                if (!first)
+                        first = sg;
+                if (last)
+                        last->next = sg;
+                last = sg;
        }
+        last->next = first;
+        return 0;
 }
-#else /* !CONFIG_NUMA */
-static void free_sched_groups(const struct cpumask *cpu_map,
-                              struct cpumask *nodemask)
-{
-}
-#endif /* CONFIG_NUMA */
 /*
 * Initialize sched groups cpu_power.
@@ -7159,48 +7208,19 @@ static void free_sched_groups(const struct cpumask *cpu_map,
 */
 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 {
-        struct sched_domain *child;
+        struct sched_group *sg = sd->groups;
-        struct sched_group *group;
-        long power;
-        int weight;
-        WARN_ON(!sd || !sd->groups);
+        WARN_ON(!sd || !sg);
-        if (cpu != group_first_cpu(sd->groups))
-                return;
-        sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
+        do {
+                sg->group_weight = cpumask_weight(sched_group_cpus(sg));
-        child = sd->child;
+                sg = sg->next;
+        } while (sg != sd->groups);
-        sd->groups->cpu_power = 0;
-        if (!child) {
+        if (cpu != group_first_cpu(sg))
-                power = SCHED_LOAD_SCALE;
-                weight = cpumask_weight(sched_domain_span(sd));
-                /*
-                 * SMT siblings share the power of a single core.
-                 * Usually multiple threads get a better yield out of
-                 * that one core than a single thread would have,
-                 * reflect that in sd->smt_gain.
-                 */
-                if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
-                        power *= sd->smt_gain;
-                        power /= weight;
-                        power >>= SCHED_LOAD_SHIFT;
-                }
-                sd->groups->cpu_power += power;
                return;
-        }
-        /*
+        update_group_power(sd, cpu);
-         * Add cpu_power of each child group to this groups cpu_power.
-         */
-        group = child->groups;
-        do {
-                sd->groups->cpu_power += group->cpu_power;
-                group = group->next;
-        } while (group != child->groups);
 }
 /*
@@ -7214,15 +7234,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 # define SD_INIT_NAME(sd, type)         do { } while (0)
 #endif
-#define SD_INIT(sd, type)       sd_init_##type(sd)
+#define SD_INIT_FUNC(type)                                              \
+static noinline struct sched_domain *                                   \
-#define SD_INIT_FUNC(type)      \
+sd_init_##type(struct sched_domain_topology_level *tl, int cpu)         \
-static noinline void sd_init_##type(struct sched_domain *sd)    \
+{                                                                       \
-{                                                               \
+        struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);       \
-        memset(sd, 0, sizeof(*sd));                             \
+        *sd = SD_##type##_INIT;                                         \
-        *sd = SD_##type##_INIT;                                 \
+        SD_INIT_NAME(sd, type);                                         \
-        sd->level = SD_LV_##type;                               \
+        sd->private = &tl->data;                                        \
-        SD_INIT_NAME(sd, type);                                 \
+        return sd;                                                      \
 }
 SD_INIT_FUNC(CPU)
@@ -7241,13 +7261,14 @@ SD_INIT_FUNC(CPU)
 #endif
 static int default_relax_domain_level = -1;
+int sched_domain_level_max;
 static int __init setup_relax_domain_level(char *str)
 {
        unsigned long val;
        val = simple_strtoul(str, NULL, 0);
-        if (val < SD_LV_MAX)
+        if (val < sched_domain_level_max)
                default_relax_domain_level = val;
        return 1;
@@ -7275,37 +7296,20 @@ static void set_domain_attribute(struct sched_domain *sd,
        }
 }
+static void __sdt_free(const struct cpumask *cpu_map);
+static int __sdt_alloc(const struct cpumask *cpu_map);
 static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
                                 const struct cpumask *cpu_map)
 {
        switch (what) {
-        case sa_sched_groups:
-                free_sched_groups(cpu_map, d->tmpmask); /* fall through */
-                d->sched_group_nodes = NULL;
        case sa_rootdomain:
-                free_rootdomain(d->rd); /* fall through */
+                if (!atomic_read(&d->rd->refcount))
-        case sa_tmpmask:
+                        free_rootdomain(&d->rd->rcu); /* fall through */
-                free_cpumask_var(d->tmpmask); /* fall through */
+        case sa_sd:
-        case sa_send_covered:
+                free_percpu(d->sd); /* fall through */
-                free_cpumask_var(d->send_covered); /* fall through */
+        case sa_sd_storage:
-        case sa_this_book_map:
+                __sdt_free(cpu_map); /* fall through */
-                free_cpumask_var(d->this_book_map); /* fall through */
-        case sa_this_core_map:
-                free_cpumask_var(d->this_core_map); /* fall through */
-        case sa_this_sibling_map:
-                free_cpumask_var(d->this_sibling_map); /* fall through */
-        case sa_nodemask:
-                free_cpumask_var(d->nodemask); /* fall through */
-        case sa_sched_group_nodes:
-#ifdef CONFIG_NUMA
-                kfree(d->sched_group_nodes); /* fall through */
-        case sa_notcovered:
-                free_cpumask_var(d->notcovered); /* fall through */
-        case sa_covered:
-                free_cpumask_var(d->covered); /* fall through */
-        case sa_domainspan:
-                free_cpumask_var(d->domainspan); /* fall through */
-#endif
        case sa_none:
                break;
        }
@@ -7314,308 +7318,233 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
 static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
                                                   const struct cpumask *cpu_map)
 {
-#ifdef CONFIG_NUMA
+        memset(d, 0, sizeof(*d));
-        if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
-                return sa_none;
+        if (__sdt_alloc(cpu_map))
-        if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
+                return sa_sd_storage;
-                return sa_domainspan;
+        d->sd = alloc_percpu(struct sched_domain *);
-        if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
+        if (!d->sd)
-                return sa_covered;
+                return sa_sd_storage;
-        /* Allocate the per-node list of sched groups */
-        d->sched_group_nodes = kcalloc(nr_node_ids,
-                                      sizeof(struct sched_group *), GFP_KERNEL);
-        if (!d->sched_group_nodes) {
-                printk(KERN_WARNING "Can not alloc sched group node list\n");
-                return sa_notcovered;
-        }
-        sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
-#endif
-        if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
-                return sa_sched_group_nodes;
-        if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
-                return sa_nodemask;
-        if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
-                return sa_this_sibling_map;
-        if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
-                return sa_this_core_map;
-        if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
-                return sa_this_book_map;
-        if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
-                return sa_send_covered;
        d->rd = alloc_rootdomain();
-        if (!d->rd) {
+        if (!d->rd)
-                printk(KERN_WARNING "Cannot alloc root domain\n");
+                return sa_sd;
-                return sa_tmpmask;
-        }
        return sa_rootdomain;
 }
-static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
+/*
-        const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
+ * NULL the sd_data elements we've used to build the sched_domain and
+ * sched_group structure so that the subsequent __free_domain_allocs()
+ * will not free the data we're using.
+ */
+static void claim_allocations(int cpu, struct sched_domain *sd)
 {
-        struct sched_domain *sd = NULL;
+        struct sd_data *sdd = sd->private;
-#ifdef CONFIG_NUMA
-        struct sched_domain *parent;
-        d->sd_allnodes = 0;
-        if (cpumask_weight(cpu_map) >
-            SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
-                sd = &per_cpu(allnodes_domains, i).sd;
-                SD_INIT(sd, ALLNODES);
-                set_domain_attribute(sd, attr);
-                cpumask_copy(sched_domain_span(sd), cpu_map);
-                cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
-                d->sd_allnodes = 1;
-        }
-        parent = sd;
-        sd = &per_cpu(node_domains, i).sd;
-        SD_INIT(sd, NODE);
-        set_domain_attribute(sd, attr);
-        sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
-        sd->parent = parent;
-        if (parent)
-                parent->child = sd;
-        cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
-#endif
-        return sd;
-}
-static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
+        WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
-        const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+        *per_cpu_ptr(sdd->sd, cpu) = NULL;
-        struct sched_domain *parent, int i)
-{
-        struct sched_domain *sd;
-        sd = &per_cpu(phys_domains, i).sd;
-        SD_INIT(sd, CPU);
-        set_domain_attribute(sd, attr);
-        cpumask_copy(sched_domain_span(sd), d->nodemask);
-        sd->parent = parent;
-        if (parent)
-                parent->child = sd;
-        cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
-        return sd;
-}
-static struct sched_domain *__build_book_sched_domain(struct s_data *d,
+        if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
-        const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+                *per_cpu_ptr(sdd->sg, cpu) = NULL;
-        struct sched_domain *parent, int i)
-{
-        struct sched_domain *sd = parent;
-#ifdef CONFIG_SCHED_BOOK
-        sd = &per_cpu(book_domains, i).sd;
-        SD_INIT(sd, BOOK);
-        set_domain_attribute(sd, attr);
-        cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
-        sd->parent = parent;
-        parent->child = sd;
-        cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
-#endif
-        return sd;
-}
-static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
+        if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
-        const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+                *per_cpu_ptr(sdd->sgp, cpu) = NULL;
-        struct sched_domain *parent, int i)
-{
-        struct sched_domain *sd = parent;
-#ifdef CONFIG_SCHED_MC
-        sd = &per_cpu(core_domains, i).sd;
-        SD_INIT(sd, MC);
-        set_domain_attribute(sd, attr);
-        cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
-        sd->parent = parent;
-        parent->child = sd;
-        cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
-#endif
-        return sd;
 }
-static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
-        const struct cpumask *cpu_map, struct sched_domain_attr *attr,
-        struct sched_domain *parent, int i)
-{
-        struct sched_domain *sd = parent;
 #ifdef CONFIG_SCHED_SMT
-        sd = &per_cpu(cpu_domains, i).sd;
+static const struct cpumask *cpu_smt_mask(int cpu)
-        SD_INIT(sd, SIBLING);
+{
-        set_domain_attribute(sd, attr);
+        return topology_thread_cpumask(cpu);
-        cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
-        sd->parent = parent;
-        parent->child = sd;
-        cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
-#endif
-        return sd;
 }
+#endif
-static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
+/*
-                               const struct cpumask *cpu_map, int cpu)
+ * Topology list, bottom-up.
-{
+ */
-        switch (l) {
+static struct sched_domain_topology_level default_topology[] = {
 #ifdef CONFIG_SCHED_SMT
-        case SD_LV_SIBLING: /* set up CPU (sibling) groups */
+        { sd_init_SIBLING, cpu_smt_mask, },
-                cpumask_and(d->this_sibling_map, cpu_map,
-                            topology_thread_cpumask(cpu));
-                if (cpu == cpumask_first(d->this_sibling_map))
-                        init_sched_build_groups(d->this_sibling_map, cpu_map,
-                                                &cpu_to_cpu_group,
-                                                d->send_covered, d->tmpmask);
-                break;
 #endif
 #ifdef CONFIG_SCHED_MC
-        case SD_LV_MC: /* set up multi-core groups */
+        { sd_init_MC, cpu_coregroup_mask, },
-                cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
-                if (cpu == cpumask_first(d->this_core_map))
-                        init_sched_build_groups(d->this_core_map, cpu_map,
-                                                &cpu_to_core_group,
-                                                d->send_covered, d->tmpmask);
-                break;
 #endif
 #ifdef CONFIG_SCHED_BOOK
-        case SD_LV_BOOK: /* set up book groups */
+        { sd_init_BOOK, cpu_book_mask, },
-                cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
-                if (cpu == cpumask_first(d->this_book_map))
-                        init_sched_build_groups(d->this_book_map, cpu_map,
-                                                &cpu_to_book_group,
-                                                d->send_covered, d->tmpmask);
-                break;
 #endif
-        case SD_LV_CPU: /* set up physical groups */
+        { sd_init_CPU, cpu_cpu_mask, },
-                cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
-                if (!cpumask_empty(d->nodemask))
-                        init_sched_build_groups(d->nodemask, cpu_map,
-                                                &cpu_to_phys_group,
-                                                d->send_covered, d->tmpmask);
-                break;
 #ifdef CONFIG_NUMA
-        case SD_LV_ALLNODES:
+        { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
-                init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
+        { sd_init_ALLNODES, cpu_allnodes_mask, },
-                                        d->send_covered, d->tmpmask);
-                break;
 #endif
-        default:
+        { NULL, },
-                break;
+};
+static struct sched_domain_topology_level *sched_domain_topology = default_topology;
+static int __sdt_alloc(const struct cpumask *cpu_map)
+{
+        struct sched_domain_topology_level *tl;
+        int j;
+        for (tl = sched_domain_topology; tl->init; tl++) {
+                struct sd_data *sdd = &tl->data;
+                sdd->sd = alloc_percpu(struct sched_domain *);
+                if (!sdd->sd)
+                        return -ENOMEM;
+                sdd->sg = alloc_percpu(struct sched_group *);
+                if (!sdd->sg)
+                        return -ENOMEM;
+                sdd->sgp = alloc_percpu(struct sched_group_power *);
+                if (!sdd->sgp)
+                        return -ENOMEM;
+                for_each_cpu(j, cpu_map) {
+                        struct sched_domain *sd;
+                        struct sched_group *sg;
+                        struct sched_group_power *sgp;
+                        sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+                                        GFP_KERNEL, cpu_to_node(j));
+                        if (!sd)
+                                return -ENOMEM;
+                        *per_cpu_ptr(sdd->sd, j) = sd;
+                        sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+                                        GFP_KERNEL, cpu_to_node(j));
+                        if (!sg)
+                                return -ENOMEM;
+                        *per_cpu_ptr(sdd->sg, j) = sg;
+                        sgp = kzalloc_node(sizeof(struct sched_group_power),
+                                        GFP_KERNEL, cpu_to_node(j));
+                        if (!sgp)
+                                return -ENOMEM;
+                        *per_cpu_ptr(sdd->sgp, j) = sgp;
+                }
+        }
+        return 0;
+}
+static void __sdt_free(const struct cpumask *cpu_map)
+{
+        struct sched_domain_topology_level *tl;
+        int j;
+        for (tl = sched_domain_topology; tl->init; tl++) {
+                struct sd_data *sdd = &tl->data;
+                for_each_cpu(j, cpu_map) {
+                        struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
+                        if (sd && (sd->flags & SD_OVERLAP))
+                                free_sched_groups(sd->groups, 0);
+                        kfree(*per_cpu_ptr(sdd->sg, j));
+                        kfree(*per_cpu_ptr(sdd->sgp, j));
+                }
+                free_percpu(sdd->sd);
+                free_percpu(sdd->sg);
+                free_percpu(sdd->sgp);
        }
 }
+struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
+                struct s_data *d, const struct cpumask *cpu_map,
+                struct sched_domain_attr *attr, struct sched_domain *child,
+                int cpu)
+{
+        struct sched_domain *sd = tl->init(tl, cpu);
+        if (!sd)
+                return child;
+        set_domain_attribute(sd, attr);
+        cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+        if (child) {
+                sd->level = child->level + 1;
+                sched_domain_level_max = max(sched_domain_level_max, sd->level);
+                child->parent = sd;
+        }
+        sd->child = child;
+        return sd;
+}
 /*
 * Build sched domains for a given set of cpus and attach the sched domains
 * to the individual cpus
 */
-static int __build_sched_domains(const struct cpumask *cpu_map,
+static int build_sched_domains(const struct cpumask *cpu_map,
-                                 struct sched_domain_attr *attr)
+                               struct sched_domain_attr *attr)
 {
        enum s_alloc alloc_state = sa_none;
-        struct s_data d;
        struct sched_domain *sd;
-        int i;
+        struct s_data d;
-#ifdef CONFIG_NUMA
+        int i, ret = -ENOMEM;
-        d.sd_allnodes = 0;
-#endif
        alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
        if (alloc_state != sa_rootdomain)
                goto error;
-        alloc_state = sa_sched_groups;
-        /*
+        /* Set up domains for cpus specified by the cpu_map. */
-         * Set up domains for cpus specified by the cpu_map.
-         */
        for_each_cpu(i, cpu_map) {
-                cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
+                struct sched_domain_topology_level *tl;
-                            cpu_map);
+                sd = NULL;
+                for (tl = sched_domain_topology; tl->init; tl++) {
+                        sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
+                        if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
+                                sd->flags |= SD_OVERLAP;
+                        if (cpumask_equal(cpu_map, sched_domain_span(sd)))
+                                break;
+                }
-                sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
+                while (sd->child)
-                sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
+                        sd = sd->child;
-                sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
-                sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
-                sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
-        }
-        for_each_cpu(i, cpu_map) {
+                *per_cpu_ptr(d.sd, i) = sd;
-                build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
-                build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
-                build_sched_groups(&d, SD_LV_MC, cpu_map, i);
        }
-        /* Set up physical groups */
+        /* Build the groups for the domains */
-        for (i = 0; i < nr_node_ids; i++)
-                build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
-#ifdef CONFIG_NUMA
-        /* Set up node groups */
-        if (d.sd_allnodes)
-                build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
-        for (i = 0; i < nr_node_ids; i++)
-                if (build_numa_sched_groups(&d, cpu_map, i))
-                        goto error;
-#endif
-        /* Calculate CPU power for physical packages and nodes */
-#ifdef CONFIG_SCHED_SMT
        for_each_cpu(i, cpu_map) {
-                sd = &per_cpu(cpu_domains, i).sd;
+                for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
-                init_sched_groups_power(i, sd);
+                        sd->span_weight = cpumask_weight(sched_domain_span(sd));
-        }
+                        if (sd->flags & SD_OVERLAP) {
-#endif
+                                if (build_overlap_sched_groups(sd, i))
-#ifdef CONFIG_SCHED_MC
+                                        goto error;
-        for_each_cpu(i, cpu_map) {
+                        } else {
-                sd = &per_cpu(core_domains, i).sd;
+                                if (build_sched_groups(sd, i))
-                init_sched_groups_power(i, sd);
+                                        goto error;
-        }
+                        }
-#endif
+                }
-#ifdef CONFIG_SCHED_BOOK
-        for_each_cpu(i, cpu_map) {
-                sd = &per_cpu(book_domains, i).sd;
-                init_sched_groups_power(i, sd);
-        }
-#endif
-        for_each_cpu(i, cpu_map) {
-                sd = &per_cpu(phys_domains, i).sd;
-                init_sched_groups_power(i, sd);
        }
-#ifdef CONFIG_NUMA
+        /* Calculate CPU power for physical packages and nodes */
-        for (i = 0; i < nr_node_ids; i++)
+        for (i = nr_cpumask_bits-1; i >= 0; i--) {
-                init_numa_sched_groups_power(d.sched_group_nodes[i]);
+                if (!cpumask_test_cpu(i, cpu_map))
+                        continue;
-        if (d.sd_allnodes) {
-                struct sched_group *sg;
-                cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
+                for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
-                                                                d.tmpmask);
+                        claim_allocations(i, sd);
-                init_numa_sched_groups_power(sg);
+                        init_sched_groups_power(i, sd);
+                }
        }
-#endif
        /* Attach the domains */
+        rcu_read_lock();
        for_each_cpu(i, cpu_map) {
-#ifdef CONFIG_SCHED_SMT
+                sd = *per_cpu_ptr(d.sd, i);
-                sd = &per_cpu(cpu_domains, i).sd;
-#elif defined(CONFIG_SCHED_MC)
-                sd = &per_cpu(core_domains, i).sd;
-#elif defined(CONFIG_SCHED_BOOK)
-                sd = &per_cpu(book_domains, i).sd;
-#else
-                sd = &per_cpu(phys_domains, i).sd;
-#endif
                cpu_attach_domain(sd, d.rd, i);
        }
+        rcu_read_unlock();
-        d.sched_group_nodes = NULL; /* don't free this we still need it */
+        ret = 0;
-        __free_domain_allocs(&d, sa_tmpmask, cpu_map);
-        return 0;
 error:
        __free_domain_allocs(&d, alloc_state, cpu_map);
-        return -ENOMEM;
+        return ret;
-}
-static int build_sched_domains(const struct cpumask *cpu_map)
-{
-        return __build_sched_domains(cpu_map, NULL);
 }
 static cpumask_var_t *doms_cur; /* current sched domains */
@@ -7670,7 +7599,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
 * For now this just excludes isolated cpus, but could be used to
 * exclude other special cases in the future.
 */
-static int arch_init_sched_domains(const struct cpumask *cpu_map)
+static int init_sched_domains(const struct cpumask *cpu_map)
 {
        int err;
@@ -7681,32 +7610,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
                doms_cur = &fallback_doms;
        cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
        dattr_cur = NULL;
-        err = build_sched_domains(doms_cur[0]);
+        err = build_sched_domains(doms_cur[0], NULL);
        register_sched_domain_sysctl();
        return err;
 }
-static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
-                                       struct cpumask *tmpmask)
-{
-        free_sched_groups(cpu_map, tmpmask);
-}
 /*
 * Detach sched domains from a group of cpus specified in cpu_map
 * These cpus will now be attached to the NULL domain
 */
 static void detach_destroy_domains(const struct cpumask *cpu_map)
 {
-        /* Save because hotplug lock held. */
-        static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
        int i;
+        rcu_read_lock();
        for_each_cpu(i, cpu_map)
                cpu_attach_domain(NULL, &def_root_domain, i);
-        synchronize_sched();
+        rcu_read_unlock();
-        arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
 }
 /* handle null as "default" */
@@ -7795,8 +7716,7 @@ match1:
                                goto match2;
                }
                /* no match - add a new doms_new */
-                __build_sched_domains(doms_new[i],
+                build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
-                                        dattr_new ? dattr_new + i : NULL);
 match2:
                ;
        }
@@ -7815,7 +7735,7 @@ match2:
 }
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-static void arch_reinit_sched_domains(void)
+static void reinit_sched_domains(void)
 {
        get_online_cpus();
@@ -7848,7 +7768,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
        else
                sched_mc_power_savings = level;
-        arch_reinit_sched_domains();
+        reinit_sched_domains();
        return count;
 }
@@ -7967,14 +7887,9 @@ void __init sched_init_smp(void)
        alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
        alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
-#if defined(CONFIG_NUMA)
-        sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
-                                                                GFP_KERNEL);
-        BUG_ON(sched_group_nodes_bycpu == NULL);
-#endif
        get_online_cpus();
        mutex_lock(&sched_domains_mutex);
-        arch_init_sched_domains(cpu_active_mask);
+        init_sched_domains(cpu_active_mask);
        cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
        if (cpumask_empty(non_isolated_cpus))
                cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -8013,18 +7928,14 @@ int in_sched_functions(unsigned long addr)
                && addr < (unsigned long)__sched_text_end);
 }
-static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
+static void init_cfs_rq(struct cfs_rq *cfs_rq)
 {
        cfs_rq->tasks_timeline = RB_ROOT;
        INIT_LIST_HEAD(&cfs_rq->tasks);
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        cfs_rq->rq = rq;
-        /* allow initial update_cfs_load() to truncate */
-#ifdef CONFIG_SMP
-        cfs_rq->load_stamp = 1;
-#endif
-#endif
        cfs_rq->min_vruntime = (u64)(-(1LL << 20));
+#ifndef CONFIG_64BIT
+        cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
+#endif
 }
 static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
@@ -8040,27 +7951,18 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
        /* delimiter for bitsearch: */
        __set_bit(MAX_RT_PRIO, array->bitmap);
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+#if defined CONFIG_SMP
        rt_rq->highest_prio.curr = MAX_RT_PRIO;
-#ifdef CONFIG_SMP
        rt_rq->highest_prio.next = MAX_RT_PRIO;
-#endif
-#endif
-#ifdef CONFIG_SMP
        rt_rq->rt_nr_migratory = 0;
        rt_rq->overloaded = 0;
-        plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);
+        plist_head_init(&rt_rq->pushable_tasks);
 #endif
        rt_rq->rt_time = 0;
        rt_rq->rt_throttled = 0;
        rt_rq->rt_runtime = 0;
        raw_spin_lock_init(&rt_rq->rt_runtime_lock);
-#ifdef CONFIG_RT_GROUP_SCHED
-        rt_rq->rt_nr_boosted = 0;
-        rt_rq->rq = rq;
-#endif
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8069,11 +7971,17 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
                                struct sched_entity *parent)
 {
        struct rq *rq = cpu_rq(cpu);
-        tg->cfs_rq[cpu] = cfs_rq;
-        init_cfs_rq(cfs_rq, rq);
        cfs_rq->tg = tg;
+        cfs_rq->rq = rq;
+#ifdef CONFIG_SMP
+        /* allow initial update_cfs_load() to truncate */
+        cfs_rq->load_stamp = 1;
+#endif
+        tg->cfs_rq[cpu] = cfs_rq;
        tg->se[cpu] = se;
        /* se could be NULL for root_task_group */
        if (!se)
                return;
@@ -8096,12 +8004,14 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
 {
        struct rq *rq = cpu_rq(cpu);
-        tg->rt_rq[cpu] = rt_rq;
+        rt_rq->highest_prio.curr = MAX_RT_PRIO;
-        init_rt_rq(rt_rq, rq);
+        rt_rq->rt_nr_boosted = 0;
+        rt_rq->rq = rq;
        rt_rq->tg = tg;
-        rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
+        tg->rt_rq[cpu] = rt_rq;
        tg->rt_se[cpu] = rt_se;
        if (!rt_se)
                return;
@@ -8183,7 +8093,7 @@ void __init sched_init(void)
                rq->nr_running = 0;
                rq->calc_load_active = 0;
                rq->calc_load_update = jiffies + LOAD_FREQ;
-                init_cfs_rq(&rq->cfs, rq);
+                init_cfs_rq(&rq->cfs);
                init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
                root_task_group.shares = root_task_group_load;
@@ -8224,7 +8134,7 @@ void __init sched_init(void)
 #ifdef CONFIG_SMP
                rq->sd = NULL;
                rq->rd = NULL;
-                rq->cpu_power = SCHED_LOAD_SCALE;
+                rq->cpu_power = SCHED_POWER_SCALE;
                rq->post_schedule = 0;
                rq->active_balance = 0;
                rq->next_balance = jiffies;
@@ -8254,7 +8164,7 @@ void __init sched_init(void)
 #endif
 #ifdef CONFIG_RT_MUTEXES
-        plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);
+        plist_head_init(&init_task.pi_waiters);
 #endif
        /*
@@ -8281,6 +8191,7 @@ void __init sched_init(void)
        /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
        zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
 #ifdef CONFIG_SMP
+        zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
 #ifdef CONFIG_NO_HZ
        zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
        alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
@@ -8296,7 +8207,7 @@ void __init sched_init(void)
        scheduler_running = 1;
 }
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 static inline int preempt_count_equals(int preempt_offset)
 {
        int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
@@ -8306,7 +8217,6 @@ static inline int preempt_count_equals(int preempt_offset)
 void __might_sleep(const char *file, int line, int preempt_offset)
 {
-#ifdef in_atomic
        static unsigned long prev_jiffy;        /* ratelimiting */
        if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
@@ -8328,7 +8238,6 @@ void __might_sleep(const char *file, int line, int preempt_offset)
        if (irqs_disabled())
                print_irqtrace_events(current);
        dump_stack();
-#endif
 }
 EXPORT_SYMBOL(__might_sleep);
 #endif
@@ -8340,7 +8249,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
        int old_prio = p->prio;
        int on_rq;
-        on_rq = p->se.on_rq;
+        on_rq = p->on_rq;
        if (on_rq)
                deactivate_task(rq, p, 0);
        __setscheduler(rq, p, SCHED_NORMAL, 0);
@@ -8487,6 +8396,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
                if (!se)
                        goto err_free_rq;
+                init_cfs_rq(cfs_rq);
                init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
        }
@@ -8514,7 +8424,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
        list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
-#else /* !CONFG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED */
 static inline void free_fair_sched_group(struct task_group *tg)
 {
 }
@@ -8535,7 +8445,8 @@ static void free_rt_sched_group(struct task_group *tg)
 {
        int i;
-        destroy_rt_bandwidth(&tg->rt_bandwidth);
+        if (tg->rt_se)
+                destroy_rt_bandwidth(&tg->rt_bandwidth);
        for_each_possible_cpu(i) {
                if (tg->rt_rq)
@@ -8553,7 +8464,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 {
        struct rt_rq *rt_rq;
        struct sched_rt_entity *rt_se;
-        struct rq *rq;
        int i;
        tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8567,8 +8477,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
                        ktime_to_ns(def_rt_bandwidth.rt_period), 0);
        for_each_possible_cpu(i) {
-                rq = cpu_rq(i);
                rt_rq = kzalloc_node(sizeof(struct rt_rq),
                                     GFP_KERNEL, cpu_to_node(i));
                if (!rt_rq)
@@ -8579,6 +8487,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
                if (!rt_se)
                        goto err_free_rq;
+                init_rt_rq(rt_rq, cpu_rq(i));
+                rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
                init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
        }
@@ -8683,7 +8593,7 @@ void sched_move_task(struct task_struct *tsk)
        rq = task_rq_lock(tsk, &flags);
        running = task_current(rq, tsk);
-        on_rq = tsk->se.on_rq;
+        on_rq = tsk->on_rq;
        if (on_rq)
                dequeue_task(rq, tsk, 0);
@@ -8702,7 +8612,7 @@ void sched_move_task(struct task_struct *tsk)
        if (on_rq)
                enqueue_task(rq, tsk, 0);
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, tsk, &flags);
 }
 #endif /* CONFIG_CGROUP_SCHED */
@@ -8720,10 +8630,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
        if (!tg->se[0])
                return -EINVAL;
-        if (shares < MIN_SHARES)
+        shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
-                shares = MIN_SHARES;
-        else if (shares > MAX_SHARES)
-                shares = MAX_SHARES;
        mutex_lock(&shares_mutex);
        if (tg->shares == shares)
@@ -9073,42 +8980,10 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
        return 0;
 }
-static int
-cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-                      struct task_struct *tsk, bool threadgroup)
-{
-        int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
-        if (retval)
-                return retval;
-        if (threadgroup) {
-                struct task_struct *c;
-                rcu_read_lock();
-                list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-                        retval = cpu_cgroup_can_attach_task(cgrp, c);
-                        if (retval) {
-                                rcu_read_unlock();
-                                return retval;
-                        }
-                }
-                rcu_read_unlock();
-        }
-        return 0;
-}
 static void
-cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
-                  struct cgroup *old_cont, struct task_struct *tsk,
-                  bool threadgroup)
 {
        sched_move_task(tsk);
-        if (threadgroup) {
-                struct task_struct *c;
-                rcu_read_lock();
-                list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-                        sched_move_task(c);
-                }
-                rcu_read_unlock();
-        }
 }
 static void
@@ -9130,14 +9005,14 @@ cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
                                u64 shareval)
 {
-        return sched_group_set_shares(cgroup_tg(cgrp), shareval);
+        return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
 }
 static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
 {
        struct task_group *tg = cgroup_tg(cgrp);
-        return (u64) tg->shares;
+        return (u64) scale_load_down(tg->shares);
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -9196,8 +9071,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
        .name           = "cpu",
        .create         = cpu_cgroup_create,
        .destroy        = cpu_cgroup_destroy,
-        .can_attach     = cpu_cgroup_can_attach,
+        .can_attach_task = cpu_cgroup_can_attach_task,
-        .attach         = cpu_cgroup_attach,
+        .attach_task    = cpu_cgroup_attach_task,
        .exit           = cpu_cgroup_exit,
        .populate       = cpu_cgroup_populate,
        .subsys_id      = cpu_cgroup_subsys_id,
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
index 05577055cfca..c2f0e7248dca 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched_autogroup.h
@@ -13,6 +13,7 @@ struct autogroup {
        int                     nice;
 };
+static inline bool task_group_is_autogroup(struct task_group *tg);
 static inline struct task_group *
 autogroup_task_group(struct task_struct *p, struct task_group *tg);
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 7bacd83a4158..a6710a112b4f 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -152,7 +152,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
        read_lock_irqsave(&tasklist_lock, flags);
        do_each_thread(g, p) {
-                if (!p->se.on_rq || task_cpu(p) != rq_cpu)
+                if (!p->on_rq || task_cpu(p) != rq_cpu)
                        continue;
                print_task(m, rq, p);
@@ -296,9 +296,6 @@ static void print_cpu(struct seq_file *m, int cpu)
        P(ttwu_count);
        P(ttwu_local);
-        SEQ_printf(m, "  .%-30s: %d\n", "bkl_count",
-                                rq->rq_sched_info.bkl_count);
 #undef P
 #undef P64
 #endif
@@ -441,7 +438,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
        P(se.statistics.wait_count);
        PN(se.statistics.iowait_sum);
        P(se.statistics.iowait_count);
-        P(sched_info.bkl_count);
        P(se.nr_migrations);
        P(se.statistics.nr_migrations_cold);
        P(se.statistics.nr_failed_migrations_affine);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6fa833ab2cb8..bc8ee9993814 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -135,14 +135,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
        return grp->my_q;
 }
-/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
- * another cpu ('this_cpu')
- */
-static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
-{
-        return cfs_rq->tg->cfs_rq[this_cpu];
-}
 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
        if (!cfs_rq->on_list) {
@@ -271,11 +263,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
        return NULL;
 }
-static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
-{
-        return &cpu_rq(this_cpu)->cfs;
-}
 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
 }
@@ -334,11 +321,6 @@ static inline int entity_before(struct sched_entity *a,
        return (s64)(a->vruntime - b->vruntime) < 0;
 }
-static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-        return se->vruntime - cfs_rq->min_vruntime;
-}
 static void update_min_vruntime(struct cfs_rq *cfs_rq)
 {
        u64 vruntime = cfs_rq->min_vruntime;
@@ -358,6 +340,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
        }
        cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
+#ifndef CONFIG_64BIT
+        smp_wmb();
+        cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
+#endif
 }
 /*
@@ -368,7 +354,6 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
        struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
        struct rb_node *parent = NULL;
        struct sched_entity *entry;
-        s64 key = entity_key(cfs_rq, se);
        int leftmost = 1;
        /*
@@ -381,7 +366,7 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
                 * We dont care about collisions. Nodes with
                 * the same key stay together.
                 */
-                if (key < entity_key(cfs_rq, entry)) {
+                if (entity_before(se, entry)) {
                        link = &parent->rb_left;
                } else {
                        link = &parent->rb_right;
@@ -1072,8 +1057,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        se->on_rq = 0;
        update_cfs_load(cfs_rq, 0);
        account_entity_dequeue(cfs_rq, se);
-        update_min_vruntime(cfs_rq);
-        update_cfs_shares(cfs_rq);
        /*
         * Normalize the entity after updating the min_vruntime because the
@@ -1082,6 +1065,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         */
        if (!(flags & DEQUEUE_SLEEP))
                se->vruntime -= cfs_rq->min_vruntime;
+        update_min_vruntime(cfs_rq);
+        update_cfs_shares(cfs_rq);
 }
 /*
@@ -1331,7 +1317,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        }
        for_each_sched_entity(se) {
-                struct cfs_rq *cfs_rq = cfs_rq_of(se);
+                cfs_rq = cfs_rq_of(se);
                update_cfs_load(cfs_rq, 0);
                update_cfs_shares(cfs_rq);
@@ -1340,6 +1326,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        hrtick_update(rq);
 }
+static void set_next_buddy(struct sched_entity *se);
 /*
 * The dequeue_task method is called before nr_running is
 * decreased. We remove the task from the rbtree and
@@ -1349,19 +1337,30 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se;
+        int task_sleep = flags & DEQUEUE_SLEEP;
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
                dequeue_entity(cfs_rq, se, flags);
                /* Don't dequeue parent if it has other entities besides us */
-                if (cfs_rq->load.weight)
+                if (cfs_rq->load.weight) {
+                        /*
+                         * Bias pick_next to pick a task from this cfs_rq, as
+                         * p is sleeping when it is within its sched_slice.
+                         */
+                        if (task_sleep && parent_entity(se))
+                                set_next_buddy(parent_entity(se));
+                        /* avoid re-evaluating load for this entity */
+                        se = parent_entity(se);
                        break;
+                }
                flags |= DEQUEUE_SLEEP;
        }
        for_each_sched_entity(se) {
-                struct cfs_rq *cfs_rq = cfs_rq_of(se);
+                cfs_rq = cfs_rq_of(se);
                update_cfs_load(cfs_rq, 0);
                update_cfs_shares(cfs_rq);
@@ -1372,12 +1371,25 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 #ifdef CONFIG_SMP
-static void task_waking_fair(struct rq *rq, struct task_struct *p)
+static void task_waking_fair(struct task_struct *p)
 {
        struct sched_entity *se = &p->se;
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
+        u64 min_vruntime;
-        se->vruntime -= cfs_rq->min_vruntime;
+#ifndef CONFIG_64BIT
+        u64 min_vruntime_copy;
+        do {
+                min_vruntime_copy = cfs_rq->min_vruntime_copy;
+                smp_rmb();
+                min_vruntime = cfs_rq->min_vruntime;
+        } while (min_vruntime != min_vruntime_copy);
+#else
+        min_vruntime = cfs_rq->min_vruntime;
+#endif
+        se->vruntime -= min_vruntime;
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1453,7 +1465,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
         * effect of the currently running task from the load
         * of the current CPU:
         */
-        rcu_read_lock();
        if (sync) {
                tg = task_group(current);
                weight = current->se.load.weight;
@@ -1489,7 +1500,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
                balanced = this_eff_load <= prev_eff_load;
        } else
                balanced = true;
-        rcu_read_unlock();
        /*
         * If the currently running task will sleep within
@@ -1557,7 +1567,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                }
                /* Adjust by relative CPU power of the group */
-                avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+                avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
                if (local_group) {
                        this_load = avg_load;
@@ -1622,6 +1632,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
        /*
         * Otherwise, iterate the domains and find an elegible idle cpu.
         */
+        rcu_read_lock();
        for_each_domain(target, sd) {
                if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
                        break;
@@ -1641,6 +1652,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
                    cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
                        break;
        }
+        rcu_read_unlock();
        return target;
 }
@@ -1657,7 +1669,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
 * preempt must be disabled.
 */
 static int
-select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
+select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
 {
        struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
        int cpu = smp_processor_id();
@@ -1673,6 +1685,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
                new_cpu = prev_cpu;
        }
+        rcu_read_lock();
        for_each_domain(cpu, tmp) {
                if (!(tmp->flags & SD_LOAD_BALANCE))
                        continue;
@@ -1692,7 +1705,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
                                nr_running += cpu_rq(i)->cfs.nr_running;
                        }
-                        capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
+                        capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
                        if (tmp->flags & SD_POWERSAVINGS_BALANCE)
                                nr_running /= 2;
@@ -1723,9 +1736,10 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
        if (affine_sd) {
                if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
-                        return select_idle_sibling(p, cpu);
+                        prev_cpu = cpu;
-                else
-                        return select_idle_sibling(p, prev_cpu);
+                new_cpu = select_idle_sibling(p, prev_cpu);
+                goto unlock;
        }
        while (sd) {
@@ -1766,6 +1780,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
                }
                /* while loop will break here if sd == NULL */
        }
+unlock:
+        rcu_read_unlock();
        return new_cpu;
 }
@@ -1789,10 +1805,7 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
         * This is especially important for buddies when the leftmost
         * task is higher priority than the buddy.
         */
-        if (unlikely(se->load.weight != NICE_0_LOAD))
+        return calc_delta_fair(gran, se);
-                gran = calc_delta_fair(gran, se);
-        return gran;
 }
 /*
@@ -1826,26 +1839,26 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
 static void set_last_buddy(struct sched_entity *se)
 {
-        if (likely(task_of(se)->policy != SCHED_IDLE)) {
+        if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
-                for_each_sched_entity(se)
+                return;
-                        cfs_rq_of(se)->last = se;
-        }
+        for_each_sched_entity(se)
+                cfs_rq_of(se)->last = se;
 }
 static void set_next_buddy(struct sched_entity *se)
 {
-        if (likely(task_of(se)->policy != SCHED_IDLE)) {
+        if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
-                for_each_sched_entity(se)
+                return;
-                        cfs_rq_of(se)->next = se;
-        }
+        for_each_sched_entity(se)
+                cfs_rq_of(se)->next = se;
 }
 static void set_skip_buddy(struct sched_entity *se)
 {
-        if (likely(task_of(se)->policy != SCHED_IDLE)) {
+        for_each_sched_entity(se)
-                for_each_sched_entity(se)
+                cfs_rq_of(se)->skip = se;
-                        cfs_rq_of(se)->skip = se;
-        }
 }
 /*
@@ -1857,12 +1870,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        struct sched_entity *se = &curr->se, *pse = &p->se;
        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
        int scale = cfs_rq->nr_running >= sched_nr_latency;
+        int next_buddy_marked = 0;
        if (unlikely(se == pse))
                return;
-        if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK))
+        if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
                set_next_buddy(pse);
+                next_buddy_marked = 1;
+        }
        /*
         * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1887,11 +1903,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        if (!sched_feat(WAKEUP_PREEMPT))
                return;
-        update_curr(cfs_rq);
        find_matching_se(&se, &pse);
+        update_curr(cfs_rq_of(se));
        BUG_ON(!pse);
-        if (wakeup_preempt_entity(se, pse) == 1)
+        if (wakeup_preempt_entity(se, pse) == 1) {
+                /*
+                 * Bias pick_next to pick the sched entity that is
+                 * triggering this preemption.
+                 */
+                if (!next_buddy_marked)
+                        set_next_buddy(pse);
                goto preempt;
+        }
        return;
@@ -2102,7 +2125,7 @@ static unsigned long
 balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
              unsigned long max_load_move, struct sched_domain *sd,
              enum cpu_idle_type idle, int *all_pinned,
-              int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
+              struct cfs_rq *busiest_cfs_rq)
 {
        int loops = 0, pulled = 0;
        long rem_load_move = max_load_move;
@@ -2140,9 +2163,6 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
                 */
                if (rem_load_move <= 0)
                        break;
-                if (p->prio < *this_best_prio)
-                        *this_best_prio = p->prio;
        }
 out:
        /*
@@ -2193,26 +2213,56 @@ static void update_shares(int cpu)
        struct rq *rq = cpu_rq(cpu);
        rcu_read_lock();
+        /*
+         * Iterates the task_group tree in a bottom up fashion, see
+         * list_add_leaf_cfs_rq() for details.
+         */
        for_each_leaf_cfs_rq(rq, cfs_rq)
                update_shares_cpu(cfs_rq->tg, cpu);
        rcu_read_unlock();
 }
+/*
+ * Compute the cpu's hierarchical load factor for each task group.
+ * This needs to be done in a top-down fashion because the load of a child
+ * group is a fraction of its parents load.
+ */
+static int tg_load_down(struct task_group *tg, void *data)
+{
+        unsigned long load;
+        long cpu = (long)data;
+        if (!tg->parent) {
+                load = cpu_rq(cpu)->load.weight;
+        } else {
+                load = tg->parent->cfs_rq[cpu]->h_load;
+                load *= tg->se[cpu]->load.weight;
+                load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
+        }
+        tg->cfs_rq[cpu]->h_load = load;
+        return 0;
+}
+static void update_h_load(long cpu)
+{
+        walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
+}
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  unsigned long max_load_move,
                  struct sched_domain *sd, enum cpu_idle_type idle,
-                  int *all_pinned, int *this_best_prio)
+                  int *all_pinned)
 {
        long rem_load_move = max_load_move;
-        int busiest_cpu = cpu_of(busiest);
+        struct cfs_rq *busiest_cfs_rq;
-        struct task_group *tg;
        rcu_read_lock();
-        update_h_load(busiest_cpu);
+        update_h_load(cpu_of(busiest));
-        list_for_each_entry_rcu(tg, &task_groups, list) {
+        for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) {
-                struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
                unsigned long busiest_h_load = busiest_cfs_rq->h_load;
                unsigned long busiest_weight = busiest_cfs_rq->load.weight;
                u64 rem_load, moved_load;
@@ -2227,7 +2277,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                rem_load = div_u64(rem_load, busiest_h_load + 1);
                moved_load = balance_tasks(this_rq, this_cpu, busiest,
-                                rem_load, sd, idle, all_pinned, this_best_prio,
+                                rem_load, sd, idle, all_pinned,
                                busiest_cfs_rq);
                if (!moved_load)
@@ -2253,11 +2303,11 @@ static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  unsigned long max_load_move,
                  struct sched_domain *sd, enum cpu_idle_type idle,
-                  int *all_pinned, int *this_best_prio)
+                  int *all_pinned)
 {
        return balance_tasks(this_rq, this_cpu, busiest,
                        max_load_move, sd, idle, all_pinned,
-                        this_best_prio, &busiest->cfs);
+                        &busiest->cfs);
 }
 #endif
@@ -2274,12 +2324,11 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
                      int *all_pinned)
 {
        unsigned long total_load_moved = 0, load_moved;
-        int this_best_prio = this_rq->curr->prio;
        do {
                load_moved = load_balance_fair(this_rq, this_cpu, busiest,
                                max_load_move - total_load_moved,
-                                sd, idle, all_pinned, &this_best_prio);
+                                sd, idle, all_pinned);
                total_load_moved += load_moved;
@@ -2534,7 +2583,7 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
 unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
 {
-        return SCHED_LOAD_SCALE;
+        return SCHED_POWER_SCALE;
 }
 unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
@@ -2571,10 +2620,10 @@ unsigned long scale_rt_power(int cpu)
                available = total - rq->rt_avg;
        }
-        if (unlikely((s64)total < SCHED_LOAD_SCALE))
+        if (unlikely((s64)total < SCHED_POWER_SCALE))
-                total = SCHED_LOAD_SCALE;
+                total = SCHED_POWER_SCALE;
-        total >>= SCHED_LOAD_SHIFT;
+        total >>= SCHED_POWER_SHIFT;
        return div_u64(available, total);
 }
@@ -2582,7 +2631,7 @@ unsigned long scale_rt_power(int cpu)
 static void update_cpu_power(struct sched_domain *sd, int cpu)
 {
        unsigned long weight = sd->span_weight;
-        unsigned long power = SCHED_LOAD_SCALE;
+        unsigned long power = SCHED_POWER_SCALE;
        struct sched_group *sdg = sd->groups;
        if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
@@ -2591,26 +2640,26 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
                else
                        power *= default_scale_smt_power(sd, cpu);
-                power >>= SCHED_LOAD_SHIFT;
+                power >>= SCHED_POWER_SHIFT;
        }
-        sdg->cpu_power_orig = power;
+        sdg->sgp->power_orig = power;
        if (sched_feat(ARCH_POWER))
                power *= arch_scale_freq_power(sd, cpu);
        else
                power *= default_scale_freq_power(sd, cpu);
-        power >>= SCHED_LOAD_SHIFT;
+        power >>= SCHED_POWER_SHIFT;
        power *= scale_rt_power(cpu);
-        power >>= SCHED_LOAD_SHIFT;
+        power >>= SCHED_POWER_SHIFT;
        if (!power)
                power = 1;
        cpu_rq(cpu)->cpu_power = power;
-        sdg->cpu_power = power;
+        sdg->sgp->power = power;
 }
 static void update_group_power(struct sched_domain *sd, int cpu)
@@ -2628,11 +2677,11 @@ static void update_group_power(struct sched_domain *sd, int cpu)
        group = child->groups;
        do {
-                power += group->cpu_power;
+                power += group->sgp->power;
                group = group->next;
        } while (group != child->groups);
-        sdg->cpu_power = power;
+        sdg->sgp->power = power;
 }
 /*
@@ -2646,15 +2695,15 @@ static inline int
 fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 {
        /*
-         * Only siblings can have significantly less than SCHED_LOAD_SCALE
+         * Only siblings can have significantly less than SCHED_POWER_SCALE
         */
-        if (sd->level != SD_LV_SIBLING)
+        if (!(sd->flags & SD_SHARE_CPUPOWER))
                return 0;
        /*
         * If ~90% of the cpu_power is still there, we're good.
         */
-        if (group->cpu_power * 32 > group->cpu_power_orig * 29)
+        if (group->sgp->power * 32 > group->sgp->power_orig * 29)
                return 1;
        return 0;
@@ -2734,7 +2783,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
        }
        /* Adjust by relative CPU power of the group */
-        sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
+        sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
        /*
         * Consider the group unbalanced when the imbalance is larger
@@ -2751,7 +2800,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
        if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
                sgs->group_imb = 1;
-        sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+        sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
+                                                SCHED_POWER_SCALE);
        if (!sgs->group_capacity)
                sgs->group_capacity = fix_small_capacity(sd, group);
        sgs->group_weight = group->group_weight;
@@ -2839,7 +2889,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                        return;
                sds->total_load += sgs.group_load;
-                sds->total_pwr += sg->cpu_power;
+                sds->total_pwr += sg->sgp->power;
                /*
                 * In case the child domain prefers tasks go to siblings
@@ -2924,8 +2974,8 @@ static int check_asym_packing(struct sched_domain *sd,
        if (this_cpu > busiest_cpu)
                return 0;
-        *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
+        *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power,
-                                       SCHED_LOAD_SCALE);
+                                       SCHED_POWER_SCALE);
        return 1;
 }
@@ -2954,8 +3004,8 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
                        cpu_avg_load_per_task(this_cpu);
        scaled_busy_load_per_task = sds->busiest_load_per_task
-                                                 * SCHED_LOAD_SCALE;
+                                         * SCHED_POWER_SCALE;
-        scaled_busy_load_per_task /= sds->busiest->cpu_power;
+        scaled_busy_load_per_task /= sds->busiest->sgp->power;
        if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
                        (scaled_busy_load_per_task * imbn)) {
@@ -2969,30 +3019,30 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
         * moving them.
         */
-        pwr_now += sds->busiest->cpu_power *
+        pwr_now += sds->busiest->sgp->power *
                        min(sds->busiest_load_per_task, sds->max_load);
-        pwr_now += sds->this->cpu_power *
+        pwr_now += sds->this->sgp->power *
                        min(sds->this_load_per_task, sds->this_load);
-        pwr_now /= SCHED_LOAD_SCALE;
+        pwr_now /= SCHED_POWER_SCALE;
        /* Amount of load we'd subtract */
-        tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+        tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
-                sds->busiest->cpu_power;
+                sds->busiest->sgp->power;
        if (sds->max_load > tmp)
-                pwr_move += sds->busiest->cpu_power *
+                pwr_move += sds->busiest->sgp->power *
                        min(sds->busiest_load_per_task, sds->max_load - tmp);
        /* Amount of load we'd add */
-        if (sds->max_load * sds->busiest->cpu_power <
+        if (sds->max_load * sds->busiest->sgp->power <
-                sds->busiest_load_per_task * SCHED_LOAD_SCALE)
+                sds->busiest_load_per_task * SCHED_POWER_SCALE)
-                tmp = (sds->max_load * sds->busiest->cpu_power) /
+                tmp = (sds->max_load * sds->busiest->sgp->power) /
-                        sds->this->cpu_power;
+                        sds->this->sgp->power;
        else
-                tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+                tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
-                        sds->this->cpu_power;
+                        sds->this->sgp->power;
-        pwr_move += sds->this->cpu_power *
+        pwr_move += sds->this->sgp->power *
                        min(sds->this_load_per_task, sds->this_load + tmp);
-        pwr_move /= SCHED_LOAD_SCALE;
+        pwr_move /= SCHED_POWER_SCALE;
        /* Move if we gain throughput */
        if (pwr_move > pwr_now)
@@ -3034,9 +3084,9 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
                load_above_capacity = (sds->busiest_nr_running -
                                                sds->busiest_group_capacity);
-                load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
+                load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
-                load_above_capacity /= sds->busiest->cpu_power;
+                load_above_capacity /= sds->busiest->sgp->power;
        }
        /*
@@ -3052,9 +3102,9 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
        max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
        /* How much load to actually move to equalise the imbalance */
-        *imbalance = min(max_pull * sds->busiest->cpu_power,
+        *imbalance = min(max_pull * sds->busiest->sgp->power,
-                (sds->avg_load - sds->this_load) * sds->this->cpu_power)
+                (sds->avg_load - sds->this_load) * sds->this->sgp->power)
-                        / SCHED_LOAD_SCALE;
+                        / SCHED_POWER_SCALE;
        /*
         * if *imbalance is less than the average load per runnable task
@@ -3123,7 +3173,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
        if (!sds.busiest || sds.busiest_nr_running == 0)
                goto out_balanced;
-        sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
+        sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
        /*
         * If the busiest group is imbalanced the below checks don't
@@ -3202,7 +3252,8 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
        for_each_cpu(i, sched_group_cpus(group)) {
                unsigned long power = power_of(i);
-                unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
+                unsigned long capacity = DIV_ROUND_CLOSEST(power,
+                                                           SCHED_POWER_SCALE);
                unsigned long wl;
                if (!capacity)
@@ -3227,7 +3278,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
                 * the load can be moved away from the cpu that is potentially
                 * running at a lower capacity.
                 */
-                wl = (wl * SCHED_LOAD_SCALE) / power;
+                wl = (wl * SCHED_POWER_SCALE) / power;
                if (wl > max_load) {
                        max_load = wl;
@@ -3465,6 +3516,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
        raw_spin_unlock(&this_rq->lock);
        update_shares(this_cpu);
+        rcu_read_lock();
        for_each_domain(this_cpu, sd) {
                unsigned long interval;
                int balance = 1;
@@ -3486,6 +3538,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
                        break;
                }
        }
+        rcu_read_unlock();
        raw_spin_lock(&this_rq->lock);
@@ -3534,6 +3587,7 @@ static int active_load_balance_cpu_stop(void *data)
        double_lock_balance(busiest_rq, target_rq);
        /* Search for an sd spanning us and the target CPU. */
+        rcu_read_lock();
        for_each_domain(target_cpu, sd) {
                if ((sd->flags & SD_LOAD_BALANCE) &&
                    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
@@ -3549,6 +3603,7 @@ static int active_load_balance_cpu_stop(void *data)
                else
                        schedstat_inc(sd, alb_failed);
        }
+        rcu_read_unlock();
        double_unlock_balance(busiest_rq, target_rq);
 out_unlock:
        busiest_rq->active_balance = 0;
@@ -3675,6 +3730,7 @@ static int find_new_ilb(int cpu)
 {
        struct sched_domain *sd;
        struct sched_group *ilb_group;
+        int ilb = nr_cpu_ids;
        /*
         * Have idle load balancer selection from semi-idle packages only
@@ -3690,20 +3746,25 @@ static int find_new_ilb(int cpu)
        if (cpumask_weight(nohz.idle_cpus_mask) < 2)
                goto out_done;
+        rcu_read_lock();
        for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
                ilb_group = sd->groups;
                do {
-                        if (is_semi_idle_group(ilb_group))
+                        if (is_semi_idle_group(ilb_group)) {
-                                return cpumask_first(nohz.grp_idle_mask);
+                                ilb = cpumask_first(nohz.grp_idle_mask);
+                                goto unlock;
+                        }
                        ilb_group = ilb_group->next;
                } while (ilb_group != sd->groups);
        }
+unlock:
+        rcu_read_unlock();
 out_done:
-        return nr_cpu_ids;
+        return ilb;
 }
 #else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
 static inline int find_new_ilb(int call_cpu)
@@ -3848,6 +3909,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
        update_shares(cpu);
+        rcu_read_lock();
        for_each_domain(cpu, sd) {
                if (!(sd->flags & SD_LOAD_BALANCE))
                        continue;
@@ -3893,6 +3955,7 @@ out:
                if (!balance)
                        break;
        }
+        rcu_read_unlock();
        /*
         * next_balance will be updated only when there is a need.
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 68e69acc29b9..2e74677cb040 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -61,6 +61,14 @@ SCHED_FEAT(LB_BIAS, 1)
 SCHED_FEAT(OWNER_SPIN, 1)
 /*
- * Decrement CPU power based on irq activity
+ * Decrement CPU power based on time not spent running tasks
 */
-SCHED_FEAT(NONIRQ_POWER, 1)
+SCHED_FEAT(NONTASK_POWER, 1)
+/*
+ * Queue remote wakeups on the target CPU and process them
+ * using the scheduler IPI. Reduces rq->lock contention/bounces.
+ */
+SCHED_FEAT(TTWU_QUEUE, 1)
+SCHED_FEAT(FORCE_SD_OVERLAP, 0)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index a776a6396427..0a51882534ea 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -7,7 +7,7 @@
 #ifdef CONFIG_SMP
 static int
-select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
+select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
 {
        return task_cpu(p); /* IDLE tasks as never migrated */
 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index e7cebdc65f82..97540f0c9e47 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -183,6 +183,26 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
        return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
 }
+typedef struct task_group *rt_rq_iter_t;
+static inline struct task_group *next_task_group(struct task_group *tg)
+{
+        do {
+                tg = list_entry_rcu(tg->list.next,
+                        typeof(struct task_group), list);
+        } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
+        if (&tg->list == &task_groups)
+                tg = NULL;
+        return tg;
+}
+#define for_each_rt_rq(rt_rq, iter, rq)                                 \
+        for (iter = container_of(&task_groups, typeof(*iter), list);    \
+                (iter = next_task_group(iter)) &&                       \
+                (rt_rq = iter->rt_rq[cpu_of(rq)]);)
 static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
 {
        list_add_rcu(&rt_rq->leaf_rt_rq_list,
@@ -288,6 +308,11 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
        return ktime_to_ns(def_rt_bandwidth.rt_period);
 }
+typedef struct rt_rq *rt_rq_iter_t;
+#define for_each_rt_rq(rt_rq, iter, rq) \
+        for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
 {
 }
@@ -402,12 +427,13 @@ next:
 static void __disable_runtime(struct rq *rq)
 {
        struct root_domain *rd = rq->rd;
+        rt_rq_iter_t iter;
        struct rt_rq *rt_rq;
        if (unlikely(!scheduler_running))
                return;
-        for_each_leaf_rt_rq(rt_rq, rq) {
+        for_each_rt_rq(rt_rq, iter, rq) {
                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
                s64 want;
                int i;
@@ -487,6 +513,7 @@ static void disable_runtime(struct rq *rq)
 static void __enable_runtime(struct rq *rq)
 {
+        rt_rq_iter_t iter;
        struct rt_rq *rt_rq;
        if (unlikely(!scheduler_running))
@@ -495,7 +522,7 @@ static void __enable_runtime(struct rq *rq)
        /*
         * Reset each runqueue's bandwidth settings
         */
-        for_each_leaf_rt_rq(rt_rq, rq) {
+        for_each_rt_rq(rt_rq, iter, rq) {
                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
                raw_spin_lock(&rt_b->rt_runtime_lock);
@@ -562,6 +589,13 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
                        if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
                                rt_rq->rt_throttled = 0;
                                enqueue = 1;
+                                /*
+                                 * Force a clock update if the CPU was idle,
+                                 * lest wakeup -> unthrottle time accumulate.
+                                 */
+                                if (rt_rq->rt_nr_running && rq->curr == rq->idle)
+                                        rq->skip_clock_update = -1;
                        }
                        if (rt_rq->rt_time || rt_rq->rt_nr_running)
                                idle = 0;
@@ -977,13 +1011,23 @@ static void yield_task_rt(struct rq *rq)
 static int find_lowest_rq(struct task_struct *task);
 static int
-select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
+select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
 {
+        struct task_struct *curr;
+        struct rq *rq;
+        int cpu;
        if (sd_flag != SD_BALANCE_WAKE)
                return smp_processor_id();
+        cpu = task_cpu(p);
+        rq = cpu_rq(cpu);
+        rcu_read_lock();
+        curr = ACCESS_ONCE(rq->curr); /* unlocked access */
        /*
-         * If the current task is an RT task, then
+         * If the current task on @p's runqueue is an RT task, then
         * try to see if we can wake this RT task up on another
         * runqueue. Otherwise simply start this RT task
         * on its current runqueue.
@@ -997,21 +1041,25 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
         * lock?
         *
         * For equal prio tasks, we just let the scheduler sort it out.
+         *
+         * Otherwise, just let it ride on the affined RQ and the
+         * post-schedule router will push the preempted task away
+         *
+         * This test is optimistic, if we get it wrong the load-balancer
+         * will have to sort it out.
         */
-        if (unlikely(rt_task(rq->curr)) &&
+        if (curr && unlikely(rt_task(curr)) &&
-            (rq->curr->rt.nr_cpus_allowed < 2 ||
+            (curr->rt.nr_cpus_allowed < 2 ||
-             rq->curr->prio < p->prio) &&
+             curr->prio < p->prio) &&
            (p->rt.nr_cpus_allowed > 1)) {
-                int cpu = find_lowest_rq(p);
+                int target = find_lowest_rq(p);
-                return (cpu == -1) ? task_cpu(p) : cpu;
+                if (target != -1)
+                        cpu = target;
        }
+        rcu_read_unlock();
-        /*
+        return cpu;
-         * Otherwise, just let it ride on the affined RQ and the
-         * post-schedule router will push the preempted task away
-         */
-        return task_cpu(p);
 }
 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
@@ -1060,7 +1108,7 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
         * to move current somewhere else, making room for our non-migratable
         * task.
         */
-        if (p->prio == rq->curr->prio && !need_resched())
+        if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
                check_preempt_equal_prio(rq, p);
 #endif
 }
@@ -1090,7 +1138,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
        rt_rq = &rq->rt;
-        if (unlikely(!rt_rq->rt_nr_running))
+        if (!rt_rq->rt_nr_running)
                return NULL;
        if (rt_rq_throttled(rt_rq))
@@ -1136,7 +1184,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
         * The previous task needs to be made eligible for pushing
         * if it is still active
         */
-        if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)
+        if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
 }
@@ -1203,6 +1251,10 @@ static int find_lowest_rq(struct task_struct *task)
        int this_cpu = smp_processor_id();
        int cpu      = task_cpu(task);
+        /* Make sure the mask is initialized first */
+        if (unlikely(!lowest_mask))
+                return -1;
        if (task->rt.nr_cpus_allowed == 1)
                return -1; /* No other targets possible */
@@ -1227,6 +1279,7 @@ static int find_lowest_rq(struct task_struct *task)
        if (!cpumask_test_cpu(this_cpu, lowest_mask))
                this_cpu = -1; /* Skip this_cpu opt if not among lowest */
+        rcu_read_lock();
        for_each_domain(cpu, sd) {
                if (sd->flags & SD_WAKE_AFFINE) {
                        int best_cpu;
@@ -1236,15 +1289,20 @@ static int find_lowest_rq(struct task_struct *task)
                         * remote processor.
                         */
                        if (this_cpu != -1 &&
-                            cpumask_test_cpu(this_cpu, sched_domain_span(sd)))
+                            cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
+                                rcu_read_unlock();
                                return this_cpu;
+                        }
                        best_cpu = cpumask_first_and(lowest_mask,
                                                     sched_domain_span(sd));
-                        if (best_cpu < nr_cpu_ids)
+                        if (best_cpu < nr_cpu_ids) {
+                                rcu_read_unlock();
                                return best_cpu;
+                        }
                }
        }
+        rcu_read_unlock();
        /*
         * And finally, if there were no matches within the domains
@@ -1287,7 +1345,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
                                     !cpumask_test_cpu(lowest_rq->cpu,
                                                       &task->cpus_allowed) ||
                                     task_running(rq, task) ||
-                                     !task->se.on_rq)) {
+                                     !task->on_rq)) {
                                raw_spin_unlock(&lowest_rq->lock);
                                lowest_rq = NULL;
@@ -1321,7 +1379,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
        BUG_ON(task_current(rq, p));
        BUG_ON(p->rt.nr_cpus_allowed <= 1);
-        BUG_ON(!p->se.on_rq);
+        BUG_ON(!p->on_rq);
        BUG_ON(!rt_task(p));
        return p;
@@ -1467,7 +1525,7 @@ static int pull_rt_task(struct rq *this_rq)
                 */
                if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
                        WARN_ON(p == src_rq->curr);
-                        WARN_ON(!p->se.on_rq);
+                        WARN_ON(!p->on_rq);
                        /*
                         * There's a chance that p is higher in priority
@@ -1502,7 +1560,7 @@ skip:
 static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
 {
        /* Try to pull RT tasks here if we lower this rq's prio */
-        if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
+        if (rq->rt.highest_prio.curr > prev->prio)
                pull_rt_task(rq);
 }
@@ -1538,7 +1596,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
         * Update the migration status of the RQ if we have an RT task
         * which is running AND changing its weight value.
         */
-        if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
+        if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
                struct rq *rq = task_rq(p);
                if (!task_current(rq, p)) {
@@ -1608,7 +1666,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
         * we may need to handle the pulling of RT tasks
         * now.
         */
-        if (p->se.on_rq && !rq->rt.rt_nr_running)
+        if (p->on_rq && !rq->rt.rt_nr_running)
                pull_rt_task(rq);
 }
@@ -1638,7 +1696,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
         * If that current running task is also an RT task
         * then see if we can move to another run queue.
         */
-        if (p->se.on_rq && rq->curr != p) {
+        if (p->on_rq && rq->curr != p) {
 #ifdef CONFIG_SMP
                if (rq->rt.overloaded && push_rt_task(rq) &&
                    /* Don't resched if we changed runqueues */
@@ -1657,7 +1715,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
 static void
 prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
 {
-        if (!p->se.on_rq)
+        if (!p->on_rq)
                return;
        if (rq->curr == p) {
@@ -1796,10 +1854,11 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
 static void print_rt_stats(struct seq_file *m, int cpu)
 {
+        rt_rq_iter_t iter;
        struct rt_rq *rt_rq;
        rcu_read_lock();
-        for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu))
+        for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
                print_rt_rq(m, cpu, rt_rq);
        rcu_read_unlock();
 }
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 48ddf431db0e..331e01bcd026 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -37,7 +37,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
 #ifdef CONFIG_SMP
                /* domain-specific stats */
-                preempt_disable();
+                rcu_read_lock();
                for_each_domain(cpu, sd) {
                        enum cpu_idle_type itype;
@@ -64,7 +64,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
                            sd->ttwu_wake_remote, sd->ttwu_move_affine,
                            sd->ttwu_move_balance);
                }
-                preempt_enable();
+                rcu_read_unlock();
 #endif
        }
        kfree(mask_str);
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 1ba2bd40fdac..6f437632afab 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -9,8 +9,7 @@
 #ifdef CONFIG_SMP
 static int
-select_task_rq_stop(struct rq *rq, struct task_struct *p,
+select_task_rq_stop(struct task_struct *p, int sd_flag, int flags)
-                    int sd_flag, int flags)
 {
        return task_cpu(p); /* stop tasks as never migrate */
 }
@@ -26,7 +25,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
 {
        struct task_struct *stop = rq->stop;
-        if (stop && stop->se.on_rq)
+        if (stop && stop->on_rq)
                return stop;
        return NULL;
diff --git a/kernel/signal.c b/kernel/signal.c
index 7165af5f1b11..291c9700be75 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -87,7 +87,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
        /*
         * Tracers may want to know about even ignored signals.
         */
-        return !tracehook_consider_ignored_signal(t, sig);
+        return !t->ptrace;
 }
 /*
@@ -124,7 +124,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
 static int recalc_sigpending_tsk(struct task_struct *t)
 {
-        if (t->signal->group_stop_count > 0 ||
+        if ((t->jobctl & JOBCTL_PENDING_MASK) ||
            PENDING(&t->pending, &t->blocked) ||
            PENDING(&t->signal->shared_pending, &t->blocked)) {
                set_tsk_thread_flag(t, TIF_SIGPENDING);
@@ -150,9 +150,7 @@ void recalc_sigpending_and_wake(struct task_struct *t)
 void recalc_sigpending(void)
 {
-        if (unlikely(tracehook_force_sigpending()))
+        if (!recalc_sigpending_tsk(current) && !freezing(current))
-                set_thread_flag(TIF_SIGPENDING);
-        else if (!recalc_sigpending_tsk(current) && !freezing(current))
                clear_thread_flag(TIF_SIGPENDING);
 }
@@ -223,6 +221,129 @@ static inline void print_dropped_signal(int sig)
                                current->comm, current->pid, sig);
 }
+/**
+ * task_set_jobctl_pending - set jobctl pending bits
+ * @task: target task
+ * @mask: pending bits to set
+ *
+ * Clear @mask from @task->jobctl.  @mask must be subset of
+ * %JOBCTL_PENDING_MASK | %JOBCTL_STOP_CONSUME | %JOBCTL_STOP_SIGMASK |
+ * %JOBCTL_TRAPPING.  If stop signo is being set, the existing signo is
+ * cleared.  If @task is already being killed or exiting, this function
+ * becomes noop.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ *
+ * RETURNS:
+ * %true if @mask is set, %false if made noop because @task was dying.
+ */
+bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask)
+{
+        BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
+                        JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
+        BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK));
+        if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING)))
+                return false;
+        if (mask & JOBCTL_STOP_SIGMASK)
+                task->jobctl &= ~JOBCTL_STOP_SIGMASK;
+        task->jobctl |= mask;
+        return true;
+}
+/**
+ * task_clear_jobctl_trapping - clear jobctl trapping bit
+ * @task: target task
+ *
+ * If JOBCTL_TRAPPING is set, a ptracer is waiting for us to enter TRACED.
+ * Clear it and wake up the ptracer.  Note that we don't need any further
+ * locking.  @task->siglock guarantees that @task->parent points to the
+ * ptracer.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ */
+void task_clear_jobctl_trapping(struct task_struct *task)
+{
+        if (unlikely(task->jobctl & JOBCTL_TRAPPING)) {
+                task->jobctl &= ~JOBCTL_TRAPPING;
+                wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT);
+        }
+}
+/**
+ * task_clear_jobctl_pending - clear jobctl pending bits
+ * @task: target task
+ * @mask: pending bits to clear
+ *
+ * Clear @mask from @task->jobctl.  @mask must be subset of
+ * %JOBCTL_PENDING_MASK.  If %JOBCTL_STOP_PENDING is being cleared, other
+ * STOP bits are cleared together.
+ *
+ * If clearing of @mask leaves no stop or trap pending, this function calls
+ * task_clear_jobctl_trapping().
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ */
+void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask)
+{
+        BUG_ON(mask & ~JOBCTL_PENDING_MASK);
+        if (mask & JOBCTL_STOP_PENDING)
+                mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED;
+        task->jobctl &= ~mask;
+        if (!(task->jobctl & JOBCTL_PENDING_MASK))
+                task_clear_jobctl_trapping(task);
+}
+/**
+ * task_participate_group_stop - participate in a group stop
+ * @task: task participating in a group stop
+ *
+ * @task has %JOBCTL_STOP_PENDING set and is participating in a group stop.
+ * Group stop states are cleared and the group stop count is consumed if
+ * %JOBCTL_STOP_CONSUME was set.  If the consumption completes the group
+ * stop, the appropriate %SIGNAL_* flags are set.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ *
+ * RETURNS:
+ * %true if group stop completion should be notified to the parent, %false
+ * otherwise.
+ */
+static bool task_participate_group_stop(struct task_struct *task)
+{
+        struct signal_struct *sig = task->signal;
+        bool consume = task->jobctl & JOBCTL_STOP_CONSUME;
+        WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING));
+        task_clear_jobctl_pending(task, JOBCTL_STOP_PENDING);
+        if (!consume)
+                return false;
+        if (!WARN_ON_ONCE(sig->group_stop_count == 0))
+                sig->group_stop_count--;
+        /*
+         * Tell the caller to notify completion iff we are entering into a
+         * fresh group stop.  Read comment in do_signal_stop() for details.
+         */
+        if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) {
+                sig->flags = SIGNAL_STOP_STOPPED;
+                return true;
+        }
+        return false;
+}
 /*
 * allocate a new signal queue record
 * - this may be called without locks if and only if t == current, otherwise an
@@ -372,7 +493,8 @@ int unhandled_signal(struct task_struct *tsk, int sig)
                return 1;
        if (handler != SIG_IGN && handler != SIG_DFL)
                return 0;
-        return !tracehook_consider_fatal_signal(tsk, sig);
+        /* if ptraced, let the tracer determine */
+        return !tsk->ptrace;
 }
 /*
@@ -527,7 +649,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
                 * is to alert stop-signal processing code when another
                 * processor has come along and cleared the flag.
                 */
-                tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
+                current->jobctl |= JOBCTL_STOP_DEQUEUED;
        }
        if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
                /*
@@ -592,7 +714,7 @@ static int rm_from_queue_full(sigset_t *mask, struct sigpending *s)
        if (sigisemptyset(&m))
                return 0;
-        signandsets(&s->signal, &s->signal, mask);
+        sigandnsets(&s->signal, &s->signal, mask);
        list_for_each_entry_safe(q, n, &s->list, list) {
                if (sigismember(mask, q->info.si_signo)) {
                        list_del_init(&q->list);
@@ -696,6 +818,32 @@ static int check_kill_permission(int sig, struct siginfo *info,
        return security_task_kill(t, info, sig, 0);
 }
+/**
+ * ptrace_trap_notify - schedule trap to notify ptracer
+ * @t: tracee wanting to notify tracer
+ *
+ * This function schedules sticky ptrace trap which is cleared on the next
+ * TRAP_STOP to notify ptracer of an event.  @t must have been seized by
+ * ptracer.
+ *
+ * If @t is running, STOP trap will be taken.  If trapped for STOP and
+ * ptracer is listening for events, tracee is woken up so that it can
+ * re-trap for the new event.  If trapped otherwise, STOP trap will be
+ * eventually taken without returning to userland after the existing traps
+ * are finished by PTRACE_CONT.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ */
+static void ptrace_trap_notify(struct task_struct *t)
+{
+        WARN_ON_ONCE(!(t->ptrace & PT_SEIZED));
+        assert_spin_locked(&t->sighand->siglock);
+        task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
+        signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
+}
 /*
 * Handle magic process-wide effects of stop/continue signals. Unlike
 * the signal actions, these happen immediately at signal-generation
@@ -727,34 +875,17 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
        } else if (sig == SIGCONT) {
                unsigned int why;
                /*
-                 * Remove all stop signals from all queues,
+                 * Remove all stop signals from all queues, wake all threads.
-                 * and wake all threads.
                 */
                rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending);
                t = p;
                do {
-                        unsigned int state;
+                        task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
                        rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
-                        /*
+                        if (likely(!(t->ptrace & PT_SEIZED)))
-                         * If there is a handler for SIGCONT, we must make
+                                wake_up_state(t, __TASK_STOPPED);
-                         * sure that no thread returns to user mode before
+                        else
-                         * we post the signal, in case it was the only
+                                ptrace_trap_notify(t);
-                         * thread eligible to run the signal handler--then
-                         * it must not do anything between resuming and
-                         * running the handler.  With the TIF_SIGPENDING
-                         * flag set, the thread will pause and acquire the
-                         * siglock that we hold now and until we've queued
-                         * the pending signal.
-                         *
-                         * Wake up the stopped thread _after_ setting
-                         * TIF_SIGPENDING
-                         */
-                        state = __TASK_STOPPED;
-                        if (sig_user_defined(t, SIGCONT) && !sigismember(&t->blocked, SIGCONT)) {
-                                set_tsk_thread_flag(t, TIF_SIGPENDING);
-                                state |= TASK_INTERRUPTIBLE;
-                        }
-                        wake_up_state(t, state);
                } while_each_thread(p, t);
                /*
@@ -780,13 +911,6 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
                        signal->flags = why | SIGNAL_STOP_CONTINUED;
                        signal->group_stop_count = 0;
                        signal->group_exit_code = 0;
-                } else {
-                        /*
-                         * We are not stopped, but there could be a stop
-                         * signal in the middle of being processed after
-                         * being removed from the queue.  Clear that too.
-                         */
-                        signal->flags &= ~SIGNAL_STOP_DEQUEUED;
                }
        }
@@ -858,8 +982,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
        if (sig_fatal(p, sig) &&
            !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
            !sigismember(&t->real_blocked, sig) &&
-            (sig == SIGKILL ||
+            (sig == SIGKILL || !t->ptrace)) {
-             !tracehook_consider_fatal_signal(t, sig))) {
                /*
                 * This signal will be fatal to the whole group.
                 */
@@ -875,6 +998,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
                        signal->group_stop_count = 0;
                        t = p;
                        do {
+                                task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
                                sigaddset(&t->pending.signal, SIGKILL);
                                signal_wake_up(t, 1);
                        } while_each_thread(p, t);
@@ -1109,6 +1233,7 @@ int zap_other_threads(struct task_struct *p)
        p->signal->group_stop_count = 0;
        while_each_thread(p, t) {
+                task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
                count++;
                /* Don't bother with already dead threads */
@@ -1126,18 +1251,25 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
 {
        struct sighand_struct *sighand;
-        rcu_read_lock();
        for (;;) {
+                local_irq_save(*flags);
+                rcu_read_lock();
                sighand = rcu_dereference(tsk->sighand);
-                if (unlikely(sighand == NULL))
+                if (unlikely(sighand == NULL)) {
+                        rcu_read_unlock();
+                        local_irq_restore(*flags);
                        break;
+                }
-                spin_lock_irqsave(&sighand->siglock, *flags);
+                spin_lock(&sighand->siglock);
-                if (likely(sighand == tsk->sighand))
+                if (likely(sighand == tsk->sighand)) {
+                        rcu_read_unlock();
                        break;
-                spin_unlock_irqrestore(&sighand->siglock, *flags);
+                }
+                spin_unlock(&sighand->siglock);
+                rcu_read_unlock();
+                local_irq_restore(*flags);
        }
-        rcu_read_unlock();
        return sighand;
 }
@@ -1452,22 +1584,22 @@ ret:
 * Let a parent know about the death of a child.
 * For a stopped/continued status change, use do_notify_parent_cldstop instead.
 *
- * Returns -1 if our parent ignored us and so we've switched to
+ * Returns true if our parent ignored us and so we've switched to
- * self-reaping, or else @sig.
+ * self-reaping.
 */
-int do_notify_parent(struct task_struct *tsk, int sig)
+bool do_notify_parent(struct task_struct *tsk, int sig)
 {
        struct siginfo info;
        unsigned long flags;
        struct sighand_struct *psig;
-        int ret = sig;
+        bool autoreap = false;
        BUG_ON(sig == -1);
        /* do_notify_parent_cldstop should have been called instead.  */
        BUG_ON(task_is_stopped_or_traced(tsk));
-        BUG_ON(!task_ptrace(tsk) &&
+        BUG_ON(!tsk->ptrace &&
               (tsk->group_leader != tsk || !thread_group_empty(tsk)));
        info.si_signo = sig;
@@ -1506,7 +1638,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
        psig = tsk->parent->sighand;
        spin_lock_irqsave(&psig->siglock, flags);
-        if (!task_ptrace(tsk) && sig == SIGCHLD &&
+        if (!tsk->ptrace && sig == SIGCHLD &&
            (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
             (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
                /*
@@ -1524,28 +1656,42 @@ int do_notify_parent(struct task_struct *tsk, int sig)
                 * is implementation-defined: we do (if you don't want
                 * it, just use SIG_IGN instead).
                 */
-                ret = tsk->exit_signal = -1;
+                autoreap = true;
                if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
-                        sig = -1;
+                        sig = 0;
        }
-        if (valid_signal(sig) && sig > 0)
+        if (valid_signal(sig) && sig)
                __group_send_sig_info(sig, &info, tsk->parent);
        __wake_up_parent(tsk, tsk->parent);
        spin_unlock_irqrestore(&psig->siglock, flags);
-        return ret;
+        return autoreap;
 }
-static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
+/**
+ * do_notify_parent_cldstop - notify parent of stopped/continued state change
+ * @tsk: task reporting the state change
+ * @for_ptracer: the notification is for ptracer
+ * @why: CLD_{CONTINUED|STOPPED|TRAPPED} to report
+ *
+ * Notify @tsk's parent that the stopped/continued state has changed.  If
+ * @for_ptracer is %false, @tsk's group leader notifies to its real parent.
+ * If %true, @tsk reports to @tsk->parent which should be the ptracer.
+ *
+ * CONTEXT:
+ * Must be called with tasklist_lock at least read locked.
+ */
+static void do_notify_parent_cldstop(struct task_struct *tsk,
+                                     bool for_ptracer, int why)
 {
        struct siginfo info;
        unsigned long flags;
        struct task_struct *parent;
        struct sighand_struct *sighand;
-        if (task_ptrace(tsk))
+        if (for_ptracer) {
                parent = tsk->parent;
-        else {
+        } else {
                tsk = tsk->group_leader;
                parent = tsk->real_parent;
        }
@@ -1592,7 +1738,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
 static inline int may_ptrace_stop(void)
 {
-        if (!likely(task_ptrace(current)))
+        if (!likely(current->ptrace))
                return 0;
        /*
         * Are we in the middle of do_coredump?
@@ -1631,10 +1777,12 @@ static int sigkill_pending(struct task_struct *tsk)
 * If we actually decide not to stop at all because the tracer
 * is gone, we keep current->exit_code unless clear_code.
 */
-static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
+static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
        __releases(&current->sighand->siglock)
        __acquires(&current->sighand->siglock)
 {
+        bool gstop_done = false;
        if (arch_ptrace_stop_needed(exit_code, info)) {
                /*
                 * The arch code has something special to do before a
@@ -1655,21 +1803,52 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
        }
        /*
-         * If there is a group stop in progress,
+         * We're committing to trapping.  TRACED should be visible before
-         * we must participate in the bookkeeping.
+         * TRAPPING is cleared; otherwise, the tracer might fail do_wait().
+         * Also, transition to TRACED and updates to ->jobctl should be
+         * atomic with respect to siglock and should be done after the arch
+         * hook as siglock is released and regrabbed across it.
         */
-        if (current->signal->group_stop_count > 0)
+        set_current_state(TASK_TRACED);
-                --current->signal->group_stop_count;
        current->last_siginfo = info;
        current->exit_code = exit_code;
-        /* Let the debugger run.  */
+        /*
-        __set_current_state(TASK_TRACED);
+         * If @why is CLD_STOPPED, we're trapping to participate in a group
+         * stop.  Do the bookkeeping.  Note that if SIGCONT was delievered
+         * across siglock relocks since INTERRUPT was scheduled, PENDING
+         * could be clear now.  We act as if SIGCONT is received after
+         * TASK_TRACED is entered - ignore it.
+         */
+        if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING))
+                gstop_done = task_participate_group_stop(current);
+        /* any trap clears pending STOP trap, STOP trap clears NOTIFY */
+        task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP);
+        if (info && info->si_code >> 8 == PTRACE_EVENT_STOP)
+                task_clear_jobctl_pending(current, JOBCTL_TRAP_NOTIFY);
+        /* entering a trap, clear TRAPPING */
+        task_clear_jobctl_trapping(current);
        spin_unlock_irq(&current->sighand->siglock);
        read_lock(&tasklist_lock);
        if (may_ptrace_stop()) {
-                do_notify_parent_cldstop(current, CLD_TRAPPED);
+                /*
+                 * Notify parents of the stop.
+                 *
+                 * While ptraced, there are two parents - the ptracer and
+                 * the real_parent of the group_leader.  The ptracer should
+                 * know about every stop while the real parent is only
+                 * interested in the completion of group stop.  The states
+                 * for the two don't interact with each other.  Notify
+                 * separately unless they're gonna be duplicates.
+                 */
+                do_notify_parent_cldstop(current, true, why);
+                if (gstop_done && ptrace_reparented(current))
+                        do_notify_parent_cldstop(current, false, why);
                /*
                 * Don't want to allow preemption here, because
                 * sys_ptrace() needs this task to be inactive.
@@ -1684,7 +1863,16 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
                /*
                 * By the time we got the lock, our tracer went away.
                 * Don't drop the lock yet, another tracer may come.
+                 *
+                 * If @gstop_done, the ptracer went away between group stop
+                 * completion and here.  During detach, it would have set
+                 * JOBCTL_STOP_PENDING on us and we'll re-enter
+                 * TASK_STOPPED in do_signal_stop() on return, so notifying
+                 * the real parent of the group stop completion is enough.
                 */
+                if (gstop_done)
+                        do_notify_parent_cldstop(current, false, why);
                __set_current_state(TASK_RUNNING);
                if (clear_code)
                        current->exit_code = 0;
@@ -1706,6 +1894,9 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
        spin_lock_irq(&current->sighand->siglock);
        current->last_siginfo = NULL;
+        /* LISTENING can be set only during STOP traps, clear it */
+        current->jobctl &= ~JOBCTL_LISTENING;
        /*
         * Queued signals ignored us while we were stopped for tracing.
         * So check for any that we should take before resuming user mode.
@@ -1714,107 +1905,204 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
        recalc_sigpending_tsk(current);
 }
-void ptrace_notify(int exit_code)
+static void ptrace_do_notify(int signr, int exit_code, int why)
 {
        siginfo_t info;
-        BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
        memset(&info, 0, sizeof info);
-        info.si_signo = SIGTRAP;
+        info.si_signo = signr;
        info.si_code = exit_code;
        info.si_pid = task_pid_vnr(current);
        info.si_uid = current_uid();
        /* Let the debugger run.  */
+        ptrace_stop(exit_code, why, 1, &info);
+}
+void ptrace_notify(int exit_code)
+{
+        BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
        spin_lock_irq(&current->sighand->siglock);
-        ptrace_stop(exit_code, 1, &info);
+        ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
        spin_unlock_irq(&current->sighand->siglock);
 }
-/*
+/**
- * This performs the stopping for SIGSTOP and other stop signals.
+ * do_signal_stop - handle group stop for SIGSTOP and other stop signals
- * We have to stop all threads in the thread group.
+ * @signr: signr causing group stop if initiating
- * Returns non-zero if we've actually stopped and released the siglock.
+ *
- * Returns zero if we didn't stop and still hold the siglock.
+ * If %JOBCTL_STOP_PENDING is not set yet, initiate group stop with @signr
+ * and participate in it.  If already set, participate in the existing
+ * group stop.  If participated in a group stop (and thus slept), %true is
+ * returned with siglock released.
+ *
+ * If ptraced, this function doesn't handle stop itself.  Instead,
+ * %JOBCTL_TRAP_STOP is scheduled and %false is returned with siglock
+ * untouched.  The caller must ensure that INTERRUPT trap handling takes
+ * places afterwards.
+ *
+ * CONTEXT:
+ * Must be called with @current->sighand->siglock held, which is released
+ * on %true return.
+ *
+ * RETURNS:
+ * %false if group stop is already cancelled or ptrace trap is scheduled.
+ * %true if participated in group stop.
 */
-static int do_signal_stop(int signr)
+static bool do_signal_stop(int signr)
+        __releases(&current->sighand->siglock)
 {
        struct signal_struct *sig = current->signal;
-        int notify;
-        if (!sig->group_stop_count) {
+        if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
+                unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
                struct task_struct *t;
-                if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
+                /* signr will be recorded in task->jobctl for retries */
+                WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK);
+                if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) ||
                    unlikely(signal_group_exit(sig)))
-                        return 0;
+                        return false;
                /*
-                 * There is no group stop already in progress.
+                 * There is no group stop already in progress.  We must
-                 * We must initiate one now.
+                 * initiate one now.
+                 *
+                 * While ptraced, a task may be resumed while group stop is
+                 * still in effect and then receive a stop signal and
+                 * initiate another group stop.  This deviates from the
+                 * usual behavior as two consecutive stop signals can't
+                 * cause two group stops when !ptraced.  That is why we
+                 * also check !task_is_stopped(t) below.
+                 *
+                 * The condition can be distinguished by testing whether
+                 * SIGNAL_STOP_STOPPED is already set.  Don't generate
+                 * group_exit_code in such case.
+                 *
+                 * This is not necessary for SIGNAL_STOP_CONTINUED because
+                 * an intervening stop signal is required to cause two
+                 * continued events regardless of ptrace.
                 */
-                sig->group_exit_code = signr;
+                if (!(sig->flags & SIGNAL_STOP_STOPPED))
+                        sig->group_exit_code = signr;
+                else
+                        WARN_ON_ONCE(!current->ptrace);
+                sig->group_stop_count = 0;
-                sig->group_stop_count = 1;
+                if (task_set_jobctl_pending(current, signr | gstop))
-                for (t = next_thread(current); t != current; t = next_thread(t))
+                        sig->group_stop_count++;
+                for (t = next_thread(current); t != current;
+                     t = next_thread(t)) {
                        /*
                         * Setting state to TASK_STOPPED for a group
                         * stop is always done with the siglock held,
                         * so this check has no races.
                         */
-                        if (!(t->flags & PF_EXITING) &&
+                        if (!task_is_stopped(t) &&
-                            !task_is_stopped_or_traced(t)) {
+                            task_set_jobctl_pending(t, signr | gstop)) {
                                sig->group_stop_count++;
-                                signal_wake_up(t, 0);
+                                if (likely(!(t->ptrace & PT_SEIZED)))
+                                        signal_wake_up(t, 0);
+                                else
+                                        ptrace_trap_notify(t);
                        }
+                }
        }
-        /*
-         * If there are no other threads in the group, or if there is
+        if (likely(!current->ptrace)) {
-         * a group stop in progress and we are the last to stop, report
+                int notify = 0;
-         * to the parent.  When ptraced, every thread reports itself.
-         */
+                /*
-        notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0;
+                 * If there are no other threads in the group, or if there
-        notify = tracehook_notify_jctl(notify, CLD_STOPPED);
+                 * is a group stop in progress and we are the last to stop,
-        /*
+                 * report to the parent.
-         * tracehook_notify_jctl() can drop and reacquire siglock, so
+                 */
-         * we keep ->group_stop_count != 0 before the call. If SIGCONT
+                if (task_participate_group_stop(current))
-         * or SIGKILL comes in between ->group_stop_count == 0.
+                        notify = CLD_STOPPED;
-         */
-        if (sig->group_stop_count) {
-                if (!--sig->group_stop_count)
-                        sig->flags = SIGNAL_STOP_STOPPED;
-                current->exit_code = sig->group_exit_code;
                __set_current_state(TASK_STOPPED);
-        }
+                spin_unlock_irq(&current->sighand->siglock);
-        spin_unlock_irq(&current->sighand->siglock);
-        if (notify) {
+                /*
-                read_lock(&tasklist_lock);
+                 * Notify the parent of the group stop completion.  Because
-                do_notify_parent_cldstop(current, notify);
+                 * we're not holding either the siglock or tasklist_lock
-                read_unlock(&tasklist_lock);
+                 * here, ptracer may attach inbetween; however, this is for
-        }
+                 * group stop and should always be delivered to the real
+                 * parent of the group leader.  The new ptracer will get
+                 * its notification when this task transitions into
+                 * TASK_TRACED.
+                 */
+                if (notify) {
+                        read_lock(&tasklist_lock);
+                        do_notify_parent_cldstop(current, false, notify);
+                        read_unlock(&tasklist_lock);
+                }
-        /* Now we don't run again until woken by SIGCONT or SIGKILL */
+                /* Now we don't run again until woken by SIGCONT or SIGKILL */
-        do {
                schedule();
-        } while (try_to_freeze());
+                return true;
+        } else {
-        tracehook_finish_jctl();
+                /*
-        current->exit_code = 0;
+                 * While ptraced, group stop is handled by STOP trap.
+                 * Schedule it and let the caller deal with it.
+                 */
+                task_set_jobctl_pending(current, JOBCTL_TRAP_STOP);
+                return false;
+        }
+}
-        return 1;
+/**
+ * do_jobctl_trap - take care of ptrace jobctl traps
+ *
+ * When PT_SEIZED, it's used for both group stop and explicit
+ * SEIZE/INTERRUPT traps.  Both generate PTRACE_EVENT_STOP trap with
+ * accompanying siginfo.  If stopped, lower eight bits of exit_code contain
+ * the stop signal; otherwise, %SIGTRAP.
+ *
+ * When !PT_SEIZED, it's used only for group stop trap with stop signal
+ * number as exit_code and no siginfo.
+ *
+ * CONTEXT:
+ * Must be called with @current->sighand->siglock held, which may be
+ * released and re-acquired before returning with intervening sleep.
+ */
+static void do_jobctl_trap(void)
+{
+        struct signal_struct *signal = current->signal;
+        int signr = current->jobctl & JOBCTL_STOP_SIGMASK;
+        if (current->ptrace & PT_SEIZED) {
+                if (!signal->group_stop_count &&
+                    !(signal->flags & SIGNAL_STOP_STOPPED))
+                        signr = SIGTRAP;
+                WARN_ON_ONCE(!signr);
+                ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8),
+                                 CLD_STOPPED);
+        } else {
+                WARN_ON_ONCE(!signr);
+                ptrace_stop(signr, CLD_STOPPED, 0, NULL);
+                current->exit_code = 0;
+        }
 }
 static int ptrace_signal(int signr, siginfo_t *info,
                         struct pt_regs *regs, void *cookie)
 {
-        if (!task_ptrace(current))
-                return signr;
        ptrace_signal_deliver(regs, cookie);
+        /*
-        /* Let the debugger run.  */
+         * We do not check sig_kernel_stop(signr) but set this marker
-        ptrace_stop(signr, 0, info);
+         * unconditionally because we do not know whether debugger will
+         * change signr. This flag has no meaning unless we are going
+         * to stop after return from ptrace_stop(). In this case it will
+         * be checked in do_signal_stop(), we should only stop if it was
+         * not cleared by SIGCONT while we were sleeping. See also the
+         * comment in dequeue_signal().
+         */
+        current->jobctl |= JOBCTL_STOP_DEQUEUED;
+        ptrace_stop(signr, CLD_TRAPPED, 0, info);
        /* We're back.  Did the debugger cancel the sig?  */
        signr = current->exit_code;
@@ -1869,54 +2157,63 @@ relock:
         * the CLD_ si_code into SIGNAL_CLD_MASK bits.
         */
        if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
-                int why = (signal->flags & SIGNAL_STOP_CONTINUED)
+                int why;
-                                ? CLD_CONTINUED : CLD_STOPPED;
+                if (signal->flags & SIGNAL_CLD_CONTINUED)
+                        why = CLD_CONTINUED;
+                else
+                        why = CLD_STOPPED;
                signal->flags &= ~SIGNAL_CLD_MASK;
-                why = tracehook_notify_jctl(why, CLD_CONTINUED);
                spin_unlock_irq(&sighand->siglock);
-                if (why) {
+                /*
-                        read_lock(&tasklist_lock);
+                 * Notify the parent that we're continuing.  This event is
-                        do_notify_parent_cldstop(current->group_leader, why);
+                 * always per-process and doesn't make whole lot of sense
-                        read_unlock(&tasklist_lock);
+                 * for ptracers, who shouldn't consume the state via
-                }
+                 * wait(2) either, but, for backward compatibility, notify
+                 * the ptracer of the group leader too unless it's gonna be
+                 * a duplicate.
+                 */
+                read_lock(&tasklist_lock);
+                do_notify_parent_cldstop(current, false, why);
+                if (ptrace_reparented(current->group_leader))
+                        do_notify_parent_cldstop(current->group_leader,
+                                                true, why);
+                read_unlock(&tasklist_lock);
                goto relock;
        }
        for (;;) {
                struct k_sigaction *ka;
-                /*
-                 * Tracing can induce an artificial signal and choose sigaction.
+                if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) &&
-                 * The return value in @signr determines the default action,
+                    do_signal_stop(0))
-                 * but @info->si_signo is the signal number we will report.
-                 */
-                signr = tracehook_get_signal(current, regs, info, return_ka);
-                if (unlikely(signr < 0))
                        goto relock;
-                if (unlikely(signr != 0))
-                        ka = return_ka;
-                else {
-                        if (unlikely(signal->group_stop_count > 0) &&
-                            do_signal_stop(0))
-                                goto relock;
-                        signr = dequeue_signal(current, &current->blocked,
+                if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) {
-                                               info);
+                        do_jobctl_trap();
+                        spin_unlock_irq(&sighand->siglock);
+                        goto relock;
+                }
-                        if (!signr)
+                signr = dequeue_signal(current, &current->blocked, info);
-                                break; /* will return 0 */
-                        if (signr != SIGKILL) {
+                if (!signr)
-                                signr = ptrace_signal(signr, info,
+                        break; /* will return 0 */
-                                                      regs, cookie);
-                                if (!signr)
-                                        continue;
-                        }
-                        ka = &sighand->action[signr-1];
+                if (unlikely(current->ptrace) && signr != SIGKILL) {
+                        signr = ptrace_signal(signr, info,
+                                              regs, cookie);
+                        if (!signr)
+                                continue;
                }
+                ka = &sighand->action[signr-1];
                /* Trace actually delivered signals. */
                trace_signal_deliver(signr, info, ka);
@@ -2017,10 +2314,42 @@ relock:
        return signr;
 }
+/*
+ * It could be that complete_signal() picked us to notify about the
+ * group-wide signal. Other threads should be notified now to take
+ * the shared signals in @which since we will not.
+ */
+static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which)
+{
+        sigset_t retarget;
+        struct task_struct *t;
+        sigandsets(&retarget, &tsk->signal->shared_pending.signal, which);
+        if (sigisemptyset(&retarget))
+                return;
+        t = tsk;
+        while_each_thread(tsk, t) {
+                if (t->flags & PF_EXITING)
+                        continue;
+                if (!has_pending_signals(&retarget, &t->blocked))
+                        continue;
+                /* Remove the signals this thread can handle. */
+                sigandsets(&retarget, &retarget, &t->blocked);
+                if (!signal_pending(t))
+                        signal_wake_up(t, 0);
+                if (sigisemptyset(&retarget))
+                        break;
+        }
+}
 void exit_signals(struct task_struct *tsk)
 {
        int group_stop = 0;
-        struct task_struct *t;
+        sigset_t unblocked;
        if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
                tsk->flags |= PF_EXITING;
@@ -2036,26 +2365,23 @@ void exit_signals(struct task_struct *tsk)
        if (!signal_pending(tsk))
                goto out;
-        /*
+        unblocked = tsk->blocked;
-         * It could be that __group_complete_signal() choose us to
+        signotset(&unblocked);
-         * notify about group-wide signal. Another thread should be
+        retarget_shared_pending(tsk, &unblocked);
-         * woken now to take the signal since we will not.
-         */
-        for (t = tsk; (t = next_thread(t)) != tsk; )
-                if (!signal_pending(t) && !(t->flags & PF_EXITING))
-                        recalc_sigpending_and_wake(t);
-        if (unlikely(tsk->signal->group_stop_count) &&
+        if (unlikely(tsk->jobctl & JOBCTL_STOP_PENDING) &&
-                        !--tsk->signal->group_stop_count) {
+            task_participate_group_stop(tsk))
-                tsk->signal->flags = SIGNAL_STOP_STOPPED;
+                group_stop = CLD_STOPPED;
-                group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED);
-        }
 out:
        spin_unlock_irq(&tsk->sighand->siglock);
+        /*
+         * If group stop has completed, deliver the notification.  This
+         * should always go to the real parent of the group leader.
+         */
        if (unlikely(group_stop)) {
                read_lock(&tasklist_lock);
-                do_notify_parent_cldstop(tsk, group_stop);
+                do_notify_parent_cldstop(tsk, false, group_stop);
                read_unlock(&tasklist_lock);
        }
 }
@@ -2089,11 +2415,33 @@ long do_no_restart_syscall(struct restart_block *param)
        return -EINTR;
 }
-/*
+static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
- * We don't need to get the kernel lock - this is all local to this
+{
- * particular thread.. (and that's good, because this is _heavily_
+        if (signal_pending(tsk) && !thread_group_empty(tsk)) {
- * used by various programs)
+                sigset_t newblocked;
+                /* A set of now blocked but previously unblocked signals. */
+                sigandnsets(&newblocked, newset, &current->blocked);
+                retarget_shared_pending(tsk, &newblocked);
+        }
+        tsk->blocked = *newset;
+        recalc_sigpending();
+}
+/**
+ * set_current_blocked - change current->blocked mask
+ * @newset: new mask
+ *
+ * It is wrong to change ->blocked directly, this helper should be used
+ * to ensure the process can't miss a shared signal we are going to block.
 */
+void set_current_blocked(const sigset_t *newset)
+{
+        struct task_struct *tsk = current;
+        spin_lock_irq(&tsk->sighand->siglock);
+        __set_task_blocked(tsk, newset);
+        spin_unlock_irq(&tsk->sighand->siglock);
+}
 /*
 * This is also useful for kernel threads that want to temporarily
@@ -2105,73 +2453,66 @@ long do_no_restart_syscall(struct restart_block *param)
 */
 int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
 {
-        int error;
+        struct task_struct *tsk = current;
+        sigset_t newset;
-        spin_lock_irq(&current->sighand->siglock);
+        /* Lockless, only current can change ->blocked, never from irq */
        if (oldset)
-                *oldset = current->blocked;
+                *oldset = tsk->blocked;
-        error = 0;
        switch (how) {
        case SIG_BLOCK:
-                sigorsets(&current->blocked, &current->blocked, set);
+                sigorsets(&newset, &tsk->blocked, set);
                break;
        case SIG_UNBLOCK:
-                signandsets(&current->blocked, &current->blocked, set);
+                sigandnsets(&newset, &tsk->blocked, set);
                break;
        case SIG_SETMASK:
-                current->blocked = *set;
+                newset = *set;
                break;
        default:
-                error = -EINVAL;
+                return -EINVAL;
        }
-        recalc_sigpending();
-        spin_unlock_irq(&current->sighand->siglock);
-        return error;
+        set_current_blocked(&newset);
+        return 0;
 }
 /**
 *  sys_rt_sigprocmask - change the list of currently blocked signals
 *  @how: whether to add, remove, or set signals
- *  @set: stores pending signals
+ *  @nset: stores pending signals
 *  @oset: previous value of signal mask if non-null
 *  @sigsetsize: size of sigset_t type
 */
-SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set,
+SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
                sigset_t __user *, oset, size_t, sigsetsize)
 {
-        int error = -EINVAL;
        sigset_t old_set, new_set;
+        int error;
        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
-                goto out;
+                return -EINVAL;
-        if (set) {
+        old_set = current->blocked;
-                error = -EFAULT;
-                if (copy_from_user(&new_set, set, sizeof(*set)))
+        if (nset) {
-                        goto out;
+                if (copy_from_user(&new_set, nset, sizeof(sigset_t)))
+                        return -EFAULT;
                sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
-                error = sigprocmask(how, &new_set, &old_set);
+                error = sigprocmask(how, &new_set, NULL);
                if (error)
-                        goto out;
+                        return error;
-                if (oset)
+        }
-                        goto set_old;
-        } else if (oset) {
-                spin_lock_irq(&current->sighand->siglock);
-                old_set = current->blocked;
-                spin_unlock_irq(&current->sighand->siglock);
-        set_old:
+        if (oset) {
-                error = -EFAULT;
+                if (copy_to_user(oset, &old_set, sizeof(sigset_t)))
-                if (copy_to_user(oset, &old_set, sizeof(*oset)))
+                        return -EFAULT;
-                        goto out;
        }
-        error = 0;
-out:
+        return 0;
-        return error;
 }
 long do_sigpending(void __user *set, unsigned long sigsetsize)
@@ -2284,6 +2625,66 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
 #endif
 /**
+ *  do_sigtimedwait - wait for queued signals specified in @which
+ *  @which: queued signals to wait for
+ *  @info: if non-null, the signal's siginfo is returned here
+ *  @ts: upper bound on process time suspension
+ */
+int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
+                        const struct timespec *ts)
+{
+        struct task_struct *tsk = current;
+        long timeout = MAX_SCHEDULE_TIMEOUT;
+        sigset_t mask = *which;
+        int sig;
+        if (ts) {
+                if (!timespec_valid(ts))
+                        return -EINVAL;
+                timeout = timespec_to_jiffies(ts);
+                /*
+                 * We can be close to the next tick, add another one
+                 * to ensure we will wait at least the time asked for.
+                 */
+                if (ts->tv_sec || ts->tv_nsec)
+                        timeout++;
+        }
+        /*
+         * Invert the set of allowed signals to get those we want to block.
+         */
+        sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+        signotset(&mask);
+        spin_lock_irq(&tsk->sighand->siglock);
+        sig = dequeue_signal(tsk, &mask, info);
+        if (!sig && timeout) {
+                /*
+                 * None ready, temporarily unblock those we're interested
+                 * while we are sleeping in so that we'll be awakened when
+                 * they arrive. Unblocking is always fine, we can avoid
+                 * set_current_blocked().
+                 */
+                tsk->real_blocked = tsk->blocked;
+                sigandsets(&tsk->blocked, &tsk->blocked, &mask);
+                recalc_sigpending();
+                spin_unlock_irq(&tsk->sighand->siglock);
+                timeout = schedule_timeout_interruptible(timeout);
+                spin_lock_irq(&tsk->sighand->siglock);
+                __set_task_blocked(tsk, &tsk->real_blocked);
+                siginitset(&tsk->real_blocked, 0);
+                sig = dequeue_signal(tsk, &mask, info);
+        }
+        spin_unlock_irq(&tsk->sighand->siglock);
+        if (sig)
+                return sig;
+        return timeout ? -EINTR : -EAGAIN;
+}
+/**
 *  sys_rt_sigtimedwait - synchronously wait for queued signals specified
 *                      in @uthese
 *  @uthese: queued signals to wait for
@@ -2295,11 +2696,10 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
                siginfo_t __user *, uinfo, const struct timespec __user *, uts,
                size_t, sigsetsize)
 {
-        int ret, sig;
        sigset_t these;
        struct timespec ts;
        siginfo_t info;
-        long timeout = 0;
+        int ret;
        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
@@ -2308,61 +2708,16 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
        if (copy_from_user(&these, uthese, sizeof(these)))
                return -EFAULT;
-        /*
-         * Invert the set of allowed signals to get those we
-         * want to block.
-         */
-        sigdelsetmask(&these, sigmask(SIGKILL)|sigmask(SIGSTOP));
-        signotset(&these);
        if (uts) {
                if (copy_from_user(&ts, uts, sizeof(ts)))
                        return -EFAULT;
-                if (ts.tv_nsec >= 1000000000L || ts.tv_nsec < 0
-                    || ts.tv_sec < 0)
-                        return -EINVAL;
        }
-        spin_lock_irq(&current->sighand->siglock);
+        ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL);
-        sig = dequeue_signal(current, &these, &info);
-        if (!sig) {
-                timeout = MAX_SCHEDULE_TIMEOUT;
-                if (uts)
-                        timeout = (timespec_to_jiffies(&ts)
-                                   + (ts.tv_sec || ts.tv_nsec));
-                if (timeout) {
-                        /*
-                         * None ready -- temporarily unblock those we're
-                         * interested while we are sleeping in so that we'll
-                         * be awakened when they arrive.
-                         */
-                        current->real_blocked = current->blocked;
-                        sigandsets(&current->blocked, &current->blocked, &these);
-                        recalc_sigpending();
-                        spin_unlock_irq(&current->sighand->siglock);
-                        timeout = schedule_timeout_interruptible(timeout);
-                        spin_lock_irq(&current->sighand->siglock);
-                        sig = dequeue_signal(current, &these, &info);
-                        current->blocked = current->real_blocked;
-                        siginitset(&current->real_blocked, 0);
-                        recalc_sigpending();
-                }
-        }
-        spin_unlock_irq(&current->sighand->siglock);
-        if (sig) {
+        if (ret > 0 && uinfo) {
-                ret = sig;
+                if (copy_siginfo_to_user(uinfo, &info))
-                if (uinfo) {
+                        ret = -EFAULT;
-                        if (copy_siginfo_to_user(uinfo, &info))
-                                ret = -EFAULT;
-                }
-        } else {
-                ret = -EAGAIN;
-                if (timeout)
-                        ret = -EINTR;
        }
        return ret;
@@ -2650,60 +3005,51 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
 /**
 *  sys_sigprocmask - examine and change blocked signals
 *  @how: whether to add, remove, or set signals
- *  @set: signals to add or remove (if non-null)
+ *  @nset: signals to add or remove (if non-null)
 *  @oset: previous value of signal mask if non-null
 *
 * Some platforms have their own version with special arguments;
 * others support only sys_rt_sigprocmask.
 */
-SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set,
+SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
                old_sigset_t __user *, oset)
 {
-        int error;
        old_sigset_t old_set, new_set;
+        sigset_t new_blocked;
-        if (set) {
+        old_set = current->blocked.sig[0];
-                error = -EFAULT;
-                if (copy_from_user(&new_set, set, sizeof(*set)))
+        if (nset) {
-                        goto out;
+                if (copy_from_user(&new_set, nset, sizeof(*nset)))
+                        return -EFAULT;
                new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
-                spin_lock_irq(&current->sighand->siglock);
+                new_blocked = current->blocked;
-                old_set = current->blocked.sig[0];
-                error = 0;
                switch (how) {
-                default:
-                        error = -EINVAL;
-                        break;
                case SIG_BLOCK:
-                        sigaddsetmask(&current->blocked, new_set);
+                        sigaddsetmask(&new_blocked, new_set);
                        break;
                case SIG_UNBLOCK:
-                        sigdelsetmask(&current->blocked, new_set);
+                        sigdelsetmask(&new_blocked, new_set);
                        break;
                case SIG_SETMASK:
-                        current->blocked.sig[0] = new_set;
+                        new_blocked.sig[0] = new_set;
                        break;
+                default:
+                        return -EINVAL;
                }
-                recalc_sigpending();
+                set_current_blocked(&new_blocked);
-                spin_unlock_irq(&current->sighand->siglock);
+        }
-                if (error)
-                        goto out;
+        if (oset) {
-                if (oset)
-                        goto set_old;
-        } else if (oset) {
-                old_set = current->blocked.sig[0];
-        set_old:
-                error = -EFAULT;
                if (copy_to_user(oset, &old_set, sizeof(*oset)))
-                        goto out;
+                        return -EFAULT;
        }
-        error = 0;
-out:
+        return 0;
-        return error;
 }
 #endif /* __ARCH_WANT_SYS_SIGPROCMASK */
@@ -2756,15 +3102,11 @@ SYSCALL_DEFINE0(sgetmask)
 SYSCALL_DEFINE1(ssetmask, int, newmask)
 {
-        int old;
+        int old = current->blocked.sig[0];
+        sigset_t newset;
-        spin_lock_irq(&current->sighand->siglock);
-        old = current->blocked.sig[0];
-        siginitset(&current->blocked, newmask & ~(sigmask(SIGKILL)|
+        siginitset(&newset, newmask & ~(sigmask(SIGKILL) | sigmask(SIGSTOP)));
-                                                  sigmask(SIGSTOP)));
+        set_current_blocked(&newset);
-        recalc_sigpending();
-        spin_unlock_irq(&current->sighand->siglock);
        return old;
 }
@@ -2793,8 +3135,10 @@ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)
 SYSCALL_DEFINE0(pause)
 {
-        current->state = TASK_INTERRUPTIBLE;
+        while (!signal_pending(current)) {
-        schedule();
+                current->state = TASK_INTERRUPTIBLE;
+                schedule();
+        }
        return -ERESTARTNOHAND;
 }
@@ -2819,11 +3163,8 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
                return -EFAULT;
        sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
-        spin_lock_irq(&current->sighand->siglock);
        current->saved_sigmask = current->blocked;
-        current->blocked = newset;
+        set_current_blocked(&newset);
-        recalc_sigpending();
-        spin_unlock_irq(&current->sighand->siglock);
        current->state = TASK_INTERRUPTIBLE;
        schedule();
diff --git a/kernel/smp.c b/kernel/smp.c
index 73a195193558..fb67dfa8394e 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -74,7 +74,7 @@ static struct notifier_block __cpuinitdata hotplug_cfd_notifier = {
        .notifier_call          = hotplug_cfd,
 };
-static int __cpuinit init_call_single_data(void)
+void __init call_function_init(void)
 {
        void *cpu = (void *)(long)smp_processor_id();
        int i;
@@ -88,10 +88,7 @@ static int __cpuinit init_call_single_data(void)
        hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu);
        register_cpu_notifier(&hotplug_cfd_notifier);
-        return 0;
 }
-early_initcall(init_call_single_data);
 /*
 * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 174f976c2874..fca82c32042b 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -58,7 +58,7 @@ DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
 char *softirq_to_name[NR_SOFTIRQS] = {
        "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
-        "TASKLET", "SCHED", "HRTIMER",  "RCU"
+        "TASKLET", "SCHED", "HRTIMER", "RCU"
 };
 /*
@@ -315,16 +315,24 @@ static inline void invoke_softirq(void)
 {
        if (!force_irqthreads)
                __do_softirq();
-        else
+        else {
+                __local_bh_disable((unsigned long)__builtin_return_address(0),
+                                SOFTIRQ_OFFSET);
                wakeup_softirqd();
+                __local_bh_enable(SOFTIRQ_OFFSET);
+        }
 }
 #else
 static inline void invoke_softirq(void)
 {
        if (!force_irqthreads)
                do_softirq();
-        else
+        else {
+                __local_bh_disable((unsigned long)__builtin_return_address(0),
+                                SOFTIRQ_OFFSET);
                wakeup_softirqd();
+                __local_bh_enable(SOFTIRQ_OFFSET);
+        }
 }
 #endif
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index eb212f8f8bc8..d20c6983aad9 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -26,12 +26,18 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
 EXPORT_SYMBOL_GPL(print_stack_trace);
 /*
- * Architectures that do not implement save_stack_trace_tsk get this
+ * Architectures that do not implement save_stack_trace_tsk or
- * weak alias and a once-per-bootup warning (whenever this facility
+ * save_stack_trace_regs get this weak alias and a once-per-bootup warning
- * is utilized - for example by procfs):
+ * (whenever this facility is utilized - for example by procfs):
 */
 __weak void
 save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 {
        WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n");
 }
+__weak void
+save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
+{
+        WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n");
+}
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index e3516b29076c..ba5070ce5765 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -19,7 +19,7 @@
 #include <linux/interrupt.h>
 #include <linux/kallsyms.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 /*
 * Structure to determine completion condition and record errors.  May
@@ -136,10 +136,11 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
 static DEFINE_MUTEX(stop_cpus_mutex);
 static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
-int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+static void queue_stop_cpus_work(const struct cpumask *cpumask,
+                                 cpu_stop_fn_t fn, void *arg,
+                                 struct cpu_stop_done *done)
 {
        struct cpu_stop_work *work;
-        struct cpu_stop_done done;
        unsigned int cpu;
        /* initialize works and done */
@@ -147,9 +148,8 @@ int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
                work = &per_cpu(stop_cpus_work, cpu);
                work->fn = fn;
                work->arg = arg;
-                work->done = &done;
+                work->done = done;
        }
-        cpu_stop_init_done(&done, cpumask_weight(cpumask));
        /*
         * Disable preemption while queueing to avoid getting
@@ -161,7 +161,15 @@ int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
                cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
                                    &per_cpu(stop_cpus_work, cpu));
        preempt_enable();
+}
+static int __stop_cpus(const struct cpumask *cpumask,
+                       cpu_stop_fn_t fn, void *arg)
+{
+        struct cpu_stop_done done;
+        cpu_stop_init_done(&done, cpumask_weight(cpumask));
+        queue_stop_cpus_work(cpumask, fn, arg, &done);
        wait_for_completion(&done.completion);
        return done.executed ? done.ret : -ENOENT;
 }
@@ -431,8 +439,15 @@ static int stop_machine_cpu_stop(void *data)
        struct stop_machine_data *smdata = data;
        enum stopmachine_state curstate = STOPMACHINE_NONE;
        int cpu = smp_processor_id(), err = 0;
+        unsigned long flags;
        bool is_active;
+        /*
+         * When called from stop_machine_from_inactive_cpu(), irq might
+         * already be disabled.  Save the state and restore it on exit.
+         */
+        local_save_flags(flags);
        if (!smdata->active_cpus)
                is_active = cpu == cpumask_first(cpu_online_mask);
        else
@@ -460,7 +475,7 @@ static int stop_machine_cpu_stop(void *data)
                }
        } while (curstate != STOPMACHINE_EXIT);
-        local_irq_enable();
+        local_irq_restore(flags);
        return err;
 }
@@ -487,4 +502,57 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 }
 EXPORT_SYMBOL_GPL(stop_machine);
+/**
+ * stop_machine_from_inactive_cpu - stop_machine() from inactive CPU
+ * @fn: the function to run
+ * @data: the data ptr for the @fn()
+ * @cpus: the cpus to run the @fn() on (NULL = any online cpu)
+ *
+ * This is identical to stop_machine() but can be called from a CPU which
+ * is not active.  The local CPU is in the process of hotplug (so no other
+ * CPU hotplug can start) and not marked active and doesn't have enough
+ * context to sleep.
+ *
+ * This function provides stop_machine() functionality for such state by
+ * using busy-wait for synchronization and executing @fn directly for local
+ * CPU.
+ *
+ * CONTEXT:
+ * Local CPU is inactive.  Temporarily stops all active CPUs.
+ *
+ * RETURNS:
+ * 0 if all executions of @fn returned 0, any non zero return value if any
+ * returned non zero.
+ */
+int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
+                                  const struct cpumask *cpus)
+{
+        struct stop_machine_data smdata = { .fn = fn, .data = data,
+                                            .active_cpus = cpus };
+        struct cpu_stop_done done;
+        int ret;
+        /* Local CPU must be inactive and CPU hotplug in progress. */
+        BUG_ON(cpu_active(raw_smp_processor_id()));
+        smdata.num_threads = num_active_cpus() + 1;     /* +1 for local */
+        /* No proper task established and can't sleep - busy wait for lock. */
+        while (!mutex_trylock(&stop_cpus_mutex))
+                cpu_relax();
+        /* Schedule work on other CPUs and execute directly for local CPU */
+        set_state(&smdata, STOPMACHINE_PREPARE);
+        cpu_stop_init_done(&done, num_active_cpus());
+        queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata,
+                             &done);
+        ret = stop_machine_cpu_stop(&smdata);
+        /* Busy wait for completion. */
+        while (!completion_done(&done.completion))
+                cpu_relax();
+        mutex_unlock(&stop_cpus_mutex);
+        return ret ?: done.ret;
+}
 #endif  /* CONFIG_STOP_MACHINE */
diff --git a/kernel/sys.c b/kernel/sys.c
index af468edf096a..dd948a1fca4c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -8,7 +8,6 @@
 #include <linux/mm.h>
 #include <linux/utsname.h>
 #include <linux/mman.h>
-#include <linux/notifier.h>
 #include <linux/reboot.h>
 #include <linux/prctl.h>
 #include <linux/highuid.h>
@@ -314,12 +313,43 @@ void kernel_restart_prepare(char *cmd)
 {
        blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
        system_state = SYSTEM_RESTART;
+        usermodehelper_disable();
        device_shutdown();
-        sysdev_shutdown();
        syscore_shutdown();
 }
 /**
+ *      register_reboot_notifier - Register function to be called at reboot time
+ *      @nb: Info about notifier function to be called
+ *
+ *      Registers a function with the list of functions
+ *      to be called at reboot time.
+ *
+ *      Currently always returns zero, as blocking_notifier_chain_register()
+ *      always returns zero.
+ */
+int register_reboot_notifier(struct notifier_block *nb)
+{
+        return blocking_notifier_chain_register(&reboot_notifier_list, nb);
+}
+EXPORT_SYMBOL(register_reboot_notifier);
+/**
+ *      unregister_reboot_notifier - Unregister previously registered reboot notifier
+ *      @nb: Hook to be unregistered
+ *
+ *      Unregisters a previously registered reboot
+ *      notifier function.
+ *
+ *      Returns zero on success, or %-ENOENT on failure.
+ */
+int unregister_reboot_notifier(struct notifier_block *nb)
+{
+        return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
+}
+EXPORT_SYMBOL(unregister_reboot_notifier);
+/**
 *      kernel_restart - reboot the system
 *      @cmd: pointer to buffer containing command to execute for restart
 *              or %NULL
@@ -344,6 +374,7 @@ static void kernel_shutdown_prepare(enum system_states state)
        blocking_notifier_call_chain(&reboot_notifier_list,
                (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
        system_state = state;
+        usermodehelper_disable();
        device_shutdown();
 }
 /**
@@ -354,7 +385,6 @@ static void kernel_shutdown_prepare(enum system_states state)
 void kernel_halt(void)
 {
        kernel_shutdown_prepare(SYSTEM_HALT);
-        sysdev_shutdown();
        syscore_shutdown();
        printk(KERN_EMERG "System halted.\n");
        kmsg_dump(KMSG_DUMP_HALT);
@@ -374,7 +404,6 @@ void kernel_power_off(void)
        if (pm_power_off_prepare)
                pm_power_off_prepare();
        disable_nonboot_cpus();
-        sysdev_shutdown();
        syscore_shutdown();
        printk(KERN_EMERG "Power down.\n");
        kmsg_dump(KMSG_DUMP_POWEROFF);
@@ -592,11 +621,18 @@ static int set_user(struct cred *new)
        if (!new_user)
                return -EAGAIN;
+        /*
+         * We don't fail in case of NPROC limit excess here because too many
+         * poorly written programs don't check set*uid() return code, assuming
+         * it never fails if called by root.  We may still enforce NPROC limit
+         * for programs doing set*uid()+execve() by harmlessly deferring the
+         * failure to the execve() stage.
+         */
        if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
-                        new_user != INIT_USER) {
+                        new_user != INIT_USER)
-                free_uid(new_user);
+                current->flags |= PF_NPROC_EXCEEDED;
-                return -EAGAIN;
+        else
-        }
+                current->flags &= ~PF_NPROC_EXCEEDED;
        free_uid(new->user);
        new->user = new_user;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 25cc41cd8f33..62cbc8877fef 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -46,7 +46,9 @@ cond_syscall(sys_getsockopt);
 cond_syscall(compat_sys_getsockopt);
 cond_syscall(sys_shutdown);
 cond_syscall(sys_sendmsg);
+cond_syscall(sys_sendmmsg);
 cond_syscall(compat_sys_sendmsg);
+cond_syscall(compat_sys_sendmmsg);
 cond_syscall(sys_recvmsg);
 cond_syscall(sys_recvmmsg);
 cond_syscall(compat_sys_recvmsg);
@@ -69,15 +71,22 @@ cond_syscall(compat_sys_epoll_pwait);
 cond_syscall(sys_semget);
 cond_syscall(sys_semop);
 cond_syscall(sys_semtimedop);
+cond_syscall(compat_sys_semtimedop);
 cond_syscall(sys_semctl);
+cond_syscall(compat_sys_semctl);
 cond_syscall(sys_msgget);
 cond_syscall(sys_msgsnd);
+cond_syscall(compat_sys_msgsnd);
 cond_syscall(sys_msgrcv);
+cond_syscall(compat_sys_msgrcv);
 cond_syscall(sys_msgctl);
+cond_syscall(compat_sys_msgctl);
 cond_syscall(sys_shmget);
 cond_syscall(sys_shmat);
+cond_syscall(compat_sys_shmat);
 cond_syscall(sys_shmdt);
 cond_syscall(sys_shmctl);
+cond_syscall(compat_sys_shmctl);
 cond_syscall(sys_mq_open);
 cond_syscall(sys_mq_unlink);
 cond_syscall(sys_mq_timedsend);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c0bb32414b17..11d65b531e50 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -56,6 +56,7 @@
 #include <linux/kprobes.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/oom.h>
+#include <linux/kmod.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -616,6 +617,11 @@ static struct ctl_table kern_table[] = {
                .child          = random_table,
        },
        {
+                .procname       = "usermodehelper",
+                .mode           = 0555,
+                .child          = usermodehelper_table,
+        },
+        {
                .procname       = "overflowuid",
                .data           = &overflowuid,
                .maxlen         = sizeof(int),
@@ -730,14 +736,16 @@ static struct ctl_table kern_table[] = {
                .data           = &watchdog_enabled,
                .maxlen         = sizeof (int),
                .mode           = 0644,
-                .proc_handler   = proc_dowatchdog_enabled,
+                .proc_handler   = proc_dowatchdog,
+                .extra1         = &zero,
+                .extra2         = &one,
        },
        {
                .procname       = "watchdog_thresh",
-                .data           = &softlockup_thresh,
+                .data           = &watchdog_thresh,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = proc_dowatchdog_thresh,
+                .proc_handler   = proc_dowatchdog,
                .extra1         = &neg_one,
                .extra2         = &sixty,
        },
@@ -755,7 +763,9 @@ static struct ctl_table kern_table[] = {
                .data           = &watchdog_enabled,
                .maxlen         = sizeof (int),
                .mode           = 0644,
-                .proc_handler   = proc_dowatchdog_enabled,
+                .proc_handler   = proc_dowatchdog,
+                .extra1         = &zero,
+                .extra2         = &one,
        },
 #endif
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
@@ -928,6 +938,12 @@ static struct ctl_table kern_table[] = {
        },
 #endif
 #ifdef CONFIG_PERF_EVENTS
+        /*
+         * User-space scripts rely on the existence of this file
+         * as a feature check for perf_events being enabled.
+         *
+         * So it's an ABI, do not remove!
+         */
        {
                .procname       = "perf_event_paranoid",
                .data           = &sysctl_perf_event_paranoid,
@@ -1496,7 +1512,7 @@ static struct ctl_table fs_table[] = {
 static struct ctl_table debug_table[] = {
 #if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \
-    defined(CONFIG_S390)
+    defined(CONFIG_S390) || defined(CONFIG_TILE)
        {
                .procname       = "exception-trace",
                .data           = &show_unhandled_signals,
@@ -1574,16 +1590,11 @@ void sysctl_head_get(struct ctl_table_header *head)
        spin_unlock(&sysctl_lock);
 }
-static void free_head(struct rcu_head *rcu)
-{
-        kfree(container_of(rcu, struct ctl_table_header, rcu));
-}
 void sysctl_head_put(struct ctl_table_header *head)
 {
        spin_lock(&sysctl_lock);
        if (!--head->count)
-                call_rcu(&head->rcu, free_head);
+                kfree_rcu(head, rcu);
        spin_unlock(&sysctl_lock);
 }
@@ -1955,10 +1966,10 @@ void unregister_sysctl_table(struct ctl_table_header * header)
        start_unregistering(header);
        if (!--header->parent->count) {
                WARN_ON(1);
-                call_rcu(&header->parent->rcu, free_head);
+                kfree_rcu(header->parent, rcu);
        }
        if (!--header->count)
-                call_rcu(&header->rcu, free_head);
+                kfree_rcu(header, rcu);
        spin_unlock(&sysctl_lock);
 }
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 9ffea360a778..e19ce1454ee1 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -28,7 +28,7 @@
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <net/genetlink.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 /*
 * Maximum length of a cpumask that can be specified in
@@ -285,7 +285,7 @@ ret:
 static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
 {
        struct listener_list *listeners;
-        struct listener *s, *tmp;
+        struct listener *s, *tmp, *s2;
        unsigned int cpu;
        if (!cpumask_subset(mask, cpu_possible_mask))
@@ -293,18 +293,25 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
        if (isadd == REGISTER) {
                for_each_cpu(cpu, mask) {
-                        s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
+                        s = kmalloc_node(sizeof(struct listener),
-                                         cpu_to_node(cpu));
+                                        GFP_KERNEL, cpu_to_node(cpu));
                        if (!s)
                                goto cleanup;
                        s->pid = pid;
-                        INIT_LIST_HEAD(&s->list);
                        s->valid = 1;
                        listeners = &per_cpu(listener_array, cpu);
                        down_write(&listeners->sem);
+                        list_for_each_entry(s2, &listeners->list, list) {
+                                if (s2->pid == pid && s2->valid)
+                                        goto exists;
+                        }
                        list_add(&s->list, &listeners->list);
+                        s = NULL;
+exists:
                        up_write(&listeners->sem);
+                        kfree(s); /* nop if NULL */
                }
                return 0;
        }
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index b0425991e9ac..e2fd74b8e8c2 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,5 +1,5 @@
 obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
-obj-y += timeconv.o posix-clock.o
+obj-y += timeconv.o posix-clock.o alarmtimer.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD)         += clockevents.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS)               += tick-common.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
new file mode 100644
index 000000000000..59f369f98a04
--- /dev/null
+++ b/kernel/time/alarmtimer.c
@@ -0,0 +1,720 @@
+/*
+ * Alarmtimer interface
+ *
+ * This interface provides a timer which is similarto hrtimers,
+ * but triggers a RTC alarm if the box is suspend.
+ *
+ * This interface is influenced by the Android RTC Alarm timer
+ * interface.
+ *
+ * Copyright (C) 2010 IBM Corperation
+ *
+ * Author: John Stultz <john.stultz@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/time.h>
+#include <linux/hrtimer.h>
+#include <linux/timerqueue.h>
+#include <linux/rtc.h>
+#include <linux/alarmtimer.h>
+#include <linux/mutex.h>
+#include <linux/platform_device.h>
+#include <linux/posix-timers.h>
+#include <linux/workqueue.h>
+#include <linux/freezer.h>
+/**
+ * struct alarm_base - Alarm timer bases
+ * @lock:               Lock for syncrhonized access to the base
+ * @timerqueue:         Timerqueue head managing the list of events
+ * @timer:              hrtimer used to schedule events while running
+ * @gettime:            Function to read the time correlating to the base
+ * @base_clockid:       clockid for the base
+ */
+static struct alarm_base {
+        spinlock_t              lock;
+        struct timerqueue_head  timerqueue;
+        struct hrtimer          timer;
+        ktime_t                 (*gettime)(void);
+        clockid_t               base_clockid;
+} alarm_bases[ALARM_NUMTYPE];
+/* freezer delta & lock used to handle clock_nanosleep triggered wakeups */
+static ktime_t freezer_delta;
+static DEFINE_SPINLOCK(freezer_delta_lock);
+#ifdef CONFIG_RTC_CLASS
+/* rtc timer and device for setting alarm wakeups at suspend */
+static struct rtc_timer         rtctimer;
+static struct rtc_device        *rtcdev;
+static DEFINE_SPINLOCK(rtcdev_lock);
+/**
+ * has_wakealarm - check rtc device has wakealarm ability
+ * @dev: current device
+ * @name_ptr: name to be returned
+ *
+ * This helper function checks to see if the rtc device can wake
+ * from suspend.
+ */
+static int has_wakealarm(struct device *dev, void *name_ptr)
+{
+        struct rtc_device *candidate = to_rtc_device(dev);
+        if (!candidate->ops->set_alarm)
+                return 0;
+        if (!device_may_wakeup(candidate->dev.parent))
+                return 0;
+        *(const char **)name_ptr = dev_name(dev);
+        return 1;
+}
+/**
+ * alarmtimer_get_rtcdev - Return selected rtcdevice
+ *
+ * This function returns the rtc device to use for wakealarms.
+ * If one has not already been chosen, it checks to see if a
+ * functional rtc device is available.
+ */
+static struct rtc_device *alarmtimer_get_rtcdev(void)
+{
+        struct device *dev;
+        char *str;
+        unsigned long flags;
+        struct rtc_device *ret;
+        spin_lock_irqsave(&rtcdev_lock, flags);
+        if (!rtcdev) {
+                /* Find an rtc device and init the rtc_timer */
+                dev = class_find_device(rtc_class, NULL, &str, has_wakealarm);
+                /* If we have a device then str is valid. See has_wakealarm() */
+                if (dev) {
+                        rtcdev = rtc_class_open(str);
+                        /*
+                         * Drop the reference we got in class_find_device,
+                         * rtc_open takes its own.
+                         */
+                        put_device(dev);
+                        rtc_timer_init(&rtctimer, NULL, NULL);
+                }
+        }
+        ret = rtcdev;
+        spin_unlock_irqrestore(&rtcdev_lock, flags);
+        return ret;
+}
+#else
+#define alarmtimer_get_rtcdev() (0)
+#define rtcdev (0)
+#endif
+/**
+ * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue
+ * @base: pointer to the base where the timer is being run
+ * @alarm: pointer to alarm being enqueued.
+ *
+ * Adds alarm to a alarm_base timerqueue and if necessary sets
+ * an hrtimer to run.
+ *
+ * Must hold base->lock when calling.
+ */
+static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm)
+{
+        timerqueue_add(&base->timerqueue, &alarm->node);
+        if (&alarm->node == timerqueue_getnext(&base->timerqueue)) {
+                hrtimer_try_to_cancel(&base->timer);
+                hrtimer_start(&base->timer, alarm->node.expires,
+                                HRTIMER_MODE_ABS);
+        }
+}
+/**
+ * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue
+ * @base: pointer to the base where the timer is running
+ * @alarm: pointer to alarm being removed
+ *
+ * Removes alarm to a alarm_base timerqueue and if necessary sets
+ * a new timer to run.
+ *
+ * Must hold base->lock when calling.
+ */
+static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm)
+{
+        struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue);
+        timerqueue_del(&base->timerqueue, &alarm->node);
+        if (next == &alarm->node) {
+                hrtimer_try_to_cancel(&base->timer);
+                next = timerqueue_getnext(&base->timerqueue);
+                if (!next)
+                        return;
+                hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS);
+        }
+}
+/**
+ * alarmtimer_fired - Handles alarm hrtimer being fired.
+ * @timer: pointer to hrtimer being run
+ *
+ * When a alarm timer fires, this runs through the timerqueue to
+ * see which alarms expired, and runs those. If there are more alarm
+ * timers queued for the future, we set the hrtimer to fire when
+ * when the next future alarm timer expires.
+ */
+static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
+{
+        struct alarm_base *base = container_of(timer, struct alarm_base, timer);
+        struct timerqueue_node *next;
+        unsigned long flags;
+        ktime_t now;
+        int ret = HRTIMER_NORESTART;
+        spin_lock_irqsave(&base->lock, flags);
+        now = base->gettime();
+        while ((next = timerqueue_getnext(&base->timerqueue))) {
+                struct alarm *alarm;
+                ktime_t expired = next->expires;
+                if (expired.tv64 >= now.tv64)
+                        break;
+                alarm = container_of(next, struct alarm, node);
+                timerqueue_del(&base->timerqueue, &alarm->node);
+                alarm->enabled = 0;
+                /* Re-add periodic timers */
+                if (alarm->period.tv64) {
+                        alarm->node.expires = ktime_add(expired, alarm->period);
+                        timerqueue_add(&base->timerqueue, &alarm->node);
+                        alarm->enabled = 1;
+                }
+                spin_unlock_irqrestore(&base->lock, flags);
+                if (alarm->function)
+                        alarm->function(alarm);
+                spin_lock_irqsave(&base->lock, flags);
+        }
+        if (next) {
+                hrtimer_set_expires(&base->timer, next->expires);
+                ret = HRTIMER_RESTART;
+        }
+        spin_unlock_irqrestore(&base->lock, flags);
+        return ret;
+}
+#ifdef CONFIG_RTC_CLASS
+/**
+ * alarmtimer_suspend - Suspend time callback
+ * @dev: unused
+ * @state: unused
+ *
+ * When we are going into suspend, we look through the bases
+ * to see which is the soonest timer to expire. We then
+ * set an rtc timer to fire that far into the future, which
+ * will wake us from suspend.
+ */
+static int alarmtimer_suspend(struct device *dev)
+{
+        struct rtc_time tm;
+        ktime_t min, now;
+        unsigned long flags;
+        struct rtc_device *rtc;
+        int i;
+        spin_lock_irqsave(&freezer_delta_lock, flags);
+        min = freezer_delta;
+        freezer_delta = ktime_set(0, 0);
+        spin_unlock_irqrestore(&freezer_delta_lock, flags);
+        rtc = rtcdev;
+        /* If we have no rtcdev, just return */
+        if (!rtc)
+                return 0;
+        /* Find the soonest timer to expire*/
+        for (i = 0; i < ALARM_NUMTYPE; i++) {
+                struct alarm_base *base = &alarm_bases[i];
+                struct timerqueue_node *next;
+                ktime_t delta;
+                spin_lock_irqsave(&base->lock, flags);
+                next = timerqueue_getnext(&base->timerqueue);
+                spin_unlock_irqrestore(&base->lock, flags);
+                if (!next)
+                        continue;
+                delta = ktime_sub(next->expires, base->gettime());
+                if (!min.tv64 || (delta.tv64 < min.tv64))
+                        min = delta;
+        }
+        if (min.tv64 == 0)
+                return 0;
+        /* XXX - Should we enforce a minimum sleep time? */
+        WARN_ON(min.tv64 < NSEC_PER_SEC);
+        /* Setup an rtc timer to fire that far in the future */
+        rtc_timer_cancel(rtc, &rtctimer);
+        rtc_read_time(rtc, &tm);
+        now = rtc_tm_to_ktime(tm);
+        now = ktime_add(now, min);
+        rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
+        return 0;
+}
+#else
+static int alarmtimer_suspend(struct device *dev)
+{
+        return 0;
+}
+#endif
+static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
+{
+        ktime_t delta;
+        unsigned long flags;
+        struct alarm_base *base = &alarm_bases[type];
+        delta = ktime_sub(absexp, base->gettime());
+        spin_lock_irqsave(&freezer_delta_lock, flags);
+        if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64))
+                freezer_delta = delta;
+        spin_unlock_irqrestore(&freezer_delta_lock, flags);
+}
+/**
+ * alarm_init - Initialize an alarm structure
+ * @alarm: ptr to alarm to be initialized
+ * @type: the type of the alarm
+ * @function: callback that is run when the alarm fires
+ */
+void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
+                void (*function)(struct alarm *))
+{
+        timerqueue_init(&alarm->node);
+        alarm->period = ktime_set(0, 0);
+        alarm->function = function;
+        alarm->type = type;
+        alarm->enabled = 0;
+}
+/**
+ * alarm_start - Sets an alarm to fire
+ * @alarm: ptr to alarm to set
+ * @start: time to run the alarm
+ * @period: period at which the alarm will recur
+ */
+void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period)
+{
+        struct alarm_base *base = &alarm_bases[alarm->type];
+        unsigned long flags;
+        spin_lock_irqsave(&base->lock, flags);
+        if (alarm->enabled)
+                alarmtimer_remove(base, alarm);
+        alarm->node.expires = start;
+        alarm->period = period;
+        alarmtimer_enqueue(base, alarm);
+        alarm->enabled = 1;
+        spin_unlock_irqrestore(&base->lock, flags);
+}
+/**
+ * alarm_cancel - Tries to cancel an alarm timer
+ * @alarm: ptr to alarm to be canceled
+ */
+void alarm_cancel(struct alarm *alarm)
+{
+        struct alarm_base *base = &alarm_bases[alarm->type];
+        unsigned long flags;
+        spin_lock_irqsave(&base->lock, flags);
+        if (alarm->enabled)
+                alarmtimer_remove(base, alarm);
+        alarm->enabled = 0;
+        spin_unlock_irqrestore(&base->lock, flags);
+}
+/**
+ * clock2alarm - helper that converts from clockid to alarmtypes
+ * @clockid: clockid.
+ */
+static enum alarmtimer_type clock2alarm(clockid_t clockid)
+{
+        if (clockid == CLOCK_REALTIME_ALARM)
+                return ALARM_REALTIME;
+        if (clockid == CLOCK_BOOTTIME_ALARM)
+                return ALARM_BOOTTIME;
+        return -1;
+}
+/**
+ * alarm_handle_timer - Callback for posix timers
+ * @alarm: alarm that fired
+ *
+ * Posix timer callback for expired alarm timers.
+ */
+static void alarm_handle_timer(struct alarm *alarm)
+{
+        struct k_itimer *ptr = container_of(alarm, struct k_itimer,
+                                                it.alarmtimer);
+        if (posix_timer_event(ptr, 0) != 0)
+                ptr->it_overrun++;
+}
+/**
+ * alarm_clock_getres - posix getres interface
+ * @which_clock: clockid
+ * @tp: timespec to fill
+ *
+ * Returns the granularity of underlying alarm base clock
+ */
+static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
+{
+        clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;
+        if (!alarmtimer_get_rtcdev())
+                return -ENOTSUPP;
+        return hrtimer_get_res(baseid, tp);
+}
+/**
+ * alarm_clock_get - posix clock_get interface
+ * @which_clock: clockid
+ * @tp: timespec to fill.
+ *
+ * Provides the underlying alarm base time.
+ */
+static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
+{
+        struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
+        if (!alarmtimer_get_rtcdev())
+                return -ENOTSUPP;
+        *tp = ktime_to_timespec(base->gettime());
+        return 0;
+}
+/**
+ * alarm_timer_create - posix timer_create interface
+ * @new_timer: k_itimer pointer to manage
+ *
+ * Initializes the k_itimer structure.
+ */
+static int alarm_timer_create(struct k_itimer *new_timer)
+{
+        enum  alarmtimer_type type;
+        struct alarm_base *base;
+        if (!alarmtimer_get_rtcdev())
+                return -ENOTSUPP;
+        if (!capable(CAP_WAKE_ALARM))
+                return -EPERM;
+        type = clock2alarm(new_timer->it_clock);
+        base = &alarm_bases[type];
+        alarm_init(&new_timer->it.alarmtimer, type, alarm_handle_timer);
+        return 0;
+}
+/**
+ * alarm_timer_get - posix timer_get interface
+ * @new_timer: k_itimer pointer
+ * @cur_setting: itimerspec data to fill
+ *
+ * Copies the itimerspec data out from the k_itimer
+ */
+static void alarm_timer_get(struct k_itimer *timr,
+                                struct itimerspec *cur_setting)
+{
+        cur_setting->it_interval =
+                        ktime_to_timespec(timr->it.alarmtimer.period);
+        cur_setting->it_value =
+                        ktime_to_timespec(timr->it.alarmtimer.node.expires);
+        return;
+}
+/**
+ * alarm_timer_del - posix timer_del interface
+ * @timr: k_itimer pointer to be deleted
+ *
+ * Cancels any programmed alarms for the given timer.
+ */
+static int alarm_timer_del(struct k_itimer *timr)
+{
+        if (!rtcdev)
+                return -ENOTSUPP;
+        alarm_cancel(&timr->it.alarmtimer);
+        return 0;
+}
+/**
+ * alarm_timer_set - posix timer_set interface
+ * @timr: k_itimer pointer to be deleted
+ * @flags: timer flags
+ * @new_setting: itimerspec to be used
+ * @old_setting: itimerspec being replaced
+ *
+ * Sets the timer to new_setting, and starts the timer.
+ */
+static int alarm_timer_set(struct k_itimer *timr, int flags,
+                                struct itimerspec *new_setting,
+                                struct itimerspec *old_setting)
+{
+        if (!rtcdev)
+                return -ENOTSUPP;
+        /* Save old values */
+        old_setting->it_interval =
+                        ktime_to_timespec(timr->it.alarmtimer.period);
+        old_setting->it_value =
+                        ktime_to_timespec(timr->it.alarmtimer.node.expires);
+        /* If the timer was already set, cancel it */
+        alarm_cancel(&timr->it.alarmtimer);
+        /* start the timer */
+        alarm_start(&timr->it.alarmtimer,
+                        timespec_to_ktime(new_setting->it_value),
+                        timespec_to_ktime(new_setting->it_interval));
+        return 0;
+}
+/**
+ * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep
+ * @alarm: ptr to alarm that fired
+ *
+ * Wakes up the task that set the alarmtimer
+ */
+static void alarmtimer_nsleep_wakeup(struct alarm *alarm)
+{
+        struct task_struct *task = (struct task_struct *)alarm->data;
+        alarm->data = NULL;
+        if (task)
+                wake_up_process(task);
+}
+/**
+ * alarmtimer_do_nsleep - Internal alarmtimer nsleep implementation
+ * @alarm: ptr to alarmtimer
+ * @absexp: absolute expiration time
+ *
+ * Sets the alarm timer and sleeps until it is fired or interrupted.
+ */
+static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp)
+{
+        alarm->data = (void *)current;
+        do {
+                set_current_state(TASK_INTERRUPTIBLE);
+                alarm_start(alarm, absexp, ktime_set(0, 0));
+                if (likely(alarm->data))
+                        schedule();
+                alarm_cancel(alarm);
+        } while (alarm->data && !signal_pending(current));
+        __set_current_state(TASK_RUNNING);
+        return (alarm->data == NULL);
+}
+/**
+ * update_rmtp - Update remaining timespec value
+ * @exp: expiration time
+ * @type: timer type
+ * @rmtp: user pointer to remaining timepsec value
+ *
+ * Helper function that fills in rmtp value with time between
+ * now and the exp value
+ */
+static int update_rmtp(ktime_t exp, enum  alarmtimer_type type,
+                        struct timespec __user *rmtp)
+{
+        struct timespec rmt;
+        ktime_t rem;
+        rem = ktime_sub(exp, alarm_bases[type].gettime());
+        if (rem.tv64 <= 0)
+                return 0;
+        rmt = ktime_to_timespec(rem);
+        if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
+                return -EFAULT;
+        return 1;
+}
+/**
+ * alarm_timer_nsleep_restart - restartblock alarmtimer nsleep
+ * @restart: ptr to restart block
+ *
+ * Handles restarted clock_nanosleep calls
+ */
+static long __sched alarm_timer_nsleep_restart(struct restart_block *restart)
+{
+        enum  alarmtimer_type type = restart->nanosleep.clockid;
+        ktime_t exp;
+        struct timespec __user  *rmtp;
+        struct alarm alarm;
+        int ret = 0;
+        exp.tv64 = restart->nanosleep.expires;
+        alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
+        if (alarmtimer_do_nsleep(&alarm, exp))
+                goto out;
+        if (freezing(current))
+                alarmtimer_freezerset(exp, type);
+        rmtp = restart->nanosleep.rmtp;
+        if (rmtp) {
+                ret = update_rmtp(exp, type, rmtp);
+                if (ret <= 0)
+                        goto out;
+        }
+        /* The other values in restart are already filled in */
+        ret = -ERESTART_RESTARTBLOCK;
+out:
+        return ret;
+}
+/**
+ * alarm_timer_nsleep - alarmtimer nanosleep
+ * @which_clock: clockid
+ * @flags: determins abstime or relative
+ * @tsreq: requested sleep time (abs or rel)
+ * @rmtp: remaining sleep time saved
+ *
+ * Handles clock_nanosleep calls against _ALARM clockids
+ */
+static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
+                     struct timespec *tsreq, struct timespec __user *rmtp)
+{
+        enum  alarmtimer_type type = clock2alarm(which_clock);
+        struct alarm alarm;
+        ktime_t exp;
+        int ret = 0;
+        struct restart_block *restart;
+        if (!alarmtimer_get_rtcdev())
+                return -ENOTSUPP;
+        if (!capable(CAP_WAKE_ALARM))
+                return -EPERM;
+        alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
+        exp = timespec_to_ktime(*tsreq);
+        /* Convert (if necessary) to absolute time */
+        if (flags != TIMER_ABSTIME) {
+                ktime_t now = alarm_bases[type].gettime();
+                exp = ktime_add(now, exp);
+        }
+        if (alarmtimer_do_nsleep(&alarm, exp))
+                goto out;
+        if (freezing(current))
+                alarmtimer_freezerset(exp, type);
+        /* abs timers don't set remaining time or restart */
+        if (flags == TIMER_ABSTIME) {
+                ret = -ERESTARTNOHAND;
+                goto out;
+        }
+        if (rmtp) {
+                ret = update_rmtp(exp, type, rmtp);
+                if (ret <= 0)
+                        goto out;
+        }
+        restart = &current_thread_info()->restart_block;
+        restart->fn = alarm_timer_nsleep_restart;
+        restart->nanosleep.clockid = type;
+        restart->nanosleep.expires = exp.tv64;
+        restart->nanosleep.rmtp = rmtp;
+        ret = -ERESTART_RESTARTBLOCK;
+out:
+        return ret;
+}
+/* Suspend hook structures */
+static const struct dev_pm_ops alarmtimer_pm_ops = {
+        .suspend = alarmtimer_suspend,
+};
+static struct platform_driver alarmtimer_driver = {
+        .driver = {
+                .name = "alarmtimer",
+                .pm = &alarmtimer_pm_ops,
+        }
+};
+/**
+ * alarmtimer_init - Initialize alarm timer code
+ *
+ * This function initializes the alarm bases and registers
+ * the posix clock ids.
+ */
+static int __init alarmtimer_init(void)
+{
+        int error = 0;
+        int i;
+        struct k_clock alarm_clock = {
+                .clock_getres   = alarm_clock_getres,
+                .clock_get      = alarm_clock_get,
+                .timer_create   = alarm_timer_create,
+                .timer_set      = alarm_timer_set,
+                .timer_del      = alarm_timer_del,
+                .timer_get      = alarm_timer_get,
+                .nsleep         = alarm_timer_nsleep,
+        };
+        posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock);
+        posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock);
+        /* Initialize alarm bases */
+        alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME;
+        alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real;
+        alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME;
+        alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime;
+        for (i = 0; i < ALARM_NUMTYPE; i++) {
+                timerqueue_init_head(&alarm_bases[i].timerqueue);
+                spin_lock_init(&alarm_bases[i].lock);
+                hrtimer_init(&alarm_bases[i].timer,
+                                alarm_bases[i].base_clockid,
+                                HRTIMER_MODE_ABS);
+                alarm_bases[i].timer.function = alarmtimer_fired;
+        }
+        error = platform_driver_register(&alarmtimer_driver);
+        platform_device_register_simple("alarmtimer", -1, NULL, 0);
+        return error;
+}
+device_initcall(alarmtimer_init);
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 0d74b9ba90c8..e4c699dfa4e8 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -182,7 +182,10 @@ void clockevents_register_device(struct clock_event_device *dev)
        unsigned long flags;
        BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
-        BUG_ON(!dev->cpumask);
+        if (!dev->cpumask) {
+                WARN_ON(num_possible_cpus() > 1);
+                dev->cpumask = cpumask_of(smp_processor_id());
+        }
        raw_spin_lock_irqsave(&clockevents_lock, flags);
@@ -194,6 +197,70 @@ void clockevents_register_device(struct clock_event_device *dev)
 }
 EXPORT_SYMBOL_GPL(clockevents_register_device);
+static void clockevents_config(struct clock_event_device *dev,
+                               u32 freq)
+{
+        u64 sec;
+        if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
+                return;
+        /*
+         * Calculate the maximum number of seconds we can sleep. Limit
+         * to 10 minutes for hardware which can program more than
+         * 32bit ticks so we still get reasonable conversion values.
+         */
+        sec = dev->max_delta_ticks;
+        do_div(sec, freq);
+        if (!sec)
+                sec = 1;
+        else if (sec > 600 && dev->max_delta_ticks > UINT_MAX)
+                sec = 600;
+        clockevents_calc_mult_shift(dev, freq, sec);
+        dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev);
+        dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev);
+}
+/**
+ * clockevents_config_and_register - Configure and register a clock event device
+ * @dev:        device to register
+ * @freq:       The clock frequency
+ * @min_delta:  The minimum clock ticks to program in oneshot mode
+ * @max_delta:  The maximum clock ticks to program in oneshot mode
+ *
+ * min/max_delta can be 0 for devices which do not support oneshot mode.
+ */
+void clockevents_config_and_register(struct clock_event_device *dev,
+                                     u32 freq, unsigned long min_delta,
+                                     unsigned long max_delta)
+{
+        dev->min_delta_ticks = min_delta;
+        dev->max_delta_ticks = max_delta;
+        clockevents_config(dev, freq);
+        clockevents_register_device(dev);
+}
+/**
+ * clockevents_update_freq - Update frequency and reprogram a clock event device.
+ * @dev:        device to modify
+ * @freq:       new device frequency
+ *
+ * Reconfigure and reprogram a clock event device in oneshot
+ * mode. Must be called on the cpu for which the device delivers per
+ * cpu timer events with interrupts disabled!  Returns 0 on success,
+ * -ETIME when the event is in the past.
+ */
+int clockevents_update_freq(struct clock_event_device *dev, u32 freq)
+{
+        clockevents_config(dev, freq);
+        if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
+                return 0;
+        return clockevents_program_event(dev, dev->next_event, ktime_get());
+}
 /*
 * Noop handler when we shut down an event device
 */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 6519cf62d9cd..e0980f0d9a0a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -185,7 +185,6 @@ static struct clocksource *watchdog;
 static struct timer_list watchdog_timer;
 static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
 static DEFINE_SPINLOCK(watchdog_lock);
-static cycle_t watchdog_last;
 static int watchdog_running;
 static int clocksource_watchdog_kthread(void *data);
@@ -254,11 +253,6 @@ static void clocksource_watchdog(unsigned long data)
        if (!watchdog_running)
                goto out;
-        wdnow = watchdog->read(watchdog);
-        wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask,
-                                     watchdog->mult, watchdog->shift);
-        watchdog_last = wdnow;
        list_for_each_entry(cs, &watchdog_list, wd_list) {
                /* Clocksource already marked unstable? */
@@ -268,19 +262,28 @@ static void clocksource_watchdog(unsigned long data)
                        continue;
                }
+                local_irq_disable();
                csnow = cs->read(cs);
+                wdnow = watchdog->read(watchdog);
+                local_irq_enable();
                /* Clocksource initialized ? */
                if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
                        cs->flags |= CLOCK_SOURCE_WATCHDOG;
-                        cs->wd_last = csnow;
+                        cs->wd_last = wdnow;
+                        cs->cs_last = csnow;
                        continue;
                }
-                /* Check the deviation from the watchdog clocksource. */
+                wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask,
-                cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) &
+                                             watchdog->mult, watchdog->shift);
+                cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) &
                                             cs->mask, cs->mult, cs->shift);
-                cs->wd_last = csnow;
+                cs->cs_last = csnow;
+                cs->wd_last = wdnow;
+                /* Check the deviation from the watchdog clocksource. */
                if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
                        clocksource_unstable(cs, cs_nsec - wd_nsec);
                        continue;
@@ -318,7 +321,6 @@ static inline void clocksource_start_watchdog(void)
                return;
        init_timer(&watchdog_timer);
        watchdog_timer.function = clocksource_watchdog;
-        watchdog_last = watchdog->read(watchdog);
        watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
        add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
        watchdog_running = 1;
@@ -626,19 +628,6 @@ static void clocksource_enqueue(struct clocksource *cs)
        list_add(&cs->list, entry);
 }
-/*
- * Maximum time we expect to go between ticks. This includes idle
- * tickless time. It provides the trade off between selecting a
- * mult/shift pair that is very precise but can only handle a short
- * period of time, vs. a mult/shift pair that can handle long periods
- * of time but isn't as precise.
- *
- * This is a subsystem constant, and actual hardware limitations
- * may override it (ie: clocksources that wrap every 3 seconds).
- */
-#define MAX_UPDATE_LENGTH 5 /* Seconds */
 /**
 * __clocksource_updatefreq_scale - Used update clocksource with new freq
 * @t:          clocksource to be registered
@@ -652,15 +641,28 @@ static void clocksource_enqueue(struct clocksource *cs)
 */
 void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
 {
+        u64 sec;
        /*
-         * Ideally we want to use  some of the limits used in
+         * Calc the maximum number of seconds which we can run before
-         * clocksource_max_deferment, to provide a more informed
+         * wrapping around. For clocksources which have a mask > 32bit
-         * MAX_UPDATE_LENGTH. But for now this just gets the
+         * we need to limit the max sleep time to have a good
-         * register interface working properly.
+         * conversion precision. 10 minutes is still a reasonable
+         * amount. That results in a shift value of 24 for a
+         * clocksource with mask >= 40bit and f >= 4GHz. That maps to
+         * ~ 0.06ppm granularity for NTP. We apply the same 12.5%
+         * margin as we do in clocksource_max_deferment()
         */
+        sec = (cs->mask - (cs->mask >> 5));
+        do_div(sec, freq);
+        do_div(sec, scale);
+        if (!sec)
+                sec = 1;
+        else if (sec > 600 && cs->mask > UINT_MAX)
+                sec = 600;
        clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
-                                      NSEC_PER_SEC/scale,
+                               NSEC_PER_SEC / scale, sec * scale);
-                                      MAX_UPDATE_LENGTH*scale);
        cs->max_idle_ns = clocksource_max_deferment(cs);
 }
 EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
@@ -685,8 +687,8 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
        /* Add clocksource to the clcoksource list */
        mutex_lock(&clocksource_mutex);
        clocksource_enqueue(cs);
-        clocksource_select();
        clocksource_enqueue_watchdog(cs);
+        clocksource_select();
        mutex_unlock(&clocksource_mutex);
        return 0;
 }
@@ -706,8 +708,8 @@ int clocksource_register(struct clocksource *cs)
        mutex_lock(&clocksource_mutex);
        clocksource_enqueue(cs);
-        clocksource_select();
        clocksource_enqueue_watchdog(cs);
+        clocksource_select();
        mutex_unlock(&clocksource_mutex);
        return 0;
 }
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index da800ffa810c..c7218d132738 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -456,23 +456,27 @@ void tick_broadcast_oneshot_control(unsigned long reason)
        unsigned long flags;
        int cpu;
-        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
        /*
         * Periodic mode does not care about the enter/exit of power
         * states
         */
        if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
-                goto out;
+                return;
-        bc = tick_broadcast_device.evtdev;
+        /*
+         * We are called with preemtion disabled from the depth of the
+         * idle code, so we can't be moved away.
+         */
        cpu = smp_processor_id();
        td = &per_cpu(tick_cpu_device, cpu);
        dev = td->evtdev;
        if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
-                goto out;
+                return;
+        bc = tick_broadcast_device.evtdev;
+        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
        if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
                if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
                        cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask());
@@ -489,8 +493,6 @@ void tick_broadcast_oneshot_control(unsigned long reason)
                                tick_program_event(dev->next_event, 1);
                }
        }
-out:
        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
@@ -522,10 +524,11 @@ static void tick_broadcast_init_next_event(struct cpumask *mask,
 */
 void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 {
+        int cpu = smp_processor_id();
        /* Set it up only once ! */
        if (bc->event_handler != tick_handle_oneshot_broadcast) {
                int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
-                int cpu = smp_processor_id();
                bc->event_handler = tick_handle_oneshot_broadcast;
                clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
@@ -551,6 +554,15 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
                        tick_broadcast_set_event(tick_next_period, 1);
                } else
                        bc->next_event.tv64 = KTIME_MAX;
+        } else {
+                /*
+                 * The first cpu which switches to oneshot mode sets
+                 * the bit for all other cpus which are in the general
+                 * (periodic) broadcast mask. So the bit is set and
+                 * would prevent the first broadcast enter after this
+                 * to program the bc device.
+                 */
+                tick_broadcast_clear_oneshot(cpu);
        }
 }
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 8ad5d576755e..2b021b0e8507 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -596,6 +596,64 @@ void __init timekeeping_init(void)
 static struct timespec timekeeping_suspend_time;
 /**
+ * __timekeeping_inject_sleeptime - Internal function to add sleep interval
+ * @delta: pointer to a timespec delta value
+ *
+ * Takes a timespec offset measuring a suspend interval and properly
+ * adds the sleep offset to the timekeeping variables.
+ */
+static void __timekeeping_inject_sleeptime(struct timespec *delta)
+{
+        if (!timespec_valid(delta)) {
+                printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid "
+                                        "sleep delta value!\n");
+                return;
+        }
+        xtime = timespec_add(xtime, *delta);
+        wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta);
+        total_sleep_time = timespec_add(total_sleep_time, *delta);
+}
+/**
+ * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values
+ * @delta: pointer to a timespec delta value
+ *
+ * This hook is for architectures that cannot support read_persistent_clock
+ * because their RTC/persistent clock is only accessible when irqs are enabled.
+ *
+ * This function should only be called by rtc_resume(), and allows
+ * a suspend offset to be injected into the timekeeping values.
+ */
+void timekeeping_inject_sleeptime(struct timespec *delta)
+{
+        unsigned long flags;
+        struct timespec ts;
+        /* Make sure we don't set the clock twice */
+        read_persistent_clock(&ts);
+        if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
+                return;
+        write_seqlock_irqsave(&xtime_lock, flags);
+        timekeeping_forward_now();
+        __timekeeping_inject_sleeptime(delta);
+        timekeeper.ntp_error = 0;
+        ntp_clear();
+        update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+                                timekeeper.mult);
+        write_sequnlock_irqrestore(&xtime_lock, flags);
+        /* signal hrtimers about time change */
+        clock_was_set();
+}
+/**
 * timekeeping_resume - Resumes the generic timekeeping subsystem.
 *
 * This is for the generic clocksource timekeeping.
@@ -615,9 +673,7 @@ static void timekeeping_resume(void)
        if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
                ts = timespec_sub(ts, timekeeping_suspend_time);
-                xtime = timespec_add(xtime, ts);
+                __timekeeping_inject_sleeptime(&ts);
-                wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
-                total_sleep_time = timespec_add(total_sleep_time, ts);
        }
        /* re-base the last cycle value */
        timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
@@ -630,18 +686,40 @@ static void timekeeping_resume(void)
        clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
        /* Resume hrtimers */
-        hres_timers_resume();
+        hrtimers_resume();
 }
 static int timekeeping_suspend(void)
 {
        unsigned long flags;
+        struct timespec         delta, delta_delta;
+        static struct timespec  old_delta;
        read_persistent_clock(&timekeeping_suspend_time);
        write_seqlock_irqsave(&xtime_lock, flags);
        timekeeping_forward_now();
        timekeeping_suspended = 1;
+        /*
+         * To avoid drift caused by repeated suspend/resumes,
+         * which each can add ~1 second drift error,
+         * try to compensate so the difference in system time
+         * and persistent_clock time stays close to constant.
+         */
+        delta = timespec_sub(xtime, timekeeping_suspend_time);
+        delta_delta = timespec_sub(delta, old_delta);
+        if (abs(delta_delta.tv_sec)  >= 2) {
+                /*
+                 * if delta_delta is too large, assume time correction
+                 * has occured and set old_delta to the current delta.
+                 */
+                old_delta = delta;
+        } else {
+                /* Otherwise try to adjust old_system to compensate */
+                timekeeping_suspend_time =
+                        timespec_add(timekeeping_suspend_time, delta_delta);
+        }
        write_sequnlock_irqrestore(&xtime_lock, flags);
        clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
@@ -1049,6 +1127,21 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
 }
 /**
+ * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
+ */
+ktime_t ktime_get_monotonic_offset(void)
+{
+        unsigned long seq;
+        struct timespec wtom;
+        do {
+                seq = read_seqbegin(&xtime_lock);
+                wtom = wall_to_monotonic;
+        } while (read_seqretry(&xtime_lock, seq));
+        return timespec_to_ktime(wtom);
+}
+/**
 * xtime_update() - advances the timekeeping infrastructure
 * @ticks:      number of ticks, that have elapsed since the last call.
 *
diff --git a/kernel/timer.c b/kernel/timer.c
index fd6198692b57..8cff36119e4d 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -749,16 +749,15 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
        unsigned long expires_limit, mask;
        int bit;
-        expires_limit = expires;
        if (timer->slack >= 0) {
                expires_limit = expires + timer->slack;
        } else {
-                unsigned long now = jiffies;
+                long delta = expires - jiffies;
+                if (delta < 256)
+                        return expires;
-                /* No slack, if already expired else auto slack 0.4% */
+                expires_limit = expires + delta / 256;
-                if (time_after(expires, now))
-                        expires_limit = expires + (expires - now)/256;
        }
        mask = expires ^ expires_limit;
        if (mask == 0)
@@ -795,6 +794,8 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
 */
 int mod_timer(struct timer_list *timer, unsigned long expires)
 {
+        expires = apply_slack(timer, expires);
        /*
         * This is a common optimization triggered by the
         * networking code - if the timer is re-modified
@@ -803,8 +804,6 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
        if (timer_pending(timer) && timer->expires == expires)
                return 1;
-        expires = apply_slack(timer, expires);
        return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
 }
 EXPORT_SYMBOL(mod_timer);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 2ad39e556cb4..cd3134510f3d 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -82,7 +82,7 @@ config EVENT_POWER_TRACING_DEPRECATED
          power:power_frequency
          This is for userspace compatibility
          and will vanish after 5 kernel iterations,
-          namely 2.6.41.
+          namely 3.1.
 config CONTEXT_SWITCH_TRACER
        bool
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ee24fa1935ac..c3e4575e7829 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -32,27 +32,32 @@
 #include <trace/events/sched.h>
-#include <asm/ftrace.h>
 #include <asm/setup.h>
 #include "trace_output.h"
 #include "trace_stat.h"
 #define FTRACE_WARN_ON(cond)                    \
-        do {                                    \
+        ({                                      \
-                if (WARN_ON(cond))              \
+                int ___r = cond;                \
+                if (WARN_ON(___r))              \
                        ftrace_kill();          \
-        } while (0)
+                ___r;                           \
+        })
 #define FTRACE_WARN_ON_ONCE(cond)               \
-        do {                                    \
+        ({                                      \
-                if (WARN_ON_ONCE(cond))         \
+                int ___r = cond;                \
+                if (WARN_ON_ONCE(___r))         \
                        ftrace_kill();          \
-        } while (0)
+                ___r;                           \
+        })
 /* hash bits for specific function selection */
 #define FTRACE_HASH_BITS 7
 #define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS)
+#define FTRACE_HASH_DEFAULT_BITS 10
+#define FTRACE_HASH_MAX_BITS 12
 /* ftrace_enabled is a method to turn ftrace on or off */
 int ftrace_enabled __read_mostly;
@@ -76,33 +81,45 @@ static int ftrace_disabled __read_mostly;
 static DEFINE_MUTEX(ftrace_lock);
-static struct ftrace_ops ftrace_list_end __read_mostly =
+static struct ftrace_ops ftrace_list_end __read_mostly = {
-{
        .func           = ftrace_stub,
 };
-static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
+static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
+static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
 ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
+static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub;
 ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
 ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
+static struct ftrace_ops global_ops;
+static void
+ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip);
 /*
- * Traverse the ftrace_list, invoking all entries.  The reason that we
+ * Traverse the ftrace_global_list, invoking all entries.  The reason that we
 * can use rcu_dereference_raw() is that elements removed from this list
 * are simply leaked, so there is no need to interact with a grace-period
 * mechanism.  The rcu_dereference_raw() calls are needed to handle
- * concurrent insertions into the ftrace_list.
+ * concurrent insertions into the ftrace_global_list.
 *
 * Silly Alpha and silly pointer-speculation compiler optimizations!
 */
-static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
+static void ftrace_global_list_func(unsigned long ip,
+                                    unsigned long parent_ip)
 {
-        struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/
+        struct ftrace_ops *op;
+        if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT)))
+                return;
+        trace_recursion_set(TRACE_GLOBAL_BIT);
+        op = rcu_dereference_raw(ftrace_global_list); /*see above*/
        while (op != &ftrace_list_end) {
                op->func(ip, parent_ip);
                op = rcu_dereference_raw(op->next); /*see above*/
        };
+        trace_recursion_clear(TRACE_GLOBAL_BIT);
 }
 static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip)
@@ -130,9 +147,11 @@ void clear_ftrace_function(void)
 {
        ftrace_trace_function = ftrace_stub;
        __ftrace_trace_function = ftrace_stub;
+        __ftrace_trace_function_delay = ftrace_stub;
        ftrace_pid_function = ftrace_stub;
 }
+#undef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
 #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
 /*
 * For those archs that do not test ftrace_trace_stop in their
@@ -147,46 +166,74 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
 }
 #endif
-static int __register_ftrace_function(struct ftrace_ops *ops)
+static void update_global_ops(void)
 {
-        ops->next = ftrace_list;
+        ftrace_func_t func;
        /*
-         * We are entering ops into the ftrace_list but another
+         * If there's only one function registered, then call that
-         * CPU might be walking that list. We need to make sure
+         * function directly. Otherwise, we need to iterate over the
-         * the ops->next pointer is valid before another CPU sees
+         * registered callers.
-         * the ops pointer included into the ftrace_list.
         */
-        rcu_assign_pointer(ftrace_list, ops);
+        if (ftrace_global_list == &ftrace_list_end ||
+            ftrace_global_list->next == &ftrace_list_end)
+                func = ftrace_global_list->func;
+        else
+                func = ftrace_global_list_func;
-        if (ftrace_enabled) {
+        /* If we filter on pids, update to use the pid function */
-                ftrace_func_t func;
+        if (!list_empty(&ftrace_pids)) {
+                set_ftrace_pid_function(func);
+                func = ftrace_pid_func;
+        }
-                if (ops->next == &ftrace_list_end)
+        global_ops.func = func;
-                        func = ops->func;
+}
-                else
-                        func = ftrace_list_func;
-                if (!list_empty(&ftrace_pids)) {
+static void update_ftrace_function(void)
-                        set_ftrace_pid_function(func);
+{
-                        func = ftrace_pid_func;
+        ftrace_func_t func;
-                }
+        update_global_ops();
+        /*
+         * If we are at the end of the list and this ops is
+         * not dynamic, then have the mcount trampoline call
+         * the function directly
+         */
+        if (ftrace_ops_list == &ftrace_list_end ||
+            (ftrace_ops_list->next == &ftrace_list_end &&
+             !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC)))
+                func = ftrace_ops_list->func;
+        else
+                func = ftrace_ops_list_func;
-                /*
-                 * For one func, simply call it directly.
-                 * For more than one func, call the chain.
-                 */
 #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
-                ftrace_trace_function = func;
+        ftrace_trace_function = func;
 #else
-                __ftrace_trace_function = func;
+#ifdef CONFIG_DYNAMIC_FTRACE
-                ftrace_trace_function = ftrace_test_stop_func;
+        /* do not update till all functions have been modified */
+        __ftrace_trace_function_delay = func;
+#else
+        __ftrace_trace_function = func;
 #endif
-        }
+        ftrace_trace_function = ftrace_test_stop_func;
+#endif
+}
-        return 0;
+static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
+{
+        ops->next = *list;
+        /*
+         * We are entering ops into the list but another
+         * CPU might be walking that list. We need to make sure
+         * the ops->next pointer is valid before another CPU sees
+         * the ops pointer included into the list.
+         */
+        rcu_assign_pointer(*list, ops);
 }
-static int __unregister_ftrace_function(struct ftrace_ops *ops)
+static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
 {
        struct ftrace_ops **p;
@@ -194,13 +241,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
         * If we are removing the last function, then simply point
         * to the ftrace_stub.
         */
-        if (ftrace_list == ops && ops->next == &ftrace_list_end) {
+        if (*list == ops && ops->next == &ftrace_list_end) {
-                ftrace_trace_function = ftrace_stub;
+                *list = &ftrace_list_end;
-                ftrace_list = &ftrace_list_end;
                return 0;
        }
-        for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next)
+        for (p = list; *p != &ftrace_list_end; p = &(*p)->next)
                if (*p == ops)
                        break;
@@ -208,53 +254,83 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
                return -1;
        *p = (*p)->next;
+        return 0;
+}
-        if (ftrace_enabled) {
+static int __register_ftrace_function(struct ftrace_ops *ops)
-                /* If we only have one func left, then call that directly */
+{
-                if (ftrace_list->next == &ftrace_list_end) {
+        if (ftrace_disabled)
-                        ftrace_func_t func = ftrace_list->func;
+                return -ENODEV;
-                        if (!list_empty(&ftrace_pids)) {
+        if (FTRACE_WARN_ON(ops == &global_ops))
-                                set_ftrace_pid_function(func);
+                return -EINVAL;
-                                func = ftrace_pid_func;
-                        }
+        if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))
-#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+                return -EBUSY;
-                        ftrace_trace_function = func;
-#else
+        if (!core_kernel_data((unsigned long)ops))
-                        __ftrace_trace_function = func;
+                ops->flags |= FTRACE_OPS_FL_DYNAMIC;
-#endif
-                }
+        if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
-        }
+                int first = ftrace_global_list == &ftrace_list_end;
+                add_ftrace_ops(&ftrace_global_list, ops);
+                ops->flags |= FTRACE_OPS_FL_ENABLED;
+                if (first)
+                        add_ftrace_ops(&ftrace_ops_list, &global_ops);
+        } else
+                add_ftrace_ops(&ftrace_ops_list, ops);
+        if (ftrace_enabled)
+                update_ftrace_function();
        return 0;
 }
-static void ftrace_update_pid_func(void)
+static int __unregister_ftrace_function(struct ftrace_ops *ops)
 {
-        ftrace_func_t func;
+        int ret;
-        if (ftrace_trace_function == ftrace_stub)
+        if (ftrace_disabled)
-                return;
+                return -ENODEV;
-#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+        if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
-        func = ftrace_trace_function;
+                return -EBUSY;
-#else
-        func = __ftrace_trace_function;
-#endif
-        if (!list_empty(&ftrace_pids)) {
+        if (FTRACE_WARN_ON(ops == &global_ops))
-                set_ftrace_pid_function(func);
+                return -EINVAL;
-                func = ftrace_pid_func;
-        } else {
-                if (func == ftrace_pid_func)
-                        func = ftrace_pid_function;
-        }
-#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+        if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
-        ftrace_trace_function = func;
+                ret = remove_ftrace_ops(&ftrace_global_list, ops);
-#else
+                if (!ret && ftrace_global_list == &ftrace_list_end)
-        __ftrace_trace_function = func;
+                        ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops);
-#endif
+                if (!ret)
+                        ops->flags &= ~FTRACE_OPS_FL_ENABLED;
+        } else
+                ret = remove_ftrace_ops(&ftrace_ops_list, ops);
+        if (ret < 0)
+                return ret;
+        if (ftrace_enabled)
+                update_ftrace_function();
+        /*
+         * Dynamic ops may be freed, we must make sure that all
+         * callers are done before leaving this function.
+         */
+        if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
+                synchronize_sched();
+        return 0;
+}
+static void ftrace_update_pid_func(void)
+{
+        /* Only do something if we are tracing something */
+        if (ftrace_trace_function == ftrace_stub)
+                return;
+        update_ftrace_function();
 }
 #ifdef CONFIG_FUNCTION_PROFILER
@@ -715,8 +791,7 @@ static void unregister_ftrace_profiler(void)
        unregister_ftrace_graph();
 }
 #else
-static struct ftrace_ops ftrace_profile_ops __read_mostly =
+static struct ftrace_ops ftrace_profile_ops __read_mostly = {
-{
        .func           = function_profile_call,
 };
@@ -736,19 +811,10 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
                     size_t cnt, loff_t *ppos)
 {
        unsigned long val;
-        char buf[64];           /* big enough to hold a number */
        int ret;
-        if (cnt >= sizeof(buf))
+        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
-                return -EINVAL;
+        if (ret)
-        if (copy_from_user(&buf, ubuf, cnt))
-                return -EFAULT;
-        buf[cnt] = 0;
-        ret = strict_strtoul(buf, 10, &val);
-        if (ret < 0)
                return ret;
        val = !!val;
@@ -888,8 +954,35 @@ enum {
        FTRACE_START_FUNC_RET           = (1 << 3),
        FTRACE_STOP_FUNC_RET            = (1 << 4),
 };
+struct ftrace_func_entry {
+        struct hlist_node hlist;
+        unsigned long ip;
+};
-static int ftrace_filtered;
+struct ftrace_hash {
+        unsigned long           size_bits;
+        struct hlist_head       *buckets;
+        unsigned long           count;
+        struct rcu_head         rcu;
+};
+/*
+ * We make these constant because no one should touch them,
+ * but they are used as the default "empty hash", to avoid allocating
+ * it all the time. These are in a read only section such that if
+ * anyone does try to modify it, it will cause an exception.
+ */
+static const struct hlist_head empty_buckets[1];
+static const struct ftrace_hash empty_hash = {
+        .buckets = (struct hlist_head *)empty_buckets,
+};
+#define EMPTY_HASH      ((struct ftrace_hash *)&empty_hash)
+static struct ftrace_ops global_ops = {
+        .func                   = ftrace_stub,
+        .notrace_hash           = EMPTY_HASH,
+        .filter_hash            = EMPTY_HASH,
+};
 static struct dyn_ftrace *ftrace_new_addrs;
@@ -912,6 +1005,292 @@ static struct ftrace_page	*ftrace_pages;
 static struct dyn_ftrace *ftrace_free_records;
+static struct ftrace_func_entry *
+ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
+{
+        unsigned long key;
+        struct ftrace_func_entry *entry;
+        struct hlist_head *hhd;
+        struct hlist_node *n;
+        if (!hash->count)
+                return NULL;
+        if (hash->size_bits > 0)
+                key = hash_long(ip, hash->size_bits);
+        else
+                key = 0;
+        hhd = &hash->buckets[key];
+        hlist_for_each_entry_rcu(entry, n, hhd, hlist) {
+                if (entry->ip == ip)
+                        return entry;
+        }
+        return NULL;
+}
+static void __add_hash_entry(struct ftrace_hash *hash,
+                             struct ftrace_func_entry *entry)
+{
+        struct hlist_head *hhd;
+        unsigned long key;
+        if (hash->size_bits)
+                key = hash_long(entry->ip, hash->size_bits);
+        else
+                key = 0;
+        hhd = &hash->buckets[key];
+        hlist_add_head(&entry->hlist, hhd);
+        hash->count++;
+}
+static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
+{
+        struct ftrace_func_entry *entry;
+        entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+        if (!entry)
+                return -ENOMEM;
+        entry->ip = ip;
+        __add_hash_entry(hash, entry);
+        return 0;
+}
+static void
+free_hash_entry(struct ftrace_hash *hash,
+                  struct ftrace_func_entry *entry)
+{
+        hlist_del(&entry->hlist);
+        kfree(entry);
+        hash->count--;
+}
+static void
+remove_hash_entry(struct ftrace_hash *hash,
+                  struct ftrace_func_entry *entry)
+{
+        hlist_del(&entry->hlist);
+        hash->count--;
+}
+static void ftrace_hash_clear(struct ftrace_hash *hash)
+{
+        struct hlist_head *hhd;
+        struct hlist_node *tp, *tn;
+        struct ftrace_func_entry *entry;
+        int size = 1 << hash->size_bits;
+        int i;
+        if (!hash->count)
+                return;
+        for (i = 0; i < size; i++) {
+                hhd = &hash->buckets[i];
+                hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist)
+                        free_hash_entry(hash, entry);
+        }
+        FTRACE_WARN_ON(hash->count);
+}
+static void free_ftrace_hash(struct ftrace_hash *hash)
+{
+        if (!hash || hash == EMPTY_HASH)
+                return;
+        ftrace_hash_clear(hash);
+        kfree(hash->buckets);
+        kfree(hash);
+}
+static void __free_ftrace_hash_rcu(struct rcu_head *rcu)
+{
+        struct ftrace_hash *hash;
+        hash = container_of(rcu, struct ftrace_hash, rcu);
+        free_ftrace_hash(hash);
+}
+static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
+{
+        if (!hash || hash == EMPTY_HASH)
+                return;
+        call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu);
+}
+static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
+{
+        struct ftrace_hash *hash;
+        int size;
+        hash = kzalloc(sizeof(*hash), GFP_KERNEL);
+        if (!hash)
+                return NULL;
+        size = 1 << size_bits;
+        hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL);
+        if (!hash->buckets) {
+                kfree(hash);
+                return NULL;
+        }
+        hash->size_bits = size_bits;
+        return hash;
+}
+static struct ftrace_hash *
+alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
+{
+        struct ftrace_func_entry *entry;
+        struct ftrace_hash *new_hash;
+        struct hlist_node *tp;
+        int size;
+        int ret;
+        int i;
+        new_hash = alloc_ftrace_hash(size_bits);
+        if (!new_hash)
+                return NULL;
+        /* Empty hash? */
+        if (!hash || !hash->count)
+                return new_hash;
+        size = 1 << hash->size_bits;
+        for (i = 0; i < size; i++) {
+                hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) {
+                        ret = add_hash_entry(new_hash, entry->ip);
+                        if (ret < 0)
+                                goto free_hash;
+                }
+        }
+        FTRACE_WARN_ON(new_hash->count != hash->count);
+        return new_hash;
+ free_hash:
+        free_ftrace_hash(new_hash);
+        return NULL;
+}
+static void
+ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash);
+static void
+ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash);
+static int
+ftrace_hash_move(struct ftrace_ops *ops, int enable,
+                 struct ftrace_hash **dst, struct ftrace_hash *src)
+{
+        struct ftrace_func_entry *entry;
+        struct hlist_node *tp, *tn;
+        struct hlist_head *hhd;
+        struct ftrace_hash *old_hash;
+        struct ftrace_hash *new_hash;
+        unsigned long key;
+        int size = src->count;
+        int bits = 0;
+        int ret;
+        int i;
+        /*
+         * Remove the current set, update the hash and add
+         * them back.
+         */
+        ftrace_hash_rec_disable(ops, enable);
+        /*
+         * If the new source is empty, just free dst and assign it
+         * the empty_hash.
+         */
+        if (!src->count) {
+                free_ftrace_hash_rcu(*dst);
+                rcu_assign_pointer(*dst, EMPTY_HASH);
+                return 0;
+        }
+        /*
+         * Make the hash size about 1/2 the # found
+         */
+        for (size /= 2; size; size >>= 1)
+                bits++;
+        /* Don't allocate too much */
+        if (bits > FTRACE_HASH_MAX_BITS)
+                bits = FTRACE_HASH_MAX_BITS;
+        ret = -ENOMEM;
+        new_hash = alloc_ftrace_hash(bits);
+        if (!new_hash)
+                goto out;
+        size = 1 << src->size_bits;
+        for (i = 0; i < size; i++) {
+                hhd = &src->buckets[i];
+                hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) {
+                        if (bits > 0)
+                                key = hash_long(entry->ip, bits);
+                        else
+                                key = 0;
+                        remove_hash_entry(src, entry);
+                        __add_hash_entry(new_hash, entry);
+                }
+        }
+        old_hash = *dst;
+        rcu_assign_pointer(*dst, new_hash);
+        free_ftrace_hash_rcu(old_hash);
+        ret = 0;
+ out:
+        /*
+         * Enable regardless of ret:
+         *  On success, we enable the new hash.
+         *  On failure, we re-enable the original hash.
+         */
+        ftrace_hash_rec_enable(ops, enable);
+        return ret;
+}
+/*
+ * Test the hashes for this ops to see if we want to call
+ * the ops->func or not.
+ *
+ * It's a match if the ip is in the ops->filter_hash or
+ * the filter_hash does not exist or is empty,
+ *  AND
+ * the ip is not in the ops->notrace_hash.
+ *
+ * This needs to be called with preemption disabled as
+ * the hashes are freed with call_rcu_sched().
+ */
+static int
+ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
+{
+        struct ftrace_hash *filter_hash;
+        struct ftrace_hash *notrace_hash;
+        int ret;
+        filter_hash = rcu_dereference_raw(ops->filter_hash);
+        notrace_hash = rcu_dereference_raw(ops->notrace_hash);
+        if ((!filter_hash || !filter_hash->count ||
+             ftrace_lookup_ip(filter_hash, ip)) &&
+            (!notrace_hash || !notrace_hash->count ||
+             !ftrace_lookup_ip(notrace_hash, ip)))
+                ret = 1;
+        else
+                ret = 0;
+        return ret;
+}
 /*
 * This is a double for. Do not use 'break' to break out of the loop,
 * you must use a goto.
@@ -926,6 +1305,105 @@ static struct dyn_ftrace *ftrace_free_records;
                }                               \
        }
+static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
+                                     int filter_hash,
+                                     bool inc)
+{
+        struct ftrace_hash *hash;
+        struct ftrace_hash *other_hash;
+        struct ftrace_page *pg;
+        struct dyn_ftrace *rec;
+        int count = 0;
+        int all = 0;
+        /* Only update if the ops has been registered */
+        if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+                return;
+        /*
+         * In the filter_hash case:
+         *   If the count is zero, we update all records.
+         *   Otherwise we just update the items in the hash.
+         *
+         * In the notrace_hash case:
+         *   We enable the update in the hash.
+         *   As disabling notrace means enabling the tracing,
+         *   and enabling notrace means disabling, the inc variable
+         *   gets inversed.
+         */
+        if (filter_hash) {
+                hash = ops->filter_hash;
+                other_hash = ops->notrace_hash;
+                if (!hash || !hash->count)
+                        all = 1;
+        } else {
+                inc = !inc;
+                hash = ops->notrace_hash;
+                other_hash = ops->filter_hash;
+                /*
+                 * If the notrace hash has no items,
+                 * then there's nothing to do.
+                 */
+                if (hash && !hash->count)
+                        return;
+        }
+        do_for_each_ftrace_rec(pg, rec) {
+                int in_other_hash = 0;
+                int in_hash = 0;
+                int match = 0;
+                if (all) {
+                        /*
+                         * Only the filter_hash affects all records.
+                         * Update if the record is not in the notrace hash.
+                         */
+                        if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip))
+                                match = 1;
+                } else {
+                        in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip);
+                        in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip);
+                        /*
+                         *
+                         */
+                        if (filter_hash && in_hash && !in_other_hash)
+                                match = 1;
+                        else if (!filter_hash && in_hash &&
+                                 (in_other_hash || !other_hash->count))
+                                match = 1;
+                }
+                if (!match)
+                        continue;
+                if (inc) {
+                        rec->flags++;
+                        if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX))
+                                return;
+                } else {
+                        if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0))
+                                return;
+                        rec->flags--;
+                }
+                count++;
+                /* Shortcut, if we handled all records, we are done. */
+                if (!all && count == hash->count)
+                        return;
+        } while_for_each_ftrace_rec();
+}
+static void ftrace_hash_rec_disable(struct ftrace_ops *ops,
+                                    int filter_hash)
+{
+        __ftrace_hash_rec_update(ops, filter_hash, 0);
+}
+static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
+                                   int filter_hash)
+{
+        __ftrace_hash_rec_update(ops, filter_hash, 1);
+}
 static void ftrace_free_rec(struct dyn_ftrace *rec)
 {
        rec->freelist = ftrace_free_records;
@@ -1047,18 +1525,18 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
        ftrace_addr = (unsigned long)FTRACE_ADDR;
        /*
-         * If this record is not to be traced or we want to disable it,
+         * If we are enabling tracing:
-         * then disable it.
+         *
+         *   If the record has a ref count, then we need to enable it
+         *   because someone is using it.
         *
-         * If we want to enable it and filtering is off, then enable it.
+         *   Otherwise we make sure its disabled.
         *
-         * If we want to enable it and filtering is on, enable it only if
+         * If we are disabling tracing, then disable all records that
-         * it's filtered
+         * are enabled.
         */
-        if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) {
+        if (enable && (rec->flags & ~FTRACE_FL_MASK))
-                if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER))
+                flag = FTRACE_FL_ENABLED;
-                        flag = FTRACE_FL_ENABLED;
-        }
        /* If the state of this record hasn't changed, then do nothing */
        if ((rec->flags & FTRACE_FL_ENABLED) == flag)
@@ -1079,19 +1557,16 @@ static void ftrace_replace_code(int enable)
        struct ftrace_page *pg;
        int failed;
+        if (unlikely(ftrace_disabled))
+                return;
        do_for_each_ftrace_rec(pg, rec) {
-                /*
+                /* Skip over free records */
-                 * Skip over free records, records that have
+                if (rec->flags & FTRACE_FL_FREE)
-                 * failed and not converted.
-                 */
-                if (rec->flags & FTRACE_FL_FREE ||
-                    rec->flags & FTRACE_FL_FAILED ||
-                    !(rec->flags & FTRACE_FL_CONVERTED))
                        continue;
                failed = __ftrace_replace_code(rec, enable);
                if (failed) {
-                        rec->flags |= FTRACE_FL_FAILED;
                        ftrace_bug(failed, rec->ip);
                        /* Stop processing */
                        return;
@@ -1107,10 +1582,12 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
        ip = rec->ip;
+        if (unlikely(ftrace_disabled))
+                return 0;
        ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
        if (ret) {
                ftrace_bug(ret, ip);
-                rec->flags |= FTRACE_FL_FAILED;
                return 0;
        }
        return 1;
@@ -1138,6 +1615,12 @@ static int __ftrace_modify_code(void *data)
 {
        int *command = data;
+        /*
+         * Do not call function tracer while we update the code.
+         * We are in stop machine, no worrying about races.
+         */
+        function_trace_stop++;
        if (*command & FTRACE_ENABLE_CALLS)
                ftrace_replace_code(1);
        else if (*command & FTRACE_DISABLE_CALLS)
@@ -1151,6 +1634,18 @@ static int __ftrace_modify_code(void *data)
        else if (*command & FTRACE_STOP_FUNC_RET)
                ftrace_disable_ftrace_graph_caller();
+#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+        /*
+         * For archs that call ftrace_test_stop_func(), we must
+         * wait till after we update all the function callers
+         * before we update the callback. This keeps different
+         * ops that record different functions from corrupting
+         * each other.
+         */
+        __ftrace_trace_function = __ftrace_trace_function_delay;
+#endif
+        function_trace_stop--;
        return 0;
 }
@@ -1171,6 +1666,7 @@ static void ftrace_run_update_code(int command)
 static ftrace_func_t saved_ftrace_func;
 static int ftrace_start_up;
+static int global_start_up;
 static void ftrace_startup_enable(int command)
 {
@@ -1185,19 +1681,38 @@ static void ftrace_startup_enable(int command)
        ftrace_run_update_code(command);
 }
-static void ftrace_startup(int command)
+static int ftrace_startup(struct ftrace_ops *ops, int command)
 {
+        bool hash_enable = true;
        if (unlikely(ftrace_disabled))
-                return;
+                return -ENODEV;
        ftrace_start_up++;
        command |= FTRACE_ENABLE_CALLS;
+        /* ops marked global share the filter hashes */
+        if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
+                ops = &global_ops;
+                /* Don't update hash if global is already set */
+                if (global_start_up)
+                        hash_enable = false;
+                global_start_up++;
+        }
+        ops->flags |= FTRACE_OPS_FL_ENABLED;
+        if (hash_enable)
+                ftrace_hash_rec_enable(ops, 1);
        ftrace_startup_enable(command);
+        return 0;
 }
-static void ftrace_shutdown(int command)
+static void ftrace_shutdown(struct ftrace_ops *ops, int command)
 {
+        bool hash_disable = true;
        if (unlikely(ftrace_disabled))
                return;
@@ -1209,6 +1724,23 @@ static void ftrace_shutdown(int command)
         */
        WARN_ON_ONCE(ftrace_start_up < 0);
+        if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
+                ops = &global_ops;
+                global_start_up--;
+                WARN_ON_ONCE(global_start_up < 0);
+                /* Don't update hash if global still has users */
+                if (global_start_up) {
+                        WARN_ON_ONCE(!ftrace_start_up);
+                        hash_disable = false;
+                }
+        }
+        if (hash_disable)
+                ftrace_hash_rec_disable(ops, 1);
+        if (ops != &global_ops || !global_start_up)
+                ops->flags &= ~FTRACE_OPS_FL_ENABLED;
        if (!ftrace_start_up)
                command |= FTRACE_DISABLE_CALLS;
@@ -1249,10 +1781,36 @@ static cycle_t		ftrace_update_time;
 static unsigned long    ftrace_update_cnt;
 unsigned long           ftrace_update_tot_cnt;
+static int ops_traces_mod(struct ftrace_ops *ops)
+{
+        struct ftrace_hash *hash;
+        hash = ops->filter_hash;
+        return !!(!hash || !hash->count);
+}
 static int ftrace_update_code(struct module *mod)
 {
        struct dyn_ftrace *p;
        cycle_t start, stop;
+        unsigned long ref = 0;
+        /*
+         * When adding a module, we need to check if tracers are
+         * currently enabled and if they are set to trace all functions.
+         * If they are, we need to enable the module functions as well
+         * as update the reference counts for those function records.
+         */
+        if (mod) {
+                struct ftrace_ops *ops;
+                for (ops = ftrace_ops_list;
+                     ops != &ftrace_list_end; ops = ops->next) {
+                        if (ops->flags & FTRACE_OPS_FL_ENABLED &&
+                            ops_traces_mod(ops))
+                                ref++;
+                }
+        }
        start = ftrace_now(raw_smp_processor_id());
        ftrace_update_cnt = 0;
@@ -1265,7 +1823,7 @@ static int ftrace_update_code(struct module *mod)
                p = ftrace_new_addrs;
                ftrace_new_addrs = p->newlist;
-                p->flags = 0L;
+                p->flags = ref;
                /*
                 * Do the initial record conversion from mcount jump
@@ -1273,10 +1831,10 @@ static int ftrace_update_code(struct module *mod)
                 */
                if (!ftrace_code_disable(mod, p)) {
                        ftrace_free_rec(p);
-                        continue;
+                        /* Game over */
+                        break;
                }
-                p->flags |= FTRACE_FL_CONVERTED;
                ftrace_update_cnt++;
                /*
@@ -1288,7 +1846,7 @@ static int ftrace_update_code(struct module *mod)
                 * conversion puts the module to the correct state, thus
                 * passing the ftrace_make_call check.
                 */
-                if (ftrace_start_up) {
+                if (ftrace_start_up && ref) {
                        int failed = __ftrace_replace_code(p, 1);
                        if (failed) {
                                ftrace_bug(failed, p->ip);
@@ -1351,9 +1909,9 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
 enum {
        FTRACE_ITER_FILTER      = (1 << 0),
        FTRACE_ITER_NOTRACE     = (1 << 1),
-        FTRACE_ITER_FAILURES    = (1 << 2),
+        FTRACE_ITER_PRINTALL    = (1 << 2),
-        FTRACE_ITER_PRINTALL    = (1 << 3),
+        FTRACE_ITER_HASH        = (1 << 3),
-        FTRACE_ITER_HASH        = (1 << 4),
+        FTRACE_ITER_ENABLED     = (1 << 4),
 };
 #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
@@ -1365,6 +1923,8 @@ struct ftrace_iterator {
        struct dyn_ftrace               *func;
        struct ftrace_func_probe        *probe;
        struct trace_parser             parser;
+        struct ftrace_hash              *hash;
+        struct ftrace_ops               *ops;
        int                             hidx;
        int                             idx;
        unsigned                        flags;
@@ -1461,8 +2021,12 @@ static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
        struct ftrace_iterator *iter = m->private;
+        struct ftrace_ops *ops = &global_ops;
        struct dyn_ftrace *rec = NULL;
+        if (unlikely(ftrace_disabled))
+                return NULL;
        if (iter->flags & FTRACE_ITER_HASH)
                return t_hash_next(m, pos);
@@ -1483,17 +2047,15 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
                rec = &iter->pg->records[iter->idx++];
                if ((rec->flags & FTRACE_FL_FREE) ||
-                    (!(iter->flags & FTRACE_ITER_FAILURES) &&
-                     (rec->flags & FTRACE_FL_FAILED)) ||
-                    ((iter->flags & FTRACE_ITER_FAILURES) &&
-                     !(rec->flags & FTRACE_FL_FAILED)) ||
                    ((iter->flags & FTRACE_ITER_FILTER) &&
-                     !(rec->flags & FTRACE_FL_FILTER)) ||
+                     !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) ||
                    ((iter->flags & FTRACE_ITER_NOTRACE) &&
-                     !(rec->flags & FTRACE_FL_NOTRACE))) {
+                     !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) ||
+                    ((iter->flags & FTRACE_ITER_ENABLED) &&
+                     !(rec->flags & ~FTRACE_FL_MASK))) {
                        rec = NULL;
                        goto retry;
                }
@@ -1517,10 +2079,15 @@ static void reset_iter_read(struct ftrace_iterator *iter)
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
        struct ftrace_iterator *iter = m->private;
+        struct ftrace_ops *ops = &global_ops;
        void *p = NULL;
        loff_t l;
        mutex_lock(&ftrace_lock);
+        if (unlikely(ftrace_disabled))
+                return NULL;
        /*
         * If an lseek was done, then reset and start from beginning.
         */
@@ -1532,7 +2099,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
         * off, we can short cut and just print out that all
         * functions are enabled.
         */
-        if (iter->flags & FTRACE_ITER_FILTER && !ftrace_filtered) {
+        if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) {
                if (*pos > 0)
                        return t_hash_start(m, pos);
                iter->flags |= FTRACE_ITER_PRINTALL;
@@ -1590,7 +2157,11 @@ static int t_show(struct seq_file *m, void *v)
        if (!rec)
                return 0;
-        seq_printf(m, "%ps\n", (void *)rec->ip);
+        seq_printf(m, "%ps", (void *)rec->ip);
+        if (iter->flags & FTRACE_ITER_ENABLED)
+                seq_printf(m, " (%ld)",
+                           rec->flags & ~FTRACE_FL_MASK);
+        seq_printf(m, "\n");
        return 0;
 }
@@ -1630,44 +2201,46 @@ ftrace_avail_open(struct inode *inode, struct file *file)
 }
 static int
-ftrace_failures_open(struct inode *inode, struct file *file)
+ftrace_enabled_open(struct inode *inode, struct file *file)
 {
-        int ret;
-        struct seq_file *m;
        struct ftrace_iterator *iter;
+        int ret;
-        ret = ftrace_avail_open(inode, file);
+        if (unlikely(ftrace_disabled))
+                return -ENODEV;
+        iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+        if (!iter)
+                return -ENOMEM;
+        iter->pg = ftrace_pages_start;
+        iter->flags = FTRACE_ITER_ENABLED;
+        ret = seq_open(file, &show_ftrace_seq_ops);
        if (!ret) {
-                m = file->private_data;
+                struct seq_file *m = file->private_data;
-                iter = m->private;
-                iter->flags = FTRACE_ITER_FAILURES;
+                m->private = iter;
+        } else {
+                kfree(iter);
        }
        return ret;
 }
+static void ftrace_filter_reset(struct ftrace_hash *hash)
-static void ftrace_filter_reset(int enable)
 {
-        struct ftrace_page *pg;
-        struct dyn_ftrace *rec;
-        unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
        mutex_lock(&ftrace_lock);
-        if (enable)
+        ftrace_hash_clear(hash);
-                ftrace_filtered = 0;
-        do_for_each_ftrace_rec(pg, rec) {
-                if (rec->flags & FTRACE_FL_FAILED)
-                        continue;
-                rec->flags &= ~type;
-        } while_for_each_ftrace_rec();
        mutex_unlock(&ftrace_lock);
 }
 static int
-ftrace_regex_open(struct inode *inode, struct file *file, int enable)
+ftrace_regex_open(struct ftrace_ops *ops, int flag,
+                  struct inode *inode, struct file *file)
 {
        struct ftrace_iterator *iter;
+        struct ftrace_hash *hash;
        int ret = 0;
        if (unlikely(ftrace_disabled))
@@ -1682,21 +2255,42 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
                return -ENOMEM;
        }
+        if (flag & FTRACE_ITER_NOTRACE)
+                hash = ops->notrace_hash;
+        else
+                hash = ops->filter_hash;
+        iter->ops = ops;
+        iter->flags = flag;
+        if (file->f_mode & FMODE_WRITE) {
+                mutex_lock(&ftrace_lock);
+                iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash);
+                mutex_unlock(&ftrace_lock);
+                if (!iter->hash) {
+                        trace_parser_put(&iter->parser);
+                        kfree(iter);
+                        return -ENOMEM;
+                }
+        }
        mutex_lock(&ftrace_regex_lock);
        if ((file->f_mode & FMODE_WRITE) &&
            (file->f_flags & O_TRUNC))
-                ftrace_filter_reset(enable);
+                ftrace_filter_reset(iter->hash);
        if (file->f_mode & FMODE_READ) {
                iter->pg = ftrace_pages_start;
-                iter->flags = enable ? FTRACE_ITER_FILTER :
-                        FTRACE_ITER_NOTRACE;
                ret = seq_open(file, &show_ftrace_seq_ops);
                if (!ret) {
                        struct seq_file *m = file->private_data;
                        m->private = iter;
                } else {
+                        /* Failed */
+                        free_ftrace_hash(iter->hash);
                        trace_parser_put(&iter->parser);
                        kfree(iter);
                }
@@ -1710,13 +2304,15 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
 static int
 ftrace_filter_open(struct inode *inode, struct file *file)
 {
-        return ftrace_regex_open(inode, file, 1);
+        return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER,
+                                 inode, file);
 }
 static int
 ftrace_notrace_open(struct inode *inode, struct file *file)
 {
-        return ftrace_regex_open(inode, file, 0);
+        return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE,
+                                 inode, file);
 }
 static loff_t
@@ -1761,86 +2357,99 @@ static int ftrace_match(char *str, char *regex, int len, int type)
 }
 static int
-ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type)
+enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not)
+{
+        struct ftrace_func_entry *entry;
+        int ret = 0;
+        entry = ftrace_lookup_ip(hash, rec->ip);
+        if (not) {
+                /* Do nothing if it doesn't exist */
+                if (!entry)
+                        return 0;
+                free_hash_entry(hash, entry);
+        } else {
+                /* Do nothing if it exists */
+                if (entry)
+                        return 0;
+                ret = add_hash_entry(hash, rec->ip);
+        }
+        return ret;
+}
+static int
+ftrace_match_record(struct dyn_ftrace *rec, char *mod,
+                    char *regex, int len, int type)
 {
        char str[KSYM_SYMBOL_LEN];
+        char *modname;
+        kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
+        if (mod) {
+                /* module lookup requires matching the module */
+                if (!modname || strcmp(modname, mod))
+                        return 0;
+                /* blank search means to match all funcs in the mod */
+                if (!len)
+                        return 1;
+        }
-        kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
        return ftrace_match(str, regex, len, type);
 }
-static int ftrace_match_records(char *buff, int len, int enable)
+static int
+match_records(struct ftrace_hash *hash, char *buff,
+              int len, char *mod, int not)
 {
-        unsigned int search_len;
+        unsigned search_len = 0;
        struct ftrace_page *pg;
        struct dyn_ftrace *rec;
-        unsigned long flag;
+        int type = MATCH_FULL;
-        char *search;
+        char *search = buff;
-        int type;
-        int not;
        int found = 0;
+        int ret;
-        flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
+        if (len) {
-        type = filter_parse_regex(buff, len, &search, &not);
+                type = filter_parse_regex(buff, len, &search, &not);
+                search_len = strlen(search);
-        search_len = strlen(search);
+        }
        mutex_lock(&ftrace_lock);
-        do_for_each_ftrace_rec(pg, rec) {
-                if (rec->flags & FTRACE_FL_FAILED)
+        if (unlikely(ftrace_disabled))
-                        continue;
+                goto out_unlock;
-                if (ftrace_match_record(rec, search, search_len, type)) {
+        do_for_each_ftrace_rec(pg, rec) {
-                        if (not)
-                                rec->flags &= ~flag;
+                if (ftrace_match_record(rec, mod, search, search_len, type)) {
-                        else
+                        ret = enter_record(hash, rec, not);
-                                rec->flags |= flag;
+                        if (ret < 0) {
+                                found = ret;
+                                goto out_unlock;
+                        }
                        found = 1;
                }
-                /*
-                 * Only enable filtering if we have a function that
-                 * is filtered on.
-                 */
-                if (enable && (rec->flags & FTRACE_FL_FILTER))
-                        ftrace_filtered = 1;
        } while_for_each_ftrace_rec();
+ out_unlock:
        mutex_unlock(&ftrace_lock);
        return found;
 }
 static int
-ftrace_match_module_record(struct dyn_ftrace *rec, char *mod,
+ftrace_match_records(struct ftrace_hash *hash, char *buff, int len)
-                           char *regex, int len, int type)
 {
-        char str[KSYM_SYMBOL_LEN];
+        return match_records(hash, buff, len, NULL, 0);
-        char *modname;
-        kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
-        if (!modname || strcmp(modname, mod))
-                return 0;
-        /* blank search means to match all funcs in the mod */
-        if (len)
-                return ftrace_match(str, regex, len, type);
-        else
-                return 1;
 }
-static int ftrace_match_module_records(char *buff, char *mod, int enable)
+static int
+ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod)
 {
-        unsigned search_len = 0;
-        struct ftrace_page *pg;
-        struct dyn_ftrace *rec;
-        int type = MATCH_FULL;
-        char *search = buff;
-        unsigned long flag;
        int not = 0;
-        int found = 0;
-        flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
        /* blank or '*' mean the same */
        if (strcmp(buff, "*") == 0)
@@ -1852,32 +2461,7 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable)
                not = 1;
        }
-        if (strlen(buff)) {
+        return match_records(hash, buff, strlen(buff), mod, not);
-                type = filter_parse_regex(buff, strlen(buff), &search, &not);
-                search_len = strlen(search);
-        }
-        mutex_lock(&ftrace_lock);
-        do_for_each_ftrace_rec(pg, rec) {
-                if (rec->flags & FTRACE_FL_FAILED)
-                        continue;
-                if (ftrace_match_module_record(rec, mod,
-                                               search, search_len, type)) {
-                        if (not)
-                                rec->flags &= ~flag;
-                        else
-                                rec->flags |= flag;
-                        found = 1;
-                }
-                if (enable && (rec->flags & FTRACE_FL_FILTER))
-                        ftrace_filtered = 1;
-        } while_for_each_ftrace_rec();
-        mutex_unlock(&ftrace_lock);
-        return found;
 }
 /*
@@ -1886,9 +2470,11 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable)
 */
 static int
-ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
+ftrace_mod_callback(struct ftrace_hash *hash,
+                    char *func, char *cmd, char *param, int enable)
 {
        char *mod;
+        int ret = -EINVAL;
        /*
         * cmd == 'mod' because we only registered this func
@@ -1900,15 +2486,19 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
        /* we must have a module name */
        if (!param)
-                return -EINVAL;
+                return ret;
        mod = strsep(&param, ":");
        if (!strlen(mod))
-                return -EINVAL;
+                return ret;
-        if (ftrace_match_module_records(func, mod, enable))
+        ret = ftrace_match_module_records(hash, func, mod);
-                return 0;
+        if (!ret)
-        return -EINVAL;
+                ret = -EINVAL;
+        if (ret < 0)
+                return ret;
+        return 0;
 }
 static struct ftrace_func_command ftrace_mod_cmd = {
@@ -1959,6 +2549,7 @@ static int ftrace_probe_registered;
 static void __enable_ftrace_function_probe(void)
 {
+        int ret;
        int i;
        if (ftrace_probe_registered)
@@ -1973,13 +2564,16 @@ static void __enable_ftrace_function_probe(void)
        if (i == FTRACE_FUNC_HASHSIZE)
                return;
-        __register_ftrace_function(&trace_probe_ops);
+        ret = __register_ftrace_function(&trace_probe_ops);
-        ftrace_startup(0);
+        if (!ret)
+                ret = ftrace_startup(&trace_probe_ops, 0);
        ftrace_probe_registered = 1;
 }
 static void __disable_ftrace_function_probe(void)
 {
+        int ret;
        int i;
        if (!ftrace_probe_registered)
@@ -1992,8 +2586,10 @@ static void __disable_ftrace_function_probe(void)
        }
        /* no more funcs left */
-        __unregister_ftrace_function(&trace_probe_ops);
+        ret = __unregister_ftrace_function(&trace_probe_ops);
-        ftrace_shutdown(0);
+        if (!ret)
+                ftrace_shutdown(&trace_probe_ops, 0);
        ftrace_probe_registered = 0;
 }
@@ -2029,12 +2625,13 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
                return -EINVAL;
        mutex_lock(&ftrace_lock);
-        do_for_each_ftrace_rec(pg, rec) {
-                if (rec->flags & FTRACE_FL_FAILED)
+        if (unlikely(ftrace_disabled))
-                        continue;
+                goto out_unlock;
-                if (!ftrace_match_record(rec, search, len, type))
+        do_for_each_ftrace_rec(pg, rec) {
+                if (!ftrace_match_record(rec, NULL, search, len, type))
                        continue;
                entry = kmalloc(sizeof(*entry), GFP_KERNEL);
@@ -2195,7 +2792,8 @@ int unregister_ftrace_command(struct ftrace_func_command *cmd)
        return ret;
 }
-static int ftrace_process_regex(char *buff, int len, int enable)
+static int ftrace_process_regex(struct ftrace_hash *hash,
+                                char *buff, int len, int enable)
 {
        char *func, *command, *next = buff;
        struct ftrace_func_command *p;
@@ -2204,9 +2802,12 @@ static int ftrace_process_regex(char *buff, int len, int enable)
        func = strsep(&next, ":");
        if (!next) {
-                if (ftrace_match_records(func, len, enable))
+                ret = ftrace_match_records(hash, func, len);
-                        return 0;
+                if (!ret)
-                return ret;
+                        ret = -EINVAL;
+                if (ret < 0)
+                        return ret;
+                return 0;
        }
        /* command found */
@@ -2216,7 +2817,7 @@ static int ftrace_process_regex(char *buff, int len, int enable)
        mutex_lock(&ftrace_cmd_mutex);
        list_for_each_entry(p, &ftrace_commands, list) {
                if (strcmp(p->name, command) == 0) {
-                        ret = p->func(func, command, next, enable);
+                        ret = p->func(hash, func, command, next, enable);
                        goto out_unlock;
                }
        }
@@ -2239,6 +2840,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
        mutex_lock(&ftrace_regex_lock);
+        ret = -ENODEV;
+        if (unlikely(ftrace_disabled))
+                goto out_unlock;
        if (file->f_mode & FMODE_READ) {
                struct seq_file *m = file->private_data;
                iter = m->private;
@@ -2250,7 +2855,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
        if (read >= 0 && trace_parser_loaded(parser) &&
            !trace_parser_cont(parser)) {
-                ret = ftrace_process_regex(parser->buffer,
+                ret = ftrace_process_regex(iter->hash, parser->buffer,
                                           parser->idx, enable);
                trace_parser_clear(parser);
                if (ret)
@@ -2278,22 +2883,53 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf,
        return ftrace_regex_write(file, ubuf, cnt, ppos, 0);
 }
-static void
+static int
-ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)
+ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
+                 int reset, int enable)
 {
+        struct ftrace_hash **orig_hash;
+        struct ftrace_hash *hash;
+        int ret;
+        /* All global ops uses the global ops filters */
+        if (ops->flags & FTRACE_OPS_FL_GLOBAL)
+                ops = &global_ops;
        if (unlikely(ftrace_disabled))
-                return;
+                return -ENODEV;
+        if (enable)
+                orig_hash = &ops->filter_hash;
+        else
+                orig_hash = &ops->notrace_hash;
+        hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
+        if (!hash)
+                return -ENOMEM;
        mutex_lock(&ftrace_regex_lock);
        if (reset)
-                ftrace_filter_reset(enable);
+                ftrace_filter_reset(hash);
        if (buf)
-                ftrace_match_records(buf, len, enable);
+                ftrace_match_records(hash, buf, len);
+        mutex_lock(&ftrace_lock);
+        ret = ftrace_hash_move(ops, enable, orig_hash, hash);
+        if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED
+            && ftrace_enabled)
+                ftrace_run_update_code(FTRACE_ENABLE_CALLS);
+        mutex_unlock(&ftrace_lock);
        mutex_unlock(&ftrace_regex_lock);
+        free_ftrace_hash(hash);
+        return ret;
 }
 /**
 * ftrace_set_filter - set a function to filter on in ftrace
+ * @ops - the ops to set the filter with
 * @buf - the string that holds the function filter text.
 * @len - the length of the string.
 * @reset - non zero to reset all filters before applying this filter.
@@ -2301,13 +2937,16 @@ ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)
 * Filters denote which functions should be enabled when tracing is enabled.
 * If @buf is NULL and reset is set, all functions will be enabled for tracing.
 */
-void ftrace_set_filter(unsigned char *buf, int len, int reset)
+void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
+                       int len, int reset)
 {
-        ftrace_set_regex(buf, len, reset, 1);
+        ftrace_set_regex(ops, buf, len, reset, 1);
 }
+EXPORT_SYMBOL_GPL(ftrace_set_filter);
 /**
 * ftrace_set_notrace - set a function to not trace in ftrace
+ * @ops - the ops to set the notrace filter with
 * @buf - the string that holds the function notrace text.
 * @len - the length of the string.
 * @reset - non zero to reset all filters before applying this filter.
@@ -2316,10 +2955,44 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset)
 * is enabled. If @buf is NULL and reset is set, all functions will be enabled
 * for tracing.
 */
-void ftrace_set_notrace(unsigned char *buf, int len, int reset)
+void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
+                        int len, int reset)
 {
-        ftrace_set_regex(buf, len, reset, 0);
+        ftrace_set_regex(ops, buf, len, reset, 0);
 }
+EXPORT_SYMBOL_GPL(ftrace_set_notrace);
+/**
+ * ftrace_set_filter - set a function to filter on in ftrace
+ * @ops - the ops to set the filter with
+ * @buf - the string that holds the function filter text.
+ * @len - the length of the string.
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Filters denote which functions should be enabled when tracing is enabled.
+ * If @buf is NULL and reset is set, all functions will be enabled for tracing.
+ */
+void ftrace_set_global_filter(unsigned char *buf, int len, int reset)
+{
+        ftrace_set_regex(&global_ops, buf, len, reset, 1);
+}
+EXPORT_SYMBOL_GPL(ftrace_set_global_filter);
+/**
+ * ftrace_set_notrace - set a function to not trace in ftrace
+ * @ops - the ops to set the notrace filter with
+ * @buf - the string that holds the function notrace text.
+ * @len - the length of the string.
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Notrace Filters denote which functions should not be enabled when tracing
+ * is enabled. If @buf is NULL and reset is set, all functions will be enabled
+ * for tracing.
+ */
+void ftrace_set_global_notrace(unsigned char *buf, int len, int reset)
+{
+        ftrace_set_regex(&global_ops, buf, len, reset, 0);
+}
+EXPORT_SYMBOL_GPL(ftrace_set_global_notrace);
 /*
 * command line interface to allow users to set filters on boot up.
@@ -2370,22 +3043,23 @@ static void __init set_ftrace_early_graph(char *buf)
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-static void __init set_ftrace_early_filter(char *buf, int enable)
+static void __init
+set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable)
 {
        char *func;
        while (buf) {
                func = strsep(&buf, ",");
-                ftrace_set_regex(func, strlen(func), 0, enable);
+                ftrace_set_regex(ops, func, strlen(func), 0, enable);
        }
 }
 static void __init set_ftrace_early_filters(void)
 {
        if (ftrace_filter_buf[0])
-                set_ftrace_early_filter(ftrace_filter_buf, 1);
+                set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1);
        if (ftrace_notrace_buf[0])
-                set_ftrace_early_filter(ftrace_notrace_buf, 0);
+                set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0);
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
        if (ftrace_graph_buf[0])
                set_ftrace_early_graph(ftrace_graph_buf);
@@ -2393,11 +3067,14 @@ static void __init set_ftrace_early_filters(void)
 }
 static int
-ftrace_regex_release(struct inode *inode, struct file *file, int enable)
+ftrace_regex_release(struct inode *inode, struct file *file)
 {
        struct seq_file *m = (struct seq_file *)file->private_data;
        struct ftrace_iterator *iter;
+        struct ftrace_hash **orig_hash;
        struct trace_parser *parser;
+        int filter_hash;
+        int ret;
        mutex_lock(&ftrace_regex_lock);
        if (file->f_mode & FMODE_READ) {
@@ -2410,33 +3087,35 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
        parser = &iter->parser;
        if (trace_parser_loaded(parser)) {
                parser->buffer[parser->idx] = 0;
-                ftrace_match_records(parser->buffer, parser->idx, enable);
+                ftrace_match_records(iter->hash, parser->buffer, parser->idx);
        }
-        mutex_lock(&ftrace_lock);
-        if (ftrace_start_up && ftrace_enabled)
-                ftrace_run_update_code(FTRACE_ENABLE_CALLS);
-        mutex_unlock(&ftrace_lock);
        trace_parser_put(parser);
+        if (file->f_mode & FMODE_WRITE) {
+                filter_hash = !!(iter->flags & FTRACE_ITER_FILTER);
+                if (filter_hash)
+                        orig_hash = &iter->ops->filter_hash;
+                else
+                        orig_hash = &iter->ops->notrace_hash;
+                mutex_lock(&ftrace_lock);
+                ret = ftrace_hash_move(iter->ops, filter_hash,
+                                       orig_hash, iter->hash);
+                if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED)
+                    && ftrace_enabled)
+                        ftrace_run_update_code(FTRACE_ENABLE_CALLS);
+                mutex_unlock(&ftrace_lock);
+        }
+        free_ftrace_hash(iter->hash);
        kfree(iter);
        mutex_unlock(&ftrace_regex_lock);
        return 0;
 }
-static int
-ftrace_filter_release(struct inode *inode, struct file *file)
-{
-        return ftrace_regex_release(inode, file, 1);
-}
-static int
-ftrace_notrace_release(struct inode *inode, struct file *file)
-{
-        return ftrace_regex_release(inode, file, 0);
-}
 static const struct file_operations ftrace_avail_fops = {
        .open = ftrace_avail_open,
        .read = seq_read,
@@ -2444,8 +3123,8 @@ static const struct file_operations ftrace_avail_fops = {
        .release = seq_release_private,
 };
-static const struct file_operations ftrace_failures_fops = {
+static const struct file_operations ftrace_enabled_fops = {
-        .open = ftrace_failures_open,
+        .open = ftrace_enabled_open,
        .read = seq_read,
        .llseek = seq_lseek,
        .release = seq_release_private,
@@ -2456,7 +3135,7 @@ static const struct file_operations ftrace_filter_fops = {
        .read = seq_read,
        .write = ftrace_filter_write,
        .llseek = ftrace_regex_lseek,
-        .release = ftrace_filter_release,
+        .release = ftrace_regex_release,
 };
 static const struct file_operations ftrace_notrace_fops = {
@@ -2464,7 +3143,7 @@ static const struct file_operations ftrace_notrace_fops = {
        .read = seq_read,
        .write = ftrace_notrace_write,
        .llseek = ftrace_regex_lseek,
-        .release = ftrace_notrace_release,
+        .release = ftrace_regex_release,
 };
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -2573,9 +3252,6 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
        bool exists;
        int i;
-        if (ftrace_disabled)
-                return -ENODEV;
        /* decode regex */
        type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
        if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS)
@@ -2584,12 +3260,18 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
        search_len = strlen(search);
        mutex_lock(&ftrace_lock);
+        if (unlikely(ftrace_disabled)) {
+                mutex_unlock(&ftrace_lock);
+                return -ENODEV;
+        }
        do_for_each_ftrace_rec(pg, rec) {
-                if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
+                if (rec->flags & FTRACE_FL_FREE)
                        continue;
-                if (ftrace_match_record(rec, search, search_len, type)) {
+                if (ftrace_match_record(rec, NULL, search, search_len, type)) {
                        /* if it is in the array */
                        exists = false;
                        for (i = 0; i < *idx; i++) {
@@ -2679,8 +3361,8 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
        trace_create_file("available_filter_functions", 0444,
                        d_tracer, NULL, &ftrace_avail_fops);
-        trace_create_file("failures", 0444,
+        trace_create_file("enabled_functions", 0444,
-                        d_tracer, NULL, &ftrace_failures_fops);
+                        d_tracer, NULL, &ftrace_enabled_fops);
        trace_create_file("set_ftrace_filter", 0644, d_tracer,
                        NULL, &ftrace_filter_fops);
@@ -2703,7 +3385,7 @@ static int ftrace_process_locs(struct module *mod,
 {
        unsigned long *p;
        unsigned long addr;
-        unsigned long flags;
+        unsigned long flags = 0; /* Shut up gcc */
        mutex_lock(&ftrace_lock);
        p = start;
@@ -2720,10 +3402,19 @@ static int ftrace_process_locs(struct module *mod,
                ftrace_record_ip(addr);
        }
-        /* disable interrupts to prevent kstop machine */
+        /*
-        local_irq_save(flags);
+         * We only need to disable interrupts on start up
+         * because we are modifying code that an interrupt
+         * may execute, and the modification is not atomic.
+         * But for modules, nothing runs the code we modify
+         * until we are finished with it, and there's no
+         * reason to cause large interrupt latencies while we do it.
+         */
+        if (!mod)
+                local_irq_save(flags);
        ftrace_update_code(mod);
-        local_irq_restore(flags);
+        if (!mod)
+                local_irq_restore(flags);
        mutex_unlock(&ftrace_lock);
        return 0;
@@ -2735,10 +3426,11 @@ void ftrace_release_mod(struct module *mod)
        struct dyn_ftrace *rec;
        struct ftrace_page *pg;
+        mutex_lock(&ftrace_lock);
        if (ftrace_disabled)
-                return;
+                goto out_unlock;
-        mutex_lock(&ftrace_lock);
        do_for_each_ftrace_rec(pg, rec) {
                if (within_module_core(rec->ip, mod)) {
                        /*
@@ -2749,6 +3441,7 @@ void ftrace_release_mod(struct module *mod)
                        ftrace_free_rec(rec);
                }
        } while_for_each_ftrace_rec();
+ out_unlock:
        mutex_unlock(&ftrace_lock);
 }
@@ -2835,6 +3528,10 @@ void __init ftrace_init(void)
 #else
+static struct ftrace_ops global_ops = {
+        .func                   = ftrace_stub,
+};
 static int __init ftrace_nodyn_init(void)
 {
        ftrace_enabled = 1;
@@ -2845,12 +3542,47 @@ device_initcall(ftrace_nodyn_init);
 static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
 static inline void ftrace_startup_enable(int command) { }
 /* Keep as macros so we do not need to define the commands */
-# define ftrace_startup(command)        do { } while (0)
+# define ftrace_startup(ops, command)                   \
-# define ftrace_shutdown(command)       do { } while (0)
+        ({                                              \
+                (ops)->flags |= FTRACE_OPS_FL_ENABLED;  \
+                0;                                      \
+        })
+# define ftrace_shutdown(ops, command)  do { } while (0)
 # define ftrace_startup_sysctl()        do { } while (0)
 # define ftrace_shutdown_sysctl()       do { } while (0)
+static inline int
+ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
+{
+        return 1;
+}
 #endif /* CONFIG_DYNAMIC_FTRACE */
+static void
+ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
+{
+        struct ftrace_ops *op;
+        if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT)))
+                return;
+        trace_recursion_set(TRACE_INTERNAL_BIT);
+        /*
+         * Some of the ops may be dynamically allocated,
+         * they must be freed after a synchronize_sched().
+         */
+        preempt_disable_notrace();
+        op = rcu_dereference_raw(ftrace_ops_list);
+        while (op != &ftrace_list_end) {
+                if (ftrace_ops_test(op, ip))
+                        op->func(ip, parent_ip);
+                op = rcu_dereference_raw(op->next);
+        };
+        preempt_enable_notrace();
+        trace_recursion_clear(TRACE_INTERNAL_BIT);
+}
 static void clear_ftrace_swapper(void)
 {
        struct task_struct *p;
@@ -3143,19 +3875,23 @@ void ftrace_kill(void)
 */
 int register_ftrace_function(struct ftrace_ops *ops)
 {
-        int ret;
+        int ret = -1;
-        if (unlikely(ftrace_disabled))
-                return -1;
        mutex_lock(&ftrace_lock);
+        if (unlikely(ftrace_disabled))
+                goto out_unlock;
        ret = __register_ftrace_function(ops);
-        ftrace_startup(0);
+        if (!ret)
+                ret = ftrace_startup(ops, 0);
+ out_unlock:
        mutex_unlock(&ftrace_lock);
        return ret;
 }
+EXPORT_SYMBOL_GPL(register_ftrace_function);
 /**
 * unregister_ftrace_function - unregister a function for profiling.
@@ -3169,25 +3905,27 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
        mutex_lock(&ftrace_lock);
        ret = __unregister_ftrace_function(ops);
-        ftrace_shutdown(0);
+        if (!ret)
+                ftrace_shutdown(ops, 0);
        mutex_unlock(&ftrace_lock);
        return ret;
 }
+EXPORT_SYMBOL_GPL(unregister_ftrace_function);
 int
 ftrace_enable_sysctl(struct ctl_table *table, int write,
                     void __user *buffer, size_t *lenp,
                     loff_t *ppos)
 {
-        int ret;
+        int ret = -ENODEV;
-        if (unlikely(ftrace_disabled))
-                return -ENODEV;
        mutex_lock(&ftrace_lock);
-        ret  = proc_dointvec(table, write, buffer, lenp, ppos);
+        if (unlikely(ftrace_disabled))
+                goto out;
+        ret = proc_dointvec(table, write, buffer, lenp, ppos);
        if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
                goto out;
@@ -3199,11 +3937,11 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
                ftrace_startup_sysctl();
                /* we are starting ftrace again */
-                if (ftrace_list != &ftrace_list_end) {
+                if (ftrace_ops_list != &ftrace_list_end) {
-                        if (ftrace_list->next == &ftrace_list_end)
+                        if (ftrace_ops_list->next == &ftrace_list_end)
-                                ftrace_trace_function = ftrace_list->func;
+                                ftrace_trace_function = ftrace_ops_list->func;
                        else
-                                ftrace_trace_function = ftrace_list_func;
+                                ftrace_trace_function = ftrace_ops_list_func;
                }
        } else {
@@ -3392,7 +4130,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
        ftrace_graph_return = retfunc;
        ftrace_graph_entry = entryfunc;
-        ftrace_startup(FTRACE_START_FUNC_RET);
+        ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
 out:
        mutex_unlock(&ftrace_lock);
@@ -3409,7 +4147,7 @@ void unregister_ftrace_graph(void)
        ftrace_graph_active--;
        ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
        ftrace_graph_entry = ftrace_graph_entry_stub;
-        ftrace_shutdown(FTRACE_STOP_FUNC_RET);
+        ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
        unregister_pm_notifier(&ftrace_suspend_notifier);
        unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 0ef7b4b2a1f7..731201bf4acc 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -997,15 +997,21 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
                             unsigned nr_pages)
 {
        struct buffer_page *bpage, *tmp;
-        unsigned long addr;
        LIST_HEAD(pages);
        unsigned i;
        WARN_ON(!nr_pages);
        for (i = 0; i < nr_pages; i++) {
+                struct page *page;
+                /*
+                 * __GFP_NORETRY flag makes sure that the allocation fails
+                 * gracefully without invoking oom-killer and the system is
+                 * not destabilized.
+                 */
                bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
-                                    GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
+                                    GFP_KERNEL | __GFP_NORETRY,
+                                    cpu_to_node(cpu_buffer->cpu));
                if (!bpage)
                        goto free_pages;
@@ -1013,10 +1019,11 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
                list_add(&bpage->list, &pages);
-                addr = __get_free_page(GFP_KERNEL);
+                page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
-                if (!addr)
+                                        GFP_KERNEL | __GFP_NORETRY, 0);
+                if (!page)
                        goto free_pages;
-                bpage->page = (void *)addr;
+                bpage->page = page_address(page);
                rb_init_page(bpage->page);
        }
@@ -1045,7 +1052,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 {
        struct ring_buffer_per_cpu *cpu_buffer;
        struct buffer_page *bpage;
-        unsigned long addr;
+        struct page *page;
        int ret;
        cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
@@ -1067,10 +1074,10 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
        rb_check_bpage(cpu_buffer, bpage);
        cpu_buffer->reader_page = bpage;
-        addr = __get_free_page(GFP_KERNEL);
+        page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0);
-        if (!addr)
+        if (!page)
                goto fail_free_reader;
-        bpage->page = (void *)addr;
+        bpage->page = page_address(page);
        rb_init_page(bpage->page);
        INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
@@ -1314,7 +1321,6 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
        unsigned nr_pages, rm_pages, new_pages;
        struct buffer_page *bpage, *tmp;
        unsigned long buffer_size;
-        unsigned long addr;
        LIST_HEAD(pages);
        int i, cpu;
@@ -1375,16 +1381,24 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
        for_each_buffer_cpu(buffer, cpu) {
                for (i = 0; i < new_pages; i++) {
+                        struct page *page;
+                        /*
+                         * __GFP_NORETRY flag makes sure that the allocation
+                         * fails gracefully without invoking oom-killer and
+                         * the system is not destabilized.
+                         */
                        bpage = kzalloc_node(ALIGN(sizeof(*bpage),
                                                  cache_line_size()),
-                                            GFP_KERNEL, cpu_to_node(cpu));
+                                            GFP_KERNEL | __GFP_NORETRY,
+                                            cpu_to_node(cpu));
                        if (!bpage)
                                goto free_pages;
                        list_add(&bpage->list, &pages);
-                        addr = __get_free_page(GFP_KERNEL);
+                        page = alloc_pages_node(cpu_to_node(cpu),
-                        if (!addr)
+                                                GFP_KERNEL | __GFP_NORETRY, 0);
+                        if (!page)
                                goto free_pages;
-                        bpage->page = (void *)addr;
+                        bpage->page = page_address(page);
                        rb_init_page(bpage->page);
                }
        }
@@ -2216,7 +2230,7 @@ static noinline void trace_recursive_fail(void)
        printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
                    "HC[%lu]:SC[%lu]:NMI[%lu]\n",
-                    current->trace_recursion,
+                    trace_recursion_buffer(),
                    hardirq_count() >> HARDIRQ_SHIFT,
                    softirq_count() >> SOFTIRQ_SHIFT,
                    in_nmi());
@@ -2226,9 +2240,9 @@ static noinline void trace_recursive_fail(void)
 static inline int trace_recursive_lock(void)
 {
-        current->trace_recursion++;
+        trace_recursion_inc();
-        if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
+        if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH))
                return 0;
        trace_recursive_fail();
@@ -2238,9 +2252,9 @@ static inline int trace_recursive_lock(void)
 static inline void trace_recursive_unlock(void)
 {
-        WARN_ON_ONCE(!current->trace_recursion);
+        WARN_ON_ONCE(!trace_recursion_buffer());
-        current->trace_recursion--;
+        trace_recursion_dec();
 }
 #else
@@ -3730,16 +3744,17 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
 * Returns:
 *  The page allocated, or NULL on error.
 */
-void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
+void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
 {
        struct buffer_data_page *bpage;
-        unsigned long addr;
+        struct page *page;
-        addr = __get_free_page(GFP_KERNEL);
+        page = alloc_pages_node(cpu_to_node(cpu),
-        if (!addr)
+                                GFP_KERNEL | __GFP_NORETRY, 0);
+        if (!page)
                return NULL;
-        bpage = (void *)addr;
+        bpage = page_address(page);
        rb_init_page(bpage);
@@ -3978,20 +3993,11 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
                size_t cnt, loff_t *ppos)
 {
        unsigned long *p = filp->private_data;
-        char buf[64];
        unsigned long val;
        int ret;
-        if (cnt >= sizeof(buf))
+        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
-                return -EINVAL;
+        if (ret)
-        if (copy_from_user(&buf, ubuf, cnt))
-                return -EFAULT;
-        buf[cnt] = 0;
-        ret = strict_strtoul(buf, 10, &val);
-        if (ret < 0)
                return ret;
        if (val)
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 302f8a614635..a5457d577b98 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -106,7 +106,7 @@ static enum event_status read_page(int cpu)
        int inc;
        int i;
-        bpage = ring_buffer_alloc_read_page(buffer);
+        bpage = ring_buffer_alloc_read_page(buffer, cpu);
        if (!bpage)
                return EVENT_DROPPED;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1cb49be7c7fb..e5df02c69b1d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -343,26 +343,27 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
 static int trace_stop_count;
 static DEFINE_SPINLOCK(tracing_start_lock);
+static void wakeup_work_handler(struct work_struct *work)
+{
+        wake_up(&trace_wait);
+}
+static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler);
 /**
 * trace_wake_up - wake up tasks waiting for trace input
 *
- * Simply wakes up any task that is blocked on the trace_wait
+ * Schedules a delayed work to wake up any task that is blocked on the
- * queue. These is used with trace_poll for tasks polling the trace.
+ * trace_wait queue. These is used with trace_poll for tasks polling the
+ * trace.
 */
 void trace_wake_up(void)
 {
-        int cpu;
+        const unsigned long delay = msecs_to_jiffies(2);
        if (trace_flags & TRACE_ITER_BLOCK)
                return;
-        /*
+        schedule_delayed_work(&wakeup_work, delay);
-         * The runqueue_is_locked() can fail, but this is the best we
-         * have for now:
-         */
-        cpu = get_cpu();
-        if (!runqueue_is_locked(cpu))
-                wake_up(&trace_wait);
-        put_cpu();
 }
 static int __init set_buf_size(char *str)
@@ -424,6 +425,7 @@ static const char *trace_options[] = {
        "graph-time",
        "record-cmd",
        "overwrite",
+        "disable_on_free",
        NULL
 };
@@ -1191,6 +1193,18 @@ void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
+void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer,
+                                            struct ring_buffer_event *event,
+                                            unsigned long flags, int pc,
+                                            struct pt_regs *regs)
+{
+        ring_buffer_unlock_commit(buffer, event);
+        ftrace_trace_stack_regs(buffer, flags, 0, pc, regs);
+        ftrace_trace_userstack(buffer, flags, pc);
+}
+EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit_regs);
 void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
                                         struct ring_buffer_event *event)
 {
@@ -1234,30 +1248,103 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
 }
 #ifdef CONFIG_STACKTRACE
+#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long))
+struct ftrace_stack {
+        unsigned long           calls[FTRACE_STACK_MAX_ENTRIES];
+};
+static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack);
+static DEFINE_PER_CPU(int, ftrace_stack_reserve);
 static void __ftrace_trace_stack(struct ring_buffer *buffer,
                                 unsigned long flags,
-                                 int skip, int pc)
+                                 int skip, int pc, struct pt_regs *regs)
 {
        struct ftrace_event_call *call = &event_kernel_stack;
        struct ring_buffer_event *event;
        struct stack_entry *entry;
        struct stack_trace trace;
+        int use_stack;
+        int size = FTRACE_STACK_ENTRIES;
+        trace.nr_entries        = 0;
+        trace.skip              = skip;
+        /*
+         * Since events can happen in NMIs there's no safe way to
+         * use the per cpu ftrace_stacks. We reserve it and if an interrupt
+         * or NMI comes in, it will just have to use the default
+         * FTRACE_STACK_SIZE.
+         */
+        preempt_disable_notrace();
+        use_stack = ++__get_cpu_var(ftrace_stack_reserve);
+        /*
+         * We don't need any atomic variables, just a barrier.
+         * If an interrupt comes in, we don't care, because it would
+         * have exited and put the counter back to what we want.
+         * We just need a barrier to keep gcc from moving things
+         * around.
+         */
+        barrier();
+        if (use_stack == 1) {
+                trace.entries           = &__get_cpu_var(ftrace_stack).calls[0];
+                trace.max_entries       = FTRACE_STACK_MAX_ENTRIES;
+                if (regs)
+                        save_stack_trace_regs(regs, &trace);
+                else
+                        save_stack_trace(&trace);
+                if (trace.nr_entries > size)
+                        size = trace.nr_entries;
+        } else
+                /* From now on, use_stack is a boolean */
+                use_stack = 0;
+        size *= sizeof(unsigned long);
        event = trace_buffer_lock_reserve(buffer, TRACE_STACK,
-                                          sizeof(*entry), flags, pc);
+                                          sizeof(*entry) + size, flags, pc);
        if (!event)
-                return;
+                goto out;
-        entry   = ring_buffer_event_data(event);
+        entry = ring_buffer_event_data(event);
-        memset(&entry->caller, 0, sizeof(entry->caller));
-        trace.nr_entries        = 0;
+        memset(&entry->caller, 0, size);
-        trace.max_entries       = FTRACE_STACK_ENTRIES;
-        trace.skip              = skip;
+        if (use_stack)
-        trace.entries           = entry->caller;
+                memcpy(&entry->caller, trace.entries,
+                       trace.nr_entries * sizeof(unsigned long));
+        else {
+                trace.max_entries       = FTRACE_STACK_ENTRIES;
+                trace.entries           = entry->caller;
+                if (regs)
+                        save_stack_trace_regs(regs, &trace);
+                else
+                        save_stack_trace(&trace);
+        }
+        entry->size = trace.nr_entries;
-        save_stack_trace(&trace);
        if (!filter_check_discard(call, entry, buffer, event))
                ring_buffer_unlock_commit(buffer, event);
+ out:
+        /* Again, don't let gcc optimize things here */
+        barrier();
+        __get_cpu_var(ftrace_stack_reserve)--;
+        preempt_enable_notrace();
+}
+void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags,
+                             int skip, int pc, struct pt_regs *regs)
+{
+        if (!(trace_flags & TRACE_ITER_STACKTRACE))
+                return;
+        __ftrace_trace_stack(buffer, flags, skip, pc, regs);
 }
 void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
@@ -1266,13 +1353,13 @@ void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
        if (!(trace_flags & TRACE_ITER_STACKTRACE))
                return;
-        __ftrace_trace_stack(buffer, flags, skip, pc);
+        __ftrace_trace_stack(buffer, flags, skip, pc, NULL);
 }
 void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
                   int pc)
 {
-        __ftrace_trace_stack(tr->buffer, flags, skip, pc);
+        __ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL);
 }
 /**
@@ -1288,7 +1375,7 @@ void trace_dump_stack(void)
        local_save_flags(flags);
        /* skipping 3 traces, seems to get us at the caller of this function */
-        __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count());
+        __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL);
 }
 static DEFINE_PER_CPU(int, user_stack_count);
@@ -1536,7 +1623,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
        ftrace_enable_cpu();
-        return event ? ring_buffer_event_data(event) : NULL;
+        if (event) {
+                iter->ent_size = ring_buffer_event_length(event);
+                return ring_buffer_event_data(event);
+        }
+        iter->ent_size = 0;
+        return NULL;
 }
 static struct trace_entry *
@@ -2014,9 +2106,10 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
 {
        enum print_line_t ret;
-        if (iter->lost_events)
+        if (iter->lost_events &&
-                trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
+            !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
-                                 iter->cpu, iter->lost_events);
+                                 iter->cpu, iter->lost_events))
+                return TRACE_TYPE_PARTIAL_LINE;
        if (iter->trace && iter->trace->print_line) {
                ret = iter->trace->print_line(iter);
@@ -2050,6 +2143,9 @@ void trace_default_header(struct seq_file *m)
 {
        struct trace_iterator *iter = m->private;
+        if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
+                return;
        if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
                /* print nothing if the buffers are empty */
                if (trace_empty(iter))
@@ -2700,20 +2796,11 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
                   size_t cnt, loff_t *ppos)
 {
        struct trace_array *tr = filp->private_data;
-        char buf[64];
        unsigned long val;
        int ret;
-        if (cnt >= sizeof(buf))
+        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
-                return -EINVAL;
+        if (ret)
-        if (copy_from_user(&buf, ubuf, cnt))
-                return -EFAULT;
-        buf[cnt] = 0;
-        ret = strict_strtoul(buf, 10, &val);
-        if (ret < 0)
                return ret;
        val = !!val;
@@ -2766,7 +2853,7 @@ int tracer_init(struct tracer *t, struct trace_array *tr)
        return t->init(tr);
 }
-static int tracing_resize_ring_buffer(unsigned long size)
+static int __tracing_resize_ring_buffer(unsigned long size)
 {
        int ret;
@@ -2818,6 +2905,41 @@ static int tracing_resize_ring_buffer(unsigned long size)
        return ret;
 }
+static ssize_t tracing_resize_ring_buffer(unsigned long size)
+{
+        int cpu, ret = size;
+        mutex_lock(&trace_types_lock);
+        tracing_stop();
+        /* disable all cpu buffers */
+        for_each_tracing_cpu(cpu) {
+                if (global_trace.data[cpu])
+                        atomic_inc(&global_trace.data[cpu]->disabled);
+                if (max_tr.data[cpu])
+                        atomic_inc(&max_tr.data[cpu]->disabled);
+        }
+        if (size != global_trace.entries)
+                ret = __tracing_resize_ring_buffer(size);
+        if (ret < 0)
+                ret = -ENOMEM;
+        for_each_tracing_cpu(cpu) {
+                if (global_trace.data[cpu])
+                        atomic_dec(&global_trace.data[cpu]->disabled);
+                if (max_tr.data[cpu])
+                        atomic_dec(&max_tr.data[cpu]->disabled);
+        }
+        tracing_start();
+        mutex_unlock(&trace_types_lock);
+        return ret;
+}
 /**
 * tracing_update_buffers - used by tracing facility to expand ring buffers
@@ -2835,7 +2957,7 @@ int tracing_update_buffers(void)
        mutex_lock(&trace_types_lock);
        if (!ring_buffer_expanded)
-                ret = tracing_resize_ring_buffer(trace_buf_size);
+                ret = __tracing_resize_ring_buffer(trace_buf_size);
        mutex_unlock(&trace_types_lock);
        return ret;
@@ -2859,7 +2981,7 @@ static int tracing_set_tracer(const char *buf)
        mutex_lock(&trace_types_lock);
        if (!ring_buffer_expanded) {
-                ret = tracing_resize_ring_buffer(trace_buf_size);
+                ret = __tracing_resize_ring_buffer(trace_buf_size);
                if (ret < 0)
                        goto out;
                ret = 0;
@@ -2965,20 +3087,11 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
                      size_t cnt, loff_t *ppos)
 {
        unsigned long *ptr = filp->private_data;
-        char buf[64];
        unsigned long val;
        int ret;
-        if (cnt >= sizeof(buf))
+        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
-                return -EINVAL;
+        if (ret)
-        if (copy_from_user(&buf, ubuf, cnt))
-                return -EFAULT;
-        buf[cnt] = 0;
-        ret = strict_strtoul(buf, 10, &val);
-        if (ret < 0)
                return ret;
        *ptr = val * 1000;
@@ -3230,6 +3343,14 @@ waitagain:
                if (iter->seq.len >= cnt)
                        break;
+                /*
+                 * Setting the full flag means we reached the trace_seq buffer
+                 * size and we should leave by partial output condition above.
+                 * One of the trace_seq_* functions is not used properly.
+                 */
+                WARN_ONCE(iter->seq.full, "full flag set for trace type %d",
+                          iter->ent->type);
        }
        trace_access_unlock(iter->cpu_file);
        trace_event_read_unlock();
@@ -3425,67 +3546,54 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
                      size_t cnt, loff_t *ppos)
 {
        unsigned long val;
-        char buf[64];
+        int ret;
-        int ret, cpu;
-        if (cnt >= sizeof(buf))
-                return -EINVAL;
-        if (copy_from_user(&buf, ubuf, cnt))
-                return -EFAULT;
-        buf[cnt] = 0;
-        ret = strict_strtoul(buf, 10, &val);
+        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
-        if (ret < 0)
+        if (ret)
                return ret;
        /* must have at least 1 entry */
        if (!val)
                return -EINVAL;
-        mutex_lock(&trace_types_lock);
-        tracing_stop();
-        /* disable all cpu buffers */
-        for_each_tracing_cpu(cpu) {
-                if (global_trace.data[cpu])
-                        atomic_inc(&global_trace.data[cpu]->disabled);
-                if (max_tr.data[cpu])
-                        atomic_inc(&max_tr.data[cpu]->disabled);
-        }
        /* value is in KB */
        val <<= 10;
-        if (val != global_trace.entries) {
+        ret = tracing_resize_ring_buffer(val);
-                ret = tracing_resize_ring_buffer(val);
+        if (ret < 0)
-                if (ret < 0) {
+                return ret;
-                        cnt = ret;
-                        goto out;
-                }
-        }
        *ppos += cnt;
-        /* If check pages failed, return ENOMEM */
+        return cnt;
-        if (tracing_disabled)
+}
-                cnt = -ENOMEM;
- out:
-        for_each_tracing_cpu(cpu) {
-                if (global_trace.data[cpu])
-                        atomic_dec(&global_trace.data[cpu]->disabled);
-                if (max_tr.data[cpu])
-                        atomic_dec(&max_tr.data[cpu]->disabled);
-        }
-        tracing_start();
+static ssize_t
-        mutex_unlock(&trace_types_lock);
+tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
+                          size_t cnt, loff_t *ppos)
+{
+        /*
+         * There is no need to read what the user has written, this function
+         * is just to make sure that there is no error when "echo" is used
+         */
+        *ppos += cnt;
        return cnt;
 }
+static int
+tracing_free_buffer_release(struct inode *inode, struct file *filp)
+{
+        /* disable tracing ? */
+        if (trace_flags & TRACE_ITER_STOP_ON_FREE)
+                tracing_off();
+        /* resize the ring buffer to 0 */
+        tracing_resize_ring_buffer(0);
+        return 0;
+}
 static int mark_printk(const char *fmt, ...)
 {
        int ret;
@@ -3631,6 +3739,11 @@ static const struct file_operations tracing_entries_fops = {
        .llseek         = generic_file_llseek,
 };
+static const struct file_operations tracing_free_buffer_fops = {
+        .write          = tracing_free_buffer_write,
+        .release        = tracing_free_buffer_release,
+};
 static const struct file_operations tracing_mark_fops = {
        .open           = tracing_open_generic,
        .write          = tracing_mark_write,
@@ -3687,7 +3800,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
                return 0;
        if (!info->spare)
-                info->spare = ring_buffer_alloc_read_page(info->tr->buffer);
+                info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu);
        if (!info->spare)
                return -ENOMEM;
@@ -3844,7 +3957,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                ref->ref = 1;
                ref->buffer = info->tr->buffer;
-                ref->page = ring_buffer_alloc_read_page(ref->buffer);
+                ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu);
                if (!ref->page) {
                        kfree(ref);
                        break;
@@ -3853,8 +3966,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                r = ring_buffer_read_page(ref->buffer, &ref->page,
                                          len, info->cpu, 1);
                if (r < 0) {
-                        ring_buffer_free_read_page(ref->buffer,
+                        ring_buffer_free_read_page(ref->buffer, ref->page);
-                                                   ref->page);
                        kfree(ref);
                        break;
                }
@@ -4090,19 +4202,10 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
 {
        struct trace_option_dentry *topt = filp->private_data;
        unsigned long val;
-        char buf[64];
        int ret;
-        if (cnt >= sizeof(buf))
+        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
-                return -EINVAL;
+        if (ret)
-        if (copy_from_user(&buf, ubuf, cnt))
-                return -EFAULT;
-        buf[cnt] = 0;
-        ret = strict_strtoul(buf, 10, &val);
-        if (ret < 0)
                return ret;
        if (val != 0 && val != 1)
@@ -4150,20 +4253,11 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
                         loff_t *ppos)
 {
        long index = (long)filp->private_data;
-        char buf[64];
        unsigned long val;
        int ret;
-        if (cnt >= sizeof(buf))
+        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
-                return -EINVAL;
+        if (ret)
-        if (copy_from_user(&buf, ubuf, cnt))
-                return -EFAULT;
-        buf[cnt] = 0;
-        ret = strict_strtoul(buf, 10, &val);
-        if (ret < 0)
                return ret;
        if (val != 0 && val != 1)
@@ -4356,6 +4450,9 @@ static __init int tracer_init_debugfs(void)
        trace_create_file("buffer_size_kb", 0644, d_tracer,
                        &global_trace, &tracing_entries_fops);
+        trace_create_file("free_buffer", 0644, d_tracer,
+                        &global_trace, &tracing_free_buffer_fops);
        trace_create_file("trace_marker", 0220, d_tracer,
                        NULL, &tracing_mark_fops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5e9dfc6286dd..616846bcfee5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -2,7 +2,7 @@
 #define _LINUX_KERNEL_TRACE_H
 #include <linux/fs.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <linux/sched.h>
 #include <linux/clocksource.h>
 #include <linux/ring_buffer.h>
@@ -278,6 +278,29 @@ struct tracer {
 };
+/* Only current can touch trace_recursion */
+#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
+#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
+/* Ring buffer has the 10 LSB bits to count */
+#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff)
+/* for function tracing recursion */
+#define TRACE_INTERNAL_BIT              (1<<11)
+#define TRACE_GLOBAL_BIT                (1<<12)
+/*
+ * Abuse of the trace_recursion.
+ * As we need a way to maintain state if we are tracing the function
+ * graph in irq because we want to trace a particular function that
+ * was called in irq context but we have irq tracing off. Since this
+ * can only be modified by current, we can reuse trace_recursion.
+ */
+#define TRACE_IRQ_BIT                   (1<<13)
+#define trace_recursion_set(bit)        do { (current)->trace_recursion |= (bit); } while (0)
+#define trace_recursion_clear(bit)      do { (current)->trace_recursion &= ~(bit); } while (0)
+#define trace_recursion_test(bit)       ((current)->trace_recursion & (bit))
 #define TRACE_PIPE_ALL_CPU      -1
 int tracer_init(struct tracer *t, struct trace_array *tr);
@@ -389,6 +412,9 @@ void update_max_tr_single(struct trace_array *tr,
 void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
                        int skip, int pc);
+void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags,
+                             int skip, int pc, struct pt_regs *regs);
 void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
                            int pc);
@@ -400,6 +426,12 @@ static inline void ftrace_trace_stack(struct ring_buffer *buffer,
 {
 }
+static inline void ftrace_trace_stack_regs(struct ring_buffer *buffer,
+                                           unsigned long flags, int skip,
+                                           int pc, struct pt_regs *regs)
+{
+}
 static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
                                          unsigned long flags, int pc)
 {
@@ -419,6 +451,8 @@ extern void trace_find_cmdline(int pid, char comm[]);
 extern unsigned long ftrace_update_tot_cnt;
 #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
 extern int DYN_FTRACE_TEST_NAME(void);
+#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
+extern int DYN_FTRACE_TEST_NAME2(void);
 #endif
 extern int ring_buffer_expanded;
@@ -505,8 +539,18 @@ static inline int ftrace_graph_addr(unsigned long addr)
                return 1;
        for (i = 0; i < ftrace_graph_count; i++) {
-                if (addr == ftrace_graph_funcs[i])
+                if (addr == ftrace_graph_funcs[i]) {
+                        /*
+                         * If no irqs are to be traced, but a set_graph_function
+                         * is set, and called by an interrupt handler, we still
+                         * want to trace it.
+                         */
+                        if (in_irq())
+                                trace_recursion_set(TRACE_IRQ_BIT);
+                        else
+                                trace_recursion_clear(TRACE_IRQ_BIT);
                        return 1;
+                }
        }
        return 0;
@@ -607,6 +651,7 @@ enum trace_iterator_flags {
        TRACE_ITER_GRAPH_TIME           = 0x80000,
        TRACE_ITER_RECORD_CMD           = 0x100000,
        TRACE_ITER_OVERWRITE            = 0x200000,
+        TRACE_ITER_STOP_ON_FREE         = 0x400000,
 };
 /*
@@ -675,6 +720,7 @@ struct event_subsystem {
        struct dentry           *entry;
        struct event_filter     *filter;
        int                     nr_events;
+        int                     ref_count;
 };
 #define FILTER_PRED_INVALID     ((unsigned short)-1)
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e32744c84d94..93365907f219 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -161,7 +161,8 @@ FTRACE_ENTRY(kernel_stack, stack_entry,
        TRACE_STACK,
        F_STRUCT(
-                __array(        unsigned long,  caller, FTRACE_STACK_ENTRIES    )
+                __field(        int,            size    )
+                __dynamic_array(unsigned long,  caller  )
        ),
        F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 2fe110341359..581876f9f387 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -244,6 +244,35 @@ static void ftrace_clear_events(void)
        mutex_unlock(&event_mutex);
 }
+static void __put_system(struct event_subsystem *system)
+{
+        struct event_filter *filter = system->filter;
+        WARN_ON_ONCE(system->ref_count == 0);
+        if (--system->ref_count)
+                return;
+        if (filter) {
+                kfree(filter->filter_string);
+                kfree(filter);
+        }
+        kfree(system->name);
+        kfree(system);
+}
+static void __get_system(struct event_subsystem *system)
+{
+        WARN_ON_ONCE(system->ref_count == 0);
+        system->ref_count++;
+}
+static void put_system(struct event_subsystem *system)
+{
+        mutex_lock(&event_mutex);
+        __put_system(system);
+        mutex_unlock(&event_mutex);
+}
 /*
 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
 */
@@ -486,20 +515,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
                   loff_t *ppos)
 {
        struct ftrace_event_call *call = filp->private_data;
-        char buf[64];
        unsigned long val;
        int ret;
-        if (cnt >= sizeof(buf))
+        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
-                return -EINVAL;
+        if (ret)
-        if (copy_from_user(&buf, ubuf, cnt))
-                return -EFAULT;
-        buf[cnt] = 0;
-        ret = strict_strtoul(buf, 10, &val);
-        if (ret < 0)
                return ret;
        ret = tracing_update_buffers();
@@ -528,7 +548,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
                   loff_t *ppos)
 {
        const char set_to_char[4] = { '?', '0', '1', 'X' };
-        const char *system = filp->private_data;
+        struct event_subsystem *system = filp->private_data;
        struct ftrace_event_call *call;
        char buf[2];
        int set = 0;
@@ -539,7 +559,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
                if (!call->name || !call->class || !call->class->reg)
                        continue;
-                if (system && strcmp(call->class->system, system) != 0)
+                if (system && strcmp(call->class->system, system->name) != 0)
                        continue;
                /*
@@ -569,21 +589,13 @@ static ssize_t
 system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
                    loff_t *ppos)
 {
-        const char *system = filp->private_data;
+        struct event_subsystem *system = filp->private_data;
+        const char *name = NULL;
        unsigned long val;
-        char buf[64];
        ssize_t ret;
-        if (cnt >= sizeof(buf))
+        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
-                return -EINVAL;
+        if (ret)
-        if (copy_from_user(&buf, ubuf, cnt))
-                return -EFAULT;
-        buf[cnt] = 0;
-        ret = strict_strtoul(buf, 10, &val);
-        if (ret < 0)
                return ret;
        ret = tracing_update_buffers();
@@ -593,7 +605,14 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
        if (val != 0 && val != 1)
                return -EINVAL;
-        ret = __ftrace_set_clr_event(NULL, system, NULL, val);
+        /*
+         * Opening of "enable" adds a ref count to system,
+         * so the name is safe to use.
+         */
+        if (system)
+                name = system->name;
+        ret = __ftrace_set_clr_event(NULL, name, NULL, val);
        if (ret)
                goto out;
@@ -826,6 +845,52 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
        return cnt;
 }
+static LIST_HEAD(event_subsystems);
+static int subsystem_open(struct inode *inode, struct file *filp)
+{
+        struct event_subsystem *system = NULL;
+        int ret;
+        if (!inode->i_private)
+                goto skip_search;
+        /* Make sure the system still exists */
+        mutex_lock(&event_mutex);
+        list_for_each_entry(system, &event_subsystems, list) {
+                if (system == inode->i_private) {
+                        /* Don't open systems with no events */
+                        if (!system->nr_events) {
+                                system = NULL;
+                                break;
+                        }
+                        __get_system(system);
+                        break;
+                }
+        }
+        mutex_unlock(&event_mutex);
+        if (system != inode->i_private)
+                return -ENODEV;
+ skip_search:
+        ret = tracing_open_generic(inode, filp);
+        if (ret < 0 && system)
+                put_system(system);
+        return ret;
+}
+static int subsystem_release(struct inode *inode, struct file *file)
+{
+        struct event_subsystem *system = inode->i_private;
+        if (system)
+                put_system(system);
+        return 0;
+}
 static ssize_t
 subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
                      loff_t *ppos)
@@ -963,17 +1028,19 @@ static const struct file_operations ftrace_event_filter_fops = {
 };
 static const struct file_operations ftrace_subsystem_filter_fops = {
-        .open = tracing_open_generic,
+        .open = subsystem_open,
        .read = subsystem_filter_read,
        .write = subsystem_filter_write,
        .llseek = default_llseek,
+        .release = subsystem_release,
 };
 static const struct file_operations ftrace_system_enable_fops = {
-        .open = tracing_open_generic,
+        .open = subsystem_open,
        .read = system_enable_read,
        .write = system_enable_write,
        .llseek = default_llseek,
+        .release = subsystem_release,
 };
 static const struct file_operations ftrace_show_header_fops = {
@@ -1002,8 +1069,6 @@ static struct dentry *event_trace_events_dir(void)
        return d_events;
 }
-static LIST_HEAD(event_subsystems);
 static struct dentry *
 event_subsystem_dir(const char *name, struct dentry *d_events)
 {
@@ -1013,6 +1078,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
        /* First see if we did not already create this dir */
        list_for_each_entry(system, &event_subsystems, list) {
                if (strcmp(system->name, name) == 0) {
+                        __get_system(system);
                        system->nr_events++;
                        return system->entry;
                }
@@ -1035,6 +1101,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
        }
        system->nr_events = 1;
+        system->ref_count = 1;
        system->name = kstrdup(name, GFP_KERNEL);
        if (!system->name) {
                debugfs_remove(system->entry);
@@ -1062,8 +1129,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
                           "'%s/filter' entry\n", name);
        }
-        trace_create_file("enable", 0644, system->entry,
+        trace_create_file("enable", 0644, system->entry, system,
-                          (void *)system->name,
                          &ftrace_system_enable_fops);
        return system->entry;
@@ -1184,16 +1250,9 @@ static void remove_subsystem_dir(const char *name)
        list_for_each_entry(system, &event_subsystems, list) {
                if (strcmp(system->name, name) == 0) {
                        if (!--system->nr_events) {
-                                struct event_filter *filter = system->filter;
                                debugfs_remove_recursive(system->entry);
                                list_del(&system->list);
-                                if (filter) {
+                                __put_system(system);
-                                        kfree(filter->filter_string);
-                                        kfree(filter);
-                                }
-                                kfree(system->name);
-                                kfree(system);
                        }
                        break;
                }
@@ -1657,7 +1716,12 @@ static struct ftrace_ops trace_ops __initdata  =
 static __init void event_trace_self_test_with_function(void)
 {
-        register_ftrace_function(&trace_ops);
+        int ret;
+        ret = register_ftrace_function(&trace_ops);
+        if (WARN_ON(ret < 0)) {
+                pr_info("Failed to enable function tracer for event tests\n");
+                return;
+        }
        pr_info("Running tests again, along with the function tracer\n");
        event_trace_self_tests();
        unregister_ftrace_function(&trace_ops);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 8008ddcfbf20..256764ecccd6 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1886,6 +1886,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
        mutex_lock(&event_mutex);
+        /* Make sure the system still has events */
+        if (!system->nr_events) {
+                err = -ENODEV;
+                goto out_unlock;
+        }
        if (!strcmp(strstrip(filter_string), "0")) {
                filter_free_subsystem_preds(system);
                remove_filter_string(system->filter);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 16aee4d44e8f..c7b0c6a7db09 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -149,11 +149,13 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
 static struct ftrace_ops trace_ops __read_mostly =
 {
        .func = function_trace_call,
+        .flags = FTRACE_OPS_FL_GLOBAL,
 };
 static struct ftrace_ops trace_stack_ops __read_mostly =
 {
        .func = function_stack_trace_call,
+        .flags = FTRACE_OPS_FL_GLOBAL,
 };
 /* Our two options */
@@ -322,7 +324,8 @@ ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
 }
 static int
-ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
+ftrace_trace_onoff_callback(struct ftrace_hash *hash,
+                            char *glob, char *cmd, char *param, int enable)
 {
        struct ftrace_probe_ops *ops;
        void *count = (void *)-1;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 962cdb24ed81..a7d2a4c653d8 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -74,6 +74,20 @@ static struct tracer_flags tracer_flags = {
 static struct trace_array *graph_array;
+/*
+ * DURATION column is being also used to display IRQ signs,
+ * following values are used by print_graph_irq and others
+ * to fill in space into DURATION column.
+ */
+enum {
+        DURATION_FILL_FULL  = -1,
+        DURATION_FILL_START = -2,
+        DURATION_FILL_END   = -3,
+};
+static enum print_line_t
+print_graph_duration(unsigned long long duration, struct trace_seq *s,
+                     u32 flags);
 /* Add a function return address to the trace stack on thread info.*/
 int
@@ -213,7 +227,7 @@ int __trace_graph_entry(struct trace_array *tr,
 static inline int ftrace_graph_ignore_irqs(void)
 {
-        if (!ftrace_graph_skip_irqs)
+        if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT))
                return 0;
        return in_irq();
@@ -577,32 +591,6 @@ get_return_for_leaf(struct trace_iterator *iter,
        return next;
 }
-/* Signal a overhead of time execution to the output */
-static int
-print_graph_overhead(unsigned long long duration, struct trace_seq *s,
-                     u32 flags)
-{
-        /* If duration disappear, we don't need anything */
-        if (!(flags & TRACE_GRAPH_PRINT_DURATION))
-                return 1;
-        /* Non nested entry or return */
-        if (duration == -1)
-                return trace_seq_printf(s, "  ");
-        if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
-                /* Duration exceeded 100 msecs */
-                if (duration > 100000ULL)
-                        return trace_seq_printf(s, "! ");
-                /* Duration exceeded 10 msecs */
-                if (duration > 10000ULL)
-                        return trace_seq_printf(s, "+ ");
-        }
-        return trace_seq_printf(s, "  ");
-}
 static int print_graph_abs_time(u64 t, struct trace_seq *s)
 {
        unsigned long usecs_rem;
@@ -625,34 +613,36 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
                addr >= (unsigned long)__irqentry_text_end)
                return TRACE_TYPE_UNHANDLED;
-        /* Absolute time */
+        if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
-        if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
+                /* Absolute time */
-                ret = print_graph_abs_time(iter->ts, s);
+                if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
-                if (!ret)
+                        ret = print_graph_abs_time(iter->ts, s);
-                        return TRACE_TYPE_PARTIAL_LINE;
+                        if (!ret)
-        }
+                                return TRACE_TYPE_PARTIAL_LINE;
+                }
-        /* Cpu */
+                /* Cpu */
-        if (flags & TRACE_GRAPH_PRINT_CPU) {
+                if (flags & TRACE_GRAPH_PRINT_CPU) {
-                ret = print_graph_cpu(s, cpu);
+                        ret = print_graph_cpu(s, cpu);
-                if (ret == TRACE_TYPE_PARTIAL_LINE)
+                        if (ret == TRACE_TYPE_PARTIAL_LINE)
-                        return TRACE_TYPE_PARTIAL_LINE;
+                                return TRACE_TYPE_PARTIAL_LINE;
-        }
+                }
-        /* Proc */
+                /* Proc */
-        if (flags & TRACE_GRAPH_PRINT_PROC) {
+                if (flags & TRACE_GRAPH_PRINT_PROC) {
-                ret = print_graph_proc(s, pid);
+                        ret = print_graph_proc(s, pid);
-                if (ret == TRACE_TYPE_PARTIAL_LINE)
+                        if (ret == TRACE_TYPE_PARTIAL_LINE)
-                        return TRACE_TYPE_PARTIAL_LINE;
+                                return TRACE_TYPE_PARTIAL_LINE;
-                ret = trace_seq_printf(s, " | ");
+                        ret = trace_seq_printf(s, " | ");
-                if (!ret)
+                        if (!ret)
-                        return TRACE_TYPE_PARTIAL_LINE;
+                                return TRACE_TYPE_PARTIAL_LINE;
+                }
        }
        /* No overhead */
-        ret = print_graph_overhead(-1, s, flags);
+        ret = print_graph_duration(DURATION_FILL_START, s, flags);
-        if (!ret)
+        if (ret != TRACE_TYPE_HANDLED)
-                return TRACE_TYPE_PARTIAL_LINE;
+                return ret;
        if (type == TRACE_GRAPH_ENT)
                ret = trace_seq_printf(s, "==========>");
@@ -662,9 +652,10 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
-        /* Don't close the duration column if haven't one */
+        ret = print_graph_duration(DURATION_FILL_END, s, flags);
-        if (flags & TRACE_GRAPH_PRINT_DURATION)
+        if (ret != TRACE_TYPE_HANDLED)
-                trace_seq_printf(s, " |");
+                return ret;
        ret = trace_seq_printf(s, "\n");
        if (!ret)
@@ -716,9 +707,49 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
 }
 static enum print_line_t
-print_graph_duration(unsigned long long duration, struct trace_seq *s)
+print_graph_duration(unsigned long long duration, struct trace_seq *s,
+                     u32 flags)
 {
-        int ret;
+        int ret = -1;
+        if (!(flags & TRACE_GRAPH_PRINT_DURATION) ||
+            !(trace_flags & TRACE_ITER_CONTEXT_INFO))
+                        return TRACE_TYPE_HANDLED;
+        /* No real adata, just filling the column with spaces */
+        switch (duration) {
+        case DURATION_FILL_FULL:
+                ret = trace_seq_printf(s, "              |  ");
+                return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+        case DURATION_FILL_START:
+                ret = trace_seq_printf(s, "  ");
+                return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+        case DURATION_FILL_END:
+                ret = trace_seq_printf(s, " |");
+                return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+        }
+        /* Signal a overhead of time execution to the output */
+        if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
+                /* Duration exceeded 100 msecs */
+                if (duration > 100000ULL)
+                        ret = trace_seq_printf(s, "! ");
+                /* Duration exceeded 10 msecs */
+                else if (duration > 10000ULL)
+                        ret = trace_seq_printf(s, "+ ");
+        }
+        /*
+         * The -1 means we either did not exceed the duration tresholds
+         * or we dont want to print out the overhead. Either way we need
+         * to fill out the space.
+         */
+        if (ret == -1)
+                ret = trace_seq_printf(s, "  ");
+        /* Catching here any failure happenned above */
+        if (!ret)
+                return TRACE_TYPE_PARTIAL_LINE;
        ret = trace_print_graph_duration(duration, s);
        if (ret != TRACE_TYPE_HANDLED)
@@ -767,18 +798,11 @@ print_graph_entry_leaf(struct trace_iterator *iter,
                        cpu_data->enter_funcs[call->depth] = 0;
        }
-        /* Overhead */
+        /* Overhead and duration */
-        ret = print_graph_overhead(duration, s, flags);
+        ret = print_graph_duration(duration, s, flags);
-        if (!ret)
+        if (ret == TRACE_TYPE_PARTIAL_LINE)
                return TRACE_TYPE_PARTIAL_LINE;
-        /* Duration */
-        if (flags & TRACE_GRAPH_PRINT_DURATION) {
-                ret = print_graph_duration(duration, s);
-                if (ret == TRACE_TYPE_PARTIAL_LINE)
-                        return TRACE_TYPE_PARTIAL_LINE;
-        }
        /* Function */
        for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
                ret = trace_seq_printf(s, " ");
@@ -815,17 +839,10 @@ print_graph_entry_nested(struct trace_iterator *iter,
                        cpu_data->enter_funcs[call->depth] = call->func;
        }
-        /* No overhead */
-        ret = print_graph_overhead(-1, s, flags);
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
        /* No time */
-        if (flags & TRACE_GRAPH_PRINT_DURATION) {
+        ret = print_graph_duration(DURATION_FILL_FULL, s, flags);
-                ret = trace_seq_printf(s, "            |  ");
+        if (ret != TRACE_TYPE_HANDLED)
-                if (!ret)
+                return ret;
-                        return TRACE_TYPE_PARTIAL_LINE;
-        }
        /* Function */
        for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
@@ -865,6 +882,9 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
                        return TRACE_TYPE_PARTIAL_LINE;
        }
+        if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
+                return 0;
        /* Absolute time */
        if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
                ret = print_graph_abs_time(iter->ts, s);
@@ -1078,18 +1098,11 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
        if (print_graph_prologue(iter, s, 0, 0, flags))
                return TRACE_TYPE_PARTIAL_LINE;
-        /* Overhead */
+        /* Overhead and duration */
-        ret = print_graph_overhead(duration, s, flags);
+        ret = print_graph_duration(duration, s, flags);
-        if (!ret)
+        if (ret == TRACE_TYPE_PARTIAL_LINE)
                return TRACE_TYPE_PARTIAL_LINE;
-        /* Duration */
-        if (flags & TRACE_GRAPH_PRINT_DURATION) {
-                ret = print_graph_duration(duration, s);
-                if (ret == TRACE_TYPE_PARTIAL_LINE)
-                        return TRACE_TYPE_PARTIAL_LINE;
-        }
        /* Closing brace */
        for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
                ret = trace_seq_printf(s, " ");
@@ -1146,17 +1159,10 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
        if (print_graph_prologue(iter, s, 0, 0, flags))
                return TRACE_TYPE_PARTIAL_LINE;
-        /* No overhead */
-        ret = print_graph_overhead(-1, s, flags);
-        if (!ret)
-                return TRACE_TYPE_PARTIAL_LINE;
        /* No time */
-        if (flags & TRACE_GRAPH_PRINT_DURATION) {
+        ret = print_graph_duration(DURATION_FILL_FULL, s, flags);
-                ret = trace_seq_printf(s, "            |  ");
+        if (ret != TRACE_TYPE_HANDLED)
-                if (!ret)
+                return ret;
-                        return TRACE_TYPE_PARTIAL_LINE;
-        }
        /* Indentation */
        if (depth > 0)
@@ -1207,7 +1213,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
 enum print_line_t
-__print_graph_function_flags(struct trace_iterator *iter, u32 flags)
+print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 {
        struct ftrace_graph_ent_entry *field;
        struct fgraph_data *data = iter->private;
@@ -1270,18 +1276,7 @@ __print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 static enum print_line_t
 print_graph_function(struct trace_iterator *iter)
 {
-        return __print_graph_function_flags(iter, tracer_flags.val);
+        return print_graph_function_flags(iter, tracer_flags.val);
-}
-enum print_line_t print_graph_function_flags(struct trace_iterator *iter,
-                                             u32 flags)
-{
-        if (trace_flags & TRACE_ITER_LATENCY_FMT)
-                flags |= TRACE_GRAPH_PRINT_DURATION;
-        else
-                flags |= TRACE_GRAPH_PRINT_ABS_TIME;
-        return __print_graph_function_flags(iter, flags);
 }
 static enum print_line_t
@@ -1309,8 +1304,7 @@ static void print_lat_header(struct seq_file *s, u32 flags)
        seq_printf(s, "#%.*s / _----=> need-resched    \n", size, spaces);
        seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces);
        seq_printf(s, "#%.*s|| / _--=> preempt-depth   \n", size, spaces);
-        seq_printf(s, "#%.*s||| / _-=> lock-depth      \n", size, spaces);
+        seq_printf(s, "#%.*s||| /                      \n", size, spaces);
-        seq_printf(s, "#%.*s|||| /                     \n", size, spaces);
 }
 static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
@@ -1329,7 +1323,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
        if (flags & TRACE_GRAPH_PRINT_PROC)
                seq_printf(s, "  TASK/PID       ");
        if (lat)
-                seq_printf(s, "|||||");
+                seq_printf(s, "||||");
        if (flags & TRACE_GRAPH_PRINT_DURATION)
                seq_printf(s, "  DURATION   ");
        seq_printf(s, "               FUNCTION CALLS\n");
@@ -1343,7 +1337,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
        if (flags & TRACE_GRAPH_PRINT_PROC)
                seq_printf(s, "   |    |        ");
        if (lat)
-                seq_printf(s, "|||||");
+                seq_printf(s, "||||");
        if (flags & TRACE_GRAPH_PRINT_DURATION)
                seq_printf(s, "   |   |      ");
        seq_printf(s, "               |   |   |   |\n");
@@ -1358,15 +1352,16 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags)
 {
        struct trace_iterator *iter = s->private;
+        if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
+                return;
        if (trace_flags & TRACE_ITER_LATENCY_FMT) {
                /* print nothing if the buffers are empty */
                if (trace_empty(iter))
                        return;
                print_trace_header(s, iter);
-                flags |= TRACE_GRAPH_PRINT_DURATION;
+        }
-        } else
-                flags |= TRACE_GRAPH_PRINT_ABS_TIME;
        __print_graph_headers_flags(s, flags);
 }
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index a4969b47afc1..667aa8cc0cfc 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -153,6 +153,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
 static struct ftrace_ops trace_ops __read_mostly =
 {
        .func = irqsoff_tracer_call,
+        .flags = FTRACE_OPS_FL_GLOBAL,
 };
 #endif /* CONFIG_FUNCTION_TRACER */
@@ -225,7 +226,9 @@ static void irqsoff_trace_close(struct trace_iterator *iter)
 }
 #define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \
-                            TRACE_GRAPH_PRINT_PROC)
+                            TRACE_GRAPH_PRINT_PROC | \
+                            TRACE_GRAPH_PRINT_ABS_TIME | \
+                            TRACE_GRAPH_PRINT_DURATION)
 static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
 {
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 35d55a386145..5fb3697bf0e5 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -53,7 +53,6 @@ const char *reserved_field_names[] = {
        "common_preempt_count",
        "common_pid",
        "common_tgid",
-        "common_lock_depth",
        FIELD_STRING_IP,
        FIELD_STRING_RETIP,
        FIELD_STRING_FUNC,
@@ -344,6 +343,14 @@ DEFINE_BASIC_FETCH_FUNCS(deref)
 DEFINE_FETCH_deref(string)
 DEFINE_FETCH_deref(string_size)
+static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data)
+{
+        if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+                update_deref_fetch_param(data->orig.data);
+        else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+                update_symbol_cache(data->orig.data);
+}
 static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
 {
        if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
@@ -378,6 +385,19 @@ DEFINE_BASIC_FETCH_FUNCS(bitfield)
 #define fetch_bitfield_string_size NULL
 static __kprobes void
+update_bitfield_fetch_param(struct bitfield_fetch_param *data)
+{
+        /*
+         * Don't check the bitfield itself, because this must be the
+         * last fetch function.
+         */
+        if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+                update_deref_fetch_param(data->orig.data);
+        else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+                update_symbol_cache(data->orig.data);
+}
+static __kprobes void
 free_bitfield_fetch_param(struct bitfield_fetch_param *data)
 {
        /*
@@ -390,6 +410,7 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data)
                free_symbol_cache(data->orig.data);
        kfree(data);
 }
 /* Default (unsigned long) fetch type */
 #define __DEFAULT_FETCH_TYPE(t) u##t
 #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
@@ -537,6 +558,7 @@ struct probe_arg {
 /* Flags for trace_probe */
 #define TP_FLAG_TRACE   1
 #define TP_FLAG_PROFILE 2
+#define TP_FLAG_REGISTERED 4
 struct trace_probe {
        struct list_head        list;
@@ -556,16 +578,49 @@ struct trace_probe {
        (sizeof(struct probe_arg) * (n)))
-static __kprobes int probe_is_return(struct trace_probe *tp)
+static __kprobes int trace_probe_is_return(struct trace_probe *tp)
 {
        return tp->rp.handler != NULL;
 }
-static __kprobes const char *probe_symbol(struct trace_probe *tp)
+static __kprobes const char *trace_probe_symbol(struct trace_probe *tp)
 {
        return tp->symbol ? tp->symbol : "unknown";
 }
+static __kprobes unsigned long trace_probe_offset(struct trace_probe *tp)
+{
+        return tp->rp.kp.offset;
+}
+static __kprobes bool trace_probe_is_enabled(struct trace_probe *tp)
+{
+        return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE));
+}
+static __kprobes bool trace_probe_is_registered(struct trace_probe *tp)
+{
+        return !!(tp->flags & TP_FLAG_REGISTERED);
+}
+static __kprobes bool trace_probe_has_gone(struct trace_probe *tp)
+{
+        return !!(kprobe_gone(&tp->rp.kp));
+}
+static __kprobes bool trace_probe_within_module(struct trace_probe *tp,
+                                                struct module *mod)
+{
+        int len = strlen(mod->name);
+        const char *name = trace_probe_symbol(tp);
+        return strncmp(mod->name, name, len) == 0 && name[len] == ':';
+}
+static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp)
+{
+        return !!strchr(trace_probe_symbol(tp), ':');
+}
 static int register_probe_event(struct trace_probe *tp);
 static void unregister_probe_event(struct trace_probe *tp);
@@ -647,6 +702,16 @@ error:
        return ERR_PTR(ret);
 }
+static void update_probe_arg(struct probe_arg *arg)
+{
+        if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
+                update_bitfield_fetch_param(arg->fetch.data);
+        else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
+                update_deref_fetch_param(arg->fetch.data);
+        else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
+                update_symbol_cache(arg->fetch.data);
+}
 static void free_probe_arg(struct probe_arg *arg)
 {
        if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
@@ -672,7 +737,7 @@ static void free_trace_probe(struct trace_probe *tp)
        kfree(tp);
 }
-static struct trace_probe *find_probe_event(const char *event,
+static struct trace_probe *find_trace_probe(const char *event,
                                            const char *group)
 {
        struct trace_probe *tp;
@@ -684,13 +749,96 @@ static struct trace_probe *find_probe_event(const char *event,
        return NULL;
 }
+/* Enable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */
+static int enable_trace_probe(struct trace_probe *tp, int flag)
+{
+        int ret = 0;
+        tp->flags |= flag;
+        if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) &&
+            !trace_probe_has_gone(tp)) {
+                if (trace_probe_is_return(tp))
+                        ret = enable_kretprobe(&tp->rp);
+                else
+                        ret = enable_kprobe(&tp->rp.kp);
+        }
+        return ret;
+}
+/* Disable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */
+static void disable_trace_probe(struct trace_probe *tp, int flag)
+{
+        tp->flags &= ~flag;
+        if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) {
+                if (trace_probe_is_return(tp))
+                        disable_kretprobe(&tp->rp);
+                else
+                        disable_kprobe(&tp->rp.kp);
+        }
+}
+/* Internal register function - just handle k*probes and flags */
+static int __register_trace_probe(struct trace_probe *tp)
+{
+        int i, ret;
+        if (trace_probe_is_registered(tp))
+                return -EINVAL;
+        for (i = 0; i < tp->nr_args; i++)
+                update_probe_arg(&tp->args[i]);
+        /* Set/clear disabled flag according to tp->flag */
+        if (trace_probe_is_enabled(tp))
+                tp->rp.kp.flags &= ~KPROBE_FLAG_DISABLED;
+        else
+                tp->rp.kp.flags |= KPROBE_FLAG_DISABLED;
+        if (trace_probe_is_return(tp))
+                ret = register_kretprobe(&tp->rp);
+        else
+                ret = register_kprobe(&tp->rp.kp);
+        if (ret == 0)
+                tp->flags |= TP_FLAG_REGISTERED;
+        else {
+                pr_warning("Could not insert probe at %s+%lu: %d\n",
+                           trace_probe_symbol(tp), trace_probe_offset(tp), ret);
+                if (ret == -ENOENT && trace_probe_is_on_module(tp)) {
+                        pr_warning("This probe might be able to register after"
+                                   "target module is loaded. Continue.\n");
+                        ret = 0;
+                } else if (ret == -EILSEQ) {
+                        pr_warning("Probing address(0x%p) is not an "
+                                   "instruction boundary.\n",
+                                   tp->rp.kp.addr);
+                        ret = -EINVAL;
+                }
+        }
+        return ret;
+}
+/* Internal unregister function - just handle k*probes and flags */
+static void __unregister_trace_probe(struct trace_probe *tp)
+{
+        if (trace_probe_is_registered(tp)) {
+                if (trace_probe_is_return(tp))
+                        unregister_kretprobe(&tp->rp);
+                else
+                        unregister_kprobe(&tp->rp.kp);
+                tp->flags &= ~TP_FLAG_REGISTERED;
+                /* Cleanup kprobe for reuse */
+                if (tp->rp.kp.symbol_name)
+                        tp->rp.kp.addr = NULL;
+        }
+}
 /* Unregister a trace_probe and probe_event: call with locking probe_lock */
 static void unregister_trace_probe(struct trace_probe *tp)
 {
-        if (probe_is_return(tp))
+        __unregister_trace_probe(tp);
-                unregister_kretprobe(&tp->rp);
-        else
-                unregister_kprobe(&tp->rp.kp);
        list_del(&tp->list);
        unregister_probe_event(tp);
 }
@@ -703,41 +851,65 @@ static int register_trace_probe(struct trace_probe *tp)
        mutex_lock(&probe_lock);
-        /* register as an event */
+        /* Delete old (same name) event if exist */
-        old_tp = find_probe_event(tp->call.name, tp->call.class->system);
+        old_tp = find_trace_probe(tp->call.name, tp->call.class->system);
        if (old_tp) {
-                /* delete old event */
                unregister_trace_probe(old_tp);
                free_trace_probe(old_tp);
        }
+        /* Register new event */
        ret = register_probe_event(tp);
        if (ret) {
                pr_warning("Failed to register probe event(%d)\n", ret);
                goto end;
        }
-        tp->rp.kp.flags |= KPROBE_FLAG_DISABLED;
+        /* Register k*probe */
-        if (probe_is_return(tp))
+        ret = __register_trace_probe(tp);
-                ret = register_kretprobe(&tp->rp);
+        if (ret < 0)
-        else
-                ret = register_kprobe(&tp->rp.kp);
-        if (ret) {
-                pr_warning("Could not insert probe(%d)\n", ret);
-                if (ret == -EILSEQ) {
-                        pr_warning("Probing address(0x%p) is not an "
-                                   "instruction boundary.\n",
-                                   tp->rp.kp.addr);
-                        ret = -EINVAL;
-                }
                unregister_probe_event(tp);
-        } else
+        else
                list_add_tail(&tp->list, &probe_list);
 end:
        mutex_unlock(&probe_lock);
        return ret;
 }
+/* Module notifier call back, checking event on the module */
+static int trace_probe_module_callback(struct notifier_block *nb,
+                                       unsigned long val, void *data)
+{
+        struct module *mod = data;
+        struct trace_probe *tp;
+        int ret;
+        if (val != MODULE_STATE_COMING)
+                return NOTIFY_DONE;
+        /* Update probes on coming module */
+        mutex_lock(&probe_lock);
+        list_for_each_entry(tp, &probe_list, list) {
+                if (trace_probe_within_module(tp, mod)) {
+                        __unregister_trace_probe(tp);
+                        ret = __register_trace_probe(tp);
+                        if (ret)
+                                pr_warning("Failed to re-register probe %s on"
+                                           "%s: %d\n",
+                                           tp->call.name, mod->name, ret);
+                }
+        }
+        mutex_unlock(&probe_lock);
+        return NOTIFY_DONE;
+}
+static struct notifier_block trace_probe_module_nb = {
+        .notifier_call = trace_probe_module_callback,
+        .priority = 1   /* Invoked after kprobe module callback */
+};
 /* Split symbol and offset. */
 static int split_symbol_offset(char *symbol, unsigned long *offset)
 {
@@ -963,8 +1135,8 @@ static int create_trace_probe(int argc, char **argv)
 {
        /*
         * Argument syntax:
-         *  - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
+         *  - Add kprobe: p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS]
-         *  - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
+         *  - Add kretprobe: r[:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS]
         * Fetch args:
         *  $retval     : fetch return value
         *  $stack      : fetch stack address
@@ -1026,7 +1198,7 @@ static int create_trace_probe(int argc, char **argv)
                        return -EINVAL;
                }
                mutex_lock(&probe_lock);
-                tp = find_probe_event(event, group);
+                tp = find_trace_probe(event, group);
                if (!tp) {
                        mutex_unlock(&probe_lock);
                        pr_info("Event %s/%s doesn't exist.\n", group, event);
@@ -1145,7 +1317,7 @@ error:
        return ret;
 }
-static void cleanup_all_probes(void)
+static void release_all_trace_probes(void)
 {
        struct trace_probe *tp;
@@ -1159,7 +1331,6 @@ static void cleanup_all_probes(void)
        mutex_unlock(&probe_lock);
 }
 /* Probes listing interfaces */
 static void *probes_seq_start(struct seq_file *m, loff_t *pos)
 {
@@ -1182,15 +1353,16 @@ static int probes_seq_show(struct seq_file *m, void *v)
        struct trace_probe *tp = v;
        int i;
-        seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
+        seq_printf(m, "%c", trace_probe_is_return(tp) ? 'r' : 'p');
        seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name);
        if (!tp->symbol)
                seq_printf(m, " 0x%p", tp->rp.kp.addr);
        else if (tp->rp.kp.offset)
-                seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset);
+                seq_printf(m, " %s+%u", trace_probe_symbol(tp),
+                           tp->rp.kp.offset);
        else
-                seq_printf(m, " %s", probe_symbol(tp));
+                seq_printf(m, " %s", trace_probe_symbol(tp));
        for (i = 0; i < tp->nr_args; i++)
                seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm);
@@ -1210,7 +1382,7 @@ static int probes_open(struct inode *inode, struct file *file)
 {
        if ((file->f_mode & FMODE_WRITE) &&
            (file->f_flags & O_TRUNC))
-                cleanup_all_probes();
+                release_all_trace_probes();
        return seq_open(file, &probes_seq_op);
 }
@@ -1398,7 +1570,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
        store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
        if (!filter_current_check_discard(buffer, call, entry, event))
-                trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
+                trace_nowake_buffer_unlock_commit_regs(buffer, event,
+                                                       irq_flags, pc, regs);
 }
 /* Kretprobe handler */
@@ -1430,7 +1603,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
        store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
        if (!filter_current_check_discard(buffer, call, entry, event))
-                trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
+                trace_nowake_buffer_unlock_commit_regs(buffer, event,
+                                                       irq_flags, pc, regs);
 }
 /* Event entry printers */
@@ -1512,30 +1686,6 @@ partial:
        return TRACE_TYPE_PARTIAL_LINE;
 }
-static int probe_event_enable(struct ftrace_event_call *call)
-{
-        struct trace_probe *tp = (struct trace_probe *)call->data;
-        tp->flags |= TP_FLAG_TRACE;
-        if (probe_is_return(tp))
-                return enable_kretprobe(&tp->rp);
-        else
-                return enable_kprobe(&tp->rp.kp);
-}
-static void probe_event_disable(struct ftrace_event_call *call)
-{
-        struct trace_probe *tp = (struct trace_probe *)call->data;
-        tp->flags &= ~TP_FLAG_TRACE;
-        if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) {
-                if (probe_is_return(tp))
-                        disable_kretprobe(&tp->rp);
-                else
-                        disable_kprobe(&tp->rp.kp);
-        }
-}
 #undef DEFINE_FIELD
 #define DEFINE_FIELD(type, item, name, is_signed)                       \
        do {                                                            \
@@ -1597,7 +1747,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
        const char *fmt, *arg;
-        if (!probe_is_return(tp)) {
+        if (!trace_probe_is_return(tp)) {
                fmt = "(%lx)";
                arg = "REC->" FIELD_STRING_IP;
        } else {
@@ -1714,49 +1864,25 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
        head = this_cpu_ptr(call->perf_events);
        perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
 }
-static int probe_perf_enable(struct ftrace_event_call *call)
-{
-        struct trace_probe *tp = (struct trace_probe *)call->data;
-        tp->flags |= TP_FLAG_PROFILE;
-        if (probe_is_return(tp))
-                return enable_kretprobe(&tp->rp);
-        else
-                return enable_kprobe(&tp->rp.kp);
-}
-static void probe_perf_disable(struct ftrace_event_call *call)
-{
-        struct trace_probe *tp = (struct trace_probe *)call->data;
-        tp->flags &= ~TP_FLAG_PROFILE;
-        if (!(tp->flags & TP_FLAG_TRACE)) {
-                if (probe_is_return(tp))
-                        disable_kretprobe(&tp->rp);
-                else
-                        disable_kprobe(&tp->rp.kp);
-        }
-}
 #endif  /* CONFIG_PERF_EVENTS */
 static __kprobes
 int kprobe_register(struct ftrace_event_call *event, enum trace_reg type)
 {
+        struct trace_probe *tp = (struct trace_probe *)event->data;
        switch (type) {
        case TRACE_REG_REGISTER:
-                return probe_event_enable(event);
+                return enable_trace_probe(tp, TP_FLAG_TRACE);
        case TRACE_REG_UNREGISTER:
-                probe_event_disable(event);
+                disable_trace_probe(tp, TP_FLAG_TRACE);
                return 0;
 #ifdef CONFIG_PERF_EVENTS
        case TRACE_REG_PERF_REGISTER:
-                return probe_perf_enable(event);
+                return enable_trace_probe(tp, TP_FLAG_PROFILE);
        case TRACE_REG_PERF_UNREGISTER:
-                probe_perf_disable(event);
+                disable_trace_probe(tp, TP_FLAG_PROFILE);
                return 0;
 #endif
        }
@@ -1806,7 +1932,7 @@ static int register_probe_event(struct trace_probe *tp)
        /* Initialize ftrace_event_call */
        INIT_LIST_HEAD(&call->class->fields);
-        if (probe_is_return(tp)) {
+        if (trace_probe_is_return(tp)) {
                call->event.funcs = &kretprobe_funcs;
                call->class->define_fields = kretprobe_event_define_fields;
        } else {
@@ -1845,6 +1971,9 @@ static __init int init_kprobe_trace(void)
        struct dentry *d_tracer;
        struct dentry *entry;
+        if (register_module_notifier(&trace_probe_module_nb))
+                return -EINVAL;
        d_tracer = tracing_init_dentry();
        if (!d_tracer)
                return 0;
@@ -1871,8 +2000,12 @@ fs_initcall(init_kprobe_trace);
 #ifdef CONFIG_FTRACE_STARTUP_TEST
-static int kprobe_trace_selftest_target(int a1, int a2, int a3,
+/*
-                                        int a4, int a5, int a6)
+ * The "__used" keeps gcc from removing the function symbol
+ * from the kallsyms table.
+ */
+static __used int kprobe_trace_selftest_target(int a1, int a2, int a3,
+                                               int a4, int a5, int a6)
 {
        return a1 + a2 + a3 + a4 + a5 + a6;
 }
@@ -1894,12 +2027,12 @@ static __init int kprobe_trace_self_tests_init(void)
                warn++;
        } else {
                /* Enable trace point */
-                tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM);
+                tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM);
                if (WARN_ON_ONCE(tp == NULL)) {
                        pr_warning("error on getting new probe.\n");
                        warn++;
                } else
-                        probe_event_enable(&tp->call);
+                        enable_trace_probe(tp, TP_FLAG_TRACE);
        }
        ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
@@ -1909,12 +2042,12 @@ static __init int kprobe_trace_self_tests_init(void)
                warn++;
        } else {
                /* Enable trace point */
-                tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM);
+                tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM);
                if (WARN_ON_ONCE(tp == NULL)) {
                        pr_warning("error on getting new probe.\n");
                        warn++;
                } else
-                        probe_event_enable(&tp->call);
+                        enable_trace_probe(tp, TP_FLAG_TRACE);
        }
        if (warn)
@@ -1935,7 +2068,7 @@ static __init int kprobe_trace_self_tests_init(void)
        }
 end:
-        cleanup_all_probes();
+        release_all_trace_probes();
        if (warn)
                pr_cont("NG: Some tests are failed. Please check them.\n");
        else
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 017fa376505d..fd3c8aae55e5 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -12,7 +12,7 @@
 #include <linux/slab.h>
 #include <linux/time.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include "trace.h"
 #include "trace_output.h"
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 456be9063c2d..51999309a6cf 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -353,6 +353,33 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
 }
 EXPORT_SYMBOL(ftrace_print_symbols_seq);
+#if BITS_PER_LONG == 32
+const char *
+ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
+                         const struct trace_print_flags_u64 *symbol_array)
+{
+        int i;
+        const char *ret = p->buffer + p->len;
+        for (i = 0;  symbol_array[i].name; i++) {
+                if (val != symbol_array[i].mask)
+                        continue;
+                trace_seq_puts(p, symbol_array[i].name);
+                break;
+        }
+        if (!p->len)
+                trace_seq_printf(p, "0x%llx", val);
+        trace_seq_putc(p, 0);
+        return ret;
+}
+EXPORT_SYMBOL(ftrace_print_symbols_seq_u64);
+#endif
 const char *
 ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
 {
@@ -830,6 +857,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
 enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
                                  struct trace_event *event)
 {
+        if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type))
+                return TRACE_TYPE_PARTIAL_LINE;
        return TRACE_TYPE_HANDLED;
 }
@@ -1077,19 +1107,20 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
 {
        struct stack_entry *field;
        struct trace_seq *s = &iter->seq;
-        int i;
+        unsigned long *p;
+        unsigned long *end;
        trace_assign_type(field, iter->ent);
+        end = (unsigned long *)((long)iter->ent + iter->ent_size);
        if (!trace_seq_puts(s, "<stack trace>\n"))
                goto partial;
-        for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
-                if (!field->caller[i] || (field->caller[i] == ULONG_MAX))
+        for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) {
-                        break;
                if (!trace_seq_puts(s, " => "))
                        goto partial;
-                if (!seq_print_ip_sym(s, field->caller[i], flags))
+                if (!seq_print_ip_sym(s, *p, flags))
                        goto partial;
                if (!trace_seq_puts(s, "\n"))
                        goto partial;
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 2547d8813cf0..1f06468a10d7 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -32,7 +32,7 @@ static DEFINE_MUTEX(btrace_mutex);
 struct trace_bprintk_fmt {
        struct list_head list;
-        char fmt[0];
+        const char *fmt;
 };
 static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
@@ -49,6 +49,7 @@ static
 void hold_module_trace_bprintk_format(const char **start, const char **end)
 {
        const char **iter;
+        char *fmt;
        mutex_lock(&btrace_mutex);
        for (iter = start; iter < end; iter++) {
@@ -58,14 +59,18 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
                        continue;
                }
-                tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt)
+                tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL);
-                                + strlen(*iter) + 1, GFP_KERNEL);
+                if (tb_fmt)
-                if (tb_fmt) {
+                        fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL);
+                if (tb_fmt && fmt) {
                        list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
-                        strcpy(tb_fmt->fmt, *iter);
+                        strcpy(fmt, *iter);
+                        tb_fmt->fmt = fmt;
                        *iter = tb_fmt->fmt;
-                } else
+                } else {
+                        kfree(tb_fmt);
                        *iter = NULL;
+                }
        }
        mutex_unlock(&btrace_mutex);
 }
@@ -84,6 +89,76 @@ static int module_trace_bprintk_format_notify(struct notifier_block *self,
        return 0;
 }
+/*
+ * The debugfs/tracing/printk_formats file maps the addresses with
+ * the ASCII formats that are used in the bprintk events in the
+ * buffer. For userspace tools to be able to decode the events from
+ * the buffer, they need to be able to map the address with the format.
+ *
+ * The addresses of the bprintk formats are in their own section
+ * __trace_printk_fmt. But for modules we copy them into a link list.
+ * The code to print the formats and their addresses passes around the
+ * address of the fmt string. If the fmt address passed into the seq
+ * functions is within the kernel core __trace_printk_fmt section, then
+ * it simply uses the next pointer in the list.
+ *
+ * When the fmt pointer is outside the kernel core __trace_printk_fmt
+ * section, then we need to read the link list pointers. The trick is
+ * we pass the address of the string to the seq function just like
+ * we do for the kernel core formats. To get back the structure that
+ * holds the format, we simply use containerof() and then go to the
+ * next format in the list.
+ */
+static const char **
+find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos)
+{
+        struct trace_bprintk_fmt *mod_fmt;
+        if (list_empty(&trace_bprintk_fmt_list))
+                return NULL;
+        /*
+         * v will point to the address of the fmt record from t_next
+         * v will be NULL from t_start.
+         * If this is the first pointer or called from start
+         * then we need to walk the list.
+         */
+        if (!v || start_index == *pos) {
+                struct trace_bprintk_fmt *p;
+                /* search the module list */
+                list_for_each_entry(p, &trace_bprintk_fmt_list, list) {
+                        if (start_index == *pos)
+                                return &p->fmt;
+                        start_index++;
+                }
+                /* pos > index */
+                return NULL;
+        }
+        /*
+         * v points to the address of the fmt field in the mod list
+         * structure that holds the module print format.
+         */
+        mod_fmt = container_of(v, typeof(*mod_fmt), fmt);
+        if (mod_fmt->list.next == &trace_bprintk_fmt_list)
+                return NULL;
+        mod_fmt = container_of(mod_fmt->list.next, typeof(*mod_fmt), list);
+        return &mod_fmt->fmt;
+}
+static void format_mod_start(void)
+{
+        mutex_lock(&btrace_mutex);
+}
+static void format_mod_stop(void)
+{
+        mutex_unlock(&btrace_mutex);
+}
 #else /* !CONFIG_MODULES */
 __init static int
 module_trace_bprintk_format_notify(struct notifier_block *self,
@@ -91,6 +166,13 @@ module_trace_bprintk_format_notify(struct notifier_block *self,
 {
        return 0;
 }
+static inline const char **
+find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos)
+{
+        return NULL;
+}
+static inline void format_mod_start(void) { }
+static inline void format_mod_stop(void) { }
 #endif /* CONFIG_MODULES */
@@ -153,20 +235,30 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
 }
 EXPORT_SYMBOL_GPL(__ftrace_vprintk);
+static const char **find_next(void *v, loff_t *pos)
+{
+        const char **fmt = v;
+        int start_index;
+        start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt;
+        if (*pos < start_index)
+                return __start___trace_bprintk_fmt + *pos;
+        return find_next_mod_format(start_index, v, fmt, pos);
+}
 static void *
 t_start(struct seq_file *m, loff_t *pos)
 {
-        const char **fmt = __start___trace_bprintk_fmt + *pos;
+        format_mod_start();
+        return find_next(NULL, pos);
-        if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt)
-                return NULL;
-        return fmt;
 }
 static void *t_next(struct seq_file *m, void * v, loff_t *pos)
 {
        (*pos)++;
-        return t_start(m, pos);
+        return find_next(v, pos);
 }
 static int t_show(struct seq_file *m, void *v)
@@ -205,6 +297,7 @@ static int t_show(struct seq_file *m, void *v)
 static void t_stop(struct seq_file *m, void *p)
 {
+        format_mod_stop();
 }
 static const struct seq_operations show_format_seq_ops = {
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 7319559ed59f..e4a70c0c71b6 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -129,6 +129,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
 static struct ftrace_ops trace_ops __read_mostly =
 {
        .func = wakeup_tracer_call,
+        .flags = FTRACE_OPS_FL_GLOBAL,
 };
 #endif /* CONFIG_FUNCTION_TRACER */
@@ -226,7 +227,9 @@ static void wakeup_trace_close(struct trace_iterator *iter)
                graph_trace_close(iter);
 }
-#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC)
+#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC | \
+                            TRACE_GRAPH_PRINT_ABS_TIME | \
+                            TRACE_GRAPH_PRINT_DURATION)
 static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
 {
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 659732eba07c..288541f977fb 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -101,6 +101,206 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
 #ifdef CONFIG_DYNAMIC_FTRACE
+static int trace_selftest_test_probe1_cnt;
+static void trace_selftest_test_probe1_func(unsigned long ip,
+                                            unsigned long pip)
+{
+        trace_selftest_test_probe1_cnt++;
+}
+static int trace_selftest_test_probe2_cnt;
+static void trace_selftest_test_probe2_func(unsigned long ip,
+                                            unsigned long pip)
+{
+        trace_selftest_test_probe2_cnt++;
+}
+static int trace_selftest_test_probe3_cnt;
+static void trace_selftest_test_probe3_func(unsigned long ip,
+                                            unsigned long pip)
+{
+        trace_selftest_test_probe3_cnt++;
+}
+static int trace_selftest_test_global_cnt;
+static void trace_selftest_test_global_func(unsigned long ip,
+                                            unsigned long pip)
+{
+        trace_selftest_test_global_cnt++;
+}
+static int trace_selftest_test_dyn_cnt;
+static void trace_selftest_test_dyn_func(unsigned long ip,
+                                         unsigned long pip)
+{
+        trace_selftest_test_dyn_cnt++;
+}
+static struct ftrace_ops test_probe1 = {
+        .func                   = trace_selftest_test_probe1_func,
+};
+static struct ftrace_ops test_probe2 = {
+        .func                   = trace_selftest_test_probe2_func,
+};
+static struct ftrace_ops test_probe3 = {
+        .func                   = trace_selftest_test_probe3_func,
+};
+static struct ftrace_ops test_global = {
+        .func                   = trace_selftest_test_global_func,
+        .flags                  = FTRACE_OPS_FL_GLOBAL,
+};
+static void print_counts(void)
+{
+        printk("(%d %d %d %d %d) ",
+               trace_selftest_test_probe1_cnt,
+               trace_selftest_test_probe2_cnt,
+               trace_selftest_test_probe3_cnt,
+               trace_selftest_test_global_cnt,
+               trace_selftest_test_dyn_cnt);
+}
+static void reset_counts(void)
+{
+        trace_selftest_test_probe1_cnt = 0;
+        trace_selftest_test_probe2_cnt = 0;
+        trace_selftest_test_probe3_cnt = 0;
+        trace_selftest_test_global_cnt = 0;
+        trace_selftest_test_dyn_cnt = 0;
+}
+static int trace_selftest_ops(int cnt)
+{
+        int save_ftrace_enabled = ftrace_enabled;
+        struct ftrace_ops *dyn_ops;
+        char *func1_name;
+        char *func2_name;
+        int len1;
+        int len2;
+        int ret = -1;
+        printk(KERN_CONT "PASSED\n");
+        pr_info("Testing dynamic ftrace ops #%d: ", cnt);
+        ftrace_enabled = 1;
+        reset_counts();
+        /* Handle PPC64 '.' name */
+        func1_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
+        func2_name = "*" __stringify(DYN_FTRACE_TEST_NAME2);
+        len1 = strlen(func1_name);
+        len2 = strlen(func2_name);
+        /*
+         * Probe 1 will trace function 1.
+         * Probe 2 will trace function 2.
+         * Probe 3 will trace functions 1 and 2.
+         */
+        ftrace_set_filter(&test_probe1, func1_name, len1, 1);
+        ftrace_set_filter(&test_probe2, func2_name, len2, 1);
+        ftrace_set_filter(&test_probe3, func1_name, len1, 1);
+        ftrace_set_filter(&test_probe3, func2_name, len2, 0);
+        register_ftrace_function(&test_probe1);
+        register_ftrace_function(&test_probe2);
+        register_ftrace_function(&test_probe3);
+        register_ftrace_function(&test_global);
+        DYN_FTRACE_TEST_NAME();
+        print_counts();
+        if (trace_selftest_test_probe1_cnt != 1)
+                goto out;
+        if (trace_selftest_test_probe2_cnt != 0)
+                goto out;
+        if (trace_selftest_test_probe3_cnt != 1)
+                goto out;
+        if (trace_selftest_test_global_cnt == 0)
+                goto out;
+        DYN_FTRACE_TEST_NAME2();
+        print_counts();
+        if (trace_selftest_test_probe1_cnt != 1)
+                goto out;
+        if (trace_selftest_test_probe2_cnt != 1)
+                goto out;
+        if (trace_selftest_test_probe3_cnt != 2)
+                goto out;
+        /* Add a dynamic probe */
+        dyn_ops = kzalloc(sizeof(*dyn_ops), GFP_KERNEL);
+        if (!dyn_ops) {
+                printk("MEMORY ERROR ");
+                goto out;
+        }
+        dyn_ops->func = trace_selftest_test_dyn_func;
+        register_ftrace_function(dyn_ops);
+        trace_selftest_test_global_cnt = 0;
+        DYN_FTRACE_TEST_NAME();
+        print_counts();
+        if (trace_selftest_test_probe1_cnt != 2)
+                goto out_free;
+        if (trace_selftest_test_probe2_cnt != 1)
+                goto out_free;
+        if (trace_selftest_test_probe3_cnt != 3)
+                goto out_free;
+        if (trace_selftest_test_global_cnt == 0)
+                goto out;
+        if (trace_selftest_test_dyn_cnt == 0)
+                goto out_free;
+        DYN_FTRACE_TEST_NAME2();
+        print_counts();
+        if (trace_selftest_test_probe1_cnt != 2)
+                goto out_free;
+        if (trace_selftest_test_probe2_cnt != 2)
+                goto out_free;
+        if (trace_selftest_test_probe3_cnt != 4)
+                goto out_free;
+        ret = 0;
+ out_free:
+        unregister_ftrace_function(dyn_ops);
+        kfree(dyn_ops);
+ out:
+        /* Purposely unregister in the same order */
+        unregister_ftrace_function(&test_probe1);
+        unregister_ftrace_function(&test_probe2);
+        unregister_ftrace_function(&test_probe3);
+        unregister_ftrace_function(&test_global);
+        /* Make sure everything is off */
+        reset_counts();
+        DYN_FTRACE_TEST_NAME();
+        DYN_FTRACE_TEST_NAME();
+        if (trace_selftest_test_probe1_cnt ||
+            trace_selftest_test_probe2_cnt ||
+            trace_selftest_test_probe3_cnt ||
+            trace_selftest_test_global_cnt ||
+            trace_selftest_test_dyn_cnt)
+                ret = -1;
+        ftrace_enabled = save_ftrace_enabled;
+        return ret;
+}
 /* Test dynamic code modification and ftrace filters */
 int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
                                           struct trace_array *tr,
@@ -131,7 +331,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
        func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
        /* filter only on our function */
-        ftrace_set_filter(func_name, strlen(func_name), 1);
+        ftrace_set_global_filter(func_name, strlen(func_name), 1);
        /* enable tracing */
        ret = tracer_init(trace, tr);
@@ -166,22 +366,30 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
        /* check the trace buffer */
        ret = trace_test_buffer(tr, &count);
-        trace->reset(tr);
        tracing_start();
        /* we should only have one item */
        if (!ret && count != 1) {
+                trace->reset(tr);
                printk(KERN_CONT ".. filter failed count=%ld ..", count);
                ret = -1;
                goto out;
        }
+        /* Test the ops with global tracing running */
+        ret = trace_selftest_ops(1);
+        trace->reset(tr);
 out:
        ftrace_enabled = save_ftrace_enabled;
        tracer_enabled = save_tracer_enabled;
        /* Enable tracing on all functions again */
-        ftrace_set_filter(NULL, 0, 1);
+        ftrace_set_global_filter(NULL, 0, 1);
+        /* Test the ops with global tracing off */
+        if (!ret)
+                ret = trace_selftest_ops(2);
        return ret;
 }
diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c
index 54dd77cce5bf..b4c475a0a48b 100644
--- a/kernel/trace/trace_selftest_dynamic.c
+++ b/kernel/trace/trace_selftest_dynamic.c
@@ -5,3 +5,9 @@ int DYN_FTRACE_TEST_NAME(void)
        /* used to call mcount */
        return 0;
 }
+int DYN_FTRACE_TEST_NAME2(void)
+{
+        /* used to call mcount */
+        return 0;
+}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 4c5dead0c239..77575b386d97 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -133,6 +133,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
 static struct ftrace_ops trace_ops __read_mostly =
 {
        .func = stack_trace_call,
+        .flags = FTRACE_OPS_FL_GLOBAL,
 };
 static ssize_t
@@ -155,20 +156,11 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
 {
        long *ptr = filp->private_data;
        unsigned long val, flags;
-        char buf[64];
        int ret;
        int cpu;
-        if (count >= sizeof(buf))
+        ret = kstrtoul_from_user(ubuf, count, 10, &val);
-                return -EINVAL;
+        if (ret)
-        if (copy_from_user(&buf, ubuf, count))
-                return -EFAULT;
-        buf[count] = 0;
-        ret = strict_strtoul(buf, 10, &val);
-        if (ret < 0)
                return ret;
        local_irq_save(flags);
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 68187af4889e..b219f1449c54 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -251,9 +251,9 @@ static void set_tracepoint(struct tracepoint_entry **entry,
 {
        WARN_ON(strcmp((*entry)->name, elem->name) != 0);
-        if (elem->regfunc && !elem->state && active)
+        if (elem->regfunc && !jump_label_enabled(&elem->key) && active)
                elem->regfunc();
-        else if (elem->unregfunc && elem->state && !active)
+        else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active)
                elem->unregfunc();
        /*
@@ -264,13 +264,10 @@ static void set_tracepoint(struct tracepoint_entry **entry,
         * is used.
         */
        rcu_assign_pointer(elem->funcs, (*entry)->funcs);
-        if (!elem->state && active) {
+        if (active && !jump_label_enabled(&elem->key))
-                jump_label_enable(&elem->state);
+                jump_label_inc(&elem->key);
-                elem->state = active;
+        else if (!active && jump_label_enabled(&elem->key))
-        } else if (elem->state && !active) {
+                jump_label_dec(&elem->key);
-                jump_label_disable(&elem->state);
-                elem->state = active;
-        }
 }
 /*
@@ -281,13 +278,11 @@ static void set_tracepoint(struct tracepoint_entry **entry,
 */
 static void disable_tracepoint(struct tracepoint *elem)
 {
-        if (elem->unregfunc && elem->state)
+        if (elem->unregfunc && jump_label_enabled(&elem->key))
                elem->unregfunc();
-        if (elem->state) {
+        if (jump_label_enabled(&elem->key))
-                jump_label_disable(&elem->state);
+                jump_label_dec(&elem->key);
-                elem->state = 0;
-        }
        rcu_assign_pointer(elem->funcs, NULL);
 }
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 44646179eaba..bff131b9510a 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -15,6 +15,7 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
+#include <linux/proc_fs.h>
 static struct uts_namespace *create_uts_ns(void)
 {
@@ -79,3 +80,41 @@ void free_uts_ns(struct kref *kref)
        put_user_ns(ns->user_ns);
        kfree(ns);
 }
+static void *utsns_get(struct task_struct *task)
+{
+        struct uts_namespace *ns = NULL;
+        struct nsproxy *nsproxy;
+        rcu_read_lock();
+        nsproxy = task_nsproxy(task);
+        if (nsproxy) {
+                ns = nsproxy->uts_ns;
+                get_uts_ns(ns);
+        }
+        rcu_read_unlock();
+        return ns;
+}
+static void utsns_put(void *ns)
+{
+        put_uts_ns(ns);
+}
+static int utsns_install(struct nsproxy *nsproxy, void *ns)
+{
+        get_uts_ns(ns);
+        put_uts_ns(nsproxy->uts_ns);
+        nsproxy->uts_ns = ns;
+        return 0;
+}
+const struct proc_ns_operations utsns_operations = {
+        .name           = "uts",
+        .type           = CLONE_NEWUTS,
+        .get            = utsns_get,
+        .put            = utsns_put,
+        .install        = utsns_install,
+};
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 14733d4d156b..36491cd5b7d4 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -28,7 +28,7 @@
 #include <linux/perf_event.h>
 int watchdog_enabled = 1;
-int __read_mostly softlockup_thresh = 60;
+int __read_mostly watchdog_thresh = 10;
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
 static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
@@ -91,6 +91,17 @@ static int __init nosoftlockup_setup(char *str)
 __setup("nosoftlockup", nosoftlockup_setup);
 /*  */
+/*
+ * Hard-lockup warnings should be triggered after just a few seconds. Soft-
+ * lockups can have false positives under extreme conditions. So we generally
+ * want a higher threshold for soft lockups than for hard lockups. So we couple
+ * the thresholds with a factor: we make the soft threshold twice the amount of
+ * time the hard threshold is.
+ */
+static int get_softlockup_thresh(void)
+{
+        return watchdog_thresh * 2;
+}
 /*
 * Returns seconds, approximately.  We don't need nanosecond
@@ -105,12 +116,12 @@ static unsigned long get_timestamp(int this_cpu)
 static unsigned long get_sample_period(void)
 {
        /*
-         * convert softlockup_thresh from seconds to ns
+         * convert watchdog_thresh from seconds to ns
         * the divide by 5 is to give hrtimer 5 chances to
         * increment before the hardlockup detector generates
         * a warning
         */
-        return softlockup_thresh / 5 * NSEC_PER_SEC;
+        return get_softlockup_thresh() * (NSEC_PER_SEC / 5);
 }
 /* Commands for resetting the watchdog */
@@ -182,13 +193,14 @@ static int is_softlockup(unsigned long touch_ts)
        unsigned long now = get_timestamp(smp_processor_id());
        /* Warn about unreasonable delays: */
-        if (time_after(now, touch_ts + softlockup_thresh))
+        if (time_after(now, touch_ts + get_softlockup_thresh()))
                return now - touch_ts;
        return 0;
 }
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 static struct perf_event_attr wd_hw_attr = {
        .type           = PERF_TYPE_HARDWARE,
        .config         = PERF_COUNT_HW_CPU_CYCLES,
@@ -198,7 +210,7 @@ static struct perf_event_attr wd_hw_attr = {
 };
 /* Callback function for perf event subsystem */
-static void watchdog_overflow_callback(struct perf_event *event, int nmi,
+static void watchdog_overflow_callback(struct perf_event *event,
                 struct perf_sample_data *data,
                 struct pt_regs *regs)
 {
@@ -357,10 +369,11 @@ static int watchdog_nmi_enable(int cpu)
        if (event != NULL)
                goto out_enable;
-        /* Try to register using hardware perf events */
        wd_attr = &wd_hw_attr;
-        wd_attr->sample_period = hw_nmi_get_sample_period();
+        wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
-        event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
+        /* Try to register using hardware perf events */
+        event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
        if (!IS_ERR(event)) {
                printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
                goto out_save;
@@ -404,15 +417,13 @@ static void watchdog_nmi_disable(int cpu) { return; }
 #endif /* CONFIG_HARDLOCKUP_DETECTOR */
 /* prepare/enable/disable routines */
-static int watchdog_prepare_cpu(int cpu)
+static void watchdog_prepare_cpu(int cpu)
 {
        struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
        WARN_ON(per_cpu(softlockup_watchdog, cpu));
        hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        hrtimer->function = watchdog_timer_fn;
-        return 0;
 }
 static int watchdog_enable(int cpu)
@@ -501,28 +512,25 @@ static void watchdog_disable_all_cpus(void)
 /* sysctl functions */
 #ifdef CONFIG_SYSCTL
 /*
- * proc handler for /proc/sys/kernel/nmi_watchdog
+ * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
 */
-int proc_dowatchdog_enabled(struct ctl_table *table, int write,
+int proc_dowatchdog(struct ctl_table *table, int write,
-                     void __user *buffer, size_t *length, loff_t *ppos)
+                    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-        proc_dointvec(table, write, buffer, length, ppos);
+        int ret;
-        if (write) {
+        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-                if (watchdog_enabled)
+        if (ret || !write)
-                        watchdog_enable_all_cpus();
+                goto out;
-                else
-                        watchdog_disable_all_cpus();
-        }
-        return 0;
-}
-int proc_dowatchdog_thresh(struct ctl_table *table, int write,
+        if (watchdog_enabled && watchdog_thresh)
-                             void __user *buffer,
+                watchdog_enable_all_cpus();
-                             size_t *lenp, loff_t *ppos)
+        else
-{
+                watchdog_disable_all_cpus();
-        return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+out:
+        return ret;
 }
 #endif /* CONFIG_SYSCTL */
@@ -534,17 +542,16 @@ static int __cpuinit
 cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
        int hotcpu = (unsigned long)hcpu;
-        int err = 0;
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                err = watchdog_prepare_cpu(hotcpu);
+                watchdog_prepare_cpu(hotcpu);
                break;
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
                if (watchdog_enabled)
-                        err = watchdog_enable(hotcpu);
+                        watchdog_enable(hotcpu);
                break;
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_UP_CANCELED:
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e3378e8d3a5c..25fb1b0e53fa 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -221,7 +221,7 @@ typedef unsigned long mayday_mask_t;
 * per-CPU workqueues:
 */
 struct workqueue_struct {
-        unsigned int            flags;          /* I: WQ_* flags */
+        unsigned int            flags;          /* W: WQ_* flags */
        union {
                struct cpu_workqueue_struct __percpu    *pcpu;
                struct cpu_workqueue_struct             *single;
@@ -240,6 +240,7 @@ struct workqueue_struct {
        mayday_mask_t           mayday_mask;    /* cpus requesting rescue */
        struct worker           *rescuer;       /* I: rescue worker */
+        int                     nr_drainers;    /* W: drain in progress */
        int                     saved_max_active; /* W: saved cwq max_active */
        const char              *name;          /* I: workqueue name */
 #ifdef CONFIG_LOCKDEP
@@ -990,7 +991,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
        debug_work_activate(work);
        /* if dying, only works from the same workqueue are allowed */
-        if (unlikely(wq->flags & WQ_DYING) &&
+        if (unlikely(wq->flags & WQ_DRAINING) &&
            WARN_ON_ONCE(!is_chained_work(wq)))
                return;
@@ -2381,6 +2382,54 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
+/**
+ * drain_workqueue - drain a workqueue
+ * @wq: workqueue to drain
+ *
+ * Wait until the workqueue becomes empty.  While draining is in progress,
+ * only chain queueing is allowed.  IOW, only currently pending or running
+ * work items on @wq can queue further work items on it.  @wq is flushed
+ * repeatedly until it becomes empty.  The number of flushing is detemined
+ * by the depth of chaining and should be relatively short.  Whine if it
+ * takes too long.
+ */
+void drain_workqueue(struct workqueue_struct *wq)
+{
+        unsigned int flush_cnt = 0;
+        unsigned int cpu;
+        /*
+         * __queue_work() needs to test whether there are drainers, is much
+         * hotter than drain_workqueue() and already looks at @wq->flags.
+         * Use WQ_DRAINING so that queue doesn't have to check nr_drainers.
+         */
+        spin_lock(&workqueue_lock);
+        if (!wq->nr_drainers++)
+                wq->flags |= WQ_DRAINING;
+        spin_unlock(&workqueue_lock);
+reflush:
+        flush_workqueue(wq);
+        for_each_cwq_cpu(cpu, wq) {
+                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+                if (!cwq->nr_active && list_empty(&cwq->delayed_works))
+                        continue;
+                if (++flush_cnt == 10 ||
+                    (flush_cnt % 100 == 0 && flush_cnt <= 1000))
+                        pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n",
+                                   wq->name, flush_cnt);
+                goto reflush;
+        }
+        spin_lock(&workqueue_lock);
+        if (!--wq->nr_drainers)
+                wq->flags &= ~WQ_DRAINING;
+        spin_unlock(&workqueue_lock);
+}
+EXPORT_SYMBOL_GPL(drain_workqueue);
 static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
                             bool wait_executing)
 {
@@ -2866,9 +2915,7 @@ static int alloc_cwqs(struct workqueue_struct *wq)
                }
        }
-        /* just in case, make sure it's actually aligned
+        /* just in case, make sure it's actually aligned */
-         * - this is affected by PERCPU() alignment in vmlinux.lds.S
-         */
        BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
        return wq->cpu_wq.v ? 0 : -ENOMEM;
 }
@@ -3011,34 +3058,10 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
 */
 void destroy_workqueue(struct workqueue_struct *wq)
 {
-        unsigned int flush_cnt = 0;
        unsigned int cpu;
-        /*
+        /* drain it before proceeding with destruction */
-         * Mark @wq dying and drain all pending works.  Once WQ_DYING is
+        drain_workqueue(wq);
-         * set, only chain queueing is allowed.  IOW, only currently
-         * pending or running work items on @wq can queue further work
-         * items on it.  @wq is flushed repeatedly until it becomes empty.
-         * The number of flushing is detemined by the depth of chaining and
-         * should be relatively short.  Whine if it takes too long.
-         */
-        wq->flags |= WQ_DYING;
-reflush:
-        flush_workqueue(wq);
-        for_each_cwq_cpu(cpu, wq) {
-                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
-                if (!cwq->nr_active && list_empty(&cwq->delayed_works))
-                        continue;
-                if (++flush_cnt == 10 ||
-                    (flush_cnt % 100 == 0 && flush_cnt <= 1000))
-                        printk(KERN_WARNING "workqueue %s: flush on "
-                               "destruction isn't complete after %u tries\n",
-                               wq->name, flush_cnt);
-                goto reflush;
-        }
        /*
         * wq list is used to freeze wq, remove from list after