98 files changed, 13972 insertions, 2698 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 526128a2e622..382dd5a8b2d7 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -55,4 +55,4 @@ config HZ
        default 1000 if HZ_1000
 config SCHED_HRTICK
-        def_bool HIGH_RES_TIMERS && X86
+        def_bool HIGH_RES_TIMERS && USE_GENERIC_SMP_HELPERS
diff --git a/kernel/Makefile b/kernel/Makefile
index 1c9938addb9d..54f69837d35a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -2,8 +2,8 @@
 # Makefile for the linux kernel.
 #
-obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
+obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
-            exit.o itimer.o time.o softirq.o resource.o \
+            cpu.o exit.o itimer.o time.o softirq.o resource.o \
            sysctl.o capability.o ptrace.o timer.o user.o \
            signal.o sys.o kmod.o workqueue.o pid.o \
            rcupdate.o extable.o params.o posix-timers.o \
@@ -11,6 +11,20 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
            notifier.o ksysfs.o pm_qos_params.o sched_clock.o
+CFLAGS_REMOVE_sched.o = -mno-spe
+ifdef CONFIG_FTRACE
+# Do not trace debug files and internal ftrace files
+CFLAGS_REMOVE_lockdep.o = -pg
+CFLAGS_REMOVE_lockdep_proc.o = -pg
+CFLAGS_REMOVE_mutex-debug.o = -pg
+CFLAGS_REMOVE_rtmutex-debug.o = -pg
+CFLAGS_REMOVE_cgroup-debug.o = -pg
+CFLAGS_REMOVE_sched_clock.o = -pg
+CFLAGS_REMOVE_sched.o = -mno-spe -pg
+endif
+obj-$(CONFIG_PROFILING) += profile.o
 obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
@@ -27,7 +41,8 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
-obj-$(CONFIG_SMP) += cpu.o spinlock.o
+obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o
+obj-$(CONFIG_SMP) += spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
 obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
 obj-$(CONFIG_UID16) += uid16.o
@@ -69,6 +84,9 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_MARKERS) += marker.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
+obj-$(CONFIG_FTRACE) += trace/
+obj-$(CONFIG_TRACING) += trace/
+obj-$(CONFIG_SMP) += sched_cpupri.o
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index 91e1cfd734d2..dd68b9059418 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -75,37 +75,39 @@ int acct_parm[3] = {4, 2, 30};
 /*
 * External references and all of the globals.
 */
-static void do_acct_process(struct pid_namespace *ns, struct file *);
+static void do_acct_process(struct bsd_acct_struct *acct,
+                struct pid_namespace *ns, struct file *);
 /*
 * This structure is used so that all the data protected by lock
 * can be placed in the same cache line as the lock.  This primes
 * the cache line to have the data after getting the lock.
 */
-struct acct_glbs {
+struct bsd_acct_struct {
-        spinlock_t              lock;
        volatile int            active;
        volatile int            needcheck;
        struct file             *file;
        struct pid_namespace    *ns;
        struct timer_list       timer;
+        struct list_head        list;
 };
-static struct acct_glbs acct_globals __cacheline_aligned =
+static DEFINE_SPINLOCK(acct_lock);
-        {__SPIN_LOCK_UNLOCKED(acct_globals.lock)};
+static LIST_HEAD(acct_list);
 /*
 * Called whenever the timer says to check the free space.
 */
-static void acct_timeout(unsigned long unused)
+static void acct_timeout(unsigned long x)
 {
-        acct_globals.needcheck = 1;
+        struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x;
+        acct->needcheck = 1;
 }
 /*
 * Check the amount of free space and suspend/resume accordingly.
 */
-static int check_free_space(struct file *file)
+static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
 {
        struct kstatfs sbuf;
        int res;
@@ -113,11 +115,11 @@ static int check_free_space(struct file *file)
        sector_t resume;
        sector_t suspend;
-        spin_lock(&acct_globals.lock);
+        spin_lock(&acct_lock);
-        res = acct_globals.active;
+        res = acct->active;
-        if (!file || !acct_globals.needcheck)
+        if (!file || !acct->needcheck)
                goto out;
-        spin_unlock(&acct_globals.lock);
+        spin_unlock(&acct_lock);
        /* May block */
        if (vfs_statfs(file->f_path.dentry, &sbuf))
@@ -136,35 +138,35 @@ static int check_free_space(struct file *file)
                act = 0;
        /*
-         * If some joker switched acct_globals.file under us we'ld better be
+         * If some joker switched acct->file under us we'ld better be
         * silent and _not_ touch anything.
         */
-        spin_lock(&acct_globals.lock);
+        spin_lock(&acct_lock);
-        if (file != acct_globals.file) {
+        if (file != acct->file) {
                if (act)
                        res = act>0;
                goto out;
        }
-        if (acct_globals.active) {
+        if (acct->active) {
                if (act < 0) {
-                        acct_globals.active = 0;
+                        acct->active = 0;
                        printk(KERN_INFO "Process accounting paused\n");
                }
        } else {
                if (act > 0) {
-                        acct_globals.active = 1;
+                        acct->active = 1;
                        printk(KERN_INFO "Process accounting resumed\n");
                }
        }
-        del_timer(&acct_globals.timer);
+        del_timer(&acct->timer);
-        acct_globals.needcheck = 0;
+        acct->needcheck = 0;
-        acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ;
+        acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
-        add_timer(&acct_globals.timer);
+        add_timer(&acct->timer);
-        res = acct_globals.active;
+        res = acct->active;
 out:
-        spin_unlock(&acct_globals.lock);
+        spin_unlock(&acct_lock);
        return res;
 }
@@ -172,39 +174,41 @@ out:
 * Close the old accounting file (if currently open) and then replace
 * it with file (if non-NULL).
 *
- * NOTE: acct_globals.lock MUST be held on entry and exit.
+ * NOTE: acct_lock MUST be held on entry and exit.
 */
-static void acct_file_reopen(struct file *file)
+static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
+                struct pid_namespace *ns)
 {
        struct file *old_acct = NULL;
        struct pid_namespace *old_ns = NULL;
-        if (acct_globals.file) {
+        if (acct->file) {
-                old_acct = acct_globals.file;
+                old_acct = acct->file;
-                old_ns = acct_globals.ns;
+                old_ns = acct->ns;
-                del_timer(&acct_globals.timer);
+                del_timer(&acct->timer);
-                acct_globals.active = 0;
+                acct->active = 0;
-                acct_globals.needcheck = 0;
+                acct->needcheck = 0;
-                acct_globals.file = NULL;
+                acct->file = NULL;
+                acct->ns = NULL;
+                list_del(&acct->list);
        }
        if (file) {
-                acct_globals.file = file;
+                acct->file = file;
-                acct_globals.ns = get_pid_ns(task_active_pid_ns(current));
+                acct->ns = ns;
-                acct_globals.needcheck = 0;
+                acct->needcheck = 0;
-                acct_globals.active = 1;
+                acct->active = 1;
+                list_add(&acct->list, &acct_list);
                /* It's been deleted if it was used before so this is safe */
-                init_timer(&acct_globals.timer);
+                setup_timer(&acct->timer, acct_timeout, (unsigned long)acct);
-                acct_globals.timer.function = acct_timeout;
+                acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
-                acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ;
+                add_timer(&acct->timer);
-                add_timer(&acct_globals.timer);
        }
        if (old_acct) {
                mnt_unpin(old_acct->f_path.mnt);
-                spin_unlock(&acct_globals.lock);
+                spin_unlock(&acct_lock);
-                do_acct_process(old_ns, old_acct);
+                do_acct_process(acct, old_ns, old_acct);
                filp_close(old_acct, NULL);
-                put_pid_ns(old_ns);
+                spin_lock(&acct_lock);
-                spin_lock(&acct_globals.lock);
        }
 }
@@ -212,6 +216,8 @@ static int acct_on(char *name)
 {
        struct file *file;
        int error;
+        struct pid_namespace *ns;
+        struct bsd_acct_struct *acct = NULL;
        /* Difference from BSD - they don't do O_APPEND */
        file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
@@ -228,18 +234,34 @@ static int acct_on(char *name)
                return -EIO;
        }
+        ns = task_active_pid_ns(current);
+        if (ns->bacct == NULL) {
+                acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
+                if (acct == NULL) {
+                        filp_close(file, NULL);
+                        return -ENOMEM;
+                }
+        }
        error = security_acct(file);
        if (error) {
+                kfree(acct);
                filp_close(file, NULL);
                return error;
        }
-        spin_lock(&acct_globals.lock);
+        spin_lock(&acct_lock);
+        if (ns->bacct == NULL) {
+                ns->bacct = acct;
+                acct = NULL;
+        }
        mnt_pin(file->f_path.mnt);
-        acct_file_reopen(file);
+        acct_file_reopen(ns->bacct, file, ns);
-        spin_unlock(&acct_globals.lock);
+        spin_unlock(&acct_lock);
        mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
+        kfree(acct);
        return 0;
 }
@@ -269,11 +291,17 @@ asmlinkage long sys_acct(const char __user *name)
                error = acct_on(tmp);
                putname(tmp);
        } else {
+                struct bsd_acct_struct *acct;
+                acct = task_active_pid_ns(current)->bacct;
+                if (acct == NULL)
+                        return 0;
                error = security_acct(NULL);
                if (!error) {
-                        spin_lock(&acct_globals.lock);
+                        spin_lock(&acct_lock);
-                        acct_file_reopen(NULL);
+                        acct_file_reopen(acct, NULL, NULL);
-                        spin_unlock(&acct_globals.lock);
+                        spin_unlock(&acct_lock);
                }
        }
        return error;
@@ -288,10 +316,16 @@ asmlinkage long sys_acct(const char __user *name)
 */
 void acct_auto_close_mnt(struct vfsmount *m)
 {
-        spin_lock(&acct_globals.lock);
+        struct bsd_acct_struct *acct;
-        if (acct_globals.file && acct_globals.file->f_path.mnt == m)
-                acct_file_reopen(NULL);
+        spin_lock(&acct_lock);
-        spin_unlock(&acct_globals.lock);
+restart:
+        list_for_each_entry(acct, &acct_list, list)
+                if (acct->file && acct->file->f_path.mnt == m) {
+                        acct_file_reopen(acct, NULL, NULL);
+                        goto restart;
+                }
+        spin_unlock(&acct_lock);
 }
 /**
@@ -303,12 +337,31 @@ void acct_auto_close_mnt(struct vfsmount *m)
 */
 void acct_auto_close(struct super_block *sb)
 {
-        spin_lock(&acct_globals.lock);
+        struct bsd_acct_struct *acct;
-        if (acct_globals.file &&
-            acct_globals.file->f_path.mnt->mnt_sb == sb) {
+        spin_lock(&acct_lock);
-                acct_file_reopen(NULL);
+restart:
+        list_for_each_entry(acct, &acct_list, list)
+                if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) {
+                        acct_file_reopen(acct, NULL, NULL);
+                        goto restart;
+                }
+        spin_unlock(&acct_lock);
+}
+void acct_exit_ns(struct pid_namespace *ns)
+{
+        struct bsd_acct_struct *acct;
+        spin_lock(&acct_lock);
+        acct = ns->bacct;
+        if (acct != NULL) {
+                if (acct->file != NULL)
+                        acct_file_reopen(acct, NULL, NULL);
+                kfree(acct);
        }
-        spin_unlock(&acct_globals.lock);
+        spin_unlock(&acct_lock);
 }
 /*
@@ -425,7 +478,8 @@ static u32 encode_float(u64 value)
 /*
 *  do_acct_process does all actual work. Caller holds the reference to file.
 */
-static void do_acct_process(struct pid_namespace *ns, struct file *file)
+static void do_acct_process(struct bsd_acct_struct *acct,
+                struct pid_namespace *ns, struct file *file)
 {
        struct pacct_struct *pacct = &current->signal->pacct;
        acct_t ac;
@@ -440,7 +494,7 @@ static void do_acct_process(struct pid_namespace *ns, struct file *file)
         * First check to see if there is enough free_space to continue
         * the process accounting system.
         */
-        if (!check_free_space(file))
+        if (!check_free_space(acct, file))
                return;
        /*
@@ -577,34 +631,46 @@ void acct_collect(long exitcode, int group_dead)
        spin_unlock_irq(&current->sighand->siglock);
 }
-/**
+static void acct_process_in_ns(struct pid_namespace *ns)
- * acct_process - now just a wrapper around do_acct_process
- * @exitcode: task exit code
- *
- * handles process accounting for an exiting task
- */
-void acct_process(void)
 {
        struct file *file = NULL;
-        struct pid_namespace *ns;
+        struct bsd_acct_struct *acct;
+        acct = ns->bacct;
        /*
         * accelerate the common fastpath:
         */
-        if (!acct_globals.file)
+        if (!acct || !acct->file)
                return;
-        spin_lock(&acct_globals.lock);
+        spin_lock(&acct_lock);
-        file = acct_globals.file;
+        file = acct->file;
        if (unlikely(!file)) {
-                spin_unlock(&acct_globals.lock);
+                spin_unlock(&acct_lock);
                return;
        }
        get_file(file);
-        ns = get_pid_ns(acct_globals.ns);
+        spin_unlock(&acct_lock);
-        spin_unlock(&acct_globals.lock);
-        do_acct_process(ns, file);
+        do_acct_process(acct, ns, file);
        fput(file);
-        put_pid_ns(ns);
+}
+/**
+ * acct_process - now just a wrapper around acct_process_in_ns,
+ * which in turn is a wrapper around do_acct_process.
+ *
+ * handles process accounting for an exiting task
+ */
+void acct_process(void)
+{
+        struct pid_namespace *ns;
+        /*
+         * This loop is safe lockless, since current is still
+         * alive and holds its namespace, which in turn holds
+         * its parent.
+         */
+        for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent)
+                acct_process_in_ns(ns);
 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index c10e7aae04d7..4699950e65bd 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1476,7 +1476,8 @@ void audit_syscall_entry(int arch, int major,
        struct audit_context *context = tsk->audit_context;
        enum audit_state     state;
-        BUG_ON(!context);
+        if (unlikely(!context))
+                return;
        /*
         * This happens only on certain architectures that make system
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index d1a7605c5b8f..a5e026bc45c4 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -10,30 +10,73 @@
 * of the License.
 */
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/sched.h>
-#include <linux/delay.h>
+#include <linux/stacktrace.h>
+static void backtrace_test_normal(void)
+{
+        printk("Testing a backtrace from process context.\n");
+        printk("The following trace is a kernel self test and not a bug!\n");
-static struct timer_list backtrace_timer;
+        dump_stack();
+}
-static void backtrace_test_timer(unsigned long data)
+static DECLARE_COMPLETION(backtrace_work);
+static void backtrace_test_irq_callback(unsigned long data)
+{
+        dump_stack();
+        complete(&backtrace_work);
+}
+static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0);
+static void backtrace_test_irq(void)
 {
        printk("Testing a backtrace from irq context.\n");
        printk("The following trace is a kernel self test and not a bug!\n");
-        dump_stack();
+        init_completion(&backtrace_work);
+        tasklet_schedule(&backtrace_tasklet);
+        wait_for_completion(&backtrace_work);
+}
+#ifdef CONFIG_STACKTRACE
+static void backtrace_test_saved(void)
+{
+        struct stack_trace trace;
+        unsigned long entries[8];
+        printk("Testing a saved backtrace.\n");
+        printk("The following trace is a kernel self test and not a bug!\n");
+        trace.nr_entries = 0;
+        trace.max_entries = ARRAY_SIZE(entries);
+        trace.entries = entries;
+        trace.skip = 0;
+        save_stack_trace(&trace);
+        print_stack_trace(&trace, 0);
+}
+#else
+static void backtrace_test_saved(void)
+{
+        printk("Saved backtrace test skipped.\n");
 }
+#endif
 static int backtrace_regression_test(void)
 {
        printk("====[ backtrace testing ]===========\n");
-        printk("Testing a backtrace from process context.\n");
-        printk("The following trace is a kernel self test and not a bug!\n");
-        dump_stack();
-        init_timer(&backtrace_timer);
+        backtrace_test_normal();
-        backtrace_timer.function = backtrace_test_timer;
+        backtrace_test_irq();
-        mod_timer(&backtrace_timer, jiffies + 10);
+        backtrace_test_saved();
-        msleep(10);
        printk("====[ end of backtrace testing ]====\n");
        return 0;
 }
diff --git a/kernel/capability.c b/kernel/capability.c
index 901e0fdc3fff..0101e847603e 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -115,11 +115,208 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
        return 0;
 }
+#ifndef CONFIG_SECURITY_FILE_CAPABILITIES
+/*
+ * Without filesystem capability support, we nominally support one process
+ * setting the capabilities of another
+ */
+static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
+                                     kernel_cap_t *pIp, kernel_cap_t *pPp)
+{
+        struct task_struct *target;
+        int ret;
+        spin_lock(&task_capability_lock);
+        read_lock(&tasklist_lock);
+        if (pid && pid != task_pid_vnr(current)) {
+                target = find_task_by_vpid(pid);
+                if (!target) {
+                        ret = -ESRCH;
+                        goto out;
+                }
+        } else
+                target = current;
+        ret = security_capget(target, pEp, pIp, pPp);
+out:
+        read_unlock(&tasklist_lock);
+        spin_unlock(&task_capability_lock);
+        return ret;
+}
+/*
+ * cap_set_pg - set capabilities for all processes in a given process
+ * group.  We call this holding task_capability_lock and tasklist_lock.
+ */
+static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective,
+                             kernel_cap_t *inheritable,
+                             kernel_cap_t *permitted)
+{
+        struct task_struct *g, *target;
+        int ret = -EPERM;
+        int found = 0;
+        struct pid *pgrp;
+        spin_lock(&task_capability_lock);
+        read_lock(&tasklist_lock);
+        pgrp = find_vpid(pgrp_nr);
+        do_each_pid_task(pgrp, PIDTYPE_PGID, g) {
+                target = g;
+                while_each_thread(g, target) {
+                        if (!security_capset_check(target, effective,
+                                                   inheritable, permitted)) {
+                                security_capset_set(target, effective,
+                                                    inheritable, permitted);
+                                ret = 0;
+                        }
+                        found = 1;
+                }
+        } while_each_pid_task(pgrp, PIDTYPE_PGID, g);
+        read_unlock(&tasklist_lock);
+        spin_unlock(&task_capability_lock);
+        if (!found)
+                ret = 0;
+        return ret;
+}
 /*
- * For sys_getproccap() and sys_setproccap(), any of the three
+ * cap_set_all - set capabilities for all processes other than init
- * capability set pointers may be NULL -- indicating that that set is
+ * and self.  We call this holding task_capability_lock and tasklist_lock.
- * uninteresting and/or not to be changed.
 */
+static inline int cap_set_all(kernel_cap_t *effective,
+                              kernel_cap_t *inheritable,
+                              kernel_cap_t *permitted)
+{
+        struct task_struct *g, *target;
+        int ret = -EPERM;
+        int found = 0;
+        spin_lock(&task_capability_lock);
+        read_lock(&tasklist_lock);
+        do_each_thread(g, target) {
+                if (target == current
+                    || is_container_init(target->group_leader))
+                        continue;
+                found = 1;
+                if (security_capset_check(target, effective, inheritable,
+                                          permitted))
+                        continue;
+                ret = 0;
+                security_capset_set(target, effective, inheritable, permitted);
+        } while_each_thread(g, target);
+        read_unlock(&tasklist_lock);
+        spin_unlock(&task_capability_lock);
+        if (!found)
+                ret = 0;
+        return ret;
+}
+/*
+ * Given the target pid does not refer to the current process we
+ * need more elaborate support... (This support is not present when
+ * filesystem capabilities are configured.)
+ */
+static inline int do_sys_capset_other_tasks(pid_t pid, kernel_cap_t *effective,
+                                            kernel_cap_t *inheritable,
+                                            kernel_cap_t *permitted)
+{
+        struct task_struct *target;
+        int ret;
+        if (!capable(CAP_SETPCAP))
+                return -EPERM;
+        if (pid == -1)            /* all procs other than current and init */
+                return cap_set_all(effective, inheritable, permitted);
+        else if (pid < 0)                    /* all procs in process group */
+                return cap_set_pg(-pid, effective, inheritable, permitted);
+        /* target != current */
+        spin_lock(&task_capability_lock);
+        read_lock(&tasklist_lock);
+        target = find_task_by_vpid(pid);
+        if (!target)
+                ret = -ESRCH;
+        else {
+                ret = security_capset_check(target, effective, inheritable,
+                                            permitted);
+                /* having verified that the proposed changes are legal,
+                   we now put them into effect. */
+                if (!ret)
+                        security_capset_set(target, effective, inheritable,
+                                            permitted);
+        }
+        read_unlock(&tasklist_lock);
+        spin_unlock(&task_capability_lock);
+        return ret;
+}
+#else /* ie., def CONFIG_SECURITY_FILE_CAPABILITIES */
+/*
+ * If we have configured with filesystem capability support, then the
+ * only thing that can change the capabilities of the current process
+ * is the current process. As such, we can't be in this code at the
+ * same time as we are in the process of setting capabilities in this
+ * process. The net result is that we can limit our use of locks to
+ * when we are reading the caps of another process.
+ */
+static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
+                                     kernel_cap_t *pIp, kernel_cap_t *pPp)
+{
+        int ret;
+        if (pid && (pid != task_pid_vnr(current))) {
+                struct task_struct *target;
+                spin_lock(&task_capability_lock);
+                read_lock(&tasklist_lock);
+                target = find_task_by_vpid(pid);
+                if (!target)
+                        ret = -ESRCH;
+                else
+                        ret = security_capget(target, pEp, pIp, pPp);
+                read_unlock(&tasklist_lock);
+                spin_unlock(&task_capability_lock);
+        } else
+                ret = security_capget(current, pEp, pIp, pPp);
+        return ret;
+}
+/*
+ * With filesystem capability support configured, the kernel does not
+ * permit the changing of capabilities in one process by another
+ * process. (CAP_SETPCAP has much less broad semantics when configured
+ * this way.)
+ */
+static inline int do_sys_capset_other_tasks(pid_t pid,
+                                            kernel_cap_t *effective,
+                                            kernel_cap_t *inheritable,
+                                            kernel_cap_t *permitted)
+{
+        return -EPERM;
+}
+#endif /* ie., ndef CONFIG_SECURITY_FILE_CAPABILITIES */
 /*
 * Atomically modify the effective capabilities returning the original
@@ -155,7 +352,6 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
 {
        int ret = 0;
        pid_t pid;
-        struct task_struct *target;
        unsigned tocopy;
        kernel_cap_t pE, pI, pP;
@@ -169,23 +365,7 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
        if (pid < 0)
                return -EINVAL;
-        spin_lock(&task_capability_lock);
+        ret = cap_get_target_pid(pid, &pE, &pI, &pP);
-        read_lock(&tasklist_lock);
-        if (pid && pid != task_pid_vnr(current)) {
-                target = find_task_by_vpid(pid);
-                if (!target) {
-                        ret = -ESRCH;
-                        goto out;
-                }
-        } else
-                target = current;
-        ret = security_capget(target, &pE, &pI, &pP);
-out:
-        read_unlock(&tasklist_lock);
-        spin_unlock(&task_capability_lock);
        if (!ret) {
                struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
@@ -216,7 +396,6 @@ out:
                 * before modification is attempted and the application
                 * fails.
                 */
                if (copy_to_user(dataptr, kdata, tocopy
                                 * sizeof(struct __user_cap_data_struct))) {
                        return -EFAULT;
@@ -226,70 +405,8 @@ out:
        return ret;
 }
-/*
- * cap_set_pg - set capabilities for all processes in a given process
- * group.  We call this holding task_capability_lock and tasklist_lock.
- */
-static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective,
-                              kernel_cap_t *inheritable,
-                              kernel_cap_t *permitted)
-{
-        struct task_struct *g, *target;
-        int ret = -EPERM;
-        int found = 0;
-        struct pid *pgrp;
-        pgrp = find_vpid(pgrp_nr);
-        do_each_pid_task(pgrp, PIDTYPE_PGID, g) {
-                target = g;
-                while_each_thread(g, target) {
-                        if (!security_capset_check(target, effective,
-                                                        inheritable,
-                                                        permitted)) {
-                                security_capset_set(target, effective,
-                                                        inheritable,
-                                                        permitted);
-                                ret = 0;
-                        }
-                        found = 1;
-                }
-        } while_each_pid_task(pgrp, PIDTYPE_PGID, g);
-        if (!found)
-                ret = 0;
-        return ret;
-}
-/*
- * cap_set_all - set capabilities for all processes other than init
- * and self.  We call this holding task_capability_lock and tasklist_lock.
- */
-static inline int cap_set_all(kernel_cap_t *effective,
-                               kernel_cap_t *inheritable,
-                               kernel_cap_t *permitted)
-{
-     struct task_struct *g, *target;
-     int ret = -EPERM;
-     int found = 0;
-     do_each_thread(g, target) {
-             if (target == current || is_container_init(target->group_leader))
-                     continue;
-             found = 1;
-             if (security_capset_check(target, effective, inheritable,
-                                                permitted))
-                     continue;
-             ret = 0;
-             security_capset_set(target, effective, inheritable, permitted);
-     } while_each_thread(g, target);
-     if (!found)
-             ret = 0;
-     return ret;
-}
 /**
- * sys_capset - set capabilities for a process or a group of processes
+ * sys_capset - set capabilities for a process or (*) a group of processes
 * @header: pointer to struct that contains capability version and
 *      target pid data
 * @data: pointer to struct that contains the effective, permitted,
@@ -313,7 +430,6 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
        struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
        unsigned i, tocopy;
        kernel_cap_t inheritable, permitted, effective;
-        struct task_struct *target;
        int ret;
        pid_t pid;
@@ -324,9 +440,6 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
        if (get_user(pid, &header->pid))
                return -EFAULT;
-        if (pid && pid != task_pid_vnr(current) && !capable(CAP_SETPCAP))
-                return -EPERM;
        if (copy_from_user(&kdata, data, tocopy
                           * sizeof(struct __user_cap_data_struct))) {
                return -EFAULT;
@@ -344,40 +457,31 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
                i++;
        }
-        spin_lock(&task_capability_lock);
+        if (pid && (pid != task_pid_vnr(current)))
-        read_lock(&tasklist_lock);
+                ret = do_sys_capset_other_tasks(pid, &effective, &inheritable,
+                                                &permitted);
-        if (pid > 0 && pid != task_pid_vnr(current)) {
+        else {
-                target = find_task_by_vpid(pid);
+                /*
-                if (!target) {
+                 * This lock is required even when filesystem
-                        ret = -ESRCH;
+                 * capability support is configured - it protects the
-                        goto out;
+                 * sys_capget() call from returning incorrect data in
-                }
+                 * the case that the targeted process is not the
-        } else
+                 * current one.
-                target = current;
+                 */
+                spin_lock(&task_capability_lock);
-        ret = 0;
-        /* having verified that the proposed changes are legal,
-           we now put them into effect. */
-        if (pid < 0) {
-                if (pid == -1)  /* all procs other than current and init */
-                        ret = cap_set_all(&effective, &inheritable, &permitted);
-                else            /* all procs in process group */
+                ret = security_capset_check(current, &effective, &inheritable,
-                        ret = cap_set_pg(-pid, &effective, &inheritable,
-                                         &permitted);
-        } else {
-                ret = security_capset_check(target, &effective, &inheritable,
                                            &permitted);
+                /*
+                 * Having verified that the proposed changes are
+                 * legal, we now put them into effect.
+                 */
                if (!ret)
-                        security_capset_set(target, &effective, &inheritable,
+                        security_capset_set(current, &effective, &inheritable,
                                            &permitted);
+                spin_unlock(&task_capability_lock);
        }
-out:
-        read_unlock(&tasklist_lock);
-        spin_unlock(&task_capability_lock);
        return ret;
 }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 15ac0e1e4f4d..657f8f8d93a5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -45,6 +45,7 @@
 #include <linux/delayacct.h>
 #include <linux/cgroupstats.h>
 #include <linux/hash.h>
+#include <linux/namei.h>
 #include <asm/atomic.h>
@@ -89,11 +90,7 @@ struct cgroupfs_root {
        /* Hierarchy-specific flags */
        unsigned long flags;
-        /* The path to use for release notifications. No locking
+        /* The path to use for release notifications. */
-         * between setting and use - so if userspace updates this
-         * while child cgroups exist, you could miss a
-         * notification. We ensure that it's always a valid
-         * NUL-terminated string */
        char release_agent_path[PATH_MAX];
 };
@@ -118,7 +115,7 @@ static int root_count;
 * extra work in the fork/exit path if none of the subsystems need to
 * be called.
 */
-static int need_forkexit_callback;
+static int need_forkexit_callback __read_mostly;
 static int need_mm_owner_callback __read_mostly;
 /* convenient tests for these bits */
@@ -220,7 +217,7 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
 * task until after the first call to cgroup_iter_start(). This
 * reduces the fork()/exit() overhead for people who have cgroups
 * compiled into their kernel but not actually in use */
-static int use_task_css_set_links;
+static int use_task_css_set_links __read_mostly;
 /* When we create or destroy a css_set, the operation simply
 * takes/releases a reference count on all the cgroups referenced
@@ -241,17 +238,20 @@ static int use_task_css_set_links;
 */
 static void unlink_css_set(struct css_set *cg)
 {
+        struct cg_cgroup_link *link;
+        struct cg_cgroup_link *saved_link;
        write_lock(&css_set_lock);
        hlist_del(&cg->hlist);
        css_set_count--;
-        while (!list_empty(&cg->cg_links)) {
-                struct cg_cgroup_link *link;
+        list_for_each_entry_safe(link, saved_link, &cg->cg_links,
-                link = list_entry(cg->cg_links.next,
+                                 cg_link_list) {
-                                  struct cg_cgroup_link, cg_link_list);
                list_del(&link->cg_link_list);
                list_del(&link->cgrp_link_list);
                kfree(link);
        }
        write_unlock(&css_set_lock);
 }
@@ -363,15 +363,14 @@ static struct css_set *find_existing_css_set(
 static int allocate_cg_links(int count, struct list_head *tmp)
 {
        struct cg_cgroup_link *link;
+        struct cg_cgroup_link *saved_link;
        int i;
        INIT_LIST_HEAD(tmp);
        for (i = 0; i < count; i++) {
                link = kmalloc(sizeof(*link), GFP_KERNEL);
                if (!link) {
-                        while (!list_empty(tmp)) {
+                        list_for_each_entry_safe(link, saved_link, tmp,
-                                link = list_entry(tmp->next,
+                                                 cgrp_link_list) {
-                                                  struct cg_cgroup_link,
-                                                  cgrp_link_list);
                                list_del(&link->cgrp_link_list);
                                kfree(link);
                        }
@@ -384,11 +383,10 @@ static int allocate_cg_links(int count, struct list_head *tmp)
 static void free_cg_links(struct list_head *tmp)
 {
-        while (!list_empty(tmp)) {
+        struct cg_cgroup_link *link;
-                struct cg_cgroup_link *link;
+        struct cg_cgroup_link *saved_link;
-                link = list_entry(tmp->next,
-                                  struct cg_cgroup_link,
+        list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
-                                  cgrp_link_list);
                list_del(&link->cgrp_link_list);
                kfree(link);
        }
@@ -415,11 +413,11 @@ static struct css_set *find_css_set(
        /* First see if we already have a cgroup group that matches
         * the desired set */
-        write_lock(&css_set_lock);
+        read_lock(&css_set_lock);
        res = find_existing_css_set(oldcg, cgrp, template);
        if (res)
                get_css_set(res);
-        write_unlock(&css_set_lock);
+        read_unlock(&css_set_lock);
        if (res)
                return res;
@@ -507,10 +505,6 @@ static struct css_set *find_css_set(
 * knows that the cgroup won't be removed, as cgroup_rmdir()
 * needs that mutex.
 *
- * The cgroup_common_file_write handler for operations that modify
- * the cgroup hierarchy holds cgroup_mutex across the entire operation,
- * single threading all such cgroup modifications across the system.
- *
 * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
 * (usually) take cgroup_mutex.  These are the two most performance
 * critical pieces of code here.  The exception occurs on cgroup_exit(),
@@ -1093,6 +1087,8 @@ static void cgroup_kill_sb(struct super_block *sb) {
        struct cgroupfs_root *root = sb->s_fs_info;
        struct cgroup *cgrp = &root->top_cgroup;
        int ret;
+        struct cg_cgroup_link *link;
+        struct cg_cgroup_link *saved_link;
        BUG_ON(!root);
@@ -1112,10 +1108,9 @@ static void cgroup_kill_sb(struct super_block *sb) {
         * root cgroup
         */
        write_lock(&css_set_lock);
-        while (!list_empty(&cgrp->css_sets)) {
-                struct cg_cgroup_link *link;
+        list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
-                link = list_entry(cgrp->css_sets.next,
+                                 cgrp_link_list) {
-                                  struct cg_cgroup_link, cgrp_link_list);
                list_del(&link->cg_link_list);
                list_del(&link->cgrp_link_list);
                kfree(link);
@@ -1281,18 +1276,14 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 }
 /*
- * Attach task with pid 'pid' to cgroup 'cgrp'. Call with
+ * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
- * cgroup_mutex, may take task_lock of task
+ * held. May take task_lock of task
 */
-static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
+static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
 {
-        pid_t pid;
        struct task_struct *tsk;
        int ret;
-        if (sscanf(pidbuf, "%d", &pid) != 1)
-                return -EIO;
        if (pid) {
                rcu_read_lock();
                tsk = find_task_by_vpid(pid);
@@ -1318,6 +1309,16 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
        return ret;
 }
+static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
+{
+        int ret;
+        if (!cgroup_lock_live_group(cgrp))
+                return -ENODEV;
+        ret = attach_task_by_pid(cgrp, pid);
+        cgroup_unlock();
+        return ret;
+}
 /* The various types of files and directories in a cgroup file system */
 enum cgroup_filetype {
        FILE_ROOT,
@@ -1327,12 +1328,54 @@ enum cgroup_filetype {
        FILE_RELEASE_AGENT,
 };
+/**
+ * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
+ * @cgrp: the cgroup to be checked for liveness
+ *
+ * On success, returns true; the lock should be later released with
+ * cgroup_unlock(). On failure returns false with no lock held.
+ */
+bool cgroup_lock_live_group(struct cgroup *cgrp)
+{
+        mutex_lock(&cgroup_mutex);
+        if (cgroup_is_removed(cgrp)) {
+                mutex_unlock(&cgroup_mutex);
+                return false;
+        }
+        return true;
+}
+static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
+                                      const char *buffer)
+{
+        BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+        if (!cgroup_lock_live_group(cgrp))
+                return -ENODEV;
+        strcpy(cgrp->root->release_agent_path, buffer);
+        cgroup_unlock();
+        return 0;
+}
+static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
+                                     struct seq_file *seq)
+{
+        if (!cgroup_lock_live_group(cgrp))
+                return -ENODEV;
+        seq_puts(seq, cgrp->root->release_agent_path);
+        seq_putc(seq, '\n');
+        cgroup_unlock();
+        return 0;
+}
+/* A buffer size big enough for numbers or short strings */
+#define CGROUP_LOCAL_BUFFER_SIZE 64
 static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
                                struct file *file,
                                const char __user *userbuf,
                                size_t nbytes, loff_t *unused_ppos)
 {
-        char buffer[64];
+        char buffer[CGROUP_LOCAL_BUFFER_SIZE];
        int retval = 0;
        char *end;
@@ -1361,68 +1404,36 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
        return retval;
 }
-static ssize_t cgroup_common_file_write(struct cgroup *cgrp,
+static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
-                                           struct cftype *cft,
+                                   struct file *file,
-                                           struct file *file,
+                                   const char __user *userbuf,
-                                           const char __user *userbuf,
+                                   size_t nbytes, loff_t *unused_ppos)
-                                           size_t nbytes, loff_t *unused_ppos)
 {
-        enum cgroup_filetype type = cft->private;
+        char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
-        char *buffer;
        int retval = 0;
+        size_t max_bytes = cft->max_write_len;
+        char *buffer = local_buffer;
-        if (nbytes >= PATH_MAX)
+        if (!max_bytes)
+                max_bytes = sizeof(local_buffer) - 1;
+        if (nbytes >= max_bytes)
                return -E2BIG;
+        /* Allocate a dynamic buffer if we need one */
-        /* +1 for nul-terminator */
+        if (nbytes >= sizeof(local_buffer)) {
-        buffer = kmalloc(nbytes + 1, GFP_KERNEL);
+                buffer = kmalloc(nbytes + 1, GFP_KERNEL);
-        if (buffer == NULL)
+                if (buffer == NULL)
-                return -ENOMEM;
+                        return -ENOMEM;
-        if (copy_from_user(buffer, userbuf, nbytes)) {
-                retval = -EFAULT;
-                goto out1;
        }
-        buffer[nbytes] = 0;     /* nul-terminate */
+        if (nbytes && copy_from_user(buffer, userbuf, nbytes))
-        strstrip(buffer);       /* strip -just- trailing whitespace */
+                return -EFAULT;
-        mutex_lock(&cgroup_mutex);
-        /*
+        buffer[nbytes] = 0;     /* nul-terminate */
-         * This was already checked for in cgroup_file_write(), but
+        strstrip(buffer);
-         * check again now we're holding cgroup_mutex.
+        retval = cft->write_string(cgrp, cft, buffer);
-         */
+        if (!retval)
-        if (cgroup_is_removed(cgrp)) {
-                retval = -ENODEV;
-                goto out2;
-        }
-        switch (type) {
-        case FILE_TASKLIST:
-                retval = attach_task_by_pid(cgrp, buffer);
-                break;
-        case FILE_NOTIFY_ON_RELEASE:
-                clear_bit(CGRP_RELEASABLE, &cgrp->flags);
-                if (simple_strtoul(buffer, NULL, 10) != 0)
-                        set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
-                else
-                        clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
-                break;
-        case FILE_RELEASE_AGENT:
-                BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
-                strcpy(cgrp->root->release_agent_path, buffer);
-                break;
-        default:
-                retval = -EINVAL;
-                goto out2;
-        }
-        if (retval == 0)
                retval = nbytes;
-out2:
+        if (buffer != local_buffer)
-        mutex_unlock(&cgroup_mutex);
+                kfree(buffer);
-out1:
-        kfree(buffer);
        return retval;
 }
@@ -1438,6 +1449,8 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
                return cft->write(cgrp, cft, file, buf, nbytes, ppos);
        if (cft->write_u64 || cft->write_s64)
                return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
+        if (cft->write_string)
+                return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
        if (cft->trigger) {
                int ret = cft->trigger(cgrp, (unsigned int)cft->private);
                return ret ? ret : nbytes;
@@ -1450,7 +1463,7 @@ static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
                               char __user *buf, size_t nbytes,
                               loff_t *ppos)
 {
-        char tmp[64];
+        char tmp[CGROUP_LOCAL_BUFFER_SIZE];
        u64 val = cft->read_u64(cgrp, cft);
        int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
@@ -1462,56 +1475,13 @@ static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
                               char __user *buf, size_t nbytes,
                               loff_t *ppos)
 {
-        char tmp[64];
+        char tmp[CGROUP_LOCAL_BUFFER_SIZE];
        s64 val = cft->read_s64(cgrp, cft);
        int len = sprintf(tmp, "%lld\n", (long long) val);
        return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
 }
-static ssize_t cgroup_common_file_read(struct cgroup *cgrp,
-                                          struct cftype *cft,
-                                          struct file *file,
-                                          char __user *buf,
-                                          size_t nbytes, loff_t *ppos)
-{
-        enum cgroup_filetype type = cft->private;
-        char *page;
-        ssize_t retval = 0;
-        char *s;
-        if (!(page = (char *)__get_free_page(GFP_KERNEL)))
-                return -ENOMEM;
-        s = page;
-        switch (type) {
-        case FILE_RELEASE_AGENT:
-        {
-                struct cgroupfs_root *root;
-                size_t n;
-                mutex_lock(&cgroup_mutex);
-                root = cgrp->root;
-                n = strnlen(root->release_agent_path,
-                            sizeof(root->release_agent_path));
-                n = min(n, (size_t) PAGE_SIZE);
-                strncpy(s, root->release_agent_path, n);
-                mutex_unlock(&cgroup_mutex);
-                s += n;
-                break;
-        }
-        default:
-                retval = -EINVAL;
-                goto out;
-        }
-        *s++ = '\n';
-        retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
-out:
-        free_page((unsigned long)page);
-        return retval;
-}
 static ssize_t cgroup_file_read(struct file *file, char __user *buf,
                                   size_t nbytes, loff_t *ppos)
 {
@@ -1560,7 +1530,7 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
        return cft->read_seq_string(state->cgroup, cft, m);
 }
-int cgroup_seqfile_release(struct inode *inode, struct file *file)
+static int cgroup_seqfile_release(struct inode *inode, struct file *file)
 {
        struct seq_file *seq = file->private_data;
        kfree(seq->private);
@@ -1569,6 +1539,7 @@ int cgroup_seqfile_release(struct inode *inode, struct file *file)
 static struct file_operations cgroup_seqfile_operations = {
        .read = seq_read,
+        .write = cgroup_file_write,
        .llseek = seq_lseek,
        .release = cgroup_seqfile_release,
 };
@@ -1756,15 +1727,11 @@ int cgroup_add_files(struct cgroup *cgrp,
 int cgroup_task_count(const struct cgroup *cgrp)
 {
        int count = 0;
-        struct list_head *l;
+        struct cg_cgroup_link *link;
        read_lock(&css_set_lock);
-        l = cgrp->css_sets.next;
+        list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
-        while (l != &cgrp->css_sets) {
-                struct cg_cgroup_link *link =
-                        list_entry(l, struct cg_cgroup_link, cgrp_link_list);
                count += atomic_read(&link->cg->ref.refcount);
-                l = l->next;
        }
        read_unlock(&css_set_lock);
        return count;
@@ -2227,6 +2194,18 @@ static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
        return notify_on_release(cgrp);
 }
+static int cgroup_write_notify_on_release(struct cgroup *cgrp,
+                                          struct cftype *cft,
+                                          u64 val)
+{
+        clear_bit(CGRP_RELEASABLE, &cgrp->flags);
+        if (val)
+                set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+        else
+                clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+        return 0;
+}
 /*
 * for the common functions, 'private' gives the type of file
 */
@@ -2235,7 +2214,7 @@ static struct cftype files[] = {
                .name = "tasks",
                .open = cgroup_tasks_open,
                .read = cgroup_tasks_read,
-                .write = cgroup_common_file_write,
+                .write_u64 = cgroup_tasks_write,
                .release = cgroup_tasks_release,
                .private = FILE_TASKLIST,
        },
@@ -2243,15 +2222,16 @@ static struct cftype files[] = {
        {
                .name = "notify_on_release",
                .read_u64 = cgroup_read_notify_on_release,
-                .write = cgroup_common_file_write,
+                .write_u64 = cgroup_write_notify_on_release,
                .private = FILE_NOTIFY_ON_RELEASE,
        },
 };
 static struct cftype cft_release_agent = {
        .name = "release_agent",
-        .read = cgroup_common_file_read,
+        .read_seq_string = cgroup_release_agent_show,
-        .write = cgroup_common_file_write,
+        .write_string = cgroup_release_agent_write,
+        .max_write_len = PATH_MAX,
        .private = FILE_RELEASE_AGENT,
 };
@@ -2869,16 +2849,17 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 * cgroup_clone - clone the cgroup the given subsystem is attached to
 * @tsk: the task to be moved
 * @subsys: the given subsystem
+ * @nodename: the name for the new cgroup
 *
 * Duplicate the current cgroup in the hierarchy that the given
 * subsystem is attached to, and move this task into the new
 * child.
 */
-int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
+int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
+                                                        char *nodename)
 {
        struct dentry *dentry;
        int ret = 0;
-        char nodename[MAX_CGROUP_TYPE_NAMELEN];
        struct cgroup *parent, *child;
        struct inode *inode;
        struct css_set *cg;
@@ -2903,8 +2884,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
        cg = tsk->cgroups;
        parent = task_cgroup(tsk, subsys->subsys_id);
-        snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "%d", tsk->pid);
        /* Pin the hierarchy */
        atomic_inc(&parent->root->sb->s_active);
@@ -3078,27 +3057,24 @@ static void cgroup_release_agent(struct work_struct *work)
        while (!list_empty(&release_list)) {
                char *argv[3], *envp[3];
                int i;
-                char *pathbuf;
+                char *pathbuf = NULL, *agentbuf = NULL;
                struct cgroup *cgrp = list_entry(release_list.next,
                                                    struct cgroup,
                                                    release_list);
                list_del_init(&cgrp->release_list);
                spin_unlock(&release_list_lock);
                pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
-                if (!pathbuf) {
+                if (!pathbuf)
-                        spin_lock(&release_list_lock);
+                        goto continue_free;
-                        continue;
+                if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
-                }
+                        goto continue_free;
+                agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
-                if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) {
+                if (!agentbuf)
-                        kfree(pathbuf);
+                        goto continue_free;
-                        spin_lock(&release_list_lock);
-                        continue;
-                }
                i = 0;
-                argv[i++] = cgrp->root->release_agent_path;
+                argv[i++] = agentbuf;
-                argv[i++] = (char *)pathbuf;
+                argv[i++] = pathbuf;
                argv[i] = NULL;
                i = 0;
@@ -3112,8 +3088,10 @@ static void cgroup_release_agent(struct work_struct *work)
                 * be a slow process */
                mutex_unlock(&cgroup_mutex);
                call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
-                kfree(pathbuf);
                mutex_lock(&cgroup_mutex);
+ continue_free:
+                kfree(pathbuf);
+                kfree(agentbuf);
                spin_lock(&release_list_lock);
        }
        spin_unlock(&release_list_lock);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index c77bc3a1c722..10ba5f1004a5 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -15,6 +15,28 @@
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
+/*
+ * Represents all cpu's present in the system
+ * In systems capable of hotplug, this map could dynamically grow
+ * as new cpu's are detected in the system via any platform specific
+ * method, such as ACPI for e.g.
+ */
+cpumask_t cpu_present_map __read_mostly;
+EXPORT_SYMBOL(cpu_present_map);
+#ifndef CONFIG_SMP
+/*
+ * Represents all cpu's that are currently online.
+ */
+cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
+EXPORT_SYMBOL(cpu_online_map);
+cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
+EXPORT_SYMBOL(cpu_possible_map);
+#else /* CONFIG_SMP */
 /* Serializes the updates to cpu_online_map, cpu_present_map */
 static DEFINE_MUTEX(cpu_add_remove_lock);
@@ -42,6 +64,8 @@ void __init cpu_hotplug_init(void)
        cpu_hotplug.refcount = 0;
 }
+cpumask_t cpu_active_map;
 #ifdef CONFIG_HOTPLUG_CPU
 void get_online_cpus(void)
@@ -261,6 +285,11 @@ out_allowed:
        set_cpus_allowed_ptr(current, &old_allowed);
 out_release:
        cpu_hotplug_done();
+        if (!err) {
+                if (raw_notifier_call_chain(&cpu_chain, CPU_POST_DEAD | mod,
+                                            hcpu) == NOTIFY_BAD)
+                        BUG();
+        }
        return err;
 }
@@ -269,14 +298,34 @@ int __ref cpu_down(unsigned int cpu)
        int err = 0;
        cpu_maps_update_begin();
-        if (cpu_hotplug_disabled)
+        if (cpu_hotplug_disabled) {
                err = -EBUSY;
-        else
+                goto out;
-                err = _cpu_down(cpu, 0);
+        }
+        cpu_clear(cpu, cpu_active_map);
+        /*
+         * Make sure the all cpus did the reschedule and are not
+         * using stale version of the cpu_active_map.
+         * This is not strictly necessary becuase stop_machine()
+         * that we run down the line already provides the required
+         * synchronization. But it's really a side effect and we do not
+         * want to depend on the innards of the stop_machine here.
+         */
+        synchronize_sched();
+        err = _cpu_down(cpu, 0);
+        if (cpu_online(cpu))
+                cpu_set(cpu, cpu_active_map);
+out:
        cpu_maps_update_done();
        return err;
 }
+EXPORT_SYMBOL(cpu_down);
 #endif /*CONFIG_HOTPLUG_CPU*/
 /* Requires cpu_add_remove_lock to be held */
@@ -332,11 +381,18 @@ int __cpuinit cpu_up(unsigned int cpu)
        }
        cpu_maps_update_begin();
-        if (cpu_hotplug_disabled)
+        if (cpu_hotplug_disabled) {
                err = -EBUSY;
-        else
+                goto out;
-                err = _cpu_up(cpu, 0);
+        }
+        err = _cpu_up(cpu, 0);
+        if (cpu_online(cpu))
+                cpu_set(cpu, cpu_active_map);
+out:
        cpu_maps_update_done();
        return err;
 }
@@ -390,7 +446,7 @@ void __ref enable_nonboot_cpus(void)
                goto out;
        printk("Enabling non-boot CPUs ...\n");
-        for_each_cpu_mask(cpu, frozen_cpus) {
+        for_each_cpu_mask_nr(cpu, frozen_cpus) {
                error = _cpu_up(cpu, 1);
                if (!error) {
                        printk("CPU%d is up\n", cpu);
@@ -403,3 +459,5 @@ out:
        cpu_maps_update_done();
 }
 #endif /* CONFIG_PM_SLEEP_SMP */
+#endif /* CONFIG_SMP */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 798b3ab054eb..91cf85b36dd5 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -227,10 +227,6 @@ static struct cpuset top_cpuset = {
 * The task_struct fields mems_allowed and mems_generation may only
 * be accessed in the context of that task, so require no locks.
 *
- * The cpuset_common_file_write handler for operations that modify
- * the cpuset hierarchy holds cgroup_mutex across the entire operation,
- * single threading all such cpuset modifications across the system.
- *
 * The cpuset_common_file_read() handlers only hold callback_mutex across
 * small pieces of code, such as when reading out possibly multi-word
 * cpumasks and nodemasks.
@@ -369,7 +365,7 @@ void cpuset_update_task_memory_state(void)
                my_cpusets_mem_gen = top_cpuset.mems_generation;
        } else {
                rcu_read_lock();
-                my_cpusets_mem_gen = task_cs(current)->mems_generation;
+                my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
                rcu_read_unlock();
        }
@@ -500,11 +496,16 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
 /*
 * rebuild_sched_domains()
 *
- * If the flag 'sched_load_balance' of any cpuset with non-empty
+ * This routine will be called to rebuild the scheduler's dynamic
- * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
+ * sched domains:
- * which has that flag enabled, or if any cpuset with a non-empty
+ * - if the flag 'sched_load_balance' of any cpuset with non-empty
- * 'cpus' is removed, then call this routine to rebuild the
+ *   'cpus' changes,
- * scheduler's dynamic sched domains.
+ * - or if the 'cpus' allowed changes in any cpuset which has that
+ *   flag enabled,
+ * - or if the 'sched_relax_domain_level' of any cpuset which has
+ *   that flag enabled and with non-empty 'cpus' changes,
+ * - or if any cpuset with non-empty 'cpus' is removed,
+ * - or if a cpu gets offlined.
 *
 * This routine builds a partial partition of the systems CPUs
 * (the set of non-overlappping cpumask_t's in the array 'part'
@@ -564,7 +565,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
 *      partition_sched_domains().
 */
-static void rebuild_sched_domains(void)
+void rebuild_sched_domains(void)
 {
        struct kfifo *q;        /* queue of cpusets to be scanned */
        struct cpuset *cp;      /* scans q */
@@ -609,8 +610,13 @@ static void rebuild_sched_domains(void)
        while (__kfifo_get(q, (void *)&cp, sizeof(cp))) {
                struct cgroup *cont;
                struct cpuset *child;   /* scans child cpusets of cp */
+                if (cpus_empty(cp->cpus_allowed))
+                        continue;
                if (is_sched_load_balance(cp))
                        csa[csn++] = cp;
                list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
                        child = cgroup_cs(cont);
                        __kfifo_put(q, (void *)&child, sizeof(cp));
@@ -679,7 +685,9 @@ restart:
                                if (apn == b->pn) {
                                        cpus_or(*dp, *dp, b->cpus_allowed);
                                        b->pn = -1;
-                                        update_domain_attr(dattr, b);
+                                        if (dattr)
+                                                update_domain_attr(dattr
+                                                                   + nslot, b);
                                }
                        }
                        nslot++;
@@ -701,36 +709,6 @@ done:
        /* Don't kfree(dattr) -- partition_sched_domains() does that. */
 }
-static inline int started_after_time(struct task_struct *t1,
-                                     struct timespec *time,
-                                     struct task_struct *t2)
-{
-        int start_diff = timespec_compare(&t1->start_time, time);
-        if (start_diff > 0) {
-                return 1;
-        } else if (start_diff < 0) {
-                return 0;
-        } else {
-                /*
-                 * Arbitrarily, if two processes started at the same
-                 * time, we'll say that the lower pointer value
-                 * started first. Note that t2 may have exited by now
-                 * so this may not be a valid pointer any longer, but
-                 * that's fine - it still serves to distinguish
-                 * between two tasks started (effectively)
-                 * simultaneously.
-                 */
-                return t1 > t2;
-        }
-}
-static inline int started_after(void *p1, void *p2)
-{
-        struct task_struct *t1 = p1;
-        struct task_struct *t2 = p2;
-        return started_after_time(t1, &t2->start_time, t2);
-}
 /**
 * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's
 * @tsk: task to test
@@ -766,15 +744,49 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
 }
 /**
+ * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
+ * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
+ *
+ * Called with cgroup_mutex held
+ *
+ * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
+ * calling callback functions for each.
+ *
+ * Return 0 if successful, -errno if not.
+ */
+static int update_tasks_cpumask(struct cpuset *cs)
+{
+        struct cgroup_scanner scan;
+        struct ptr_heap heap;
+        int retval;
+        /*
+         * cgroup_scan_tasks() will initialize heap->gt for us.
+         * heap_init() is still needed here for we should not change
+         * cs->cpus_allowed when heap_init() fails.
+         */
+        retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
+        if (retval)
+                return retval;
+        scan.cg = cs->css.cgroup;
+        scan.test_task = cpuset_test_cpumask;
+        scan.process_task = cpuset_change_cpumask;
+        scan.heap = &heap;
+        retval = cgroup_scan_tasks(&scan);
+        heap_free(&heap);
+        return retval;
+}
+/**
 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
 * @cs: the cpuset to consider
 * @buf: buffer of cpu numbers written to this cpuset
 */
-static int update_cpumask(struct cpuset *cs, char *buf)
+static int update_cpumask(struct cpuset *cs, const char *buf)
 {
        struct cpuset trialcs;
-        struct cgroup_scanner scan;
-        struct ptr_heap heap;
        int retval;
        int is_load_balanced;
@@ -790,7 +802,6 @@ static int update_cpumask(struct cpuset *cs, char *buf)
         * that parsing.  The validate_change() call ensures that cpusets
         * with tasks have cpus.
         */
-        buf = strstrip(buf);
        if (!*buf) {
                cpus_clear(trialcs.cpus_allowed);
        } else {
@@ -809,10 +820,6 @@ static int update_cpumask(struct cpuset *cs, char *buf)
        if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
                return 0;
-        retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
-        if (retval)
-                return retval;
        is_load_balanced = is_sched_load_balance(&trialcs);
        mutex_lock(&callback_mutex);
@@ -823,12 +830,9 @@ static int update_cpumask(struct cpuset *cs, char *buf)
         * Scan tasks in the cpuset, and update the cpumasks of any
         * that need an update.
         */
-        scan.cg = cs->css.cgroup;
+        retval = update_tasks_cpumask(cs);
-        scan.test_task = cpuset_test_cpumask;
+        if (retval < 0)
-        scan.process_task = cpuset_change_cpumask;
+                return retval;
-        scan.heap = &heap;
-        cgroup_scan_tasks(&scan);
-        heap_free(&heap);
        if (is_load_balanced)
                rebuild_sched_domains();
@@ -884,74 +888,25 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
        mutex_unlock(&callback_mutex);
 }
-/*
- * Handle user request to change the 'mems' memory placement
- * of a cpuset.  Needs to validate the request, update the
- * cpusets mems_allowed and mems_generation, and for each
- * task in the cpuset, rebind any vma mempolicies and if
- * the cpuset is marked 'memory_migrate', migrate the tasks
- * pages to the new memory.
- *
- * Call with cgroup_mutex held.  May take callback_mutex during call.
- * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
- * lock each such tasks mm->mmap_sem, scan its vma's and rebind
- * their mempolicies to the cpusets new mems_allowed.
- */
 static void *cpuset_being_rebound;
-static int update_nodemask(struct cpuset *cs, char *buf)
+/**
+ * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
+ * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
+ * @oldmem: old mems_allowed of cpuset cs
+ *
+ * Called with cgroup_mutex held
+ * Return 0 if successful, -errno if not.
+ */
+static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
 {
-        struct cpuset trialcs;
-        nodemask_t oldmem;
        struct task_struct *p;
        struct mm_struct **mmarray;
        int i, n, ntasks;
        int migrate;
        int fudge;
-        int retval;
        struct cgroup_iter it;
+        int retval;
-        /*
-         * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
-         * it's read-only
-         */
-        if (cs == &top_cpuset)
-                return -EACCES;
-        trialcs = *cs;
-        /*
-         * An empty mems_allowed is ok iff there are no tasks in the cpuset.
-         * Since nodelist_parse() fails on an empty mask, we special case
-         * that parsing.  The validate_change() call ensures that cpusets
-         * with tasks have memory.
-         */
-        buf = strstrip(buf);
-        if (!*buf) {
-                nodes_clear(trialcs.mems_allowed);
-        } else {
-                retval = nodelist_parse(buf, trialcs.mems_allowed);
-                if (retval < 0)
-                        goto done;
-                if (!nodes_subset(trialcs.mems_allowed,
-                                node_states[N_HIGH_MEMORY]))
-                        return -EINVAL;
-        }
-        oldmem = cs->mems_allowed;
-        if (nodes_equal(oldmem, trialcs.mems_allowed)) {
-                retval = 0;             /* Too easy - nothing to do */
-                goto done;
-        }
-        retval = validate_change(cs, &trialcs);
-        if (retval < 0)
-                goto done;
-        mutex_lock(&callback_mutex);
-        cs->mems_allowed = trialcs.mems_allowed;
-        cs->mems_generation = cpuset_mems_generation++;
-        mutex_unlock(&callback_mutex);
        cpuset_being_rebound = cs;              /* causes mpol_dup() rebind */
@@ -1018,7 +973,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
                mpol_rebind_mm(mm, &cs->mems_allowed);
                if (migrate)
-                        cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed);
+                        cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
                mmput(mm);
        }
@@ -1030,6 +985,70 @@ done:
        return retval;
 }
+/*
+ * Handle user request to change the 'mems' memory placement
+ * of a cpuset.  Needs to validate the request, update the
+ * cpusets mems_allowed and mems_generation, and for each
+ * task in the cpuset, rebind any vma mempolicies and if
+ * the cpuset is marked 'memory_migrate', migrate the tasks
+ * pages to the new memory.
+ *
+ * Call with cgroup_mutex held.  May take callback_mutex during call.
+ * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
+ * lock each such tasks mm->mmap_sem, scan its vma's and rebind
+ * their mempolicies to the cpusets new mems_allowed.
+ */
+static int update_nodemask(struct cpuset *cs, const char *buf)
+{
+        struct cpuset trialcs;
+        nodemask_t oldmem;
+        int retval;
+        /*
+         * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
+         * it's read-only
+         */
+        if (cs == &top_cpuset)
+                return -EACCES;
+        trialcs = *cs;
+        /*
+         * An empty mems_allowed is ok iff there are no tasks in the cpuset.
+         * Since nodelist_parse() fails on an empty mask, we special case
+         * that parsing.  The validate_change() call ensures that cpusets
+         * with tasks have memory.
+         */
+        if (!*buf) {
+                nodes_clear(trialcs.mems_allowed);
+        } else {
+                retval = nodelist_parse(buf, trialcs.mems_allowed);
+                if (retval < 0)
+                        goto done;
+                if (!nodes_subset(trialcs.mems_allowed,
+                                node_states[N_HIGH_MEMORY]))
+                        return -EINVAL;
+        }
+        oldmem = cs->mems_allowed;
+        if (nodes_equal(oldmem, trialcs.mems_allowed)) {
+                retval = 0;             /* Too easy - nothing to do */
+                goto done;
+        }
+        retval = validate_change(cs, &trialcs);
+        if (retval < 0)
+                goto done;
+        mutex_lock(&callback_mutex);
+        cs->mems_allowed = trialcs.mems_allowed;
+        cs->mems_generation = cpuset_mems_generation++;
+        mutex_unlock(&callback_mutex);
+        retval = update_tasks_nodemask(cs, &oldmem);
+done:
+        return retval;
+}
 int current_cpuset_is_being_rebound(void)
 {
        return task_cs(current) == cpuset_being_rebound;
@@ -1042,7 +1061,8 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
        if (val != cs->relax_domain_level) {
                cs->relax_domain_level = val;
-                rebuild_sched_domains();
+                if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
+                        rebuild_sched_domains();
        }
        return 0;
@@ -1194,6 +1214,15 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
        if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
                return -ENOSPC;
+        if (tsk->flags & PF_THREAD_BOUND) {
+                cpumask_t mask;
+                mutex_lock(&callback_mutex);
+                mask = cs->cpus_allowed;
+                mutex_unlock(&callback_mutex);
+                if (!cpus_equal(tsk->cpus_allowed, mask))
+                        return -EINVAL;
+        }
        return security_task_setscheduler(tsk, 0, NULL);
 }
@@ -1207,11 +1236,14 @@ static void cpuset_attach(struct cgroup_subsys *ss,
        struct mm_struct *mm;
        struct cpuset *cs = cgroup_cs(cont);
        struct cpuset *oldcs = cgroup_cs(oldcont);
+        int err;
        mutex_lock(&callback_mutex);
        guarantee_online_cpus(cs, &cpus);
-        set_cpus_allowed_ptr(tsk, &cpus);
+        err = set_cpus_allowed_ptr(tsk, &cpus);
        mutex_unlock(&callback_mutex);
+        if (err)
+                return;
        from = oldcs->mems_allowed;
        to = cs->mems_allowed;
@@ -1242,72 +1274,14 @@ typedef enum {
        FILE_SPREAD_SLAB,
 } cpuset_filetype_t;
-static ssize_t cpuset_common_file_write(struct cgroup *cont,
-                                        struct cftype *cft,
-                                        struct file *file,
-                                        const char __user *userbuf,
-                                        size_t nbytes, loff_t *unused_ppos)
-{
-        struct cpuset *cs = cgroup_cs(cont);
-        cpuset_filetype_t type = cft->private;
-        char *buffer;
-        int retval = 0;
-        /* Crude upper limit on largest legitimate cpulist user might write. */
-        if (nbytes > 100U + 6 * max(NR_CPUS, MAX_NUMNODES))
-                return -E2BIG;
-        /* +1 for nul-terminator */
-        buffer = kmalloc(nbytes + 1, GFP_KERNEL);
-        if (!buffer)
-                return -ENOMEM;
-        if (copy_from_user(buffer, userbuf, nbytes)) {
-                retval = -EFAULT;
-                goto out1;
-        }
-        buffer[nbytes] = 0;     /* nul-terminate */
-        cgroup_lock();
-        if (cgroup_is_removed(cont)) {
-                retval = -ENODEV;
-                goto out2;
-        }
-        switch (type) {
-        case FILE_CPULIST:
-                retval = update_cpumask(cs, buffer);
-                break;
-        case FILE_MEMLIST:
-                retval = update_nodemask(cs, buffer);
-                break;
-        default:
-                retval = -EINVAL;
-                goto out2;
-        }
-        if (retval == 0)
-                retval = nbytes;
-out2:
-        cgroup_unlock();
-out1:
-        kfree(buffer);
-        return retval;
-}
 static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
 {
        int retval = 0;
        struct cpuset *cs = cgroup_cs(cgrp);
        cpuset_filetype_t type = cft->private;
-        cgroup_lock();
+        if (!cgroup_lock_live_group(cgrp))
-        if (cgroup_is_removed(cgrp)) {
-                cgroup_unlock();
                return -ENODEV;
-        }
        switch (type) {
        case FILE_CPU_EXCLUSIVE:
@@ -1353,12 +1327,9 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
        struct cpuset *cs = cgroup_cs(cgrp);
        cpuset_filetype_t type = cft->private;
-        cgroup_lock();
+        if (!cgroup_lock_live_group(cgrp))
-        if (cgroup_is_removed(cgrp)) {
-                cgroup_unlock();
                return -ENODEV;
-        }
        switch (type) {
        case FILE_SCHED_RELAX_DOMAIN_LEVEL:
                retval = update_relax_domain_level(cs, val);
@@ -1372,6 +1343,32 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
 }
 /*
+ * Common handling for a write to a "cpus" or "mems" file.
+ */
+static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
+                                const char *buf)
+{
+        int retval = 0;
+        if (!cgroup_lock_live_group(cgrp))
+                return -ENODEV;
+        switch (cft->private) {
+        case FILE_CPULIST:
+                retval = update_cpumask(cgroup_cs(cgrp), buf);
+                break;
+        case FILE_MEMLIST:
+                retval = update_nodemask(cgroup_cs(cgrp), buf);
+                break;
+        default:
+                retval = -EINVAL;
+                break;
+        }
+        cgroup_unlock();
+        return retval;
+}
+/*
 * These ascii lists should be read in a single call, by using a user
 * buffer large enough to hold the entire map.  If read in smaller
 * chunks, there is no guarantee of atomicity.  Since the display format
@@ -1490,14 +1487,16 @@ static struct cftype files[] = {
        {
                .name = "cpus",
                .read = cpuset_common_file_read,
-                .write = cpuset_common_file_write,
+                .write_string = cpuset_write_resmask,
+                .max_write_len = (100U + 6 * NR_CPUS),
                .private = FILE_CPULIST,
        },
        {
                .name = "mems",
                .read = cpuset_common_file_read,
-                .write = cpuset_common_file_write,
+                .write_string = cpuset_write_resmask,
+                .max_write_len = (100U + 6 * MAX_NUMNODES),
                .private = FILE_MEMLIST,
        },
@@ -1778,7 +1777,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
        scan.scan.heap = NULL;
        scan.to = to->css.cgroup;
-        if (cgroup_scan_tasks((struct cgroup_scanner *)&scan))
+        if (cgroup_scan_tasks(&scan.scan))
                printk(KERN_ERR "move_member_tasks_to_cpuset: "
                                "cgroup_scan_tasks failed\n");
 }
@@ -1838,6 +1837,7 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
        struct cpuset *child;   /* scans child cpusets of cp */
        struct list_head queue;
        struct cgroup *cont;
+        nodemask_t oldmems;
        INIT_LIST_HEAD(&queue);
@@ -1857,6 +1857,8 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
                    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
                        continue;
+                oldmems = cp->mems_allowed;
                /* Remove offline cpus and mems from this cpuset. */
                mutex_lock(&callback_mutex);
                cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
@@ -1868,6 +1870,10 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
                if (cpus_empty(cp->cpus_allowed) ||
                     nodes_empty(cp->mems_allowed))
                        remove_tasks_in_empty_cpuset(cp);
+                else {
+                        update_tasks_cpumask(cp);
+                        update_tasks_nodemask(cp, &oldmems);
+                }
        }
 }
@@ -1960,7 +1966,6 @@ void __init cpuset_init_smp(void)
 }
 /**
 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
 * @pmask: pointer to cpumask_t variable to receive cpus_allowed set.
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 10e43fd8b721..b3179dad71be 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -145,8 +145,11 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
        d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
        tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
        d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
+        tmp = d->freepages_delay_total + tsk->delays->freepages_delay;
+        d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp;
        d->blkio_count += tsk->delays->blkio_count;
        d->swapin_count += tsk->delays->swapin_count;
+        d->freepages_count += tsk->delays->freepages_count;
        spin_unlock_irqrestore(&tsk->delays->lock, flags);
 done:
@@ -165,3 +168,16 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
        return ret;
 }
+void __delayacct_freepages_start(void)
+{
+        delayacct_start(&current->delays->freepages_start);
+}
+void __delayacct_freepages_end(void)
+{
+        delayacct_end(&current->delays->freepages_start,
+                        &current->delays->freepages_end,
+                        &current->delays->freepages_delay,
+                        &current->delays->freepages_count);
+}
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index a9e6bad9f706..0d407e886735 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -65,7 +65,7 @@ lookup_exec_domain(u_long personality)
                                goto out;
        }
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
        read_unlock(&exec_domains_lock);
        request_module("personality-%ld", pers);
        read_lock(&exec_domains_lock);
@@ -168,7 +168,6 @@ __set_personality(u_long personality)
        current->personality = personality;
        oep = current_thread_info()->exec_domain;
        current_thread_info()->exec_domain = ep;
-        set_fs_altroot();
        module_put(oep->module);
        return 0;
diff --git a/kernel/exit.c b/kernel/exit.c
index 8f6185e69b69..eb4d6470d1d0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -13,6 +13,7 @@
 #include <linux/personality.h>
 #include <linux/tty.h>
 #include <linux/mnt_namespace.h>
+#include <linux/iocontext.h>
 #include <linux/key.h>
 #include <linux/security.h>
 #include <linux/cpu.h>
@@ -45,6 +46,7 @@
 #include <linux/resource.h>
 #include <linux/blkdev.h>
 #include <linux/task_io_accounting_ops.h>
+#include <linux/tracehook.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -70,7 +72,7 @@ static void __unhash_process(struct task_struct *p)
                __get_cpu_var(process_counts)--;
        }
        list_del_rcu(&p->thread_group);
-        remove_parent(p);
+        list_del_init(&p->sibling);
 }
 /*
@@ -84,7 +86,6 @@ static void __exit_signal(struct task_struct *tsk)
        BUG_ON(!sig);
        BUG_ON(!atomic_read(&sig->count));
-        rcu_read_lock();
        sighand = rcu_dereference(tsk->sighand);
        spin_lock(&sighand->siglock);
@@ -120,6 +121,7 @@ static void __exit_signal(struct task_struct *tsk)
                sig->nivcsw += tsk->nivcsw;
                sig->inblock += task_io_get_inblock(tsk);
                sig->oublock += task_io_get_oublock(tsk);
+                task_io_accounting_add(&sig->ioac, &tsk->ioac);
                sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
                sig = NULL; /* Marker for below. */
        }
@@ -135,7 +137,6 @@ static void __exit_signal(struct task_struct *tsk)
        tsk->signal = NULL;
        tsk->sighand = NULL;
        spin_unlock(&sighand->siglock);
-        rcu_read_unlock();
        __cleanup_sighand(sighand);
        clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
@@ -151,16 +152,17 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
        put_task_struct(container_of(rhp, struct task_struct, rcu));
 }
 void release_task(struct task_struct * p)
 {
        struct task_struct *leader;
        int zap_leader;
 repeat:
+        tracehook_prepare_release_task(p);
        atomic_dec(&p->user->processes);
        proc_flush_task(p);
        write_lock_irq(&tasklist_lock);
-        ptrace_unlink(p);
+        tracehook_finish_release_task(p);
-        BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
        __exit_signal(p);
        /*
@@ -182,6 +184,13 @@ repeat:
                 * that case.
                 */
                zap_leader = task_detached(leader);
+                /*
+                 * This maintains the invariant that release_task()
+                 * only runs on a task in EXIT_DEAD, just for sanity.
+                 */
+                if (zap_leader)
+                        leader->exit_state = EXIT_DEAD;
        }
        write_unlock_irq(&tasklist_lock);
@@ -314,9 +323,8 @@ static void reparent_to_kthreadd(void)
        ptrace_unlink(current);
        /* Reparent to init */
-        remove_parent(current);
        current->real_parent = current->parent = kthreadd_task;
-        add_parent(current);
+        list_move_tail(&current->sibling, &current->real_parent->children);
        /* Set the exit signal to SIGCHLD so we signal init on exit */
        current->exit_signal = SIGCHLD;
@@ -421,7 +429,7 @@ void daemonize(const char *name, ...)
         * We don't want to have TIF_FREEZE set if the system-wide hibernation
         * or suspend transition begins right now.
         */
-        current->flags |= PF_NOFREEZE;
+        current->flags |= (PF_NOFREEZE | PF_KTHREAD);
        if (current->nsproxy != &init_nsproxy) {
                get_nsproxy(&init_nsproxy);
@@ -546,8 +554,6 @@ void put_fs_struct(struct fs_struct *fs)
        if (atomic_dec_and_test(&fs->count)) {
                path_put(&fs->root);
                path_put(&fs->pwd);
-                if (fs->altroot.dentry)
-                        path_put(&fs->altroot);
                kmem_cache_free(fs_cachep, fs);
        }
 }
@@ -655,26 +661,40 @@ assign_new_owner:
 static void exit_mm(struct task_struct * tsk)
 {
        struct mm_struct *mm = tsk->mm;
+        struct core_state *core_state;
        mm_release(tsk, mm);
        if (!mm)
                return;
        /*
         * Serialize with any possible pending coredump.
-         * We must hold mmap_sem around checking core_waiters
+         * We must hold mmap_sem around checking core_state
         * and clearing tsk->mm.  The core-inducing thread
-         * will increment core_waiters for each thread in the
+         * will increment ->nr_threads for each thread in the
         * group with ->mm != NULL.
         */
        down_read(&mm->mmap_sem);
-        if (mm->core_waiters) {
+        core_state = mm->core_state;
+        if (core_state) {
+                struct core_thread self;
                up_read(&mm->mmap_sem);
-                down_write(&mm->mmap_sem);
-                if (!--mm->core_waiters)
-                        complete(mm->core_startup_done);
-                up_write(&mm->mmap_sem);
-                wait_for_completion(&mm->core_done);
+                self.task = tsk;
+                self.next = xchg(&core_state->dumper.next, &self);
+                /*
+                 * Implies mb(), the result of xchg() must be visible
+                 * to core_state->dumper.
+                 */
+                if (atomic_dec_and_test(&core_state->nr_threads))
+                        complete(&core_state->startup);
+                for (;;) {
+                        set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+                        if (!self.task) /* see coredump_finish() */
+                                break;
+                        schedule();
+                }
+                __set_task_state(tsk, TASK_RUNNING);
                down_read(&mm->mmap_sem);
        }
        atomic_inc(&mm->mm_count);
@@ -691,37 +711,97 @@ static void exit_mm(struct task_struct * tsk)
        mmput(mm);
 }
-static void
+/*
-reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
+ * Return nonzero if @parent's children should reap themselves.
+ *
+ * Called with write_lock_irq(&tasklist_lock) held.
+ */
+static int ignoring_children(struct task_struct *parent)
 {
-        if (p->pdeath_signal)
+        int ret;
-                /* We already hold the tasklist_lock here.  */
+        struct sighand_struct *psig = parent->sighand;
-                group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
+        unsigned long flags;
+        spin_lock_irqsave(&psig->siglock, flags);
+        ret = (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
+               (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT));
+        spin_unlock_irqrestore(&psig->siglock, flags);
+        return ret;
+}
-        /* Move the child from its dying parent to the new one.  */
+/*
-        if (unlikely(traced)) {
+ * Detach all tasks we were using ptrace on.
-                /* Preserve ptrace links if someone else is tracing this child.  */
+ * Any that need to be release_task'd are put on the @dead list.
-                list_del_init(&p->ptrace_list);
+ *
-                if (ptrace_reparented(p))
+ * Called with write_lock(&tasklist_lock) held.
-                        list_add(&p->ptrace_list, &p->real_parent->ptrace_children);
+ */
-        } else {
+static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
-                /* If this child is being traced, then we're the one tracing it
+{
-                 * anyway, so let go of it.
+        struct task_struct *p, *n;
+        int ign = -1;
+        list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
+                __ptrace_unlink(p);
+                if (p->exit_state != EXIT_ZOMBIE)
+                        continue;
+                /*
+                 * If it's a zombie, our attachedness prevented normal
+                 * parent notification or self-reaping.  Do notification
+                 * now if it would have happened earlier.  If it should
+                 * reap itself, add it to the @dead list.  We can't call
+                 * release_task() here because we already hold tasklist_lock.
+                 *
+                 * If it's our own child, there is no notification to do.
+                 * But if our normal children self-reap, then this child
+                 * was prevented by ptrace and we must reap it now.
                 */
-                p->ptrace = 0;
+                if (!task_detached(p) && thread_group_empty(p)) {
-                remove_parent(p);
+                        if (!same_thread_group(p->real_parent, parent))
-                p->parent = p->real_parent;
+                                do_notify_parent(p, p->exit_signal);
-                add_parent(p);
+                        else {
+                                if (ign < 0)
+                                        ign = ignoring_children(parent);
+                                if (ign)
+                                        p->exit_signal = -1;
+                        }
+                }
-                if (task_is_traced(p)) {
+                if (task_detached(p)) {
                        /*
-                         * If it was at a trace stop, turn it into
+                         * Mark it as in the process of being reaped.
-                         * a normal stop since it's no longer being
-                         * traced.
                         */
-                        ptrace_untrace(p);
+                        p->exit_state = EXIT_DEAD;
+                        list_add(&p->ptrace_entry, dead);
                }
        }
+}
+/*
+ * Finish up exit-time ptrace cleanup.
+ *
+ * Called without locks.
+ */
+static void ptrace_exit_finish(struct task_struct *parent,
+                               struct list_head *dead)
+{
+        struct task_struct *p, *n;
+        BUG_ON(!list_empty(&parent->ptraced));
+        list_for_each_entry_safe(p, n, dead, ptrace_entry) {
+                list_del_init(&p->ptrace_entry);
+                release_task(p);
+        }
+}
+static void reparent_thread(struct task_struct *p, struct task_struct *father)
+{
+        if (p->pdeath_signal)
+                /* We already hold the tasklist_lock here.  */
+                group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
+        list_move_tail(&p->sibling, &p->real_parent->children);
        /* If this is a threaded reparent there is no need to
         * notify anyone anything has happened.
@@ -736,7 +816,8 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
        /* If we'd notified the old parent about this child's death,
         * also notify the new parent.
         */
-        if (!traced && p->exit_state == EXIT_ZOMBIE &&
+        if (!ptrace_reparented(p) &&
+            p->exit_state == EXIT_ZOMBIE &&
            !task_detached(p) && thread_group_empty(p))
                do_notify_parent(p, p->exit_signal);
@@ -753,12 +834,15 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
 static void forget_original_parent(struct task_struct *father)
 {
        struct task_struct *p, *n, *reaper = father;
-        struct list_head ptrace_dead;
+        LIST_HEAD(ptrace_dead);
-        INIT_LIST_HEAD(&ptrace_dead);
        write_lock_irq(&tasklist_lock);
+        /*
+         * First clean up ptrace if we were using it.
+         */
+        ptrace_exit(father, &ptrace_dead);
        do {
                reaper = next_thread(reaper);
                if (reaper == father) {
@@ -767,58 +851,19 @@ static void forget_original_parent(struct task_struct *father)
                }
        } while (reaper->flags & PF_EXITING);
-        /*
-         * There are only two places where our children can be:
-         *
-         * - in our child list
-         * - in our ptraced child list
-         *
-         * Search them and reparent children.
-         */
        list_for_each_entry_safe(p, n, &father->children, sibling) {
-                int ptrace;
-                ptrace = p->ptrace;
-                /* if father isn't the real parent, then ptrace must be enabled */
-                BUG_ON(father != p->real_parent && !ptrace);
-                if (father == p->real_parent) {
-                        /* reparent with a reaper, real father it's us */
-                        p->real_parent = reaper;
-                        reparent_thread(p, father, 0);
-                } else {
-                        /* reparent ptraced task to its real parent */
-                        __ptrace_unlink (p);
-                        if (p->exit_state == EXIT_ZOMBIE && !task_detached(p) &&
-                            thread_group_empty(p))
-                                do_notify_parent(p, p->exit_signal);
-                }
-                /*
-                 * if the ptraced child is a detached zombie we must collect
-                 * it before we exit, or it will remain zombie forever since
-                 * we prevented it from self-reap itself while it was being
-                 * traced by us, to be able to see it in wait4.
-                 */
-                if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && task_detached(p)))
-                        list_add(&p->ptrace_list, &ptrace_dead);
-        }
-        list_for_each_entry_safe(p, n, &father->ptrace_children, ptrace_list) {
                p->real_parent = reaper;
-                reparent_thread(p, father, 1);
+                if (p->parent == father) {
+                        BUG_ON(p->ptrace);
+                        p->parent = p->real_parent;
+                }
+                reparent_thread(p, father);
        }
        write_unlock_irq(&tasklist_lock);
        BUG_ON(!list_empty(&father->children));
-        BUG_ON(!list_empty(&father->ptrace_children));
-        list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_list) {
-                list_del_init(&p->ptrace_list);
-                release_task(p);
-        }
+        ptrace_exit_finish(father, &ptrace_dead);
 }
 /*
@@ -827,7 +872,8 @@ static void forget_original_parent(struct task_struct *father)
 */
 static void exit_notify(struct task_struct *tsk, int group_dead)
 {
-        int state;
+        int signal;
+        void *cookie;
        /*
         * This does two things:
@@ -864,22 +910,11 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
            !capable(CAP_KILL))
                tsk->exit_signal = SIGCHLD;
-        /* If something other than our normal parent is ptracing us, then
+        signal = tracehook_notify_death(tsk, &cookie, group_dead);
-         * send it a SIGCHLD instead of honoring exit_signal.  exit_signal
+        if (signal > 0)
-         * only has special meaning to our real parent.
+                signal = do_notify_parent(tsk, signal);
-         */
-        if (!task_detached(tsk) && thread_group_empty(tsk)) {
-                int signal = ptrace_reparented(tsk) ?
-                                SIGCHLD : tsk->exit_signal;
-                do_notify_parent(tsk, signal);
-        } else if (tsk->ptrace) {
-                do_notify_parent(tsk, SIGCHLD);
-        }
-        state = EXIT_ZOMBIE;
+        tsk->exit_state = signal < 0 ? EXIT_DEAD : EXIT_ZOMBIE;
-        if (task_detached(tsk) && likely(!tsk->ptrace))
-                state = EXIT_DEAD;
-        tsk->exit_state = state;
        /* mt-exec, de_thread() is waiting for us */
        if (thread_group_leader(tsk) &&
@@ -889,8 +924,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
        write_unlock_irq(&tasklist_lock);
+        tracehook_report_death(tsk, signal, cookie, group_dead);
        /* If the process is dead, release it - nobody will wait for it */
-        if (state == EXIT_DEAD)
+        if (signal < 0)
                release_task(tsk);
 }
@@ -969,10 +1006,7 @@ NORET_TYPE void do_exit(long code)
        if (unlikely(!tsk->pid))
                panic("Attempted to kill the idle task!");
-        if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
+        tracehook_report_exit(&code);
-                current->ptrace_message = code;
-                ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
-        }
        /*
         * We're taking recursive faults here in do_exit. Safest is to just
@@ -1179,13 +1213,6 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
                        return 0;
        }
-        /*
-         * Do not consider detached threads that are
-         * not ptraced:
-         */
-        if (task_detached(p) && !p->ptrace)
-                return 0;
        /* Wait for all children (clone and not) if __WALL is set;
         * otherwise, wait for clone children *only* if __WCLONE is
         * set; otherwise, wait for non-clone children *only*.  (Note:
@@ -1196,14 +1223,10 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
                return 0;
        err = security_task_wait(p);
-        if (likely(!err))
+        if (err)
-                return 1;
+                return err;
-        if (type != PIDTYPE_PID)
+        return 1;
-                return 0;
-        /* This child was explicitly requested, abort */
-        read_unlock(&tasklist_lock);
-        return err;
 }
 static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
@@ -1237,7 +1260,7 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
 * the lock and this task is uninteresting.  If we return nonzero, we have
 * released the lock and the system call should return.
 */
-static int wait_task_zombie(struct task_struct *p, int noreap,
+static int wait_task_zombie(struct task_struct *p, int options,
                            struct siginfo __user *infop,
                            int __user *stat_addr, struct rusage __user *ru)
 {
@@ -1245,7 +1268,10 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
        int retval, status, traced;
        pid_t pid = task_pid_vnr(p);
-        if (unlikely(noreap)) {
+        if (!likely(options & WEXITED))
+                return 0;
+        if (unlikely(options & WNOWAIT)) {
                uid_t uid = p->uid;
                int exit_code = p->exit_code;
                int why, status;
@@ -1326,6 +1352,8 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
                psig->coublock +=
                        task_io_get_oublock(p) +
                        sig->oublock + sig->coublock;
+                task_io_accounting_add(&psig->ioac, &p->ioac);
+                task_io_accounting_add(&psig->ioac, &sig->ioac);
                spin_unlock_irq(&p->parent->sighand->siglock);
        }
@@ -1395,21 +1423,24 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
 * the lock and this task is uninteresting.  If we return nonzero, we have
 * released the lock and the system call should return.
 */
-static int wait_task_stopped(struct task_struct *p,
+static int wait_task_stopped(int ptrace, struct task_struct *p,
-                             int noreap, struct siginfo __user *infop,
+                             int options, struct siginfo __user *infop,
                             int __user *stat_addr, struct rusage __user *ru)
 {
        int retval, exit_code, why;
        uid_t uid = 0; /* unneeded, required by compiler */
        pid_t pid;
+        if (!(options & WUNTRACED))
+                return 0;
        exit_code = 0;
        spin_lock_irq(&p->sighand->siglock);
        if (unlikely(!task_is_stopped_or_traced(p)))
                goto unlock_sig;
-        if (!(p->ptrace & PT_PTRACED) && p->signal->group_stop_count > 0)
+        if (!ptrace && p->signal->group_stop_count > 0)
                /*
                 * A group stop is in progress and this is the group leader.
                 * We won't report until all threads have stopped.
@@ -1420,7 +1451,7 @@ static int wait_task_stopped(struct task_struct *p,
        if (!exit_code)
                goto unlock_sig;
-        if (!noreap)
+        if (!unlikely(options & WNOWAIT))
                p->exit_code = 0;
        uid = p->uid;
@@ -1438,10 +1469,10 @@ unlock_sig:
         */
        get_task_struct(p);
        pid = task_pid_vnr(p);
-        why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED;
+        why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
        read_unlock(&tasklist_lock);
-        if (unlikely(noreap))
+        if (unlikely(options & WNOWAIT))
                return wait_noreap_copyout(p, pid, uid,
                                           why, exit_code,
                                           infop, ru);
@@ -1475,7 +1506,7 @@ unlock_sig:
 * the lock and this task is uninteresting.  If we return nonzero, we have
 * released the lock and the system call should return.
 */
-static int wait_task_continued(struct task_struct *p, int noreap,
+static int wait_task_continued(struct task_struct *p, int options,
                               struct siginfo __user *infop,
                               int __user *stat_addr, struct rusage __user *ru)
 {
@@ -1483,6 +1514,9 @@ static int wait_task_continued(struct task_struct *p, int noreap,
        pid_t pid;
        uid_t uid;
+        if (!unlikely(options & WCONTINUED))
+                return 0;
        if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
                return 0;
@@ -1492,7 +1526,7 @@ static int wait_task_continued(struct task_struct *p, int noreap,
                spin_unlock_irq(&p->sighand->siglock);
                return 0;
        }
-        if (!noreap)
+        if (!unlikely(options & WNOWAIT))
                p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
        spin_unlock_irq(&p->sighand->siglock);
@@ -1518,89 +1552,161 @@ static int wait_task_continued(struct task_struct *p, int noreap,
        return retval;
 }
+/*
+ * Consider @p for a wait by @parent.
+ *
+ * -ECHILD should be in *@notask_error before the first call.
+ * Returns nonzero for a final return, when we have unlocked tasklist_lock.
+ * Returns zero if the search for a child should continue;
+ * then *@notask_error is 0 if @p is an eligible child,
+ * or another error from security_task_wait(), or still -ECHILD.
+ */
+static int wait_consider_task(struct task_struct *parent, int ptrace,
+                              struct task_struct *p, int *notask_error,
+                              enum pid_type type, struct pid *pid, int options,
+                              struct siginfo __user *infop,
+                              int __user *stat_addr, struct rusage __user *ru)
+{
+        int ret = eligible_child(type, pid, options, p);
+        if (!ret)
+                return ret;
+        if (unlikely(ret < 0)) {
+                /*
+                 * If we have not yet seen any eligible child,
+                 * then let this error code replace -ECHILD.
+                 * A permission error will give the user a clue
+                 * to look for security policy problems, rather
+                 * than for mysterious wait bugs.
+                 */
+                if (*notask_error)
+                        *notask_error = ret;
+        }
+        if (likely(!ptrace) && unlikely(p->ptrace)) {
+                /*
+                 * This child is hidden by ptrace.
+                 * We aren't allowed to see it now, but eventually we will.
+                 */
+                *notask_error = 0;
+                return 0;
+        }
+        if (p->exit_state == EXIT_DEAD)
+                return 0;
+        /*
+         * We don't reap group leaders with subthreads.
+         */
+        if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
+                return wait_task_zombie(p, options, infop, stat_addr, ru);
+        /*
+         * It's stopped or running now, so it might
+         * later continue, exit, or stop again.
+         */
+        *notask_error = 0;
+        if (task_is_stopped_or_traced(p))
+                return wait_task_stopped(ptrace, p, options,
+                                         infop, stat_addr, ru);
+        return wait_task_continued(p, options, infop, stat_addr, ru);
+}
+/*
+ * Do the work of do_wait() for one thread in the group, @tsk.
+ *
+ * -ECHILD should be in *@notask_error before the first call.
+ * Returns nonzero for a final return, when we have unlocked tasklist_lock.
+ * Returns zero if the search for a child should continue; then
+ * *@notask_error is 0 if there were any eligible children,
+ * or another error from security_task_wait(), or still -ECHILD.
+ */
+static int do_wait_thread(struct task_struct *tsk, int *notask_error,
+                          enum pid_type type, struct pid *pid, int options,
+                          struct siginfo __user *infop, int __user *stat_addr,
+                          struct rusage __user *ru)
+{
+        struct task_struct *p;
+        list_for_each_entry(p, &tsk->children, sibling) {
+                /*
+                 * Do not consider detached threads.
+                 */
+                if (!task_detached(p)) {
+                        int ret = wait_consider_task(tsk, 0, p, notask_error,
+                                                     type, pid, options,
+                                                     infop, stat_addr, ru);
+                        if (ret)
+                                return ret;
+                }
+        }
+        return 0;
+}
+static int ptrace_do_wait(struct task_struct *tsk, int *notask_error,
+                          enum pid_type type, struct pid *pid, int options,
+                          struct siginfo __user *infop, int __user *stat_addr,
+                          struct rusage __user *ru)
+{
+        struct task_struct *p;
+        /*
+         * Traditionally we see ptrace'd stopped tasks regardless of options.
+         */
+        options |= WUNTRACED;
+        list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
+                int ret = wait_consider_task(tsk, 1, p, notask_error,
+                                             type, pid, options,
+                                             infop, stat_addr, ru);
+                if (ret)
+                        return ret;
+        }
+        return 0;
+}
 static long do_wait(enum pid_type type, struct pid *pid, int options,
                    struct siginfo __user *infop, int __user *stat_addr,
                    struct rusage __user *ru)
 {
        DECLARE_WAITQUEUE(wait, current);
        struct task_struct *tsk;
-        int flag, retval;
+        int retval;
        add_wait_queue(&current->signal->wait_chldexit,&wait);
 repeat:
-        /* If there is nothing that can match our critier just get out */
+        /*
+         * If there is nothing that can match our critiera just get out.
+         * We will clear @retval to zero if we see any child that might later
+         * match our criteria, even if we are not able to reap it yet.
+         */
        retval = -ECHILD;
        if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type])))
                goto end;
-        /*
-         * We will set this flag if we see any child that might later
-         * match our criteria, even if we are not able to reap it yet.
-         */
-        flag = retval = 0;
        current->state = TASK_INTERRUPTIBLE;
        read_lock(&tasklist_lock);
        tsk = current;
        do {
-                struct task_struct *p;
+                int tsk_result = do_wait_thread(tsk, &retval,
+                                                type, pid, options,
-                list_for_each_entry(p, &tsk->children, sibling) {
+                                                infop, stat_addr, ru);
-                        int ret = eligible_child(type, pid, options, p);
+                if (!tsk_result)
-                        if (!ret)
+                        tsk_result = ptrace_do_wait(tsk, &retval,
-                                continue;
+                                                    type, pid, options,
+                                                    infop, stat_addr, ru);
-                        if (unlikely(ret < 0)) {
+                if (tsk_result) {
-                                retval = ret;
+                        /*
-                        } else if (task_is_stopped_or_traced(p)) {
+                         * tasklist_lock is unlocked and we have a final result.
-                                /*
+                         */
-                                 * It's stopped now, so it might later
+                        retval = tsk_result;
-                                 * continue, exit, or stop again.
+                        goto end;
-                                 */
-                                flag = 1;
-                                if (!(p->ptrace & PT_PTRACED) &&
-                                    !(options & WUNTRACED))
-                                        continue;
-                                retval = wait_task_stopped(p,
-                                                (options & WNOWAIT), infop,
-                                                stat_addr, ru);
-                        } else if (p->exit_state == EXIT_ZOMBIE &&
-                                        !delay_group_leader(p)) {
-                                /*
-                                 * We don't reap group leaders with subthreads.
-                                 */
-                                if (!likely(options & WEXITED))
-                                        continue;
-                                retval = wait_task_zombie(p,
-                                                (options & WNOWAIT), infop,
-                                                stat_addr, ru);
-                        } else if (p->exit_state != EXIT_DEAD) {
-                                /*
-                                 * It's running now, so it might later
-                                 * exit, stop, or stop and then continue.
-                                 */
-                                flag = 1;
-                                if (!unlikely(options & WCONTINUED))
-                                        continue;
-                                retval = wait_task_continued(p,
-                                                (options & WNOWAIT), infop,
-                                                stat_addr, ru);
-                        }
-                        if (retval != 0) /* tasklist_lock released */
-                                goto end;
-                }
-                if (!flag) {
-                        list_for_each_entry(p, &tsk->ptrace_children,
-                                                                ptrace_list) {
-                                flag = eligible_child(type, pid, options, p);
-                                if (!flag)
-                                        continue;
-                                if (likely(flag > 0))
-                                        break;
-                                retval = flag;
-                                goto end;
-                        }
                }
                if (options & __WNOTHREAD)
                        break;
                tsk = next_thread(tsk);
@@ -1608,16 +1714,14 @@ repeat:
        } while (tsk != current);
        read_unlock(&tasklist_lock);
-        if (flag) {
+        if (!retval && !(options & WNOHANG)) {
-                if (options & WNOHANG)
-                        goto end;
                retval = -ERESTARTSYS;
-                if (signal_pending(current))
+                if (!signal_pending(current)) {
-                        goto end;
+                        schedule();
-                schedule();
+                        goto repeat;
-                goto repeat;
+                }
        }
-        retval = -ECHILD;
 end:
        current->state = TASK_RUNNING;
        remove_wait_queue(&current->signal->wait_chldexit,&wait);
diff --git a/kernel/fork.c b/kernel/fork.c
index 19908b26cf80..8214ba7c8bb1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -23,6 +23,7 @@
 #include <linux/sem.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
+#include <linux/iocontext.h>
 #include <linux/key.h>
 #include <linux/binfmts.h>
 #include <linux/mman.h>
@@ -32,9 +33,11 @@
 #include <linux/cpu.h>
 #include <linux/cgroup.h>
 #include <linux/security.h>
+#include <linux/hugetlb.h>
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
+#include <linux/tracehook.h>
 #include <linux/futex.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/rcupdate.h>
@@ -91,6 +94,23 @@ int nr_processes(void)
 static struct kmem_cache *task_struct_cachep;
 #endif
+#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
+static inline struct thread_info *alloc_thread_info(struct task_struct *tsk)
+{
+#ifdef CONFIG_DEBUG_STACK_USAGE
+        gfp_t mask = GFP_KERNEL | __GFP_ZERO;
+#else
+        gfp_t mask = GFP_KERNEL;
+#endif
+        return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER);
+}
+static inline void free_thread_info(struct thread_info *ti)
+{
+        free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+}
+#endif
 /* SLAB cache for signal_struct structures (tsk->signal) */
 static struct kmem_cache *signal_cachep;
@@ -306,6 +326,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                }
                /*
+                 * Clear hugetlb-related page reserves for children. This only
+                 * affects MAP_PRIVATE mappings. Faults generated by the child
+                 * are not guaranteed to succeed, even if read-only
+                 */
+                if (is_vm_hugetlb_page(tmp))
+                        reset_vma_resv_huge_pages(tmp);
+                /*
                 * Link in the new vma and copy the page table entries.
                 */
                *pprev = tmp;
@@ -373,7 +401,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
        INIT_LIST_HEAD(&mm->mmlist);
        mm->flags = (current->mm) ? current->mm->flags
                                  : MMF_DUMP_FILTER_DEFAULT;
-        mm->core_waiters = 0;
+        mm->core_state = NULL;
        mm->nr_ptes = 0;
        set_mm_counter(mm, file_rss, 0);
        set_mm_counter(mm, anon_rss, 0);
@@ -447,7 +475,7 @@ EXPORT_SYMBOL_GPL(mmput);
 /**
 * get_task_mm - acquire a reference to the task's mm
 *
- * Returns %NULL if the task has no mm.  Checks PF_BORROWED_MM (meaning
+ * Returns %NULL if the task has no mm.  Checks PF_KTHREAD (meaning
 * this kernel workthread has transiently adopted a user mm with use_mm,
 * to do its AIO) is not set and if so returns a reference to it, after
 * bumping up the use count.  User must release the mm via mmput()
@@ -460,7 +488,7 @@ struct mm_struct *get_task_mm(struct task_struct *task)
        task_lock(task);
        mm = task->mm;
        if (mm) {
-                if (task->flags & PF_BORROWED_MM)
+                if (task->flags & PF_KTHREAD)
                        mm = NULL;
                else
                        atomic_inc(&mm->mm_users);
@@ -629,13 +657,6 @@ static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
                path_get(&old->root);
                fs->pwd = old->pwd;
                path_get(&old->pwd);
-                if (old->altroot.dentry) {
-                        fs->altroot = old->altroot;
-                        path_get(&old->altroot);
-                } else {
-                        fs->altroot.mnt = NULL;
-                        fs->altroot.dentry = NULL;
-                }
                read_unlock(&old->lock);
        }
        return fs;
@@ -785,6 +806,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
        sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
        sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
+        task_io_accounting_init(&sig->ioac);
        sig->sum_sched_runtime = 0;
        INIT_LIST_HEAD(&sig->cpu_timers[0]);
        INIT_LIST_HEAD(&sig->cpu_timers[1]);
@@ -832,8 +854,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
        new_flags &= ~PF_SUPERPRIV;
        new_flags |= PF_FORKNOEXEC;
-        if (!(clone_flags & CLONE_PTRACE))
+        new_flags |= PF_STARTING;
-                p->ptrace = 0;
        p->flags = new_flags;
        clear_freeze_flag(p);
 }
@@ -874,7 +895,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                                        struct pt_regs *regs,
                                        unsigned long stack_size,
                                        int __user *child_tidptr,
-                                        struct pid *pid)
+                                        struct pid *pid,
+                                        int trace)
 {
        int retval;
        struct task_struct *p;
@@ -909,7 +931,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        rt_mutex_init_task(p);
-#ifdef CONFIG_TRACE_IRQFLAGS
+#ifdef CONFIG_PROVE_LOCKING
        DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
        DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
 #endif
@@ -967,13 +989,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->last_switch_timestamp = 0;
 #endif
-#ifdef CONFIG_TASK_XACCT
+        task_io_accounting_init(&p->ioac);
-        p->rchar = 0;           /* I/O counter: bytes read */
-        p->wchar = 0;           /* I/O counter: bytes written */
-        p->syscr = 0;           /* I/O counter: read syscalls */
-        p->syscw = 0;           /* I/O counter: write syscalls */
-#endif
-        task_io_accounting_init(p);
        acct_clear_integrals(p);
        p->it_virt_expires = cputime_zero;
@@ -1080,6 +1096,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        if (clone_flags & CLONE_THREAD)
                p->tgid = current->tgid;
+        if (current->nsproxy != p->nsproxy) {
+                retval = ns_cgroup_clone(p, pid);
+                if (retval)
+                        goto bad_fork_free_pid;
+        }
        p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
        /*
         * Clear TID on mm_release()?
@@ -1124,8 +1146,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         */
        p->group_leader = p;
        INIT_LIST_HEAD(&p->thread_group);
-        INIT_LIST_HEAD(&p->ptrace_children);
-        INIT_LIST_HEAD(&p->ptrace_list);
        /* Now that the task is set up, run cgroup callbacks if
         * necessary. We need to run them before the task is visible
@@ -1156,7 +1176,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                p->real_parent = current->real_parent;
        else
                p->real_parent = current;
-        p->parent = p->real_parent;
        spin_lock(&current->sighand->siglock);
@@ -1197,9 +1216,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        }
        if (likely(p->pid)) {
-                add_parent(p);
+                list_add_tail(&p->sibling, &p->real_parent->children);
-                if (unlikely(p->ptrace & PT_PTRACED))
+                tracehook_finish_clone(p, clone_flags, trace);
-                        __ptrace_link(p, current->parent);
                if (thread_group_leader(p)) {
                        if (clone_flags & CLONE_NEWPID)
@@ -1284,29 +1302,13 @@ struct task_struct * __cpuinit fork_idle(int cpu)
        struct pt_regs regs;
        task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
-                                &init_struct_pid);
+                            &init_struct_pid, 0);
        if (!IS_ERR(task))
                init_idle(task, cpu);
        return task;
 }
-static int fork_traceflag(unsigned clone_flags)
-{
-        if (clone_flags & CLONE_UNTRACED)
-                return 0;
-        else if (clone_flags & CLONE_VFORK) {
-                if (current->ptrace & PT_TRACE_VFORK)
-                        return PTRACE_EVENT_VFORK;
-        } else if ((clone_flags & CSIGNAL) != SIGCHLD) {
-                if (current->ptrace & PT_TRACE_CLONE)
-                        return PTRACE_EVENT_CLONE;
-        } else if (current->ptrace & PT_TRACE_FORK)
-                return PTRACE_EVENT_FORK;
-        return 0;
-}
 /*
 *  Ok, this is the main fork-routine.
 *
@@ -1341,14 +1343,14 @@ long do_fork(unsigned long clone_flags,
                }
        }
-        if (unlikely(current->ptrace)) {
+        /*
-                trace = fork_traceflag (clone_flags);
+         * When called from kernel_thread, don't do user tracing stuff.
-                if (trace)
+         */
-                        clone_flags |= CLONE_PTRACE;
+        if (likely(user_mode(regs)))
-        }
+                trace = tracehook_prepare_clone(clone_flags);
        p = copy_process(clone_flags, stack_start, regs, stack_size,
-                        child_tidptr, NULL);
+                         child_tidptr, NULL, trace);
        /*
         * Do this prior waking up the new thread - the thread pointer
         * might get invalid after that point, if the thread exits quickly.
@@ -1366,32 +1368,35 @@ long do_fork(unsigned long clone_flags,
                        init_completion(&vfork);
                }
-                if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
+                tracehook_report_clone(trace, regs, clone_flags, nr, p);
+                /*
+                 * We set PF_STARTING at creation in case tracing wants to
+                 * use this to distinguish a fully live task from one that
+                 * hasn't gotten to tracehook_report_clone() yet.  Now we
+                 * clear it and set the child going.
+                 */
+                p->flags &= ~PF_STARTING;
+                if (unlikely(clone_flags & CLONE_STOPPED)) {
                        /*
                         * We'll start up with an immediate SIGSTOP.
                         */
                        sigaddset(&p->pending.signal, SIGSTOP);
                        set_tsk_thread_flag(p, TIF_SIGPENDING);
-                }
-                if (!(clone_flags & CLONE_STOPPED))
-                        wake_up_new_task(p, clone_flags);
-                else
                        __set_task_state(p, TASK_STOPPED);
+                } else {
-                if (unlikely (trace)) {
+                        wake_up_new_task(p, clone_flags);
-                        current->ptrace_message = nr;
-                        ptrace_notify ((trace << 8) | SIGTRAP);
                }
+                tracehook_report_clone_complete(trace, regs,
+                                                clone_flags, nr, p);
                if (clone_flags & CLONE_VFORK) {
                        freezer_do_not_count();
                        wait_for_completion(&vfork);
                        freezer_count();
-                        if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) {
+                        tracehook_report_vfork_done(p, nr);
-                                current->ptrace_message = nr;
-                                ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
-                        }
                }
        } else {
                nr = PTR_ERR(p);
@@ -1403,7 +1408,7 @@ long do_fork(unsigned long clone_flags,
 #define ARCH_MIN_MMSTRUCT_ALIGN 0
 #endif
-static void sighand_ctor(struct kmem_cache *cachep, void *data)
+static void sighand_ctor(void *data)
 {
        struct sighand_struct *sighand = data;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ab80515008f4..b8e4dce80a74 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -300,11 +300,10 @@ EXPORT_SYMBOL_GPL(ktime_sub_ns);
 */
 u64 ktime_divns(const ktime_t kt, s64 div)
 {
-        u64 dclc, inc, dns;
+        u64 dclc;
        int sft = 0;
-        dclc = dns = ktime_to_ns(kt);
+        dclc = ktime_to_ns(kt);
-        inc = div;
        /* Make sure the divisor is less than 2^32: */
        while (div >> 32) {
                sft++;
@@ -623,7 +622,7 @@ static void retrigger_next_event(void *arg)
 void clock_was_set(void)
 {
        /* Retrigger the CPU local events everywhere */
-        on_each_cpu(retrigger_next_event, NULL, 0, 1);
+        on_each_cpu(retrigger_next_event, NULL, 1);
 }
 /*
@@ -632,8 +631,6 @@ void clock_was_set(void)
 */
 void hres_timers_resume(void)
 {
-        WARN_ON_ONCE(num_online_cpus() > 1);
        /* Retrigger the CPU local events: */
        retrigger_next_event(NULL);
 }
@@ -1086,7 +1083,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
 }
 EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
-#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ)
+#ifdef CONFIG_NO_HZ
 /**
 * hrtimer_get_next_event - get the time until next expiry event
 *
@@ -1677,7 +1674,7 @@ void __init hrtimers_init(void)
                          (void *)(long)smp_processor_id());
        register_cpu_notifier(&hrtimers_nb);
 #ifdef CONFIG_HIGH_RES_TIMERS
-        open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq, NULL);
+        open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
 #endif
 }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 964964baefa2..3cd441ebf5d2 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -28,8 +28,7 @@ void dynamic_irq_init(unsigned int irq)
        unsigned long flags;
        if (irq >= NR_IRQS) {
-                printk(KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
+                WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
-                WARN_ON(1);
                return;
        }
@@ -62,8 +61,7 @@ void dynamic_irq_cleanup(unsigned int irq)
        unsigned long flags;
        if (irq >= NR_IRQS) {
-                printk(KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
+                WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
-                WARN_ON(1);
                return;
        }
@@ -71,9 +69,8 @@ void dynamic_irq_cleanup(unsigned int irq)
        spin_lock_irqsave(&desc->lock, flags);
        if (desc->action) {
                spin_unlock_irqrestore(&desc->lock, flags);
-                printk(KERN_ERR "Destroying IRQ%d without calling free_irq\n",
+                WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n",
                        irq);
-                WARN_ON(1);
                return;
        }
        desc->msi_desc = NULL;
@@ -96,8 +93,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
        unsigned long flags;
        if (irq >= NR_IRQS) {
-                printk(KERN_ERR "Trying to install chip for IRQ%d\n", irq);
+                WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq);
-                WARN_ON(1);
                return -EINVAL;
        }
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 46d6611a33bb..152abfd3589f 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -17,6 +17,8 @@
 #ifdef CONFIG_SMP
+cpumask_t irq_default_affinity = CPU_MASK_ALL;
 /**
 *      synchronize_irq - wait for pending IRQ handlers (on other CPUs)
 *      @irq: interrupt number to wait for
@@ -95,6 +97,27 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
        return 0;
 }
+#ifndef CONFIG_AUTO_IRQ_AFFINITY
+/*
+ * Generic version of the affinity autoselector.
+ */
+int irq_select_affinity(unsigned int irq)
+{
+        cpumask_t mask;
+        if (!irq_can_set_affinity(irq))
+                return 0;
+        cpus_and(mask, cpu_online_map, irq_default_affinity);
+        irq_desc[irq].affinity = mask;
+        irq_desc[irq].chip->set_affinity(irq, mask);
+        set_balance_irq_affinity(irq, mask);
+        return 0;
+}
+#endif
 #endif
 /**
@@ -154,8 +177,7 @@ static void __enable_irq(struct irq_desc *desc, unsigned int irq)
 {
        switch (desc->depth) {
        case 0:
-                printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
+                WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
-                WARN_ON(1);
                break;
        case 1: {
                unsigned int status = desc->status & ~IRQ_DISABLED;
@@ -194,6 +216,17 @@ void enable_irq(unsigned int irq)
 }
 EXPORT_SYMBOL(enable_irq);
+int set_irq_wake_real(unsigned int irq, unsigned int on)
+{
+        struct irq_desc *desc = irq_desc + irq;
+        int ret = -ENXIO;
+        if (desc->chip->set_wake)
+                ret = desc->chip->set_wake(irq, on);
+        return ret;
+}
 /**
 *      set_irq_wake - control irq power management wakeup
 *      @irq:   interrupt to control
@@ -210,30 +243,32 @@ int set_irq_wake(unsigned int irq, unsigned int on)
 {
        struct irq_desc *desc = irq_desc + irq;
        unsigned long flags;
-        int ret = -ENXIO;
+        int ret = 0;
-        int (*set_wake)(unsigned, unsigned) = desc->chip->set_wake;
        /* wakeup-capable irqs can be shared between drivers that
         * don't need to have the same sleep mode behaviors.
         */
        spin_lock_irqsave(&desc->lock, flags);
        if (on) {
-                if (desc->wake_depth++ == 0)
+                if (desc->wake_depth++ == 0) {
-                        desc->status |= IRQ_WAKEUP;
+                        ret = set_irq_wake_real(irq, on);
-                else
+                        if (ret)
-                        set_wake = NULL;
+                                desc->wake_depth = 0;
+                        else
+                                desc->status |= IRQ_WAKEUP;
+                }
        } else {
                if (desc->wake_depth == 0) {
-                        printk(KERN_WARNING "Unbalanced IRQ %d "
+                        WARN(1, "Unbalanced IRQ %d wake disable\n", irq);
-                                        "wake disable\n", irq);
+                } else if (--desc->wake_depth == 0) {
-                        WARN_ON(1);
+                        ret = set_irq_wake_real(irq, on);
-                } else if (--desc->wake_depth == 0)
+                        if (ret)
-                        desc->status &= ~IRQ_WAKEUP;
+                                desc->wake_depth = 1;
-                else
+                        else
-                        set_wake = NULL;
+                                desc->status &= ~IRQ_WAKEUP;
+                }
        }
-        if (set_wake)
-                ret = desc->chip->set_wake(irq, on);
        spin_unlock_irqrestore(&desc->lock, flags);
        return ret;
 }
@@ -270,6 +305,30 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc)
                desc->handle_irq = NULL;
 }
+static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq,
+                unsigned long flags)
+{
+        int ret;
+        if (!chip || !chip->set_type) {
+                /*
+                 * IRQF_TRIGGER_* but the PIC does not support multiple
+                 * flow-types?
+                 */
+                pr_warning("No set_type function for IRQ %d (%s)\n", irq,
+                                chip ? (chip->name ? : "unknown") : "unknown");
+                return 0;
+        }
+        ret = chip->set_type(irq, flags & IRQF_TRIGGER_MASK);
+        if (ret)
+                pr_err("setting flow type for irq %u failed (%pF)\n",
+                                irq, chip->set_type);
+        return ret;
+}
 /*
 * Internal function to register an irqaction - typically used to
 * allocate special interrupts that are part of the architecture.
@@ -281,6 +340,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
        const char *old_name = NULL;
        unsigned long flags;
        int shared = 0;
+        int ret;
        if (irq >= NR_IRQS)
                return -EINVAL;
@@ -338,36 +398,23 @@ int setup_irq(unsigned int irq, struct irqaction *new)
                shared = 1;
        }
-        *p = new;
-        /* Exclude IRQ from balancing */
-        if (new->flags & IRQF_NOBALANCING)
-                desc->status |= IRQ_NO_BALANCING;
        if (!shared) {
                irq_chip_set_defaults(desc->chip);
-#if defined(CONFIG_IRQ_PER_CPU)
-                if (new->flags & IRQF_PERCPU)
-                        desc->status |= IRQ_PER_CPU;
-#endif
                /* Setup the type (level, edge polarity) if configured: */
                if (new->flags & IRQF_TRIGGER_MASK) {
-                        if (desc->chip && desc->chip->set_type)
+                        ret = __irq_set_trigger(desc->chip, irq, new->flags);
-                                desc->chip->set_type(irq,
-                                                new->flags & IRQF_TRIGGER_MASK);
+                        if (ret) {
-                        else
+                                spin_unlock_irqrestore(&desc->lock, flags);
-                                /*
+                                return ret;
-                                 * IRQF_TRIGGER_* but the PIC does not support
+                        }
-                                 * multiple flow-types?
-                                 */
-                                printk(KERN_WARNING "No IRQF_TRIGGER set_type "
-                                       "function for IRQ %d (%s)\n", irq,
-                                       desc->chip ? desc->chip->name :
-                                       "unknown");
                } else
                        compat_irq_chip_set_default_handler(desc);
+#if defined(CONFIG_IRQ_PER_CPU)
+                if (new->flags & IRQF_PERCPU)
+                        desc->status |= IRQ_PER_CPU;
+#endif
                desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING |
                                  IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
@@ -382,7 +429,17 @@ int setup_irq(unsigned int irq, struct irqaction *new)
                } else
                        /* Undo nested disables: */
                        desc->depth = 1;
+                /* Set default affinity mask once everything is setup */
+                irq_select_affinity(irq);
        }
+        *p = new;
+        /* Exclude IRQ from balancing */
+        if (new->flags & IRQF_NOBALANCING)
+                desc->status |= IRQ_NO_BALANCING;
        /* Reset broken irq detection when installing new handler */
        desc->irq_count = 0;
        desc->irqs_unhandled = 0;
@@ -571,8 +628,6 @@ int request_irq(unsigned int irq, irq_handler_t handler,
        action->next = NULL;
        action->dev_id = dev_id;
-        select_smp_affinity(irq);
 #ifdef CONFIG_DEBUG_SHIRQ
        if (irqflags & IRQF_SHARED) {
                /*
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index c2f2ccb0549a..6c6d35d68ee9 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -44,7 +44,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
                                   unsigned long count, void *data)
 {
        unsigned int irq = (int)(long)data, full_count = count, err;
-        cpumask_t new_value, tmp;
+        cpumask_t new_value;
        if (!irq_desc[irq].chip->set_affinity || no_irq_affinity ||
            irq_balancing_disabled(irq))
@@ -62,17 +62,51 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
         * way to make the system unusable accidentally :-) At least
         * one online CPU still has to be targeted.
         */
-        cpus_and(tmp, new_value, cpu_online_map);
+        if (!cpus_intersects(new_value, cpu_online_map))
-        if (cpus_empty(tmp))
                /* Special case for empty set - allow the architecture
                   code to set default SMP affinity. */
-                return select_smp_affinity(irq) ? -EINVAL : full_count;
+                return irq_select_affinity(irq) ? -EINVAL : full_count;
        irq_set_affinity(irq, new_value);
        return full_count;
 }
+static int default_affinity_read(char *page, char **start, off_t off,
+                                  int count, int *eof, void *data)
+{
+        int len = cpumask_scnprintf(page, count, irq_default_affinity);
+        if (count - len < 2)
+                return -EINVAL;
+        len += sprintf(page + len, "\n");
+        return len;
+}
+static int default_affinity_write(struct file *file, const char __user *buffer,
+                                   unsigned long count, void *data)
+{
+        unsigned int full_count = count, err;
+        cpumask_t new_value;
+        err = cpumask_parse_user(buffer, count, new_value);
+        if (err)
+                return err;
+        if (!is_affinity_mask_valid(new_value))
+                return -EINVAL;
+        /*
+         * Do not allow disabling IRQs completely - it's a too easy
+         * way to make the system unusable accidentally :-) At least
+         * one online CPU still has to be targeted.
+         */
+        if (!cpus_intersects(new_value, cpu_online_map))
+                return -EINVAL;
+        irq_default_affinity = new_value;
+        return full_count;
+}
 #endif
 static int irq_spurious_read(char *page, char **start, off_t off,
@@ -171,6 +205,21 @@ void unregister_handler_proc(unsigned int irq, struct irqaction *action)
                remove_proc_entry(action->dir->name, irq_desc[irq].dir);
 }
+void register_default_affinity_proc(void)
+{
+#ifdef CONFIG_SMP
+        struct proc_dir_entry *entry;
+        /* create /proc/irq/default_smp_affinity */
+        entry = create_proc_entry("default_smp_affinity", 0600, root_irq_dir);
+        if (entry) {
+                entry->data = NULL;
+                entry->read_proc  = default_affinity_read;
+                entry->write_proc = default_affinity_write;
+        }
+#endif
+}
 void init_irq_proc(void)
 {
        int i;
@@ -180,6 +229,8 @@ void init_irq_proc(void)
        if (!root_irq_dir)
                return;
+        register_default_affinity_proc();
        /*
         * Create entries for all existing IRQs.
         */
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 6fc0040f3e3a..38fc10ac7541 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -176,7 +176,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
        high = kallsyms_num_syms;
        while (high - low > 1) {
-                mid = (low + high) / 2;
+                mid = low + (high - low) / 2;
                if (kallsyms_addresses[mid] <= addr)
                        low = mid;
                else
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 1c5fcacbcf33..c8a4370e2a34 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -24,6 +24,12 @@
 #include <linux/utsrelease.h>
 #include <linux/utsname.h>
 #include <linux/numa.h>
+#include <linux/suspend.h>
+#include <linux/device.h>
+#include <linux/freezer.h>
+#include <linux/pm.h>
+#include <linux/cpu.h>
+#include <linux/console.h>
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -242,6 +248,12 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
                goto out;
        }
+        image->swap_page = kimage_alloc_control_pages(image, 0);
+        if (!image->swap_page) {
+                printk(KERN_ERR "Could not allocate swap buffer\n");
+                goto out;
+        }
        result = 0;
 out:
        if (result == 0)
@@ -589,14 +601,12 @@ static void kimage_free_extra_pages(struct kimage *image)
        kimage_free_page_list(&image->unuseable_pages);
 }
-static int kimage_terminate(struct kimage *image)
+static void kimage_terminate(struct kimage *image)
 {
        if (*image->entry != 0)
                image->entry++;
        *image->entry = IND_DONE;
-        return 0;
 }
 #define for_each_kimage_entry(image, ptr, entry) \
@@ -988,6 +998,8 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
                if (result)
                        goto out;
+                if (flags & KEXEC_PRESERVE_CONTEXT)
+                        image->preserve_context = 1;
                result = machine_kexec_prepare(image);
                if (result)
                        goto out;
@@ -997,9 +1009,7 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
                        if (result)
                                goto out;
                }
-                result = kimage_terminate(image);
+                kimage_terminate(image);
-                if (result)
-                        goto out;
        }
        /* Install the new kernel, and  Uninstall the old */
        image = xchg(dest_image, image);
@@ -1415,3 +1425,85 @@ static int __init crash_save_vmcoreinfo_init(void)
 }
 module_init(crash_save_vmcoreinfo_init)
+/**
+ *      kernel_kexec - reboot the system
+ *
+ *      Move into place and start executing a preloaded standalone
+ *      executable.  If nothing was preloaded return an error.
+ */
+int kernel_kexec(void)
+{
+        int error = 0;
+        if (xchg(&kexec_lock, 1))
+                return -EBUSY;
+        if (!kexec_image) {
+                error = -EINVAL;
+                goto Unlock;
+        }
+        if (kexec_image->preserve_context) {
+#ifdef CONFIG_KEXEC_JUMP
+                mutex_lock(&pm_mutex);
+                pm_prepare_console();
+                error = freeze_processes();
+                if (error) {
+                        error = -EBUSY;
+                        goto Restore_console;
+                }
+                suspend_console();
+                error = device_suspend(PMSG_FREEZE);
+                if (error)
+                        goto Resume_console;
+                error = disable_nonboot_cpus();
+                if (error)
+                        goto Resume_devices;
+                local_irq_disable();
+                /* At this point, device_suspend() has been called,
+                 * but *not* device_power_down(). We *must*
+                 * device_power_down() now.  Otherwise, drivers for
+                 * some devices (e.g. interrupt controllers) become
+                 * desynchronized with the actual state of the
+                 * hardware at resume time, and evil weirdness ensues.
+                 */
+                error = device_power_down(PMSG_FREEZE);
+                if (error)
+                        goto Enable_irqs;
+                save_processor_state();
+#endif
+        } else {
+                blocking_notifier_call_chain(&reboot_notifier_list,
+                                             SYS_RESTART, NULL);
+                system_state = SYSTEM_RESTART;
+                device_shutdown();
+                sysdev_shutdown();
+                printk(KERN_EMERG "Starting new kernel\n");
+                machine_shutdown();
+        }
+        machine_kexec(kexec_image);
+        if (kexec_image->preserve_context) {
+#ifdef CONFIG_KEXEC_JUMP
+                restore_processor_state();
+                device_power_up(PMSG_RESTORE);
+ Enable_irqs:
+                local_irq_enable();
+                enable_nonboot_cpus();
+ Resume_devices:
+                device_resume(PMSG_RESTORE);
+ Resume_console:
+                resume_console();
+                thaw_processes();
+ Restore_console:
+                pm_restore_console();
+                mutex_unlock(&pm_mutex);
+#endif
+        }
+ Unlock:
+        xchg(&kexec_lock, 0);
+        return error;
+}
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 8df97d3dfda8..2456d1a0befb 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -42,7 +42,7 @@ extern int max_threads;
 static struct workqueue_struct *khelper_wq;
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
 /*
        modprobe_path is set via /proc/sys.
@@ -352,16 +352,17 @@ static inline void register_pm_notifier_callback(void) {}
 * @path: path to usermode executable
 * @argv: arg vector for process
 * @envp: environment for process
+ * @gfp_mask: gfp mask for memory allocation
 *
 * Returns either %NULL on allocation failure, or a subprocess_info
 * structure.  This should be passed to call_usermodehelper_exec to
 * exec the process and free the structure.
 */
-struct subprocess_info *call_usermodehelper_setup(char *path,
+struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
-                                                  char **argv, char **envp)
+                                                  char **envp, gfp_t gfp_mask)
 {
        struct subprocess_info *sub_info;
-        sub_info = kzalloc(sizeof(struct subprocess_info),  GFP_ATOMIC);
+        sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
        if (!sub_info)
                goto out;
@@ -417,12 +418,12 @@ int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
 {
        struct file *f;
-        f = create_write_pipe();
+        f = create_write_pipe(0);
        if (IS_ERR(f))
                return PTR_ERR(f);
        *filp = f;
-        f = create_read_pipe(f);
+        f = create_read_pipe(f, 0);
        if (IS_ERR(f)) {
                free_write_pipe(*filp);
                return PTR_ERR(f);
@@ -494,7 +495,7 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
        struct subprocess_info *sub_info;
        int ret;
-        sub_info = call_usermodehelper_setup(path, argv, envp);
+        sub_info = call_usermodehelper_setup(path, argv, envp, GFP_KERNEL);
        if (sub_info == NULL)
                return -ENOMEM;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1485ca8d0e00..75bc2cd9ebc6 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -62,6 +62,7 @@
        addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name)))
 #endif
+static int kprobes_initialized;
 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
@@ -69,8 +70,15 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 static bool kprobe_enabled;
 DEFINE_MUTEX(kprobe_mutex);             /* Protects kprobe_table */
-DEFINE_SPINLOCK(kretprobe_lock);        /* Protects kretprobe_inst_table */
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
+static struct {
+        spinlock_t lock ____cacheline_aligned;
+} kretprobe_table_locks[KPROBE_TABLE_SIZE];
+static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
+{
+        return &(kretprobe_table_locks[hash].lock);
+}
 /*
 * Normally, functions that we'd want to prohibit kprobes in, are marked
@@ -368,26 +376,53 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
        return;
 }
-/* Called with kretprobe_lock held */
 void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
                                struct hlist_head *head)
 {
+        struct kretprobe *rp = ri->rp;
        /* remove rp inst off the rprobe_inst_table */
        hlist_del(&ri->hlist);
-        if (ri->rp) {
+        INIT_HLIST_NODE(&ri->hlist);
-                /* remove rp inst off the used list */
+        if (likely(rp)) {
-                hlist_del(&ri->uflist);
+                spin_lock(&rp->lock);
-                /* put rp inst back onto the free list */
+                hlist_add_head(&ri->hlist, &rp->free_instances);
-                INIT_HLIST_NODE(&ri->uflist);
+                spin_unlock(&rp->lock);
-                hlist_add_head(&ri->uflist, &ri->rp->free_instances);
        } else
                /* Unregistering */
                hlist_add_head(&ri->hlist, head);
 }
-struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
+void kretprobe_hash_lock(struct task_struct *tsk,
+                         struct hlist_head **head, unsigned long *flags)
 {
-        return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
+        unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
+        spinlock_t *hlist_lock;
+        *head = &kretprobe_inst_table[hash];
+        hlist_lock = kretprobe_table_lock_ptr(hash);
+        spin_lock_irqsave(hlist_lock, *flags);
+}
+void kretprobe_table_lock(unsigned long hash, unsigned long *flags)
+{
+        spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
+        spin_lock_irqsave(hlist_lock, *flags);
+}
+void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags)
+{
+        unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
+        spinlock_t *hlist_lock;
+        hlist_lock = kretprobe_table_lock_ptr(hash);
+        spin_unlock_irqrestore(hlist_lock, *flags);
+}
+void kretprobe_table_unlock(unsigned long hash, unsigned long *flags)
+{
+        spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
+        spin_unlock_irqrestore(hlist_lock, *flags);
 }
 /*
@@ -401,17 +436,21 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
        struct kretprobe_instance *ri;
        struct hlist_head *head, empty_rp;
        struct hlist_node *node, *tmp;
-        unsigned long flags = 0;
+        unsigned long hash, flags = 0;
-        INIT_HLIST_HEAD(&empty_rp);
+        if (unlikely(!kprobes_initialized))
-        spin_lock_irqsave(&kretprobe_lock, flags);
+                /* Early boot.  kretprobe_table_locks not yet initialized. */
-        head = kretprobe_inst_table_head(tk);
+                return;
+        hash = hash_ptr(tk, KPROBE_HASH_BITS);
+        head = &kretprobe_inst_table[hash];
+        kretprobe_table_lock(hash, &flags);
        hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
                if (ri->task == tk)
                        recycle_rp_inst(ri, &empty_rp);
        }
-        spin_unlock_irqrestore(&kretprobe_lock, flags);
+        kretprobe_table_unlock(hash, &flags);
+        INIT_HLIST_HEAD(&empty_rp);
        hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
                hlist_del(&ri->hlist);
                kfree(ri);
@@ -423,24 +462,29 @@ static inline void free_rp_inst(struct kretprobe *rp)
        struct kretprobe_instance *ri;
        struct hlist_node *pos, *next;
-        hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, uflist) {
+        hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, hlist) {
-                hlist_del(&ri->uflist);
+                hlist_del(&ri->hlist);
                kfree(ri);
        }
 }
 static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
 {
-        unsigned long flags;
+        unsigned long flags, hash;
        struct kretprobe_instance *ri;
        struct hlist_node *pos, *next;
+        struct hlist_head *head;
        /* No race here */
-        spin_lock_irqsave(&kretprobe_lock, flags);
+        for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) {
-        hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) {
+                kretprobe_table_lock(hash, &flags);
-                ri->rp = NULL;
+                head = &kretprobe_inst_table[hash];
-                hlist_del(&ri->uflist);
+                hlist_for_each_entry_safe(ri, pos, next, head, hlist) {
+                        if (ri->rp == rp)
+                                ri->rp = NULL;
+                }
+                kretprobe_table_unlock(hash, &flags);
        }
-        spin_unlock_irqrestore(&kretprobe_lock, flags);
        free_rp_inst(rp);
 }
@@ -831,32 +875,37 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
                                           struct pt_regs *regs)
 {
        struct kretprobe *rp = container_of(p, struct kretprobe, kp);
-        unsigned long flags = 0;
+        unsigned long hash, flags = 0;
+        struct kretprobe_instance *ri;
        /*TODO: consider to only swap the RA after the last pre_handler fired */
-        spin_lock_irqsave(&kretprobe_lock, flags);
+        hash = hash_ptr(current, KPROBE_HASH_BITS);
+        spin_lock_irqsave(&rp->lock, flags);
        if (!hlist_empty(&rp->free_instances)) {
-                struct kretprobe_instance *ri;
                ri = hlist_entry(rp->free_instances.first,
-                                 struct kretprobe_instance, uflist);
+                                struct kretprobe_instance, hlist);
+                hlist_del(&ri->hlist);
+                spin_unlock_irqrestore(&rp->lock, flags);
                ri->rp = rp;
                ri->task = current;
                if (rp->entry_handler && rp->entry_handler(ri, regs)) {
-                        spin_unlock_irqrestore(&kretprobe_lock, flags);
+                        spin_unlock_irqrestore(&rp->lock, flags);
                        return 0;
                }
                arch_prepare_kretprobe(ri, regs);
                /* XXX(hch): why is there no hlist_move_head? */
-                hlist_del(&ri->uflist);
+                INIT_HLIST_NODE(&ri->hlist);
-                hlist_add_head(&ri->uflist, &ri->rp->used_instances);
+                kretprobe_table_lock(hash, &flags);
-                hlist_add_head(&ri->hlist, kretprobe_inst_table_head(ri->task));
+                hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]);
-        } else
+                kretprobe_table_unlock(hash, &flags);
+        } else {
                rp->nmissed++;
-        spin_unlock_irqrestore(&kretprobe_lock, flags);
+                spin_unlock_irqrestore(&rp->lock, flags);
+        }
        return 0;
 }
@@ -892,7 +941,7 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp,
                rp->maxactive = NR_CPUS;
 #endif
        }
-        INIT_HLIST_HEAD(&rp->used_instances);
+        spin_lock_init(&rp->lock);
        INIT_HLIST_HEAD(&rp->free_instances);
        for (i = 0; i < rp->maxactive; i++) {
                inst = kmalloc(sizeof(struct kretprobe_instance) +
@@ -901,8 +950,8 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp,
                        free_rp_inst(rp);
                        return -ENOMEM;
                }
-                INIT_HLIST_NODE(&inst->uflist);
+                INIT_HLIST_NODE(&inst->hlist);
-                hlist_add_head(&inst->uflist, &rp->free_instances);
+                hlist_add_head(&inst->hlist, &rp->free_instances);
        }
        rp->nmissed = 0;
@@ -1009,6 +1058,7 @@ static int __init init_kprobes(void)
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                INIT_HLIST_HEAD(&kprobe_table[i]);
                INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
+                spin_lock_init(&(kretprobe_table_locks[i].lock));
        }
        /*
@@ -1050,6 +1100,7 @@ static int __init init_kprobes(void)
        err = arch_init_kprobes();
        if (!err)
                err = register_die_notifier(&kprobe_exceptions_nb);
+        kprobes_initialized = (err == 0);
        if (!err)
                init_test_probes();
@@ -1286,13 +1337,8 @@ EXPORT_SYMBOL_GPL(register_jprobe);
 EXPORT_SYMBOL_GPL(unregister_jprobe);
 EXPORT_SYMBOL_GPL(register_jprobes);
 EXPORT_SYMBOL_GPL(unregister_jprobes);
-#ifdef CONFIG_KPROBES
 EXPORT_SYMBOL_GPL(jprobe_return);
-#endif
-#ifdef CONFIG_KPROBES
 EXPORT_SYMBOL_GPL(register_kretprobe);
 EXPORT_SYMBOL_GPL(unregister_kretprobe);
 EXPORT_SYMBOL_GPL(register_kretprobes);
 EXPORT_SYMBOL_GPL(unregister_kretprobes);
-#endif
diff --git a/kernel/kthread.c b/kernel/kthread.c
index bd1b9ea024e1..96cff2f8710b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -106,7 +106,7 @@ static void create_kthread(struct kthread_create_info *create)
                 */
                sched_setscheduler(create->result, SCHED_NORMAL, &param);
                set_user_nice(create->result, KTHREAD_NICE_LEVEL);
-                set_cpus_allowed(create->result, CPU_MASK_ALL);
+                set_cpus_allowed_ptr(create->result, CPU_MASK_ALL_PTR);
        }
        complete(&create->done);
 }
@@ -176,10 +176,11 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
                return;
        }
        /* Must have done schedule() in kthread() before we set_task_cpu */
-        wait_task_inactive(k);
+        wait_task_inactive(k, 0);
        set_task_cpu(k, cpu);
        k->cpus_allowed = cpumask_of_cpu(cpu);
        k->rt.nr_cpus_allowed = 1;
+        k->flags |= PF_THREAD_BOUND;
 }
 EXPORT_SYMBOL(kthread_bind);
@@ -232,9 +233,9 @@ int kthreadd(void *unused)
        set_task_comm(tsk, "kthreadd");
        ignore_signals(tsk);
        set_user_nice(tsk, KTHREAD_NICE_LEVEL);
-        set_cpus_allowed(tsk, CPU_MASK_ALL);
+        set_cpus_allowed_ptr(tsk, CPU_MASK_ALL_PTR);
-        current->flags |= PF_NOFREEZE;
+        current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
        for (;;) {
                set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 81a4e4a3f087..d38a64362973 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -39,6 +39,7 @@
 #include <linux/irqflags.h>
 #include <linux/utsname.h>
 #include <linux/hash.h>
+#include <linux/ftrace.h>
 #include <asm/sections.h>
@@ -81,6 +82,8 @@ static int graph_lock(void)
                __raw_spin_unlock(&lockdep_lock);
                return 0;
        }
+        /* prevent any recursions within lockdep from causing deadlocks */
+        current->lockdep_recursion++;
        return 1;
 }
@@ -89,6 +92,7 @@ static inline int graph_unlock(void)
        if (debug_locks && !__raw_spin_is_locked(&lockdep_lock))
                return DEBUG_LOCKS_WARN_ON(1);
+        current->lockdep_recursion--;
        __raw_spin_unlock(&lockdep_lock);
        return 0;
 }
@@ -982,7 +986,7 @@ check_noncircular(struct lock_class *source, unsigned int depth)
        return 1;
 }
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
 /*
 * Forwards and backwards subgraph searching, for the purposes of
 * proving that two subgraphs can be connected by a new dependency
@@ -1458,7 +1462,14 @@ out_bug:
 }
 unsigned long nr_lock_chains;
-static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
+struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
+int nr_chain_hlocks;
+static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS];
+struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i)
+{
+        return lock_classes + chain_hlocks[chain->base + i];
+}
 /*
 * Look up a dependency chain. If the key is not present yet then
@@ -1466,10 +1477,15 @@ static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
 * validated. If the key is already hashed, return 0.
 * (On return with 1 graph_lock is held.)
 */
-static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class)
+static inline int lookup_chain_cache(struct task_struct *curr,
+                                     struct held_lock *hlock,
+                                     u64 chain_key)
 {
+        struct lock_class *class = hlock->class;
        struct list_head *hash_head = chainhashentry(chain_key);
        struct lock_chain *chain;
+        struct held_lock *hlock_curr, *hlock_next;
+        int i, j, n, cn;
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return 0;
@@ -1517,6 +1533,32 @@ cache_hit:
        }
        chain = lock_chains + nr_lock_chains++;
        chain->chain_key = chain_key;
+        chain->irq_context = hlock->irq_context;
+        /* Find the first held_lock of current chain */
+        hlock_next = hlock;
+        for (i = curr->lockdep_depth - 1; i >= 0; i--) {
+                hlock_curr = curr->held_locks + i;
+                if (hlock_curr->irq_context != hlock_next->irq_context)
+                        break;
+                hlock_next = hlock;
+        }
+        i++;
+        chain->depth = curr->lockdep_depth + 1 - i;
+        cn = nr_chain_hlocks;
+        while (cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS) {
+                n = cmpxchg(&nr_chain_hlocks, cn, cn + chain->depth);
+                if (n == cn)
+                        break;
+                cn = n;
+        }
+        if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
+                chain->base = cn;
+                for (j = 0; j < chain->depth - 1; j++, i++) {
+                        int lock_id = curr->held_locks[i].class - lock_classes;
+                        chain_hlocks[chain->base + j] = lock_id;
+                }
+                chain_hlocks[chain->base + j] = class - lock_classes;
+        }
        list_add_tail_rcu(&chain->entry, hash_head);
        debug_atomic_inc(&chain_lookup_misses);
        inc_chains();
@@ -1538,7 +1580,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
         * graph_lock for us)
         */
        if (!hlock->trylock && (hlock->check == 2) &&
-                        lookup_chain_cache(chain_key, hlock->class)) {
+            lookup_chain_cache(curr, hlock, chain_key)) {
                /*
                 * Check whether last held lock:
                 *
@@ -1680,7 +1722,7 @@ valid_state(struct task_struct *curr, struct held_lock *this,
 static int mark_lock(struct task_struct *curr, struct held_lock *this,
                     enum lock_usage_bit new_bit);
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
 /*
 * print irq inversion bug:
@@ -2013,11 +2055,13 @@ void early_boot_irqs_on(void)
 /*
 * Hardirqs will be enabled:
 */
-void trace_hardirqs_on(void)
+void trace_hardirqs_on_caller(unsigned long a0)
 {
        struct task_struct *curr = current;
        unsigned long ip;
+        time_hardirqs_on(CALLER_ADDR0, a0);
        if (unlikely(!debug_locks || current->lockdep_recursion))
                return;
@@ -2055,16 +2099,23 @@ void trace_hardirqs_on(void)
        curr->hardirq_enable_event = ++curr->irq_events;
        debug_atomic_inc(&hardirqs_on_events);
 }
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
+void trace_hardirqs_on(void)
+{
+        trace_hardirqs_on_caller(CALLER_ADDR0);
+}
 EXPORT_SYMBOL(trace_hardirqs_on);
 /*
 * Hardirqs were disabled:
 */
-void trace_hardirqs_off(void)
+void trace_hardirqs_off_caller(unsigned long a0)
 {
        struct task_struct *curr = current;
+        time_hardirqs_off(CALLER_ADDR0, a0);
        if (unlikely(!debug_locks || current->lockdep_recursion))
                return;
@@ -2082,7 +2133,12 @@ void trace_hardirqs_off(void)
        } else
                debug_atomic_inc(&redundant_hardirqs_off);
 }
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
+void trace_hardirqs_off(void)
+{
+        trace_hardirqs_off_caller(CALLER_ADDR0);
+}
 EXPORT_SYMBOL(trace_hardirqs_off);
 /*
@@ -2246,7 +2302,7 @@ static inline int separate_irq_context(struct task_struct *curr,
 * Mark a lock with a usage bit, and validate the state transition:
 */
 static int mark_lock(struct task_struct *curr, struct held_lock *this,
-                     enum lock_usage_bit new_bit)
+                             enum lock_usage_bit new_bit)
 {
        unsigned int new_mask = 1 << new_bit, ret = 1;
@@ -2650,7 +2706,8 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
 */
 static void check_flags(unsigned long flags)
 {
-#if defined(CONFIG_DEBUG_LOCKDEP) && defined(CONFIG_TRACE_IRQFLAGS)
+#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) && \
+    defined(CONFIG_TRACE_IRQFLAGS)
        if (!debug_locks)
                return;
@@ -2686,7 +2743,7 @@ static void check_flags(unsigned long flags)
 * and also avoid lockdep recursion:
 */
 void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
-                  int trylock, int read, int check, unsigned long ip)
+                          int trylock, int read, int check, unsigned long ip)
 {
        unsigned long flags;
@@ -2708,7 +2765,8 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 EXPORT_SYMBOL_GPL(lock_acquire);
-void lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
+void lock_release(struct lockdep_map *lock, int nested,
+                          unsigned long ip)
 {
        unsigned long flags;
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 8ce09bc4613d..c3600a091a28 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -23,6 +23,8 @@
 #define MAX_LOCKDEP_CHAINS_BITS 14
 #define MAX_LOCKDEP_CHAINS      (1UL << MAX_LOCKDEP_CHAINS_BITS)
+#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
 /*
 * Stack-trace: tightly packed array of stack backtrace
 * addresses. Protected by the hash_lock.
@@ -30,15 +32,19 @@
 #define MAX_STACK_TRACE_ENTRIES 262144UL
 extern struct list_head all_lock_classes;
+extern struct lock_chain lock_chains[];
 extern void
 get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4);
 extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);
+struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i);
 extern unsigned long nr_lock_classes;
 extern unsigned long nr_list_entries;
 extern unsigned long nr_lock_chains;
+extern int nr_chain_hlocks;
 extern unsigned long nr_stack_trace_entries;
 extern unsigned int nr_hardirq_chains;
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index dc5d29648d85..9b0e940e2545 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -139,7 +139,7 @@ static int l_show(struct seq_file *m, void *v)
        list_for_each_entry(entry, &class->locks_after, entry) {
                if (entry->distance == 1) {
-                        seq_printf(m, " -> [%p] ", entry->class);
+                        seq_printf(m, " -> [%p] ", entry->class->key);
                        print_name(m, entry->class);
                        seq_puts(m, "\n");
                }
@@ -178,6 +178,95 @@ static const struct file_operations proc_lockdep_operations = {
        .release        = seq_release,
 };
+#ifdef CONFIG_PROVE_LOCKING
+static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        struct lock_chain *chain;
+        (*pos)++;
+        if (v == SEQ_START_TOKEN)
+                chain = m->private;
+        else {
+                chain = v;
+                if (*pos < nr_lock_chains)
+                        chain = lock_chains + *pos;
+                else
+                        chain = NULL;
+        }
+        return chain;
+}
+static void *lc_start(struct seq_file *m, loff_t *pos)
+{
+        if (*pos == 0)
+                return SEQ_START_TOKEN;
+        if (*pos < nr_lock_chains)
+                return lock_chains + *pos;
+        return NULL;
+}
+static void lc_stop(struct seq_file *m, void *v)
+{
+}
+static int lc_show(struct seq_file *m, void *v)
+{
+        struct lock_chain *chain = v;
+        struct lock_class *class;
+        int i;
+        if (v == SEQ_START_TOKEN) {
+                seq_printf(m, "all lock chains:\n");
+                return 0;
+        }
+        seq_printf(m, "irq_context: %d\n", chain->irq_context);
+        for (i = 0; i < chain->depth; i++) {
+                class = lock_chain_get_class(chain, i);
+                seq_printf(m, "[%p] ", class->key);
+                print_name(m, class);
+                seq_puts(m, "\n");
+        }
+        seq_puts(m, "\n");
+        return 0;
+}
+static const struct seq_operations lockdep_chains_ops = {
+        .start  = lc_start,
+        .next   = lc_next,
+        .stop   = lc_stop,
+        .show   = lc_show,
+};
+static int lockdep_chains_open(struct inode *inode, struct file *file)
+{
+        int res = seq_open(file, &lockdep_chains_ops);
+        if (!res) {
+                struct seq_file *m = file->private_data;
+                if (nr_lock_chains)
+                        m->private = lock_chains;
+                else
+                        m->private = NULL;
+        }
+        return res;
+}
+static const struct file_operations proc_lockdep_chains_operations = {
+        .open           = lockdep_chains_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+#endif /* CONFIG_PROVE_LOCKING */
 static void lockdep_stats_debug_show(struct seq_file *m)
 {
 #ifdef CONFIG_DEBUG_LOCKDEP
@@ -294,6 +383,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
 #ifdef CONFIG_PROVE_LOCKING
        seq_printf(m, " dependency chains:             %11lu [max: %lu]\n",
                        nr_lock_chains, MAX_LOCKDEP_CHAINS);
+        seq_printf(m, " dependency chain hlocks:       %11d [max: %lu]\n",
+                        nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS);
 #endif
 #ifdef CONFIG_TRACE_IRQFLAGS
@@ -661,6 +752,10 @@ static const struct file_operations proc_lock_stat_operations = {
 static int __init lockdep_proc_init(void)
 {
        proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations);
+#ifdef CONFIG_PROVE_LOCKING
+        proc_create("lockdep_chains", S_IRUSR, NULL,
+                    &proc_lockdep_chains_operations);
+#endif
        proc_create("lockdep_stats", S_IRUSR, NULL,
                    &proc_lockdep_stats_operations);
diff --git a/kernel/marker.c b/kernel/marker.c
index b5a9fe1d50d5..971da5317903 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -55,8 +55,8 @@ static DEFINE_MUTEX(markers_mutex);
 struct marker_entry {
        struct hlist_node hlist;
        char *format;
-        void (*call)(const struct marker *mdata,        /* Probe wrapper */
+                        /* Probe wrapper */
-                void *call_private, const char *fmt, ...);
+        void (*call)(const struct marker *mdata, void *call_private, ...);
        struct marker_probe_closure single;
        struct marker_probe_closure *multi;
        int refcount;   /* Number of times armed. 0 if disarmed. */
@@ -91,15 +91,13 @@ EXPORT_SYMBOL_GPL(__mark_empty_function);
 * marker_probe_cb Callback that prepares the variable argument list for probes.
 * @mdata: pointer of type struct marker
 * @call_private: caller site private data
- * @fmt: format string
 * @...:  Variable argument list.
 *
 * Since we do not use "typical" pointer based RCU in the 1 argument case, we
 * need to put a full smp_rmb() in this branch. This is why we do not use
 * rcu_dereference() for the pointer read.
 */
-void marker_probe_cb(const struct marker *mdata, void *call_private,
+void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
-        const char *fmt, ...)
 {
        va_list args;
        char ptype;
@@ -120,8 +118,9 @@ void marker_probe_cb(const struct marker *mdata, void *call_private,
                /* Must read the ptr before private data. They are not data
                 * dependant, so we put an explicit smp_rmb() here. */
                smp_rmb();
-                va_start(args, fmt);
+                va_start(args, call_private);
-                func(mdata->single.probe_private, call_private, fmt, &args);
+                func(mdata->single.probe_private, call_private, mdata->format,
+                        &args);
                va_end(args);
        } else {
                struct marker_probe_closure *multi;
@@ -136,9 +135,9 @@ void marker_probe_cb(const struct marker *mdata, void *call_private,
                smp_read_barrier_depends();
                multi = mdata->multi;
                for (i = 0; multi[i].func; i++) {
-                        va_start(args, fmt);
+                        va_start(args, call_private);
-                        multi[i].func(multi[i].probe_private, call_private, fmt,
+                        multi[i].func(multi[i].probe_private, call_private,
-                                &args);
+                                mdata->format, &args);
                        va_end(args);
                }
        }
@@ -150,13 +149,11 @@ EXPORT_SYMBOL_GPL(marker_probe_cb);
 * marker_probe_cb Callback that does not prepare the variable argument list.
 * @mdata: pointer of type struct marker
 * @call_private: caller site private data
- * @fmt: format string
 * @...:  Variable argument list.
 *
 * Should be connected to markers "MARK_NOARGS".
 */
-void marker_probe_cb_noarg(const struct marker *mdata,
+void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
-        void *call_private, const char *fmt, ...)
 {
        va_list args;   /* not initialized */
        char ptype;
@@ -172,7 +169,8 @@ void marker_probe_cb_noarg(const struct marker *mdata,
                /* Must read the ptr before private data. They are not data
                 * dependant, so we put an explicit smp_rmb() here. */
                smp_rmb();
-                func(mdata->single.probe_private, call_private, fmt, &args);
+                func(mdata->single.probe_private, call_private, mdata->format,
+                        &args);
        } else {
                struct marker_probe_closure *multi;
                int i;
@@ -186,8 +184,8 @@ void marker_probe_cb_noarg(const struct marker *mdata,
                smp_read_barrier_depends();
                multi = mdata->multi;
                for (i = 0; multi[i].func; i++)
-                        multi[i].func(multi[i].probe_private, call_private, fmt,
+                        multi[i].func(multi[i].probe_private, call_private,
-                                &args);
+                                mdata->format, &args);
        }
        preempt_enable();
 }
@@ -443,7 +441,7 @@ static int remove_marker(const char *name)
        hlist_del(&e->hlist);
        /* Make sure the call_rcu has been executed */
        if (e->rcu_pending)
-                rcu_barrier();
+                rcu_barrier_sched();
        kfree(e);
        return 0;
 }
@@ -478,7 +476,7 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
        hlist_del(&(*entry)->hlist);
        /* Make sure the call_rcu has been executed */
        if ((*entry)->rcu_pending)
-                rcu_barrier();
+                rcu_barrier_sched();
        kfree(*entry);
        *entry = e;
        trace_mark(core_marker_format, "name %s format %s",
@@ -657,7 +655,7 @@ int marker_probe_register(const char *name, const char *format,
         * make sure it's executed now.
         */
        if (entry->rcu_pending)
-                rcu_barrier();
+                rcu_barrier_sched();
        old = marker_entry_add_probe(entry, probe, probe_private);
        if (IS_ERR(old)) {
                ret = PTR_ERR(old);
@@ -672,10 +670,7 @@ int marker_probe_register(const char *name, const char *format,
        entry->rcu_pending = 1;
        /* write rcu_pending before calling the RCU callback */
        smp_wmb();
-#ifdef CONFIG_PREEMPT_RCU
+        call_rcu_sched(&entry->rcu, free_old_closure);
-        synchronize_sched();    /* Until we have the call_rcu_sched() */
-#endif
-        call_rcu(&entry->rcu, free_old_closure);
 end:
        mutex_unlock(&markers_mutex);
        return ret;
@@ -706,7 +701,7 @@ int marker_probe_unregister(const char *name,
        if (!entry)
                goto end;
        if (entry->rcu_pending)
-                rcu_barrier();
+                rcu_barrier_sched();
        old = marker_entry_remove_probe(entry, probe, probe_private);
        mutex_unlock(&markers_mutex);
        marker_update_probes();         /* may update entry */
@@ -718,10 +713,7 @@ int marker_probe_unregister(const char *name,
        entry->rcu_pending = 1;
        /* write rcu_pending before calling the RCU callback */
        smp_wmb();
-#ifdef CONFIG_PREEMPT_RCU
+        call_rcu_sched(&entry->rcu, free_old_closure);
-        synchronize_sched();    /* Until we have the call_rcu_sched() */
-#endif
-        call_rcu(&entry->rcu, free_old_closure);
        remove_marker(name);    /* Ignore busy error message */
        ret = 0;
 end:
@@ -788,7 +780,7 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
                goto end;
        }
        if (entry->rcu_pending)
-                rcu_barrier();
+                rcu_barrier_sched();
        old = marker_entry_remove_probe(entry, NULL, probe_private);
        mutex_unlock(&markers_mutex);
        marker_update_probes();         /* may update entry */
@@ -799,10 +791,7 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
        entry->rcu_pending = 1;
        /* write rcu_pending before calling the RCU callback */
        smp_wmb();
-#ifdef CONFIG_PREEMPT_RCU
+        call_rcu_sched(&entry->rcu, free_old_closure);
-        synchronize_sched();    /* Until we have the call_rcu_sched() */
-#endif
-        call_rcu(&entry->rcu, free_old_closure);
        remove_marker(entry->name);     /* Ignore busy error message */
 end:
        mutex_unlock(&markers_mutex);
diff --git a/kernel/module.c b/kernel/module.c
index 5f80478b746d..d8b5605132a0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -70,6 +70,9 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
 static BLOCKING_NOTIFIER_HEAD(module_notify_list);
+/* Bounds of module allocation, for speeding __module_text_address */
+static unsigned long module_addr_min = -1UL, module_addr_max = 0;
 int register_module_notifier(struct notifier_block * nb)
 {
        return blocking_notifier_chain_register(&module_notify_list, nb);
@@ -134,17 +137,19 @@ extern const struct kernel_symbol __start___ksymtab_gpl[];
 extern const struct kernel_symbol __stop___ksymtab_gpl[];
 extern const struct kernel_symbol __start___ksymtab_gpl_future[];
 extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
-extern const struct kernel_symbol __start___ksymtab_unused[];
-extern const struct kernel_symbol __stop___ksymtab_unused[];
-extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
-extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
 extern const struct kernel_symbol __start___ksymtab_gpl_future[];
 extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
 extern const unsigned long __start___kcrctab[];
 extern const unsigned long __start___kcrctab_gpl[];
 extern const unsigned long __start___kcrctab_gpl_future[];
+#ifdef CONFIG_UNUSED_SYMBOLS
+extern const struct kernel_symbol __start___ksymtab_unused[];
+extern const struct kernel_symbol __stop___ksymtab_unused[];
+extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
+extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
 extern const unsigned long __start___kcrctab_unused[];
 extern const unsigned long __start___kcrctab_unused_gpl[];
+#endif
 #ifndef CONFIG_MODVERSIONS
 #define symversion(base, idx) NULL
@@ -152,156 +157,186 @@ extern const unsigned long __start___kcrctab_unused_gpl[];
 #define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
 #endif
-/* lookup symbol in given range of kernel_symbols */
-static const struct kernel_symbol *lookup_symbol(const char *name,
-        const struct kernel_symbol *start,
-        const struct kernel_symbol *stop)
-{
-        const struct kernel_symbol *ks = start;
-        for (; ks < stop; ks++)
-                if (strcmp(ks->name, name) == 0)
-                        return ks;
-        return NULL;
-}
-static bool always_ok(bool gplok, bool warn, const char *name)
-{
-        return true;
-}
-static bool printk_unused_warning(bool gplok, bool warn, const char *name)
-{
-        if (warn) {
-                printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
-                       "however this module is using it.\n", name);
-                printk(KERN_WARNING
-                       "This symbol will go away in the future.\n");
-                printk(KERN_WARNING
-                       "Please evalute if this is the right api to use and if "
-                       "it really is, submit a report the linux kernel "
-                       "mailinglist together with submitting your code for "
-                       "inclusion.\n");
-        }
-        return true;
-}
-static bool gpl_only_unused_warning(bool gplok, bool warn, const char *name)
-{
-        if (!gplok)
-                return false;
-        return printk_unused_warning(gplok, warn, name);
-}
-static bool gpl_only(bool gplok, bool warn, const char *name)
-{
-        return gplok;
-}
-static bool warn_if_not_gpl(bool gplok, bool warn, const char *name)
-{
-        if (!gplok && warn) {
-                printk(KERN_WARNING "Symbol %s is being used "
-                       "by a non-GPL module, which will not "
-                       "be allowed in the future\n", name);
-                printk(KERN_WARNING "Please see the file "
-                       "Documentation/feature-removal-schedule.txt "
-                       "in the kernel source tree for more details.\n");
-        }
-        return true;
-}
 struct symsearch {
        const struct kernel_symbol *start, *stop;
        const unsigned long *crcs;
-        bool (*check)(bool gplok, bool warn, const char *name);
+        enum {
+                NOT_GPL_ONLY,
+                GPL_ONLY,
+                WILL_BE_GPL_ONLY,
+        } licence;
+        bool unused;
 };
-/* Look through this array of symbol tables for a symbol match which
+static bool each_symbol_in_section(const struct symsearch *arr,
- * passes the check function. */
+                                   unsigned int arrsize,
-static const struct kernel_symbol *search_symarrays(const struct symsearch *arr,
+                                   struct module *owner,
-                                                    unsigned int num,
+                                   bool (*fn)(const struct symsearch *syms,
-                                                    const char *name,
+                                              struct module *owner,
-                                                    bool gplok,
+                                              unsigned int symnum, void *data),
-                                                    bool warn,
+                                   void *data)
-                                                    const unsigned long **crc)
 {
-        unsigned int i;
+        unsigned int i, j;
-        const struct kernel_symbol *ks;
-        for (i = 0; i < num; i++) {
+        for (j = 0; j < arrsize; j++) {
-                ks = lookup_symbol(name, arr[i].start, arr[i].stop);
+                for (i = 0; i < arr[j].stop - arr[j].start; i++)
-                if (!ks || !arr[i].check(gplok, warn, name))
+                        if (fn(&arr[j], owner, i, data))
-                        continue;
+                                return true;
-                if (crc)
-                        *crc = symversion(arr[i].crcs, ks - arr[i].start);
-                return ks;
        }
-        return NULL;
+        return false;
 }
-/* Find a symbol, return value, (optional) crc and (optional) module
+/* Returns true as soon as fn returns true, otherwise false. */
- * which owns it */
+static bool each_symbol(bool (*fn)(const struct symsearch *arr,
-static unsigned long find_symbol(const char *name,
+                                   struct module *owner,
-                                 struct module **owner,
+                                   unsigned int symnum, void *data),
-                                 const unsigned long **crc,
+                        void *data)
-                                 bool gplok,
-                                 bool warn)
 {
        struct module *mod;
-        const struct kernel_symbol *ks;
        const struct symsearch arr[] = {
                { __start___ksymtab, __stop___ksymtab, __start___kcrctab,
-                  always_ok },
+                  NOT_GPL_ONLY, false },
                { __start___ksymtab_gpl, __stop___ksymtab_gpl,
-                  __start___kcrctab_gpl, gpl_only },
+                  __start___kcrctab_gpl,
+                  GPL_ONLY, false },
                { __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future,
-                  __start___kcrctab_gpl_future, warn_if_not_gpl },
+                  __start___kcrctab_gpl_future,
+                  WILL_BE_GPL_ONLY, false },
+#ifdef CONFIG_UNUSED_SYMBOLS
                { __start___ksymtab_unused, __stop___ksymtab_unused,
-                  __start___kcrctab_unused, printk_unused_warning },
+                  __start___kcrctab_unused,
+                  NOT_GPL_ONLY, true },
                { __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl,
-                  __start___kcrctab_unused_gpl, gpl_only_unused_warning },
+                  __start___kcrctab_unused_gpl,
+                  GPL_ONLY, true },
+#endif
        };
-        /* Core kernel first. */
+        if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data))
-        ks = search_symarrays(arr, ARRAY_SIZE(arr), name, gplok, warn, crc);
+                return true;
-        if (ks) {
-                if (owner)
-                        *owner = NULL;
-                return ks->value;
-        }
-        /* Now try modules. */
        list_for_each_entry(mod, &modules, list) {
                struct symsearch arr[] = {
                        { mod->syms, mod->syms + mod->num_syms, mod->crcs,
-                          always_ok },
+                          NOT_GPL_ONLY, false },
                        { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
-                          mod->gpl_crcs, gpl_only },
+                          mod->gpl_crcs,
+                          GPL_ONLY, false },
                        { mod->gpl_future_syms,
                          mod->gpl_future_syms + mod->num_gpl_future_syms,
-                          mod->gpl_future_crcs, warn_if_not_gpl },
+                          mod->gpl_future_crcs,
+                          WILL_BE_GPL_ONLY, false },
+#ifdef CONFIG_UNUSED_SYMBOLS
                        { mod->unused_syms,
                          mod->unused_syms + mod->num_unused_syms,
-                          mod->unused_crcs, printk_unused_warning },
+                          mod->unused_crcs,
+                          NOT_GPL_ONLY, true },
                        { mod->unused_gpl_syms,
                          mod->unused_gpl_syms + mod->num_unused_gpl_syms,
-                          mod->unused_gpl_crcs, gpl_only_unused_warning },
+                          mod->unused_gpl_crcs,
+                          GPL_ONLY, true },
+#endif
                };
-                ks = search_symarrays(arr, ARRAY_SIZE(arr),
+                if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data))
-                                      name, gplok, warn, crc);
+                        return true;
-                if (ks) {
+        }
-                        if (owner)
+        return false;
-                                *owner = mod;
+}
-                        return ks->value;
+struct find_symbol_arg {
+        /* Input */
+        const char *name;
+        bool gplok;
+        bool warn;
+        /* Output */
+        struct module *owner;
+        const unsigned long *crc;
+        unsigned long value;
+};
+static bool find_symbol_in_section(const struct symsearch *syms,
+                                   struct module *owner,
+                                   unsigned int symnum, void *data)
+{
+        struct find_symbol_arg *fsa = data;
+        if (strcmp(syms->start[symnum].name, fsa->name) != 0)
+                return false;
+        if (!fsa->gplok) {
+                if (syms->licence == GPL_ONLY)
+                        return false;
+                if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) {
+                        printk(KERN_WARNING "Symbol %s is being used "
+                               "by a non-GPL module, which will not "
+                               "be allowed in the future\n", fsa->name);
+                        printk(KERN_WARNING "Please see the file "
+                               "Documentation/feature-removal-schedule.txt "
+                               "in the kernel source tree for more details.\n");
                }
        }
+#ifdef CONFIG_UNUSED_SYMBOLS
+        if (syms->unused && fsa->warn) {
+                printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
+                       "however this module is using it.\n", fsa->name);
+                printk(KERN_WARNING
+                       "This symbol will go away in the future.\n");
+                printk(KERN_WARNING
+                       "Please evalute if this is the right api to use and if "
+                       "it really is, submit a report the linux kernel "
+                       "mailinglist together with submitting your code for "
+                       "inclusion.\n");
+        }
+#endif
+        fsa->owner = owner;
+        fsa->crc = symversion(syms->crcs, symnum);
+        fsa->value = syms->start[symnum].value;
+        return true;
+}
+/* Find a symbol, return value, (optional) crc and (optional) module
+ * which owns it */
+static unsigned long find_symbol(const char *name,
+                                 struct module **owner,
+                                 const unsigned long **crc,
+                                 bool gplok,
+                                 bool warn)
+{
+        struct find_symbol_arg fsa;
+        fsa.name = name;
+        fsa.gplok = gplok;
+        fsa.warn = warn;
+        if (each_symbol(find_symbol_in_section, &fsa)) {
+                if (owner)
+                        *owner = fsa.owner;
+                if (crc)
+                        *crc = fsa.crc;
+                return fsa.value;
+        }
        DEBUGP("Failed to find symbol %s\n", name);
        return -ENOENT;
 }
+/* lookup symbol in given range of kernel_symbols */
+static const struct kernel_symbol *lookup_symbol(const char *name,
+        const struct kernel_symbol *start,
+        const struct kernel_symbol *stop)
+{
+        const struct kernel_symbol *ks = start;
+        for (; ks < stop; ks++)
+                if (strcmp(ks->name, name) == 0)
+                        return ks;
+        return NULL;
+}
 /* Search for module by name: must hold module_mutex. */
 static struct module *find_module(const char *name)
 {
@@ -639,8 +674,8 @@ static int __try_stop_module(void *_sref)
 {
        struct stopref *sref = _sref;
-        /* If it's not unused, quit unless we are told to block. */
+        /* If it's not unused, quit unless we're forcing. */
-        if ((sref->flags & O_NONBLOCK) && module_refcount(sref->mod) != 0) {
+        if (module_refcount(sref->mod) != 0) {
                if (!(*sref->forced = try_force_unload(sref->flags)))
                        return -EWOULDBLOCK;
        }
@@ -652,9 +687,16 @@ static int __try_stop_module(void *_sref)
 static int try_stop_module(struct module *mod, int flags, int *forced)
 {
-        struct stopref sref = { mod, flags, forced };
+        if (flags & O_NONBLOCK) {
+                struct stopref sref = { mod, flags, forced };
-        return stop_machine_run(__try_stop_module, &sref, NR_CPUS);
+                return stop_machine_run(__try_stop_module, &sref, NR_CPUS);
+        } else {
+                /* We don't need to stop the machine for this. */
+                mod->state = MODULE_STATE_GOING;
+                synchronize_sched();
+                return 0;
+        }
 }
 unsigned int module_refcount(struct module *mod)
@@ -1445,8 +1487,10 @@ static int verify_export_symbols(struct module *mod)
                { mod->syms, mod->num_syms },
                { mod->gpl_syms, mod->num_gpl_syms },
                { mod->gpl_future_syms, mod->num_gpl_future_syms },
+#ifdef CONFIG_UNUSED_SYMBOLS
                { mod->unused_syms, mod->num_unused_syms },
                { mod->unused_gpl_syms, mod->num_unused_gpl_syms },
+#endif
        };
        for (i = 0; i < ARRAY_SIZE(arr); i++) {
@@ -1526,7 +1570,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
 }
 /* Update size with this section: return offset. */
-static long get_offset(unsigned long *size, Elf_Shdr *sechdr)
+static long get_offset(unsigned int *size, Elf_Shdr *sechdr)
 {
        long ret;
@@ -1738,6 +1782,20 @@ static inline void add_kallsyms(struct module *mod,
 }
 #endif /* CONFIG_KALLSYMS */
+static void *module_alloc_update_bounds(unsigned long size)
+{
+        void *ret = module_alloc(size);
+        if (ret) {
+                /* Update module bounds. */
+                if ((unsigned long)ret < module_addr_min)
+                        module_addr_min = (unsigned long)ret;
+                if ((unsigned long)ret + size > module_addr_max)
+                        module_addr_max = (unsigned long)ret + size;
+        }
+        return ret;
+}
 /* Allocate and load the module: note that size of section 0 is always
   zero, and we rely on this for optional sections. */
 static struct module *load_module(void __user *umod,
@@ -1764,10 +1822,12 @@ static struct module *load_module(void __user *umod,
        unsigned int gplfutureindex;
        unsigned int gplfuturecrcindex;
        unsigned int unwindex = 0;
+#ifdef CONFIG_UNUSED_SYMBOLS
        unsigned int unusedindex;
        unsigned int unusedcrcindex;
        unsigned int unusedgplindex;
        unsigned int unusedgplcrcindex;
+#endif
        unsigned int markersindex;
        unsigned int markersstringsindex;
        struct module *mod;
@@ -1850,13 +1910,15 @@ static struct module *load_module(void __user *umod,
        exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");
        gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");
        gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future");
-        unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused");
-        unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl");
        crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");
        gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");
        gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future");
+#ifdef CONFIG_UNUSED_SYMBOLS
+        unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused");
+        unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl");
        unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused");
        unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl");
+#endif
        setupindex = find_sec(hdr, sechdrs, secstrings, "__param");
        exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");
        obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
@@ -1935,7 +1997,7 @@ static struct module *load_module(void __user *umod,
        layout_sections(mod, hdr, sechdrs, secstrings);
        /* Do the allocs. */
-        ptr = module_alloc(mod->core_size);
+        ptr = module_alloc_update_bounds(mod->core_size);
        if (!ptr) {
                err = -ENOMEM;
                goto free_percpu;
@@ -1943,7 +2005,7 @@ static struct module *load_module(void __user *umod,
        memset(ptr, 0, mod->core_size);
        mod->module_core = ptr;
-        ptr = module_alloc(mod->init_size);
+        ptr = module_alloc_update_bounds(mod->init_size);
        if (!ptr && mod->init_size) {
                err = -ENOMEM;
                goto free_core;
@@ -2018,14 +2080,15 @@ static struct module *load_module(void __user *umod,
                mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr;
        mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size /
                                        sizeof(*mod->gpl_future_syms);
-        mod->num_unused_syms = sechdrs[unusedindex].sh_size /
-                                        sizeof(*mod->unused_syms);
-        mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size /
-                                        sizeof(*mod->unused_gpl_syms);
        mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr;
        if (gplfuturecrcindex)
                mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr;
+#ifdef CONFIG_UNUSED_SYMBOLS
+        mod->num_unused_syms = sechdrs[unusedindex].sh_size /
+                                        sizeof(*mod->unused_syms);
+        mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size /
+                                        sizeof(*mod->unused_gpl_syms);
        mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr;
        if (unusedcrcindex)
                mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr;
@@ -2033,13 +2096,17 @@ static struct module *load_module(void __user *umod,
        if (unusedgplcrcindex)
                mod->unused_gpl_crcs
                        = (void *)sechdrs[unusedgplcrcindex].sh_addr;
+#endif
 #ifdef CONFIG_MODVERSIONS
-        if ((mod->num_syms && !crcindex) ||
+        if ((mod->num_syms && !crcindex)
-            (mod->num_gpl_syms && !gplcrcindex) ||
+            || (mod->num_gpl_syms && !gplcrcindex)
-            (mod->num_gpl_future_syms && !gplfuturecrcindex) ||
+            || (mod->num_gpl_future_syms && !gplfuturecrcindex)
-            (mod->num_unused_syms && !unusedcrcindex) ||
+#ifdef CONFIG_UNUSED_SYMBOLS
-            (mod->num_unused_gpl_syms && !unusedgplcrcindex)) {
+            || (mod->num_unused_syms && !unusedcrcindex)
+            || (mod->num_unused_gpl_syms && !unusedgplcrcindex)
+#endif
+                ) {
                printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name);
                err = try_to_force_load(mod, "nocrc");
                if (err)
@@ -2512,7 +2579,7 @@ static int m_show(struct seq_file *m, void *p)
        struct module *mod = list_entry(p, struct module, list);
        char buf[8];
-        seq_printf(m, "%s %lu",
+        seq_printf(m, "%s %u",
                   mod->name, mod->init_size + mod->core_size);
        print_unload_info(m, mod);
@@ -2595,6 +2662,9 @@ struct module *__module_text_address(unsigned long addr)
 {
        struct module *mod;
+        if (addr < module_addr_min || addr > module_addr_max)
+                return NULL;
        list_for_each_entry(mod, &modules, list)
                if (within(addr, mod->module_init, mod->init_text_size)
                    || within(addr, mod->module_core, mod->core_text_size))
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 3aaa06c561de..1d94160eb532 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -79,8 +79,8 @@ void debug_mutex_unlock(struct mutex *lock)
        if (unlikely(!debug_locks))
                return;
-        DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
        DEBUG_LOCKS_WARN_ON(lock->magic != lock);
+        DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
        DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
        DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
 }
diff --git a/kernel/mutex.c b/kernel/mutex.c
index d046a345d365..bcdc9ac8ef60 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -165,10 +165,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                 * got a signal? (This code gets eliminated in the
                 * TASK_UNINTERRUPTIBLE case.)
                 */
-                if (unlikely((state == TASK_INTERRUPTIBLE &&
+                if (unlikely(signal_pending_state(state, task))) {
-                                        signal_pending(task)) ||
-                              (state == TASK_KILLABLE &&
-                                        fatal_signal_pending(task)))) {
                        mutex_remove_waiter(lock, &waiter,
                                            task_thread_info(task));
                        mutex_release(&lock->dep_map, 1, ip);
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 48d7ed6fc3a4..43c2111cd54d 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -7,6 +7,7 @@
 #include <linux/module.h>
 #include <linux/cgroup.h>
 #include <linux/fs.h>
+#include <linux/proc_fs.h>
 #include <linux/slab.h>
 #include <linux/nsproxy.h>
@@ -24,9 +25,12 @@ static inline struct ns_cgroup *cgroup_to_ns(
                            struct ns_cgroup, css);
 }
-int ns_cgroup_clone(struct task_struct *task)
+int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
 {
-        return cgroup_clone(task, &ns_subsys);
+        char name[PROC_NUMBUF];
+        snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid));
+        return cgroup_clone(task, &ns_subsys, name);
 }
 /*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index adc785146a1c..21575fc46d05 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -157,12 +157,6 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
                goto out;
        }
-        err = ns_cgroup_clone(tsk);
-        if (err) {
-                put_nsproxy(new_ns);
-                goto out;
-        }
        tsk->nsproxy = new_ns;
 out:
@@ -209,7 +203,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
                goto out;
        }
-        err = ns_cgroup_clone(current);
+        err = ns_cgroup_clone(current, task_pid(current));
        if (err)
                put_nsproxy(*new_nsp);
diff --git a/kernel/panic.c b/kernel/panic.c
index 425567f45b9f..12c5a0a6c89b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -318,6 +318,28 @@ void warn_on_slowpath(const char *file, int line)
        add_taint(TAINT_WARN);
 }
 EXPORT_SYMBOL(warn_on_slowpath);
+void warn_slowpath(const char *file, int line, const char *fmt, ...)
+{
+        va_list args;
+        char function[KSYM_SYMBOL_LEN];
+        unsigned long caller = (unsigned long)__builtin_return_address(0);
+        sprint_symbol(function, caller);
+        printk(KERN_WARNING "------------[ cut here ]------------\n");
+        printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
+                line, function);
+        va_start(args, fmt);
+        vprintk(fmt, args);
+        va_end(args);
+        print_modules();
+        dump_stack();
+        print_oops_end_marker();
+        add_taint(TAINT_WARN);
+}
+EXPORT_SYMBOL(warn_slowpath);
 #endif
 #ifdef CONFIG_CC_STACKPROTECTOR
diff --git a/kernel/pid.c b/kernel/pid.c
index 20d59fa2d493..064e76afa507 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -30,6 +30,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/init.h>
+#include <linux/rculist.h>
 #include <linux/bootmem.h>
 #include <linux/hash.h>
 #include <linux/pid_namespace.h>
@@ -308,12 +309,6 @@ struct pid *find_vpid(int nr)
 }
 EXPORT_SYMBOL_GPL(find_vpid);
-struct pid *find_pid(int nr)
-{
-        return find_pid_ns(nr, &init_pid_ns);
-}
-EXPORT_SYMBOL_GPL(find_pid);
 /*
 * attach_pid() must be called with the tasklist_lock write-held.
 */
@@ -434,6 +429,7 @@ struct pid *find_get_pid(pid_t nr)
        return pid;
 }
+EXPORT_SYMBOL_GPL(find_get_pid);
 pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
 {
@@ -481,7 +477,7 @@ EXPORT_SYMBOL(task_session_nr_ns);
 /*
 * Used by proc to find the first pid that is greater then or equal to nr.
 *
- * If there is a pid at nr this function is exactly the same as find_pid.
+ * If there is a pid at nr this function is exactly the same as find_pid_ns.
 */
 struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 {
@@ -496,7 +492,6 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
        return pid;
 }
-EXPORT_SYMBOL_GPL(find_get_pid);
 /*
 * The pid hash table is scaled according to the amount of memory in the
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 98702b4b8851..ea567b78d1aa 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -12,6 +12,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/syscalls.h>
 #include <linux/err.h>
+#include <linux/acct.h>
 #define BITS_PER_PAGE           (PAGE_SIZE*8)
@@ -71,7 +72,7 @@ static struct pid_namespace *create_pid_namespace(unsigned int level)
        struct pid_namespace *ns;
        int i;
-        ns = kmem_cache_alloc(pid_ns_cachep, GFP_KERNEL);
+        ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
        if (ns == NULL)
                goto out;
@@ -84,17 +85,13 @@ static struct pid_namespace *create_pid_namespace(unsigned int level)
                goto out_free_map;
        kref_init(&ns->kref);
-        ns->last_pid = 0;
-        ns->child_reaper = NULL;
        ns->level = level;
        set_bit(0, ns->pidmap[0].page);
        atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
-        for (i = 1; i < PIDMAP_ENTRIES; i++) {
+        for (i = 1; i < PIDMAP_ENTRIES; i++)
-                ns->pidmap[i].page = NULL;
                atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
-        }
        return ns;
@@ -185,6 +182,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
        /* Child reaper for the pid namespace is going away */
        pid_ns->child_reaper = NULL;
+        acct_exit_ns(pid_ns);
        return;
 }
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 0afe32be4c85..8cb757026386 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -29,6 +29,7 @@
 #include <linux/pm_qos_params.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/time.h>
@@ -358,15 +359,19 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
        int ret;
        long pm_qos_class;
+        lock_kernel();
        pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
        if (pm_qos_class >= 0) {
                filp->private_data = (void *)pm_qos_class;
                sprintf(name, "process_%d", current->pid);
                ret = pm_qos_add_requirement(pm_qos_class, name,
                                        PM_QOS_DEFAULT_VALUE);
-                if (ret >= 0)
+                if (ret >= 0) {
+                        unlock_kernel();
                        return 0;
+                }
        }
+        unlock_kernel();
        return -EPERM;
 }
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index f1525ad06cb3..c42a03aef36f 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1037,6 +1037,9 @@ static void check_thread_timers(struct task_struct *tsk,
                                sig->rlim[RLIMIT_RTTIME].rlim_cur +=
                                                                USEC_PER_SEC;
                        }
+                        printk(KERN_INFO
+                                "RT Watchdog Timeout: %s[%d]\n",
+                                tsk->comm, task_pid_nr(tsk));
                        __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
                }
        }
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index dbd8398ddb0b..9a21681aa80f 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -449,9 +449,6 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
                spin_unlock_irqrestore(&idr_lock, flags);
        }
        sigqueue_free(tmr->sigq);
-        if (unlikely(tmr->it_process) &&
-            tmr->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
-                put_task_struct(tmr->it_process);
        kmem_cache_free(posix_timers_cache, tmr);
 }
@@ -856,11 +853,10 @@ retry_delete:
         * This keeps any tasks waiting on the spin lock from thinking
         * they got something (see the lock code above).
         */
-        if (timer->it_process) {
+        if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
-                if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+                put_task_struct(timer->it_process);
-                        put_task_struct(timer->it_process);
+        timer->it_process = NULL;
-                timer->it_process = NULL;
-        }
        unlock_timer(timer, flags);
        release_posix_timer(timer, IT_ID_SET);
        return 0;
@@ -885,11 +881,10 @@ retry_delete:
         * This keeps any tasks waiting on the spin lock from thinking
         * they got something (see the lock code above).
         */
-        if (timer->it_process) {
+        if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
-                if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+                put_task_struct(timer->it_process);
-                        put_task_struct(timer->it_process);
+        timer->it_process = NULL;
-                timer->it_process = NULL;
-        }
        unlock_timer(timer, flags);
        release_posix_timer(timer, IT_ID_SET);
 }
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index b45da40e8d25..dcd165f92a88 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -82,7 +82,7 @@ config PM_SLEEP_SMP
 config PM_SLEEP
        bool
-        depends on SUSPEND || HIBERNATION
+        depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
        default y
 config SUSPEND
@@ -94,6 +94,17 @@ config SUSPEND
          powered and thus its contents are preserved, such as the
          suspend-to-RAM state (e.g. the ACPI S3 state).
+config PM_TEST_SUSPEND
+        bool "Test suspend/resume and wakealarm during bootup"
+        depends on SUSPEND && PM_DEBUG && RTC_LIB=y
+        ---help---
+        This option will let you suspend your machine during bootup, and
+        make it wake up a few seconds later using an RTC wakeup alarm.
+        Enable this with a kernel parameter like "test_suspend=mem".
+        You probably want to have your system's RTC driver statically
+        linked, ensuring that it's available when this test runs.
 config SUSPEND_FREEZER
        bool "Enable freezer for suspend to RAM/standby" \
                if ARCH_WANTS_FREEZER_CONTROL || BROKEN
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 14a656cdc652..f011e0870b52 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -180,6 +180,17 @@ static void platform_restore_cleanup(int platform_mode)
 }
 /**
+ *      platform_recover - recover the platform from a failure to suspend
+ *      devices.
+ */
+static void platform_recover(int platform_mode)
+{
+        if (platform_mode && hibernation_ops && hibernation_ops->recover)
+                hibernation_ops->recover();
+}
+/**
 *      create_image - freeze devices that need to be frozen with interrupts
 *      off, create the hibernation image and thaw those devices.  Control
 *      reappears in this routine after a restore.
@@ -193,6 +204,7 @@ static int create_image(int platform_mode)
        if (error)
                return error;
+        device_pm_lock();
        local_irq_disable();
        /* At this point, device_suspend() has been called, but *not*
         * device_power_down(). We *must* call device_power_down() now.
@@ -224,9 +236,11 @@ static int create_image(int platform_mode)
        /* NOTE:  device_power_up() is just a resume() for devices
         * that suspended with irqs off ... no overall powerup.
         */
-        device_power_up();
+        device_power_up(in_suspend ?
+                (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
 Enable_irqs:
        local_irq_enable();
+        device_pm_unlock();
        return error;
 }
@@ -255,10 +269,10 @@ int hibernation_snapshot(int platform_mode)
        suspend_console();
        error = device_suspend(PMSG_FREEZE);
        if (error)
-                goto Resume_console;
+                goto Recover_platform;
        if (hibernation_test(TEST_DEVICES))
-                goto Resume_devices;
+                goto Recover_platform;
        error = platform_pre_snapshot(platform_mode);
        if (error || hibernation_test(TEST_PLATFORM))
@@ -280,12 +294,16 @@ int hibernation_snapshot(int platform_mode)
 Finish:
        platform_finish(platform_mode);
 Resume_devices:
-        device_resume();
+        device_resume(in_suspend ?
- Resume_console:
+                (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
        resume_console();
 Close:
        platform_end(platform_mode);
        return error;
+ Recover_platform:
+        platform_recover(platform_mode);
+        goto Resume_devices;
 }
 /**
@@ -300,8 +318,9 @@ static int resume_target_kernel(void)
 {
        int error;
+        device_pm_lock();
        local_irq_disable();
-        error = device_power_down(PMSG_PRETHAW);
+        error = device_power_down(PMSG_QUIESCE);
        if (error) {
                printk(KERN_ERR "PM: Some devices failed to power down, "
                        "aborting resume\n");
@@ -329,9 +348,10 @@ static int resume_target_kernel(void)
        swsusp_free();
        restore_processor_state();
        touch_softlockup_watchdog();
-        device_power_up();
+        device_power_up(PMSG_RECOVER);
 Enable_irqs:
        local_irq_enable();
+        device_pm_unlock();
        return error;
 }
@@ -350,7 +370,7 @@ int hibernation_restore(int platform_mode)
        pm_prepare_console();
        suspend_console();
-        error = device_suspend(PMSG_PRETHAW);
+        error = device_suspend(PMSG_QUIESCE);
        if (error)
                goto Finish;
@@ -362,7 +382,7 @@ int hibernation_restore(int platform_mode)
                enable_nonboot_cpus();
        }
        platform_restore_cleanup(platform_mode);
-        device_resume();
+        device_resume(PMSG_RECOVER);
 Finish:
        resume_console();
        pm_restore_console();
@@ -392,8 +412,11 @@ int hibernation_platform_enter(void)
        suspend_console();
        error = device_suspend(PMSG_HIBERNATE);
-        if (error)
+        if (error) {
-                goto Resume_console;
+                if (hibernation_ops->recover)
+                        hibernation_ops->recover();
+                goto Resume_devices;
+        }
        error = hibernation_ops->prepare();
        if (error)
@@ -403,6 +426,7 @@ int hibernation_platform_enter(void)
        if (error)
                goto Finish;
+        device_pm_lock();
        local_irq_disable();
        error = device_power_down(PMSG_HIBERNATE);
        if (!error) {
@@ -411,6 +435,7 @@ int hibernation_platform_enter(void)
                while (1);
        }
        local_irq_enable();
+        device_pm_unlock();
        /*
         * We don't need to reenable the nonboot CPUs or resume consoles, since
@@ -419,8 +444,7 @@ int hibernation_platform_enter(void)
 Finish:
        hibernation_ops->finish();
 Resume_devices:
-        device_resume();
+        device_resume(PMSG_RESTORE);
- Resume_console:
        resume_console();
 Close:
        hibernation_ops->end();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6a6d5eb3524e..0b7476f5d2a6 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -132,6 +132,61 @@ static inline int suspend_test(int level) { return 0; }
 #ifdef CONFIG_SUSPEND
+#ifdef CONFIG_PM_TEST_SUSPEND
+/*
+ * We test the system suspend code by setting an RTC wakealarm a short
+ * time in the future, then suspending.  Suspending the devices won't
+ * normally take long ... some systems only need a few milliseconds.
+ *
+ * The time it takes is system-specific though, so when we test this
+ * during system bootup we allow a LOT of time.
+ */
+#define TEST_SUSPEND_SECONDS    5
+static unsigned long suspend_test_start_time;
+static void suspend_test_start(void)
+{
+        /* FIXME Use better timebase than "jiffies", ideally a clocksource.
+         * What we want is a hardware counter that will work correctly even
+         * during the irqs-are-off stages of the suspend/resume cycle...
+         */
+        suspend_test_start_time = jiffies;
+}
+static void suspend_test_finish(const char *label)
+{
+        long nj = jiffies - suspend_test_start_time;
+        unsigned msec;
+        msec = jiffies_to_msecs(abs(nj));
+        pr_info("PM: %s took %d.%03d seconds\n", label,
+                        msec / 1000, msec % 1000);
+        /* Warning on suspend means the RTC alarm period needs to be
+         * larger -- the system was sooo slooowwww to suspend that the
+         * alarm (should have) fired before the system went to sleep!
+         *
+         * Warning on either suspend or resume also means the system
+         * has some performance issues.  The stack dump of a WARN_ON
+         * is more likely to get the right attention than a printk...
+         */
+        WARN_ON(msec > (TEST_SUSPEND_SECONDS * 1000));
+}
+#else
+static void suspend_test_start(void)
+{
+}
+static void suspend_test_finish(const char *label)
+{
+}
+#endif
 /* This is just an arbitrary number */
 #define FREE_PAGE_NUMBER (100)
@@ -228,6 +283,7 @@ static int suspend_enter(suspend_state_t state)
 {
        int error = 0;
+        device_pm_lock();
        arch_suspend_disable_irqs();
        BUG_ON(!irqs_disabled());
@@ -239,10 +295,11 @@ static int suspend_enter(suspend_state_t state)
        if (!suspend_test(TEST_CORE))
                error = suspend_ops->enter(state);
-        device_power_up();
+        device_power_up(PMSG_RESUME);
 Done:
        arch_suspend_enable_irqs();
        BUG_ON(irqs_disabled());
+        device_pm_unlock();
        return error;
 }
@@ -264,14 +321,15 @@ int suspend_devices_and_enter(suspend_state_t state)
                        goto Close;
        }
        suspend_console();
+        suspend_test_start();
        error = device_suspend(PMSG_SUSPEND);
        if (error) {
                printk(KERN_ERR "PM: Some devices failed to suspend\n");
-                goto Resume_console;
+                goto Recover_platform;
        }
+        suspend_test_finish("suspend devices");
        if (suspend_test(TEST_DEVICES))
-                goto Resume_devices;
+                goto Recover_platform;
        if (suspend_ops->prepare) {
                error = suspend_ops->prepare();
@@ -291,13 +349,19 @@ int suspend_devices_and_enter(suspend_state_t state)
        if (suspend_ops->finish)
                suspend_ops->finish();
 Resume_devices:
-        device_resume();
+        suspend_test_start();
- Resume_console:
+        device_resume(PMSG_RESUME);
+        suspend_test_finish("resume devices");
        resume_console();
 Close:
        if (suspend_ops->end)
                suspend_ops->end();
        return error;
+ Recover_platform:
+        if (suspend_ops->recover)
+                suspend_ops->recover();
+        goto Resume_devices;
 }
 /**
@@ -515,3 +579,144 @@ static int __init pm_init(void)
 }
 core_initcall(pm_init);
+#ifdef CONFIG_PM_TEST_SUSPEND
+#include <linux/rtc.h>
+/*
+ * To test system suspend, we need a hands-off mechanism to resume the
+ * system.  RTCs wake alarms are a common self-contained mechanism.
+ */
+static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
+{
+        static char err_readtime[] __initdata =
+                KERN_ERR "PM: can't read %s time, err %d\n";
+        static char err_wakealarm [] __initdata =
+                KERN_ERR "PM: can't set %s wakealarm, err %d\n";
+        static char err_suspend[] __initdata =
+                KERN_ERR "PM: suspend test failed, error %d\n";
+        static char info_test[] __initdata =
+                KERN_INFO "PM: test RTC wakeup from '%s' suspend\n";
+        unsigned long           now;
+        struct rtc_wkalrm       alm;
+        int                     status;
+        /* this may fail if the RTC hasn't been initialized */
+        status = rtc_read_time(rtc, &alm.time);
+        if (status < 0) {
+                printk(err_readtime, rtc->dev.bus_id, status);
+                return;
+        }
+        rtc_tm_to_time(&alm.time, &now);
+        memset(&alm, 0, sizeof alm);
+        rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
+        alm.enabled = true;
+        status = rtc_set_alarm(rtc, &alm);
+        if (status < 0) {
+                printk(err_wakealarm, rtc->dev.bus_id, status);
+                return;
+        }
+        if (state == PM_SUSPEND_MEM) {
+                printk(info_test, pm_states[state]);
+                status = pm_suspend(state);
+                if (status == -ENODEV)
+                        state = PM_SUSPEND_STANDBY;
+        }
+        if (state == PM_SUSPEND_STANDBY) {
+                printk(info_test, pm_states[state]);
+                status = pm_suspend(state);
+        }
+        if (status < 0)
+                printk(err_suspend, status);
+        /* Some platforms can't detect that the alarm triggered the
+         * wakeup, or (accordingly) disable it after it afterwards.
+         * It's supposed to give oneshot behavior; cope.
+         */
+        alm.enabled = false;
+        rtc_set_alarm(rtc, &alm);
+}
+static int __init has_wakealarm(struct device *dev, void *name_ptr)
+{
+        struct rtc_device *candidate = to_rtc_device(dev);
+        if (!candidate->ops->set_alarm)
+                return 0;
+        if (!device_may_wakeup(candidate->dev.parent))
+                return 0;
+        *(char **)name_ptr = dev->bus_id;
+        return 1;
+}
+/*
+ * Kernel options like "test_suspend=mem" force suspend/resume sanity tests
+ * at startup time.  They're normally disabled, for faster boot and because
+ * we can't know which states really work on this particular system.
+ */
+static suspend_state_t test_state __initdata = PM_SUSPEND_ON;
+static char warn_bad_state[] __initdata =
+        KERN_WARNING "PM: can't test '%s' suspend state\n";
+static int __init setup_test_suspend(char *value)
+{
+        unsigned i;
+        /* "=mem" ==> "mem" */
+        value++;
+        for (i = 0; i < PM_SUSPEND_MAX; i++) {
+                if (!pm_states[i])
+                        continue;
+                if (strcmp(pm_states[i], value) != 0)
+                        continue;
+                test_state = (__force suspend_state_t) i;
+                return 0;
+        }
+        printk(warn_bad_state, value);
+        return 0;
+}
+__setup("test_suspend", setup_test_suspend);
+static int __init test_suspend(void)
+{
+        static char             warn_no_rtc[] __initdata =
+                KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
+        char                    *pony = NULL;
+        struct rtc_device       *rtc = NULL;
+        /* PM is initialized by now; is that state testable? */
+        if (test_state == PM_SUSPEND_ON)
+                goto done;
+        if (!valid_state(test_state)) {
+                printk(warn_bad_state, pm_states[test_state]);
+                goto done;
+        }
+        /* RTCs have initialized by now too ... can we use one? */
+        class_find_device(rtc_class, NULL, &pony, has_wakealarm);
+        if (pony)
+                rtc = rtc_class_open(pony);
+        if (!rtc) {
+                printk(warn_no_rtc);
+                goto done;
+        }
+        /* go for it */
+        test_wakealarm(rtc, test_state);
+        rtc_class_close(rtc);
+done:
+        return 0;
+}
+late_initcall(test_suspend);
+#endif /* CONFIG_PM_TEST_SUSPEND */
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 700f44ec8406..acc0c101dbd5 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -53,8 +53,6 @@ extern int hibernation_platform_enter(void);
 extern int pfn_is_nosave(unsigned long);
-extern struct mutex pm_mutex;
 #define power_attr(_name) \
 static struct kobj_attribute _name##_attr = {   \
        .attr   = {                             \
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 678ec736076b..72016f051477 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -10,6 +10,7 @@
 #include <linux/pm.h>
 #include <linux/workqueue.h>
 #include <linux/reboot.h>
+#include <linux/cpumask.h>
 /*
 * When the user hits Sys-Rq o to power down the machine this is the
@@ -25,7 +26,8 @@ static DECLARE_WORK(poweroff_work, do_poweroff);
 static void handle_poweroff(int key, struct tty_struct *tty)
 {
-        schedule_work(&poweroff_work);
+        /* run sysrq poweroff on boot cpu */
+        schedule_work_on(first_cpu(cpu_online_map), &poweroff_work);
 }
 static struct sysrq_key_op      sysrq_poweroff_op = {
diff --git a/kernel/power/process.c b/kernel/power/process.c
index f1d0b345c9ba..278946aecaf0 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -19,9 +19,6 @@
 */
 #define TIMEOUT (20 * HZ)
-#define FREEZER_KERNEL_THREADS 0
-#define FREEZER_USER_SPACE 1
 static inline int freezeable(struct task_struct * p)
 {
        if ((p == current) ||
@@ -84,63 +81,53 @@ static void fake_signal_wake_up(struct task_struct *p)
        spin_unlock_irqrestore(&p->sighand->siglock, flags);
 }
-static int has_mm(struct task_struct *p)
+static inline bool should_send_signal(struct task_struct *p)
 {
-        return (p->mm && !(p->flags & PF_BORROWED_MM));
+        return !(p->flags & PF_FREEZER_NOSIG);
 }
 /**
 *      freeze_task - send a freeze request to given task
 *      @p: task to send the request to
- *      @with_mm_only: if set, the request will only be sent if the task has its
+ *      @sig_only: if set, the request will only be sent if the task has the
- *              own mm
+ *              PF_FREEZER_NOSIG flag unset
- *      Return value: 0, if @with_mm_only is set and the task has no mm of its
+ *      Return value: 'false', if @sig_only is set and the task has
- *              own or the task is frozen, 1, otherwise
+ *              PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
 *
- *      The freeze request is sent by seting the tasks's TIF_FREEZE flag and
+ *      The freeze request is sent by setting the tasks's TIF_FREEZE flag and
 *      either sending a fake signal to it or waking it up, depending on whether
- *      or not it has its own mm (ie. it is a user land task).  If @with_mm_only
+ *      or not it has PF_FREEZER_NOSIG set.  If @sig_only is set and the task
- *      is set and the task has no mm of its own (ie. it is a kernel thread),
+ *      has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
- *      its TIF_FREEZE flag should not be set.
+ *      TIF_FREEZE flag will not be set.
- *
- *      The task_lock() is necessary to prevent races with exit_mm() or
- *      use_mm()/unuse_mm() from occuring.
 */
-static int freeze_task(struct task_struct *p, int with_mm_only)
+static bool freeze_task(struct task_struct *p, bool sig_only)
 {
-        int ret = 1;
+        /*
+         * We first check if the task is freezing and next if it has already
+         * been frozen to avoid the race with frozen_process() which first marks
+         * the task as frozen and next clears its TIF_FREEZE.
+         */
+        if (!freezing(p)) {
+                rmb();
+                if (frozen(p))
+                        return false;
-        task_lock(p);
+                if (!sig_only || should_send_signal(p))
-        if (freezing(p)) {
+                        set_freeze_flag(p);
-                if (has_mm(p)) {
+                else
-                        if (!signal_pending(p))
+                        return false;
-                                fake_signal_wake_up(p);
+        }
-                } else {
-                        if (with_mm_only)
+        if (should_send_signal(p)) {
-                                ret = 0;
+                if (!signal_pending(p))
-                        else
+                        fake_signal_wake_up(p);
-                                wake_up_state(p, TASK_INTERRUPTIBLE);
+        } else if (sig_only) {
-                }
+                return false;
        } else {
-                rmb();
+                wake_up_state(p, TASK_INTERRUPTIBLE);
-                if (frozen(p)) {
-                        ret = 0;
-                } else {
-                        if (has_mm(p)) {
-                                set_freeze_flag(p);
-                                fake_signal_wake_up(p);
-                        } else {
-                                if (with_mm_only) {
-                                        ret = 0;
-                                } else {
-                                        set_freeze_flag(p);
-                                        wake_up_state(p, TASK_INTERRUPTIBLE);
-                                }
-                        }
-                }
        }
-        task_unlock(p);
-        return ret;
+        return true;
 }
 static void cancel_freezing(struct task_struct *p)
@@ -156,13 +143,13 @@ static void cancel_freezing(struct task_struct *p)
        }
 }
-static int try_to_freeze_tasks(int freeze_user_space)
+static int try_to_freeze_tasks(bool sig_only)
 {
        struct task_struct *g, *p;
        unsigned long end_time;
        unsigned int todo;
        struct timeval start, end;
-        s64 elapsed_csecs64;
+        u64 elapsed_csecs64;
        unsigned int elapsed_csecs;
        do_gettimeofday(&start);
@@ -175,7 +162,7 @@ static int try_to_freeze_tasks(int freeze_user_space)
                        if (frozen(p) || !freezeable(p))
                                continue;
-                        if (!freeze_task(p, freeze_user_space))
+                        if (!freeze_task(p, sig_only))
                                continue;
                        /*
@@ -235,13 +222,13 @@ int freeze_processes(void)
        int error;
        printk("Freezing user space processes ... ");
-        error = try_to_freeze_tasks(FREEZER_USER_SPACE);
+        error = try_to_freeze_tasks(true);
        if (error)
                goto Exit;
        printk("done.\n");
        printk("Freezing remaining freezable tasks ... ");
-        error = try_to_freeze_tasks(FREEZER_KERNEL_THREADS);
+        error = try_to_freeze_tasks(false);
        if (error)
                goto Exit;
        printk("done.");
@@ -251,7 +238,7 @@ int freeze_processes(void)
        return error;
 }
-static void thaw_tasks(int thaw_user_space)
+static void thaw_tasks(bool nosig_only)
 {
        struct task_struct *g, *p;
@@ -260,7 +247,7 @@ static void thaw_tasks(int thaw_user_space)
                if (!freezeable(p))
                        continue;
-                if (!p->mm == thaw_user_space)
+                if (nosig_only && should_send_signal(p))
                        continue;
                thaw_process(p);
@@ -271,8 +258,8 @@ static void thaw_tasks(int thaw_user_space)
 void thaw_processes(void)
 {
        printk("Restarting tasks ... ");
-        thaw_tasks(FREEZER_KERNEL_THREADS);
+        thaw_tasks(true);
-        thaw_tasks(FREEZER_USER_SPACE);
+        thaw_tasks(false);
        schedule();
        printk("done.\n");
 }
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 5f91a07c4eac..5d2ab836e998 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -205,8 +205,7 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
 *      objects.  The main list's elements are of type struct zone_bitmap
 *      and each of them corresonds to one zone.  For each zone bitmap
 *      object there is a list of objects of type struct bm_block that
- *      represent each blocks of bit chunks in which information is
+ *      represent each blocks of bitmap in which information is stored.
- *      stored.
 *
 *      struct memory_bitmap contains a pointer to the main list of zone
 *      bitmap objects, a struct bm_position used for browsing the bitmap,
@@ -224,26 +223,27 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
 *      pfns that correspond to the start and end of the represented zone.
 *
 *      struct bm_block contains a pointer to the memory page in which
- *      information is stored (in the form of a block of bit chunks
+ *      information is stored (in the form of a block of bitmap)
- *      of type unsigned long each).  It also contains the pfns that
+ *      It also contains the pfns that correspond to the start and end of
- *      correspond to the start and end of the represented memory area and
+ *      the represented memory area.
- *      the number of bit chunks in the block.
 */
 #define BM_END_OF_MAP   (~0UL)
-#define BM_CHUNKS_PER_BLOCK     (PAGE_SIZE / sizeof(long))
-#define BM_BITS_PER_CHUNK       (sizeof(long) << 3)
 #define BM_BITS_PER_BLOCK       (PAGE_SIZE << 3)
 struct bm_block {
        struct bm_block *next;          /* next element of the list */
        unsigned long start_pfn;        /* pfn represented by the first bit */
        unsigned long end_pfn;  /* pfn represented by the last bit plus 1 */
-        unsigned int size;      /* number of bit chunks */
+        unsigned long *data;    /* bitmap representing pages */
-        unsigned long *data;    /* chunks of bits representing pages */
 };
+static inline unsigned long bm_block_bits(struct bm_block *bb)
+{
+        return bb->end_pfn - bb->start_pfn;
+}
 struct zone_bitmap {
        struct zone_bitmap *next;       /* next element of the list */
        unsigned long start_pfn;        /* minimal pfn in this zone */
@@ -257,7 +257,6 @@ struct zone_bitmap {
 struct bm_position {
        struct zone_bitmap *zone_bm;
        struct bm_block *block;
-        int chunk;
        int bit;
 };
@@ -272,12 +271,6 @@ struct memory_bitmap {
 /* Functions that operate on memory bitmaps */
-static inline void memory_bm_reset_chunk(struct memory_bitmap *bm)
-{
-        bm->cur.chunk = 0;
-        bm->cur.bit = -1;
-}
 static void memory_bm_position_reset(struct memory_bitmap *bm)
 {
        struct zone_bitmap *zone_bm;
@@ -285,7 +278,7 @@ static void memory_bm_position_reset(struct memory_bitmap *bm)
        zone_bm = bm->zone_bm_list;
        bm->cur.zone_bm = zone_bm;
        bm->cur.block = zone_bm->bm_blocks;
-        memory_bm_reset_chunk(bm);
+        bm->cur.bit = 0;
 }
 static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
@@ -394,12 +387,10 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
                        bb->start_pfn = pfn;
                        if (nr >= BM_BITS_PER_BLOCK) {
                                pfn += BM_BITS_PER_BLOCK;
-                                bb->size = BM_CHUNKS_PER_BLOCK;
                                nr -= BM_BITS_PER_BLOCK;
                        } else {
                                /* This is executed only once in the loop */
                                pfn += nr;
-                                bb->size = DIV_ROUND_UP(nr, BM_BITS_PER_CHUNK);
                        }
                        bb->end_pfn = pfn;
                        bb = bb->next;
@@ -478,8 +469,8 @@ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
        }
        zone_bm->cur_block = bb;
        pfn -= bb->start_pfn;
-        *bit_nr = pfn % BM_BITS_PER_CHUNK;
+        *bit_nr = pfn;
-        *addr = bb->data + pfn / BM_BITS_PER_CHUNK;
+        *addr = bb->data;
        return 0;
 }
@@ -528,36 +519,6 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
        return test_bit(bit, addr);
 }
-/* Two auxiliary functions for memory_bm_next_pfn */
-/* Find the first set bit in the given chunk, if there is one */
-static inline int next_bit_in_chunk(int bit, unsigned long *chunk_p)
-{
-        bit++;
-        while (bit < BM_BITS_PER_CHUNK) {
-                if (test_bit(bit, chunk_p))
-                        return bit;
-                bit++;
-        }
-        return -1;
-}
-/* Find a chunk containing some bits set in given block of bits */
-static inline int next_chunk_in_block(int n, struct bm_block *bb)
-{
-        n++;
-        while (n < bb->size) {
-                if (bb->data[n])
-                        return n;
-                n++;
-        }
-        return -1;
-}
 /**
 *      memory_bm_next_pfn - find the pfn that corresponds to the next set bit
 *      in the bitmap @bm.  If the pfn cannot be found, BM_END_OF_MAP is
@@ -571,40 +532,33 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
 {
        struct zone_bitmap *zone_bm;
        struct bm_block *bb;
-        int chunk;
        int bit;
        do {
                bb = bm->cur.block;
                do {
-                        chunk = bm->cur.chunk;
                        bit = bm->cur.bit;
-                        do {
+                        bit = find_next_bit(bb->data, bm_block_bits(bb), bit);
-                                bit = next_bit_in_chunk(bit, bb->data + chunk);
+                        if (bit < bm_block_bits(bb))
-                                if (bit >= 0)
+                                goto Return_pfn;
-                                        goto Return_pfn;
-                                chunk = next_chunk_in_block(chunk, bb);
-                                bit = -1;
-                        } while (chunk >= 0);
                        bb = bb->next;
                        bm->cur.block = bb;
-                        memory_bm_reset_chunk(bm);
+                        bm->cur.bit = 0;
                } while (bb);
                zone_bm = bm->cur.zone_bm->next;
                if (zone_bm) {
                        bm->cur.zone_bm = zone_bm;
                        bm->cur.block = zone_bm->bm_blocks;
-                        memory_bm_reset_chunk(bm);
+                        bm->cur.bit = 0;
                }
        } while (zone_bm);
        memory_bm_position_reset(bm);
        return BM_END_OF_MAP;
 Return_pfn:
-        bm->cur.chunk = chunk;
+        bm->cur.bit = bit + 1;
-        bm->cur.bit = bit;
+        return bb->start_pfn + bit;
-        return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit;
 }
 /**
diff --git a/kernel/power/user.c b/kernel/power/user.c
index f5512cb3aa86..a6332a313262 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -23,6 +23,7 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
+#include <linux/smp_lock.h>
 #include <asm/uaccess.h>
@@ -69,16 +70,22 @@ static int snapshot_open(struct inode *inode, struct file *filp)
        struct snapshot_data *data;
        int error;
-        if (!atomic_add_unless(&snapshot_device_available, -1, 0))
+        mutex_lock(&pm_mutex);
-                return -EBUSY;
+        if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
+                error = -EBUSY;
+                goto Unlock;
+        }
        if ((filp->f_flags & O_ACCMODE) == O_RDWR) {
                atomic_inc(&snapshot_device_available);
-                return -ENOSYS;
+                error = -ENOSYS;
+                goto Unlock;
        }
        if(create_basic_memory_bitmaps()) {
                atomic_inc(&snapshot_device_available);
-                return -ENOMEM;
+                error = -ENOMEM;
+                goto Unlock;
        }
        nonseekable_open(inode, filp);
        data = &snapshot_state;
@@ -98,33 +105,36 @@ static int snapshot_open(struct inode *inode, struct file *filp)
                if (error)
                        pm_notifier_call_chain(PM_POST_HIBERNATION);
        }
-        if (error) {
+        if (error)
                atomic_inc(&snapshot_device_available);
-                return error;
-        }
        data->frozen = 0;
        data->ready = 0;
        data->platform_support = 0;
-        return 0;
+ Unlock:
+        mutex_unlock(&pm_mutex);
+        return error;
 }
 static int snapshot_release(struct inode *inode, struct file *filp)
 {
        struct snapshot_data *data;
+        mutex_lock(&pm_mutex);
        swsusp_free();
        free_basic_memory_bitmaps();
        data = filp->private_data;
        free_all_swap_pages(data->swap);
-        if (data->frozen) {
+        if (data->frozen)
-                mutex_lock(&pm_mutex);
                thaw_processes();
-                mutex_unlock(&pm_mutex);
-        }
        pm_notifier_call_chain(data->mode == O_WRONLY ?
                        PM_POST_HIBERNATION : PM_POST_RESTORE);
        atomic_inc(&snapshot_device_available);
+        mutex_unlock(&pm_mutex);
        return 0;
 }
@@ -134,9 +144,13 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
        struct snapshot_data *data;
        ssize_t res;
+        mutex_lock(&pm_mutex);
        data = filp->private_data;
-        if (!data->ready)
+        if (!data->ready) {
-                return -ENODATA;
+                res = -ENODATA;
+                goto Unlock;
+        }
        res = snapshot_read_next(&data->handle, count);
        if (res > 0) {
                if (copy_to_user(buf, data_of(data->handle), res))
@@ -144,6 +158,10 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
                else
                        *offp = data->handle.offset;
        }
+ Unlock:
+        mutex_unlock(&pm_mutex);
        return res;
 }
@@ -153,6 +171,8 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
        struct snapshot_data *data;
        ssize_t res;
+        mutex_lock(&pm_mutex);
        data = filp->private_data;
        res = snapshot_write_next(&data->handle, count);
        if (res > 0) {
@@ -161,11 +181,14 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
                else
                        *offp = data->handle.offset;
        }
+        mutex_unlock(&pm_mutex);
        return res;
 }
-static int snapshot_ioctl(struct inode *inode, struct file *filp,
+static long snapshot_ioctl(struct file *filp, unsigned int cmd,
-                          unsigned int cmd, unsigned long arg)
+                                                        unsigned long arg)
 {
        int error = 0;
        struct snapshot_data *data;
@@ -179,6 +202,9 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
+        if (!mutex_trylock(&pm_mutex))
+                return -EBUSY;
        data = filp->private_data;
        switch (cmd) {
@@ -186,7 +212,6 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
        case SNAPSHOT_FREEZE:
                if (data->frozen)
                        break;
-                mutex_lock(&pm_mutex);
                printk("Syncing filesystems ... ");
                sys_sync();
                printk("done.\n");
@@ -194,7 +219,6 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
                error = freeze_processes();
                if (error)
                        thaw_processes();
-                mutex_unlock(&pm_mutex);
                if (!error)
                        data->frozen = 1;
                break;
@@ -202,9 +226,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
        case SNAPSHOT_UNFREEZE:
                if (!data->frozen || data->ready)
                        break;
-                mutex_lock(&pm_mutex);
                thaw_processes();
-                mutex_unlock(&pm_mutex);
                data->frozen = 0;
                break;
@@ -307,16 +329,11 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
                        error = -EPERM;
                        break;
                }
-                if (!mutex_trylock(&pm_mutex)) {
-                        error = -EBUSY;
-                        break;
-                }
                /*
                 * Tasks are frozen and the notifiers have been called with
                 * PM_HIBERNATION_PREPARE
                 */
                error = suspend_devices_and_enter(PM_SUSPEND_MEM);
-                mutex_unlock(&pm_mutex);
                break;
        case SNAPSHOT_PLATFORM_SUPPORT:
@@ -390,6 +407,8 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
        }
+        mutex_unlock(&pm_mutex);
        return error;
 }
@@ -399,7 +418,7 @@ static const struct file_operations snapshot_fops = {
        .read = snapshot_read,
        .write = snapshot_write,
        .llseek = no_llseek,
-        .ioctl = snapshot_ioctl,
+        .unlocked_ioctl = snapshot_ioctl,
 };
 static struct miscdevice snapshot_device = {
diff --git a/kernel/printk.c b/kernel/printk.c
index e2129e83fd75..a7f7559c5f6c 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -38,7 +38,7 @@
 /*
 * Architectures can override it:
 */
-void __attribute__((weak)) early_printk(const char *fmt, ...)
+void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
 {
 }
@@ -75,6 +75,8 @@ EXPORT_SYMBOL(oops_in_progress);
 static DECLARE_MUTEX(console_sem);
 static DECLARE_MUTEX(secondary_console_sem);
 struct console *console_drivers;
+EXPORT_SYMBOL_GPL(console_drivers);
 /*
 * This is used for debugging the mess that is the VT code by
 * keeping track if we have the console semaphore held. It's
@@ -121,6 +123,8 @@ struct console_cmdline
 static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
 static int selected_console = -1;
 static int preferred_console = -1;
+int console_set_on_cmdline;
+EXPORT_SYMBOL(console_set_on_cmdline);
 /* Flag: console code may call schedule() */
 static int console_may_schedule;
@@ -231,7 +235,7 @@ static inline void boot_delay_msec(void)
 /*
 * Return the number of unread characters in the log buffer.
 */
-int log_buf_get_len(void)
+static int log_buf_get_len(void)
 {
        return logged_chars;
 }
@@ -268,19 +272,6 @@ int log_buf_copy(char *dest, int idx, int len)
 }
 /*
- * Extract a single character from the log buffer.
- */
-int log_buf_read(int idx)
-{
-        char ret;
-        if (log_buf_copy(&ret, idx, 1) == 1)
-                return ret;
-        else
-                return -1;
-}
-/*
 * Commands to do_syslog:
 *
 *      0 -- Close the log.  Currently a NOP.
@@ -665,18 +656,17 @@ static int acquire_console_semaphore_for_printk(unsigned int cpu)
        spin_unlock(&logbuf_lock);
        return retval;
 }
+static const char recursion_bug_msg [] =
-static const char printk_recursion_bug_msg [] =
+                KERN_CRIT "BUG: recent printk recursion!\n";
-                        KERN_CRIT "BUG: recent printk recursion!\n";
+static int recursion_bug;
-static int printk_recursion_bug;
+        static int new_text_line = 1;
+static char printk_buf[1024];
 asmlinkage int vprintk(const char *fmt, va_list args)
 {
-        static int log_level_unknown = 1;
-        static char printk_buf[1024];
-        unsigned long flags;
        int printed_len = 0;
+        int current_log_level = default_message_loglevel;
+        unsigned long flags;
        int this_cpu;
        char *p;
@@ -699,7 +689,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
                 * it can be printed at the next appropriate moment:
                 */
                if (!oops_in_progress) {
-                        printk_recursion_bug = 1;
+                        recursion_bug = 1;
                        goto out_restore_irqs;
                }
                zap_locks();
@@ -709,70 +699,62 @@ asmlinkage int vprintk(const char *fmt, va_list args)
        spin_lock(&logbuf_lock);
        printk_cpu = this_cpu;
-        if (printk_recursion_bug) {
+        if (recursion_bug) {
-                printk_recursion_bug = 0;
+                recursion_bug = 0;
-                strcpy(printk_buf, printk_recursion_bug_msg);
+                strcpy(printk_buf, recursion_bug_msg);
-                printed_len = sizeof(printk_recursion_bug_msg);
+                printed_len = sizeof(recursion_bug_msg);
        }
        /* Emit the output into the temporary buffer */
        printed_len += vscnprintf(printk_buf + printed_len,
                                  sizeof(printk_buf) - printed_len, fmt, args);
        /*
         * Copy the output into log_buf.  If the caller didn't provide
         * appropriate log level tags, we insert them here
         */
        for (p = printk_buf; *p; p++) {
-                if (log_level_unknown) {
+                if (new_text_line) {
-                        /* log_level_unknown signals the start of a new line */
+                        /* If a token, set current_log_level and skip over */
+                        if (p[0] == '<' && p[1] >= '0' && p[1] <= '7' &&
+                            p[2] == '>') {
+                                current_log_level = p[1] - '0';
+                                p += 3;
+                                printed_len -= 3;
+                        }
+                        /* Always output the token */
+                        emit_log_char('<');
+                        emit_log_char(current_log_level + '0');
+                        emit_log_char('>');
+                        printed_len += 3;
+                        new_text_line = 0;
                        if (printk_time) {
-                                int loglev_char;
+                                /* Follow the token with the time */
                                char tbuf[50], *tp;
                                unsigned tlen;
                                unsigned long long t;
                                unsigned long nanosec_rem;
-                                /*
-                                 * force the log level token to be
-                                 * before the time output.
-                                 */
-                                if (p[0] == '<' && p[1] >='0' &&
-                                   p[1] <= '7' && p[2] == '>') {
-                                        loglev_char = p[1];
-                                        p += 3;
-                                        printed_len -= 3;
-                                } else {
-                                        loglev_char = default_message_loglevel
-                                                + '0';
-                                }
                                t = cpu_clock(printk_cpu);
                                nanosec_rem = do_div(t, 1000000000);
-                                tlen = sprintf(tbuf,
+                                tlen = sprintf(tbuf, "[%5lu.%06lu] ",
-                                                "<%c>[%5lu.%06lu] ",
+                                                (unsigned long) t,
-                                                loglev_char,
+                                                nanosec_rem / 1000);
-                                                (unsigned long)t,
-                                                nanosec_rem/1000);
                                for (tp = tbuf; tp < tbuf + tlen; tp++)
                                        emit_log_char(*tp);
                                printed_len += tlen;
-                        } else {
-                                if (p[0] != '<' || p[1] < '0' ||
-                                   p[1] > '7' || p[2] != '>') {
-                                        emit_log_char('<');
-                                        emit_log_char(default_message_loglevel
-                                                + '0');
-                                        emit_log_char('>');
-                                        printed_len += 3;
-                                }
                        }
-                        log_level_unknown = 0;
                        if (!*p)
                                break;
                }
                emit_log_char(*p);
                if (*p == '\n')
-                        log_level_unknown = 1;
+                        new_text_line = 1;
        }
        /*
@@ -890,6 +872,7 @@ static int __init console_setup(char *str)
        *s = 0;
        __add_preferred_console(buf, idx, options, brl_options);
+        console_set_on_cmdline = 1;
        return 1;
 }
 __setup("console=", console_setup);
@@ -950,7 +933,7 @@ void suspend_console(void)
 {
        if (!console_suspend_enabled)
                return;
-        printk("Suspending console(s)\n");
+        printk("Suspending console(s) (use no_console_suspend to debug)\n");
        acquire_console_sem();
        console_suspended = 1;
 }
@@ -1041,7 +1024,9 @@ void release_console_sem(void)
                _log_end = log_end;
                con_start = log_end;            /* Flush */
                spin_unlock(&logbuf_lock);
+                stop_critical_timings();        /* don't trace print latency */
                call_console_drivers(_con_start, _log_end);
+                start_critical_timings();
                local_irq_restore(flags);
        }
        console_locked = 0;
@@ -1172,8 +1157,11 @@ void register_console(struct console *console)
                        console->index = 0;
                if (console->setup == NULL ||
                    console->setup(console, NULL) == 0) {
-                        console->flags |= CON_ENABLED | CON_CONSDEV;
+                        console->flags |= CON_ENABLED;
-                        preferred_console = 0;
+                        if (console->device) {
+                                console->flags |= CON_CONSDEV;
+                                preferred_console = 0;
+                        }
                }
        }
@@ -1320,6 +1308,8 @@ void tty_write_message(struct tty_struct *tty, char *msg)
 }
 #if defined CONFIG_PRINTK
+DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
 /*
 * printk rate limiting, lifted from the networking subsystem.
 *
@@ -1327,22 +1317,9 @@ void tty_write_message(struct tty_struct *tty, char *msg)
 * every printk_ratelimit_jiffies to make a denial-of-service
 * attack impossible.
 */
-int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
-{
-        return __ratelimit(ratelimit_jiffies, ratelimit_burst);
-}
-EXPORT_SYMBOL(__printk_ratelimit);
-/* minimum time in jiffies between messages */
-int printk_ratelimit_jiffies = 5 * HZ;
-/* number of messages we send before ratelimiting */
-int printk_ratelimit_burst = 10;
 int printk_ratelimit(void)
 {
-        return __printk_ratelimit(printk_ratelimit_jiffies,
+        return __ratelimit(&printk_ratelimit_state);
-                                printk_ratelimit_burst);
 }
 EXPORT_SYMBOL(printk_ratelimit);
diff --git a/kernel/profile.c b/kernel/profile.c
index ae7ead82cbc9..cd26bed4cc26 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -112,8 +112,6 @@ void __init profile_init(void)
 /* Profile event notifications */
-#ifdef CONFIG_PROFILING
 static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
 static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
 static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
@@ -203,8 +201,6 @@ void unregister_timer_hook(int (*hook)(struct pt_regs *))
 }
 EXPORT_SYMBOL_GPL(unregister_timer_hook);
-#endif /* CONFIG_PROFILING */
 #ifdef CONFIG_SMP
 /*
@@ -252,7 +248,7 @@ static void profile_flip_buffers(void)
        mutex_lock(&profile_flip_mutex);
        j = per_cpu(cpu_profile_flip, get_cpu());
        put_cpu();
-        on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
+        on_each_cpu(__profile_flip_buffers, NULL, 1);
        for_each_online_cpu(cpu) {
                struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j];
                for (i = 0; i < NR_PROFILE_HIT; ++i) {
@@ -275,7 +271,7 @@ static void profile_discard_flip_buffers(void)
        mutex_lock(&profile_flip_mutex);
        i = per_cpu(cpu_profile_flip, get_cpu());
        put_cpu();
-        on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
+        on_each_cpu(__profile_flip_buffers, NULL, 1);
        for_each_online_cpu(cpu) {
                struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
                memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
@@ -558,7 +554,7 @@ static int __init create_hash_tables(void)
 out_cleanup:
        prof_on = 0;
        smp_mb();
-        on_each_cpu(profile_nop, NULL, 0, 1);
+        on_each_cpu(profile_nop, NULL, 1);
        for_each_online_cpu(cpu) {
                struct page *page;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 6c19e94fd0a5..082b3fcb32a0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -33,13 +33,9 @@
 */
 void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
 {
-        BUG_ON(!list_empty(&child->ptrace_list));
+        BUG_ON(!list_empty(&child->ptrace_entry));
-        if (child->parent == new_parent)
+        list_add(&child->ptrace_entry, &new_parent->ptraced);
-                return;
-        list_add(&child->ptrace_list, &child->parent->ptrace_children);
-        remove_parent(child);
        child->parent = new_parent;
-        add_parent(child);
 }
 
 /*
@@ -73,12 +69,8 @@ void __ptrace_unlink(struct task_struct *child)
        BUG_ON(!child->ptrace);
        child->ptrace = 0;
-        if (ptrace_reparented(child)) {
+        child->parent = child->real_parent;
-                list_del_init(&child->ptrace_list);
+        list_del_init(&child->ptrace_entry);
-                remove_parent(child);
-                child->parent = child->real_parent;
-                add_parent(child);
-        }
        if (task_is_traced(child))
                ptrace_untrace(child);
@@ -115,13 +107,13 @@ int ptrace_check_attach(struct task_struct *child, int kill)
        read_unlock(&tasklist_lock);
        if (!ret && !kill)
-                wait_task_inactive(child);
+                ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH;
        /* All systems go.. */
        return ret;
 }
-int __ptrace_may_attach(struct task_struct *task)
+int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 {
        /* May we inspect the given task?
         * This check is used both for attaching with ptrace
@@ -148,16 +140,16 @@ int __ptrace_may_attach(struct task_struct *task)
        if (!dumpable && !capable(CAP_SYS_PTRACE))
                return -EPERM;
-        return security_ptrace(current, task);
+        return security_ptrace(current, task, mode);
 }
-int ptrace_may_attach(struct task_struct *task)
+bool ptrace_may_access(struct task_struct *task, unsigned int mode)
 {
        int err;
        task_lock(task);
-        err = __ptrace_may_attach(task);
+        err = __ptrace_may_access(task, mode);
        task_unlock(task);
-        return !err;
+        return (!err ? true : false);
 }
 int ptrace_attach(struct task_struct *task)
@@ -195,7 +187,7 @@ repeat:
        /* the same process cannot be attached many times */
        if (task->ptrace & PT_PTRACED)
                goto bad;
-        retval = __ptrace_may_attach(task);
+        retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH);
        if (retval)
                goto bad;
@@ -492,14 +484,34 @@ int ptrace_traceme(void)
        /*
         * Are we already being traced?
         */
+repeat:
        task_lock(current);
        if (!(current->ptrace & PT_PTRACED)) {
-                ret = security_ptrace(current->parent, current);
+                /*
+                 * See ptrace_attach() comments about the locking here.
+                 */
+                unsigned long flags;
+                if (!write_trylock_irqsave(&tasklist_lock, flags)) {
+                        task_unlock(current);
+                        do {
+                                cpu_relax();
+                        } while (!write_can_lock(&tasklist_lock));
+                        goto repeat;
+                }
+                ret = security_ptrace(current->parent, current,
+                                      PTRACE_MODE_ATTACH);
                /*
                 * Set the ptrace bit in the process ptrace flags.
+                 * Then link us on our parent's ptraced list.
                 */
-                if (!ret)
+                if (!ret) {
                        current->ptrace |= PT_PTRACED;
+                        __ptrace_link(current, current->real_parent);
+                }
+                write_unlock_irqrestore(&tasklist_lock, flags);
        }
        task_unlock(current);
        return ret;
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index a38895a5b8e2..6f8696c502f4 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -106,7 +106,7 @@ static void force_quiescent_state(struct rcu_data *rdp,
                 */
                cpus_and(cpumask, rcp->cpumask, cpu_online_map);
                cpu_clear(rdp->cpu, cpumask);
-                for_each_cpu_mask(cpu, cpumask)
+                for_each_cpu_mask_nr(cpu, cpumask)
                        smp_send_reschedule(cpu);
        }
 }
@@ -387,6 +387,10 @@ static void __rcu_offline_cpu(struct rcu_data *this_rdp,
        rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
        rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
        rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
+        local_irq_disable();
+        this_rdp->qlen += rdp->qlen;
+        local_irq_enable();
 }
 static void rcu_offline_cpu(int cpu)
@@ -516,10 +520,38 @@ void rcu_check_callbacks(int cpu, int user)
        if (user ||
            (idle_cpu(cpu) && !in_softirq() &&
                                hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+                /*
+                 * Get here if this CPU took its interrupt from user
+                 * mode or from the idle loop, and if this is not a
+                 * nested interrupt.  In this case, the CPU is in
+                 * a quiescent state, so count it.
+                 *
+                 * Also do a memory barrier.  This is needed to handle
+                 * the case where writes from a preempt-disable section
+                 * of code get reordered into schedule() by this CPU's
+                 * write buffer.  The memory barrier makes sure that
+                 * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see
+                 * by other CPUs to happen after any such write.
+                 */
+                smp_mb();  /* See above block comment. */
                rcu_qsctr_inc(cpu);
                rcu_bh_qsctr_inc(cpu);
-        } else if (!in_softirq())
+        } else if (!in_softirq()) {
+                /*
+                 * Get here if this CPU did not take its interrupt from
+                 * softirq, in other words, if it is not interrupting
+                 * a rcu_bh read-side critical section.  This is an _bh
+                 * critical section, so count it.  The memory barrier
+                 * is needed for the same reason as is the above one.
+                 */
+                smp_mb();  /* See above block comment. */
                rcu_bh_qsctr_inc(cpu);
+        }
        raise_rcu_softirq();
 }
@@ -543,7 +575,7 @@ static void __cpuinit rcu_online_cpu(int cpu)
        rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
        rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
-        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
+        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 }
 static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index c09605f8d16c..f14f372cf6f5 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -39,16 +39,16 @@
 #include <linux/sched.h>
 #include <asm/atomic.h>
 #include <linux/bitops.h>
-#include <linux/completion.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
 #include <linux/cpu.h>
 #include <linux/mutex.h>
 #include <linux/module.h>
-struct rcu_synchronize {
+enum rcu_barrier {
-        struct rcu_head head;
+        RCU_BARRIER_STD,
-        struct completion completion;
+        RCU_BARRIER_BH,
+        RCU_BARRIER_SCHED,
 };
 static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
@@ -60,7 +60,7 @@ static struct completion rcu_barrier_completion;
 * Awaken the corresponding synchronize_rcu() instance now that a
 * grace period has elapsed.
 */
-static void wakeme_after_rcu(struct rcu_head  *head)
+void wakeme_after_rcu(struct rcu_head  *head)
 {
        struct rcu_synchronize *rcu;
@@ -77,17 +77,7 @@ static void wakeme_after_rcu(struct rcu_head  *head)
 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
 * and may be nested.
 */
-void synchronize_rcu(void)
+synchronize_rcu_xxx(synchronize_rcu, call_rcu)
-{
-        struct rcu_synchronize rcu;
-        init_completion(&rcu.completion);
-        /* Will wake me after RCU finished */
-        call_rcu(&rcu.head, wakeme_after_rcu);
-        /* Wait for it */
-        wait_for_completion(&rcu.completion);
-}
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 static void rcu_barrier_callback(struct rcu_head *notused)
@@ -99,19 +89,30 @@ static void rcu_barrier_callback(struct rcu_head *notused)
 /*
 * Called with preemption disabled, and from cross-cpu IRQ context.
 */
-static void rcu_barrier_func(void *notused)
+static void rcu_barrier_func(void *type)
 {
        int cpu = smp_processor_id();
        struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
        atomic_inc(&rcu_barrier_cpu_count);
-        call_rcu(head, rcu_barrier_callback);
+        switch ((enum rcu_barrier)type) {
+        case RCU_BARRIER_STD:
+                call_rcu(head, rcu_barrier_callback);
+                break;
+        case RCU_BARRIER_BH:
+                call_rcu_bh(head, rcu_barrier_callback);
+                break;
+        case RCU_BARRIER_SCHED:
+                call_rcu_sched(head, rcu_barrier_callback);
+                break;
+        }
 }
-/**
+/*
- * rcu_barrier - Wait until all the in-flight RCUs are complete.
+ * Orchestrate the specified type of RCU barrier, waiting for all
+ * RCU callbacks of the specified type to complete.
 */
-void rcu_barrier(void)
+static void _rcu_barrier(enum rcu_barrier type)
 {
        BUG_ON(in_interrupt());
        /* Take cpucontrol mutex to protect against CPU hotplug */
@@ -127,13 +128,39 @@ void rcu_barrier(void)
         * until all the callbacks are queued.
         */
        rcu_read_lock();
-        on_each_cpu(rcu_barrier_func, NULL, 0, 1);
+        on_each_cpu(rcu_barrier_func, (void *)type, 1);
        rcu_read_unlock();
        wait_for_completion(&rcu_barrier_completion);
        mutex_unlock(&rcu_barrier_mutex);
 }
+/**
+ * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
+ */
+void rcu_barrier(void)
+{
+        _rcu_barrier(RCU_BARRIER_STD);
+}
 EXPORT_SYMBOL_GPL(rcu_barrier);
+/**
+ * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
+ */
+void rcu_barrier_bh(void)
+{
+        _rcu_barrier(RCU_BARRIER_BH);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_bh);
+/**
+ * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
+ */
+void rcu_barrier_sched(void)
+{
+        _rcu_barrier(RCU_BARRIER_SCHED);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_sched);
 void __init rcu_init(void)
 {
        __rcu_init();
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 41d275a81df5..27827931ca0d 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -46,11 +46,11 @@
 #include <asm/atomic.h>
 #include <linux/bitops.h>
 #include <linux/module.h>
+#include <linux/kthread.h>
 #include <linux/completion.h>
 #include <linux/moduleparam.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
-#include <linux/rcupdate.h>
 #include <linux/cpu.h>
 #include <linux/random.h>
 #include <linux/delay.h>
@@ -82,14 +82,18 @@ struct rcu_data {
        spinlock_t      lock;           /* Protect rcu_data fields. */
        long            completed;      /* Number of last completed batch. */
        int             waitlistcount;
-        struct tasklet_struct rcu_tasklet;
        struct rcu_head *nextlist;
        struct rcu_head **nexttail;
        struct rcu_head *waitlist[GP_STAGES];
        struct rcu_head **waittail[GP_STAGES];
-        struct rcu_head *donelist;
+        struct rcu_head *donelist;      /* from waitlist & waitschedlist */
        struct rcu_head **donetail;
        long rcu_flipctr[2];
+        struct rcu_head *nextschedlist;
+        struct rcu_head **nextschedtail;
+        struct rcu_head *waitschedlist;
+        struct rcu_head **waitschedtail;
+        int rcu_sched_sleeping;
 #ifdef CONFIG_RCU_TRACE
        struct rcupreempt_trace trace;
 #endif /* #ifdef CONFIG_RCU_TRACE */
@@ -131,11 +135,24 @@ enum rcu_try_flip_states {
        rcu_try_flip_waitmb_state,
 };
+/*
+ * States for rcu_ctrlblk.rcu_sched_sleep.
+ */
+enum rcu_sched_sleep_states {
+        rcu_sched_not_sleeping, /* Not sleeping, callbacks need GP.  */
+        rcu_sched_sleep_prep,   /* Thinking of sleeping, rechecking. */
+        rcu_sched_sleeping,     /* Sleeping, awaken if GP needed. */
+};
 struct rcu_ctrlblk {
        spinlock_t      fliplock;       /* Protect state-machine transitions. */
        long            completed;      /* Number of last completed batch. */
        enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
                                                        the rcu state machine */
+        spinlock_t      schedlock;      /* Protect rcu_sched sleep state. */
+        enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */
+        wait_queue_head_t sched_wq;     /* Place for rcu_sched to sleep. */
 };
 static DEFINE_PER_CPU(struct rcu_data, rcu_data);
@@ -143,8 +160,12 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
        .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
        .completed = 0,
        .rcu_try_flip_state = rcu_try_flip_idle_state,
+        .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock),
+        .sched_sleep = rcu_sched_not_sleeping,
+        .sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq),
 };
+static struct task_struct *rcu_sched_grace_period_task;
 #ifdef CONFIG_RCU_TRACE
 static char *rcu_try_flip_state_names[] =
@@ -207,6 +228,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
 */
 #define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
+#define RCU_SCHED_BATCH_TIME (HZ / 50)
 /*
 * Return the number of RCU batches processed thus far.  Useful
 * for debug and statistics.
@@ -411,32 +434,34 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp)
        }
 }
-#ifdef CONFIG_NO_HZ
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
+        .dynticks = 1,
+};
-DEFINE_PER_CPU(long, dynticks_progress_counter) = 1;
+#ifdef CONFIG_NO_HZ
-static DEFINE_PER_CPU(long, rcu_dyntick_snapshot);
 static DEFINE_PER_CPU(int, rcu_update_flag);
 /**
 * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
 *
 * If the CPU was idle with dynamic ticks active, this updates the
- * dynticks_progress_counter to let the RCU handling know that the
+ * rcu_dyntick_sched.dynticks to let the RCU handling know that the
 * CPU is active.
 */
 void rcu_irq_enter(void)
 {
        int cpu = smp_processor_id();
+        struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
        if (per_cpu(rcu_update_flag, cpu))
                per_cpu(rcu_update_flag, cpu)++;
        /*
         * Only update if we are coming from a stopped ticks mode
-         * (dynticks_progress_counter is even).
+         * (rcu_dyntick_sched.dynticks is even).
         */
        if (!in_interrupt() &&
-            (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) {
+            (rdssp->dynticks & 0x1) == 0) {
                /*
                 * The following might seem like we could have a race
                 * with NMI/SMIs. But this really isn't a problem.
@@ -459,12 +484,12 @@ void rcu_irq_enter(void)
                 * RCU read-side critical sections on this CPU would
                 * have already completed.
                 */
-                per_cpu(dynticks_progress_counter, cpu)++;
+                rdssp->dynticks++;
                /*
                 * The following memory barrier ensures that any
                 * rcu_read_lock() primitives in the irq handler
                 * are seen by other CPUs to follow the above
-                 * increment to dynticks_progress_counter. This is
+                 * increment to rcu_dyntick_sched.dynticks. This is
                 * required in order for other CPUs to correctly
                 * determine when it is safe to advance the RCU
                 * grace-period state machine.
@@ -472,7 +497,7 @@ void rcu_irq_enter(void)
                smp_mb(); /* see above block comment. */
                /*
                 * Since we can't determine the dynamic tick mode from
-                 * the dynticks_progress_counter after this routine,
+                 * the rcu_dyntick_sched.dynticks after this routine,
                 * we use a second flag to acknowledge that we came
                 * from an idle state with ticks stopped.
                 */
@@ -480,7 +505,7 @@ void rcu_irq_enter(void)
                /*
                 * If we take an NMI/SMI now, they will also increment
                 * the rcu_update_flag, and will not update the
-                 * dynticks_progress_counter on exit. That is for
+                 * rcu_dyntick_sched.dynticks on exit. That is for
                 * this IRQ to do.
                 */
        }
@@ -490,12 +515,13 @@ void rcu_irq_enter(void)
 * rcu_irq_exit - Called from exiting Hard irq context.
 *
 * If the CPU was idle with dynamic ticks active, update the
- * dynticks_progress_counter to put let the RCU handling be
+ * rcu_dyntick_sched.dynticks to put let the RCU handling be
 * aware that the CPU is going back to idle with no ticks.
 */
 void rcu_irq_exit(void)
 {
        int cpu = smp_processor_id();
+        struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
        /*
         * rcu_update_flag is set if we interrupted the CPU
@@ -503,7 +529,7 @@ void rcu_irq_exit(void)
         * Once this occurs, we keep track of interrupt nesting
         * because a NMI/SMI could also come in, and we still
         * only want the IRQ that started the increment of the
-         * dynticks_progress_counter to be the one that modifies
+         * rcu_dyntick_sched.dynticks to be the one that modifies
         * it on exit.
         */
        if (per_cpu(rcu_update_flag, cpu)) {
@@ -515,28 +541,29 @@ void rcu_irq_exit(void)
                /*
                 * If an NMI/SMI happens now we are still
-                 * protected by the dynticks_progress_counter being odd.
+                 * protected by the rcu_dyntick_sched.dynticks being odd.
                 */
                /*
                 * The following memory barrier ensures that any
                 * rcu_read_unlock() primitives in the irq handler
                 * are seen by other CPUs to preceed the following
-                 * increment to dynticks_progress_counter. This
+                 * increment to rcu_dyntick_sched.dynticks. This
                 * is required in order for other CPUs to determine
                 * when it is safe to advance the RCU grace-period
                 * state machine.
                 */
                smp_mb(); /* see above block comment. */
-                per_cpu(dynticks_progress_counter, cpu)++;
+                rdssp->dynticks++;
-                WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1);
+                WARN_ON(rdssp->dynticks & 0x1);
        }
 }
 static void dyntick_save_progress_counter(int cpu)
 {
-        per_cpu(rcu_dyntick_snapshot, cpu) =
+        struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
-                per_cpu(dynticks_progress_counter, cpu);
+        rdssp->dynticks_snap = rdssp->dynticks;
 }
 static inline int
@@ -544,9 +571,10 @@ rcu_try_flip_waitack_needed(int cpu)
 {
        long curr;
        long snap;
+        struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
-        curr = per_cpu(dynticks_progress_counter, cpu);
+        curr = rdssp->dynticks;
-        snap = per_cpu(rcu_dyntick_snapshot, cpu);
+        snap = rdssp->dynticks_snap;
        smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
        /*
@@ -567,7 +595,7 @@ rcu_try_flip_waitack_needed(int cpu)
         * that this CPU already acknowledged the counter.
         */
-        if ((curr - snap) > 2 || (snap & 0x1) == 0)
+        if ((curr - snap) > 2 || (curr & 0x1) == 0)
                return 0;
        /* We need this CPU to explicitly acknowledge the counter flip. */
@@ -580,9 +608,10 @@ rcu_try_flip_waitmb_needed(int cpu)
 {
        long curr;
        long snap;
+        struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
-        curr = per_cpu(dynticks_progress_counter, cpu);
+        curr = rdssp->dynticks;
-        snap = per_cpu(rcu_dyntick_snapshot, cpu);
+        snap = rdssp->dynticks_snap;
        smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
        /*
@@ -609,14 +638,86 @@ rcu_try_flip_waitmb_needed(int cpu)
        return 1;
 }
+static void dyntick_save_progress_counter_sched(int cpu)
+{
+        struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+        rdssp->sched_dynticks_snap = rdssp->dynticks;
+}
+static int rcu_qsctr_inc_needed_dyntick(int cpu)
+{
+        long curr;
+        long snap;
+        struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+        curr = rdssp->dynticks;
+        snap = rdssp->sched_dynticks_snap;
+        smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+        /*
+         * If the CPU remained in dynticks mode for the entire time
+         * and didn't take any interrupts, NMIs, SMIs, or whatever,
+         * then it cannot be in the middle of an rcu_read_lock(), so
+         * the next rcu_read_lock() it executes must use the new value
+         * of the counter.  Therefore, this CPU has been in a quiescent
+         * state the entire time, and we don't need to wait for it.
+         */
+        if ((curr == snap) && ((curr & 0x1) == 0))
+                return 0;
+        /*
+         * If the CPU passed through or entered a dynticks idle phase with
+         * no active irq handlers, then, as above, this CPU has already
+         * passed through a quiescent state.
+         */
+        if ((curr - snap) > 2 || (snap & 0x1) == 0)
+                return 0;
+        /* We need this CPU to go through a quiescent state. */
+        return 1;
+}
 #else /* !CONFIG_NO_HZ */
-# define dyntick_save_progress_counter(cpu)     do { } while (0)
+# define dyntick_save_progress_counter(cpu)             do { } while (0)
-# define rcu_try_flip_waitack_needed(cpu)       (1)
+# define rcu_try_flip_waitack_needed(cpu)               (1)
-# define rcu_try_flip_waitmb_needed(cpu)        (1)
+# define rcu_try_flip_waitmb_needed(cpu)                (1)
+# define dyntick_save_progress_counter_sched(cpu)       do { } while (0)
+# define rcu_qsctr_inc_needed_dyntick(cpu)              (1)
 #endif /* CONFIG_NO_HZ */
+static void save_qsctr_sched(int cpu)
+{
+        struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+        rdssp->sched_qs_snap = rdssp->sched_qs;
+}
+static inline int rcu_qsctr_inc_needed(int cpu)
+{
+        struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+        /*
+         * If there has been a quiescent state, no more need to wait
+         * on this CPU.
+         */
+        if (rdssp->sched_qs != rdssp->sched_qs_snap) {
+                smp_mb(); /* force ordering with cpu entering schedule(). */
+                return 0;
+        }
+        /* We need this CPU to go through a quiescent state. */
+        return 1;
+}
 /*
 * Get here when RCU is idle.  Decide whether we need to
 * move out of idle state, and return non-zero if so.
@@ -655,7 +756,7 @@ rcu_try_flip_idle(void)
        /* Now ask each CPU for acknowledgement of the flip. */
-        for_each_cpu_mask(cpu, rcu_cpu_online_map) {
+        for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) {
                per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
                dyntick_save_progress_counter(cpu);
        }
@@ -673,7 +774,7 @@ rcu_try_flip_waitack(void)
        int cpu;
        RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
-        for_each_cpu_mask(cpu, rcu_cpu_online_map)
+        for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
                if (rcu_try_flip_waitack_needed(cpu) &&
                    per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
                        RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
@@ -705,7 +806,7 @@ rcu_try_flip_waitzero(void)
        /* Check to see if the sum of the "last" counters is zero. */
        RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
-        for_each_cpu_mask(cpu, rcu_cpu_online_map)
+        for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
                sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
        if (sum != 0) {
                RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
@@ -720,7 +821,7 @@ rcu_try_flip_waitzero(void)
        smp_mb();  /*  ^^^^^^^^^^^^ */
        /* Call for a memory barrier from each CPU. */
-        for_each_cpu_mask(cpu, rcu_cpu_online_map) {
+        for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) {
                per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
                dyntick_save_progress_counter(cpu);
        }
@@ -740,7 +841,7 @@ rcu_try_flip_waitmb(void)
        int cpu;
        RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
-        for_each_cpu_mask(cpu, rcu_cpu_online_map)
+        for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
                if (rcu_try_flip_waitmb_needed(cpu) &&
                    per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
                        RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
@@ -819,6 +920,26 @@ void rcu_check_callbacks(int cpu, int user)
        unsigned long flags;
        struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+        /*
+         * If this CPU took its interrupt from user mode or from the
+         * idle loop, and this is not a nested interrupt, then
+         * this CPU has to have exited all prior preept-disable
+         * sections of code.  So increment the counter to note this.
+         *
+         * The memory barrier is needed to handle the case where
+         * writes from a preempt-disable section of code get reordered
+         * into schedule() by this CPU's write buffer.  So the memory
+         * barrier makes sure that the rcu_qsctr_inc() is seen by other
+         * CPUs to happen after any such write.
+         */
+        if (user ||
+            (idle_cpu(cpu) && !in_softirq() &&
+             hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+                smp_mb();       /* Guard against aggressive schedule(). */
+                rcu_qsctr_inc(cpu);
+        }
        rcu_check_mb(cpu);
        if (rcu_ctrlblk.completed == rdp->completed)
                rcu_try_flip();
@@ -869,6 +990,8 @@ void rcu_offline_cpu(int cpu)
        struct rcu_head *list = NULL;
        unsigned long flags;
        struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+        struct rcu_head *schedlist = NULL;
+        struct rcu_head **schedtail = &schedlist;
        struct rcu_head **tail = &list;
        /*
@@ -882,6 +1005,11 @@ void rcu_offline_cpu(int cpu)
                rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
                                                list, tail);
        rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
+        rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail,
+                                schedlist, schedtail);
+        rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail,
+                                schedlist, schedtail);
+        rdp->rcu_sched_sleeping = 0;
        spin_unlock_irqrestore(&rdp->lock, flags);
        rdp->waitlistcount = 0;
@@ -916,12 +1044,15 @@ void rcu_offline_cpu(int cpu)
         * fix.
         */
-        local_irq_save(flags);
+        local_irq_save(flags);  /* disable preempt till we know what lock. */
        rdp = RCU_DATA_ME();
        spin_lock(&rdp->lock);
        *rdp->nexttail = list;
        if (list)
                rdp->nexttail = tail;
+        *rdp->nextschedtail = schedlist;
+        if (schedlist)
+                rdp->nextschedtail = schedtail;
        spin_unlock_irqrestore(&rdp->lock, flags);
 }
@@ -936,10 +1067,25 @@ void rcu_offline_cpu(int cpu)
 void __cpuinit rcu_online_cpu(int cpu)
 {
        unsigned long flags;
+        struct rcu_data *rdp;
        spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
        cpu_set(cpu, rcu_cpu_online_map);
        spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
+        /*
+         * The rcu_sched grace-period processing might have bypassed
+         * this CPU, given that it was not in the rcu_cpu_online_map
+         * when the grace-period scan started.  This means that the
+         * grace-period task might sleep.  So make sure that if this
+         * should happen, the first callback posted to this CPU will
+         * wake up the grace-period task if need be.
+         */
+        rdp = RCU_DATA_CPU(cpu);
+        spin_lock_irqsave(&rdp->lock, flags);
+        rdp->rcu_sched_sleeping = 1;
+        spin_unlock_irqrestore(&rdp->lock, flags);
 }
 static void rcu_process_callbacks(struct softirq_action *unused)
@@ -982,31 +1128,196 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
        *rdp->nexttail = head;
        rdp->nexttail = &head->next;
        RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
-        spin_unlock(&rdp->lock);
+        spin_unlock_irqrestore(&rdp->lock, flags);
-        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(call_rcu);
+void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+        unsigned long flags;
+        struct rcu_data *rdp;
+        int wake_gp = 0;
+        head->func = func;
+        head->next = NULL;
+        local_irq_save(flags);
+        rdp = RCU_DATA_ME();
+        spin_lock(&rdp->lock);
+        *rdp->nextschedtail = head;
+        rdp->nextschedtail = &head->next;
+        if (rdp->rcu_sched_sleeping) {
+                /* Grace-period processing might be sleeping... */
+                rdp->rcu_sched_sleeping = 0;
+                wake_gp = 1;
+        }
+        spin_unlock_irqrestore(&rdp->lock, flags);
+        if (wake_gp) {
+                /* Wake up grace-period processing, unless someone beat us. */
+                spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
+                if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping)
+                        wake_gp = 0;
+                rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping;
+                spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+                if (wake_gp)
+                        wake_up_interruptible(&rcu_ctrlblk.sched_wq);
+        }
+}
+EXPORT_SYMBOL_GPL(call_rcu_sched);
 /*
 * Wait until all currently running preempt_disable() code segments
 * (including hardware-irq-disable segments) complete.  Note that
 * in -rt this does -not- necessarily result in all currently executing
 * interrupt -handlers- having completed.
 */
-void __synchronize_sched(void)
+synchronize_rcu_xxx(__synchronize_sched, call_rcu_sched)
+EXPORT_SYMBOL_GPL(__synchronize_sched);
+/*
+ * kthread function that manages call_rcu_sched grace periods.
+ */
+static int rcu_sched_grace_period(void *arg)
 {
-        cpumask_t oldmask;
+        int couldsleep;         /* might sleep after current pass. */
+        int couldsleepnext = 0; /* might sleep after next pass. */
        int cpu;
+        unsigned long flags;
+        struct rcu_data *rdp;
+        int ret;
-        if (sched_getaffinity(0, &oldmask) < 0)
+        /*
-                oldmask = cpu_possible_map;
+         * Each pass through the following loop handles one
-        for_each_online_cpu(cpu) {
+         * rcu_sched grace period cycle.
-                sched_setaffinity(0, &cpumask_of_cpu(cpu));
+         */
-                schedule();
+        do {
-        }
+                /* Save each CPU's current state. */
-        sched_setaffinity(0, &oldmask);
+                for_each_online_cpu(cpu) {
+                        dyntick_save_progress_counter_sched(cpu);
+                        save_qsctr_sched(cpu);
+                }
+                /*
+                 * Sleep for about an RCU grace-period's worth to
+                 * allow better batching and to consume less CPU.
+                 */
+                schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME);
+                /*
+                 * If there was nothing to do last time, prepare to
+                 * sleep at the end of the current grace period cycle.
+                 */
+                couldsleep = couldsleepnext;
+                couldsleepnext = 1;
+                if (couldsleep) {
+                        spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
+                        rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep;
+                        spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+                }
+                /*
+                 * Wait on each CPU in turn to have either visited
+                 * a quiescent state or been in dynticks-idle mode.
+                 */
+                for_each_online_cpu(cpu) {
+                        while (rcu_qsctr_inc_needed(cpu) &&
+                               rcu_qsctr_inc_needed_dyntick(cpu)) {
+                                /* resched_cpu(cpu); @@@ */
+                                schedule_timeout_interruptible(1);
+                        }
+                }
+                /* Advance callbacks for each CPU.  */
+                for_each_online_cpu(cpu) {
+                        rdp = RCU_DATA_CPU(cpu);
+                        spin_lock_irqsave(&rdp->lock, flags);
+                        /*
+                         * We are running on this CPU irq-disabled, so no
+                         * CPU can go offline until we re-enable irqs.
+                         * The current CPU might have already gone
+                         * offline (between the for_each_offline_cpu and
+                         * the spin_lock_irqsave), but in that case all its
+                         * callback lists will be empty, so no harm done.
+                         *
+                         * Advance the callbacks!  We share normal RCU's
+                         * donelist, since callbacks are invoked the
+                         * same way in either case.
+                         */
+                        if (rdp->waitschedlist != NULL) {
+                                *rdp->donetail = rdp->waitschedlist;
+                                rdp->donetail = rdp->waitschedtail;
+                                /*
+                                 * Next rcu_check_callbacks() will
+                                 * do the required raise_softirq().
+                                 */
+                        }
+                        if (rdp->nextschedlist != NULL) {
+                                rdp->waitschedlist = rdp->nextschedlist;
+                                rdp->waitschedtail = rdp->nextschedtail;
+                                couldsleep = 0;
+                                couldsleepnext = 0;
+                        } else {
+                                rdp->waitschedlist = NULL;
+                                rdp->waitschedtail = &rdp->waitschedlist;
+                        }
+                        rdp->nextschedlist = NULL;
+                        rdp->nextschedtail = &rdp->nextschedlist;
+                        /* Mark sleep intention. */
+                        rdp->rcu_sched_sleeping = couldsleep;
+                        spin_unlock_irqrestore(&rdp->lock, flags);
+                }
+                /* If we saw callbacks on the last scan, go deal with them. */
+                if (!couldsleep)
+                        continue;
+                /* Attempt to block... */
+                spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
+                if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) {
+                        /*
+                         * Someone posted a callback after we scanned.
+                         * Go take care of it.
+                         */
+                        spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+                        couldsleepnext = 0;
+                        continue;
+                }
+                /* Block until the next person posts a callback. */
+                rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
+                spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+                ret = 0;
+                __wait_event_interruptible(rcu_ctrlblk.sched_wq,
+                        rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
+                        ret);
+                /*
+                 * Signals would prevent us from sleeping, and we cannot
+                 * do much with them in any case.  So flush them.
+                 */
+                if (ret)
+                        flush_signals(current);
+                couldsleepnext = 0;
+        } while (!kthread_should_stop());
+        return (0);
 }
-EXPORT_SYMBOL_GPL(__synchronize_sched);
 /*
 * Check to see if any future RCU-related work will need to be done
@@ -1023,7 +1334,9 @@ int rcu_needs_cpu(int cpu)
        return (rdp->donelist != NULL ||
                !!rdp->waitlistcount ||
-                rdp->nextlist != NULL);
+                rdp->nextlist != NULL ||
+                rdp->nextschedlist != NULL ||
+                rdp->waitschedlist != NULL);
 }
 int rcu_pending(int cpu)
@@ -1034,7 +1347,9 @@ int rcu_pending(int cpu)
        if (rdp->donelist != NULL ||
            !!rdp->waitlistcount ||
-            rdp->nextlist != NULL)
+            rdp->nextlist != NULL ||
+            rdp->nextschedlist != NULL ||
+            rdp->waitschedlist != NULL)
                return 1;
        /* The RCU core needs an acknowledgement from this CPU. */
@@ -1101,6 +1416,11 @@ void __init __rcu_init(void)
                rdp->donetail = &rdp->donelist;
                rdp->rcu_flipctr[0] = 0;
                rdp->rcu_flipctr[1] = 0;
+                rdp->nextschedlist = NULL;
+                rdp->nextschedtail = &rdp->nextschedlist;
+                rdp->waitschedlist = NULL;
+                rdp->waitschedtail = &rdp->waitschedlist;
+                rdp->rcu_sched_sleeping = 0;
        }
        register_cpu_notifier(&rcu_nb);
@@ -1119,15 +1439,19 @@ void __init __rcu_init(void)
        for_each_online_cpu(cpu)
                rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu);
-        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
+        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 }
 /*
- * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
+ * Late-boot-time RCU initialization that must wait until after scheduler
+ * has been initialized.
 */
-void synchronize_kernel(void)
+void __init rcu_init_sched(void)
 {
-        synchronize_rcu();
+        rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period,
+                                                  NULL,
+                                                  "rcu_sched_grace_period");
+        WARN_ON(IS_ERR(rcu_sched_grace_period_task));
 }
 #ifdef CONFIG_RCU_TRACE
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
index 49ac4947af24..5edf82c34bbc 100644
--- a/kernel/rcupreempt_trace.c
+++ b/kernel/rcupreempt_trace.c
@@ -38,7 +38,6 @@
 #include <linux/moduleparam.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
-#include <linux/rcupdate.h>
 #include <linux/cpu.h>
 #include <linux/mutex.h>
 #include <linux/rcupreempt_trace.h>
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 33acc424667e..90b5b123f7a1 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -57,7 +57,9 @@ static int stat_interval;	/* Interval between stats, in seconds. */
                                /*  Defaults to "only at end of test". */
 static int verbose;             /* Print more debug info. */
 static int test_no_idle_hz;     /* Test RCU's support for tickless idle CPUs. */
-static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/
+static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
+static int stutter = 5;         /* Start/stop testing interval (in sec) */
+static int irqreader = 1;       /* RCU readers from irq (timers). */
 static char *torture_type = "rcu"; /* What RCU implementation to torture. */
 module_param(nreaders, int, 0444);
@@ -72,6 +74,10 @@ module_param(test_no_idle_hz, bool, 0444);
 MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
 module_param(shuffle_interval, int, 0444);
 MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
+module_param(stutter, int, 0444);
+MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
+module_param(irqreader, int, 0444);
+MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
 module_param(torture_type, charp, 0444);
 MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
@@ -91,6 +97,7 @@ static struct task_struct **fakewriter_tasks;
 static struct task_struct **reader_tasks;
 static struct task_struct *stats_task;
 static struct task_struct *shuffler_task;
+static struct task_struct *stutter_task;
 #define RCU_TORTURE_PIPE_LEN 10
@@ -117,8 +124,18 @@ static atomic_t n_rcu_torture_alloc_fail;
 static atomic_t n_rcu_torture_free;
 static atomic_t n_rcu_torture_mberror;
 static atomic_t n_rcu_torture_error;
+static long n_rcu_torture_timers = 0;
 static struct list_head rcu_torture_removed;
+static int stutter_pause_test = 0;
+#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
+#define RCUTORTURE_RUNNABLE_INIT 1
+#else
+#define RCUTORTURE_RUNNABLE_INIT 0
+#endif
+int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
 /*
 * Allocate an element from the rcu_tortures pool.
 */
@@ -179,6 +196,16 @@ rcu_random(struct rcu_random_state *rrsp)
        return swahw32(rrsp->rrs_state);
 }
+static void
+rcu_stutter_wait(void)
+{
+        while (stutter_pause_test || !rcutorture_runnable)
+                if (rcutorture_runnable)
+                        schedule_timeout_interruptible(1);
+                else
+                        schedule_timeout_interruptible(round_jiffies_relative(HZ));
+}
 /*
 * Operations vector for selecting different types of tests.
 */
@@ -192,7 +219,9 @@ struct rcu_torture_ops {
        int (*completed)(void);
        void (*deferredfree)(struct rcu_torture *p);
        void (*sync)(void);
+        void (*cb_barrier)(void);
        int (*stats)(char *page);
+        int irqcapable;
        char *name;
 };
 static struct rcu_torture_ops *cur_ops = NULL;
@@ -265,7 +294,9 @@ static struct rcu_torture_ops rcu_ops = {
        .completed = rcu_torture_completed,
        .deferredfree = rcu_torture_deferred_free,
        .sync = synchronize_rcu,
+        .cb_barrier = rcu_barrier,
        .stats = NULL,
+        .irqcapable = 1,
        .name = "rcu"
 };
@@ -304,7 +335,9 @@ static struct rcu_torture_ops rcu_sync_ops = {
        .completed = rcu_torture_completed,
        .deferredfree = rcu_sync_torture_deferred_free,
        .sync = synchronize_rcu,
+        .cb_barrier = NULL,
        .stats = NULL,
+        .irqcapable = 1,
        .name = "rcu_sync"
 };
@@ -364,7 +397,9 @@ static struct rcu_torture_ops rcu_bh_ops = {
        .completed = rcu_bh_torture_completed,
        .deferredfree = rcu_bh_torture_deferred_free,
        .sync = rcu_bh_torture_synchronize,
+        .cb_barrier = rcu_barrier_bh,
        .stats = NULL,
+        .irqcapable = 1,
        .name = "rcu_bh"
 };
@@ -377,7 +412,9 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
        .completed = rcu_bh_torture_completed,
        .deferredfree = rcu_sync_torture_deferred_free,
        .sync = rcu_bh_torture_synchronize,
+        .cb_barrier = NULL,
        .stats = NULL,
+        .irqcapable = 1,
        .name = "rcu_bh_sync"
 };
@@ -458,6 +495,7 @@ static struct rcu_torture_ops srcu_ops = {
        .completed = srcu_torture_completed,
        .deferredfree = rcu_sync_torture_deferred_free,
        .sync = srcu_torture_synchronize,
+        .cb_barrier = NULL,
        .stats = srcu_torture_stats,
        .name = "srcu"
 };
@@ -482,6 +520,11 @@ static int sched_torture_completed(void)
        return 0;
 }
+static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
+{
+        call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
+}
 static void sched_torture_synchronize(void)
 {
        synchronize_sched();
@@ -494,12 +537,28 @@ static struct rcu_torture_ops sched_ops = {
        .readdelay = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock = sched_torture_read_unlock,
        .completed = sched_torture_completed,
-        .deferredfree = rcu_sync_torture_deferred_free,
+        .deferredfree = rcu_sched_torture_deferred_free,
        .sync = sched_torture_synchronize,
+        .cb_barrier = rcu_barrier_sched,
        .stats = NULL,
+        .irqcapable = 1,
        .name = "sched"
 };
+static struct rcu_torture_ops sched_ops_sync = {
+        .init = rcu_sync_torture_init,
+        .cleanup = NULL,
+        .readlock = sched_torture_read_lock,
+        .readdelay = rcu_read_delay,  /* just reuse rcu's version. */
+        .readunlock = sched_torture_read_unlock,
+        .completed = sched_torture_completed,
+        .deferredfree = rcu_sync_torture_deferred_free,
+        .sync = sched_torture_synchronize,
+        .cb_barrier = NULL,
+        .stats = NULL,
+        .name = "sched_sync"
+};
 /*
 * RCU torture writer kthread.  Repeatedly substitutes a new structure
 * for that pointed to by rcu_torture_current, freeing the old structure
@@ -537,6 +596,7 @@ rcu_torture_writer(void *arg)
                }
                rcu_torture_current_version++;
                oldbatch = cur_ops->completed();
+                rcu_stutter_wait();
        } while (!kthread_should_stop() && !fullstop);
        VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
        while (!kthread_should_stop())
@@ -560,6 +620,7 @@ rcu_torture_fakewriter(void *arg)
                schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
                udelay(rcu_random(&rand) & 0x3ff);
                cur_ops->sync();
+                rcu_stutter_wait();
        } while (!kthread_should_stop() && !fullstop);
        VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
@@ -569,6 +630,52 @@ rcu_torture_fakewriter(void *arg)
 }
 /*
+ * RCU torture reader from timer handler.  Dereferences rcu_torture_current,
+ * incrementing the corresponding element of the pipeline array.  The
+ * counter in the element should never be greater than 1, otherwise, the
+ * RCU implementation is broken.
+ */
+static void rcu_torture_timer(unsigned long unused)
+{
+        int idx;
+        int completed;
+        static DEFINE_RCU_RANDOM(rand);
+        static DEFINE_SPINLOCK(rand_lock);
+        struct rcu_torture *p;
+        int pipe_count;
+        idx = cur_ops->readlock();
+        completed = cur_ops->completed();
+        p = rcu_dereference(rcu_torture_current);
+        if (p == NULL) {
+                /* Leave because rcu_torture_writer is not yet underway */
+                cur_ops->readunlock(idx);
+                return;
+        }
+        if (p->rtort_mbtest == 0)
+                atomic_inc(&n_rcu_torture_mberror);
+        spin_lock(&rand_lock);
+        cur_ops->readdelay(&rand);
+        n_rcu_torture_timers++;
+        spin_unlock(&rand_lock);
+        preempt_disable();
+        pipe_count = p->rtort_pipe_count;
+        if (pipe_count > RCU_TORTURE_PIPE_LEN) {
+                /* Should not happen, but... */
+                pipe_count = RCU_TORTURE_PIPE_LEN;
+        }
+        ++__get_cpu_var(rcu_torture_count)[pipe_count];
+        completed = cur_ops->completed() - completed;
+        if (completed > RCU_TORTURE_PIPE_LEN) {
+                /* Should not happen, but... */
+                completed = RCU_TORTURE_PIPE_LEN;
+        }
+        ++__get_cpu_var(rcu_torture_batch)[completed];
+        preempt_enable();
+        cur_ops->readunlock(idx);
+}
+/*
 * RCU torture reader kthread.  Repeatedly dereferences rcu_torture_current,
 * incrementing the corresponding element of the pipeline array.  The
 * counter in the element should never be greater than 1, otherwise, the
@@ -582,11 +689,18 @@ rcu_torture_reader(void *arg)
        DEFINE_RCU_RANDOM(rand);
        struct rcu_torture *p;
        int pipe_count;
+        struct timer_list t;
        VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
        set_user_nice(current, 19);
+        if (irqreader && cur_ops->irqcapable)
+                setup_timer_on_stack(&t, rcu_torture_timer, 0);
        do {
+                if (irqreader && cur_ops->irqcapable) {
+                        if (!timer_pending(&t))
+                                mod_timer(&t, 1);
+                }
                idx = cur_ops->readlock();
                completed = cur_ops->completed();
                p = rcu_dereference(rcu_torture_current);
@@ -615,8 +729,11 @@ rcu_torture_reader(void *arg)
                preempt_enable();
                cur_ops->readunlock(idx);
                schedule();
+                rcu_stutter_wait();
        } while (!kthread_should_stop() && !fullstop);
        VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
+        if (irqreader && cur_ops->irqcapable)
+                del_timer_sync(&t);
        while (!kthread_should_stop())
                schedule_timeout_uninterruptible(1);
        return 0;
@@ -647,20 +764,22 @@ rcu_torture_printk(char *page)
        cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
        cnt += sprintf(&page[cnt],
                       "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
-                       "rtmbe: %d",
+                       "rtmbe: %d nt: %ld",
                       rcu_torture_current,
                       rcu_torture_current_version,
                       list_empty(&rcu_torture_freelist),
                       atomic_read(&n_rcu_torture_alloc),
                       atomic_read(&n_rcu_torture_alloc_fail),
                       atomic_read(&n_rcu_torture_free),
-                       atomic_read(&n_rcu_torture_mberror));
+                       atomic_read(&n_rcu_torture_mberror),
+                       n_rcu_torture_timers);
        if (atomic_read(&n_rcu_torture_mberror) != 0)
                cnt += sprintf(&page[cnt], " !!!");
        cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
        if (i > 1) {
                cnt += sprintf(&page[cnt], "!!! ");
                atomic_inc(&n_rcu_torture_error);
+                WARN_ON_ONCE(1);
        }
        cnt += sprintf(&page[cnt], "Reader Pipe: ");
        for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
@@ -785,15 +904,34 @@ rcu_torture_shuffle(void *arg)
        return 0;
 }
+/* Cause the rcutorture test to "stutter", starting and stopping all
+ * threads periodically.
+ */
+static int
+rcu_torture_stutter(void *arg)
+{
+        VERBOSE_PRINTK_STRING("rcu_torture_stutter task started");
+        do {
+                schedule_timeout_interruptible(stutter * HZ);
+                stutter_pause_test = 1;
+                if (!kthread_should_stop())
+                        schedule_timeout_interruptible(stutter * HZ);
+                stutter_pause_test = 0;
+        } while (!kthread_should_stop());
+        VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping");
+        return 0;
+}
 static inline void
 rcu_torture_print_module_parms(char *tag)
 {
        printk(KERN_ALERT "%s" TORTURE_FLAG
                "--- %s: nreaders=%d nfakewriters=%d "
                "stat_interval=%d verbose=%d test_no_idle_hz=%d "
-                "shuffle_interval = %d\n",
+                "shuffle_interval=%d stutter=%d irqreader=%d\n",
                torture_type, tag, nrealreaders, nfakewriters,
-                stat_interval, verbose, test_no_idle_hz, shuffle_interval);
+                stat_interval, verbose, test_no_idle_hz, shuffle_interval,
+                stutter, irqreader);
 }
 static void
@@ -802,6 +940,11 @@ rcu_torture_cleanup(void)
        int i;
        fullstop = 1;
+        if (stutter_task) {
+                VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
+                kthread_stop(stutter_task);
+        }
+        stutter_task = NULL;
        if (shuffler_task) {
                VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
                kthread_stop(shuffler_task);
@@ -848,7 +991,9 @@ rcu_torture_cleanup(void)
        stats_task = NULL;
        /* Wait for all RCU callbacks to fire.  */
-        rcu_barrier();
+        if (cur_ops->cb_barrier != NULL)
+                cur_ops->cb_barrier();
        rcu_torture_stats_print();  /* -After- the stats thread is stopped! */
@@ -868,7 +1013,7 @@ rcu_torture_init(void)
        int firsterr = 0;
        static struct rcu_torture_ops *torture_ops[] =
                { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
-                  &srcu_ops, &sched_ops, };
+                  &srcu_ops, &sched_ops, &sched_ops_sync, };
        /* Process args and tell the world that the torturer is on the job. */
        for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
@@ -988,6 +1133,19 @@ rcu_torture_init(void)
                        goto unwind;
                }
        }
+        if (stutter < 0)
+                stutter = 0;
+        if (stutter) {
+                /* Create the stutter thread */
+                stutter_task = kthread_run(rcu_torture_stutter, NULL,
+                                          "rcu_torture_stutter");
+                if (IS_ERR(stutter_task)) {
+                        firsterr = PTR_ERR(stutter_task);
+                        VERBOSE_PRINTK_ERRSTRING("Failed to create stutter");
+                        stutter_task = NULL;
+                        goto unwind;
+                }
+        }
        return 0;
 unwind:
diff --git a/kernel/relay.c b/kernel/relay.c
index 7de644cdec43..04006ef970b8 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -407,6 +407,35 @@ void relay_reset(struct rchan *chan)
 }
 EXPORT_SYMBOL_GPL(relay_reset);
+static inline void relay_set_buf_dentry(struct rchan_buf *buf,
+                                        struct dentry *dentry)
+{
+        buf->dentry = dentry;
+        buf->dentry->d_inode->i_size = buf->early_bytes;
+}
+static struct dentry *relay_create_buf_file(struct rchan *chan,
+                                            struct rchan_buf *buf,
+                                            unsigned int cpu)
+{
+        struct dentry *dentry;
+        char *tmpname;
+        tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL);
+        if (!tmpname)
+                return NULL;
+        snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu);
+        /* Create file in fs */
+        dentry = chan->cb->create_buf_file(tmpname, chan->parent,
+                                           S_IRUSR, buf,
+                                           &chan->is_global);
+        kfree(tmpname);
+        return dentry;
+}
 /*
 *      relay_open_buf - create a new relay channel buffer
 *
@@ -416,45 +445,34 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
 {
        struct rchan_buf *buf = NULL;
        struct dentry *dentry;
-        char *tmpname;
        if (chan->is_global)
                return chan->buf[0];
-        tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL);
-        if (!tmpname)
-                goto end;
-        snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu);
        buf = relay_create_buf(chan);
        if (!buf)
-                goto free_name;
+                return NULL;
+        if (chan->has_base_filename) {
+                dentry = relay_create_buf_file(chan, buf, cpu);
+                if (!dentry)
+                        goto free_buf;
+                relay_set_buf_dentry(buf, dentry);
+        }
        buf->cpu = cpu;
        __relay_reset(buf, 1);
-        /* Create file in fs */
-        dentry = chan->cb->create_buf_file(tmpname, chan->parent, S_IRUSR,
-                                           buf, &chan->is_global);
-        if (!dentry)
-                goto free_buf;
-        buf->dentry = dentry;
        if(chan->is_global) {
                chan->buf[0] = buf;
                buf->cpu = 0;
        }
-        goto free_name;
+        return buf;
 free_buf:
        relay_destroy_buf(buf);
-        buf = NULL;
+        return NULL;
-free_name:
-        kfree(tmpname);
-end:
-        return buf;
 }
 /**
@@ -537,8 +555,8 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
 /**
 *      relay_open - create a new relay channel
- *      @base_filename: base name of files to create
+ *      @base_filename: base name of files to create, %NULL for buffering only
- *      @parent: dentry of parent directory, %NULL for root directory
+ *      @parent: dentry of parent directory, %NULL for root directory or buffer
 *      @subbuf_size: size of sub-buffers
 *      @n_subbufs: number of sub-buffers
 *      @cb: client callback functions
@@ -560,8 +578,6 @@ struct rchan *relay_open(const char *base_filename,
 {
        unsigned int i;
        struct rchan *chan;
-        if (!base_filename)
-                return NULL;
        if (!(subbuf_size && n_subbufs))
                return NULL;
@@ -576,7 +592,10 @@ struct rchan *relay_open(const char *base_filename,
        chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
        chan->parent = parent;
        chan->private_data = private_data;
-        strlcpy(chan->base_filename, base_filename, NAME_MAX);
+        if (base_filename) {
+                chan->has_base_filename = 1;
+                strlcpy(chan->base_filename, base_filename, NAME_MAX);
+        }
        setup_callbacks(chan, cb);
        kref_init(&chan->kref);
@@ -604,6 +623,94 @@ free_bufs:
 }
 EXPORT_SYMBOL_GPL(relay_open);
+struct rchan_percpu_buf_dispatcher {
+        struct rchan_buf *buf;
+        struct dentry *dentry;
+};
+/* Called in atomic context. */
+static void __relay_set_buf_dentry(void *info)
+{
+        struct rchan_percpu_buf_dispatcher *p = info;
+        relay_set_buf_dentry(p->buf, p->dentry);
+}
+/**
+ *      relay_late_setup_files - triggers file creation
+ *      @chan: channel to operate on
+ *      @base_filename: base name of files to create
+ *      @parent: dentry of parent directory, %NULL for root directory
+ *
+ *      Returns 0 if successful, non-zero otherwise.
+ *
+ *      Use to setup files for a previously buffer-only channel.
+ *      Useful to do early tracing in kernel, before VFS is up, for example.
+ */
+int relay_late_setup_files(struct rchan *chan,
+                           const char *base_filename,
+                           struct dentry *parent)
+{
+        int err = 0;
+        unsigned int i, curr_cpu;
+        unsigned long flags;
+        struct dentry *dentry;
+        struct rchan_percpu_buf_dispatcher disp;
+        if (!chan || !base_filename)
+                return -EINVAL;
+        strlcpy(chan->base_filename, base_filename, NAME_MAX);
+        mutex_lock(&relay_channels_mutex);
+        /* Is chan already set up? */
+        if (unlikely(chan->has_base_filename))
+                return -EEXIST;
+        chan->has_base_filename = 1;
+        chan->parent = parent;
+        curr_cpu = get_cpu();
+        /*
+         * The CPU hotplug notifier ran before us and created buffers with
+         * no files associated. So it's safe to call relay_setup_buf_file()
+         * on all currently online CPUs.
+         */
+        for_each_online_cpu(i) {
+                if (unlikely(!chan->buf[i])) {
+                        printk(KERN_ERR "relay_late_setup_files: CPU %u "
+                                        "has no buffer, it must have!\n", i);
+                        BUG();
+                        err = -EINVAL;
+                        break;
+                }
+                dentry = relay_create_buf_file(chan, chan->buf[i], i);
+                if (unlikely(!dentry)) {
+                        err = -EINVAL;
+                        break;
+                }
+                if (curr_cpu == i) {
+                        local_irq_save(flags);
+                        relay_set_buf_dentry(chan->buf[i], dentry);
+                        local_irq_restore(flags);
+                } else {
+                        disp.buf = chan->buf[i];
+                        disp.dentry = dentry;
+                        smp_mb();
+                        /* relay_channels_mutex must be held, so wait. */
+                        err = smp_call_function_single(i,
+                                                       __relay_set_buf_dentry,
+                                                       &disp, 1);
+                }
+                if (unlikely(err))
+                        break;
+        }
+        put_cpu();
+        mutex_unlock(&relay_channels_mutex);
+        return err;
+}
 /**
 *      relay_switch_subbuf - switch to a new sub-buffer
 *      @buf: channel buffer
@@ -627,8 +734,13 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
                old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
                buf->padding[old_subbuf] = buf->prev_padding;
                buf->subbufs_produced++;
-                buf->dentry->d_inode->i_size += buf->chan->subbuf_size -
+                if (buf->dentry)
-                        buf->padding[old_subbuf];
+                        buf->dentry->d_inode->i_size +=
+                                buf->chan->subbuf_size -
+                                buf->padding[old_subbuf];
+                else
+                        buf->early_bytes += buf->chan->subbuf_size -
+                                            buf->padding[old_subbuf];
                smp_mb();
                if (waitqueue_active(&buf->read_wait))
                        /*
@@ -1237,4 +1349,4 @@ static __init int relay_init(void)
        return 0;
 }
-module_init(relay_init);
+early_initcall(relay_init);
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index d3c61b4ebef2..f275c8eca772 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/res_counter.h>
 #include <linux/uaccess.h>
+#include <linux/mm.h>
 void res_counter_init(struct res_counter *counter)
 {
@@ -102,44 +103,37 @@ u64 res_counter_read_u64(struct res_counter *counter, int member)
        return *res_counter_member(counter, member);
 }
-ssize_t res_counter_write(struct res_counter *counter, int member,
+int res_counter_memparse_write_strategy(const char *buf,
-                const char __user *userbuf, size_t nbytes, loff_t *pos,
+                                        unsigned long long *res)
-                int (*write_strategy)(char *st_buf, unsigned long long *val))
 {
-        int ret;
+        char *end;
-        char *buf, *end;
+        /* FIXME - make memparse() take const char* args */
-        unsigned long flags;
+        *res = memparse((char *)buf, &end);
-        unsigned long long tmp, *val;
+        if (*end != '\0')
+                return -EINVAL;
-        buf = kmalloc(nbytes + 1, GFP_KERNEL);
-        ret = -ENOMEM;
-        if (buf == NULL)
-                goto out;
-        buf[nbytes] = '\0';
+        *res = PAGE_ALIGN(*res);
-        ret = -EFAULT;
+        return 0;
-        if (copy_from_user(buf, userbuf, nbytes))
+}
-                goto out_free;
-        ret = -EINVAL;
+int res_counter_write(struct res_counter *counter, int member,
+                      const char *buf, write_strategy_fn write_strategy)
+{
+        char *end;
+        unsigned long flags;
+        unsigned long long tmp, *val;
-        strstrip(buf);
        if (write_strategy) {
-                if (write_strategy(buf, &tmp)) {
+                if (write_strategy(buf, &tmp))
-                        goto out_free;
+                        return -EINVAL;
-                }
        } else {
                tmp = simple_strtoull(buf, &end, 10);
                if (*end != '\0')
-                        goto out_free;
+                        return -EINVAL;
        }
        spin_lock_irqsave(&counter->lock, flags);
        val = res_counter_member(counter, member);
        *val = tmp;
        spin_unlock_irqrestore(&counter->lock, flags);
-        ret = nbytes;
+        return 0;
-out_free:
-        kfree(buf);
-out:
-        return ret;
 }
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 092e4c620af9..a56f629b057a 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -297,8 +297,8 @@ static int test_func(void *data)
 *
 * opcode:data
 */
-static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf,
+static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribute *attr,
-                                  size_t count)
+                                  const char *buf, size_t count)
 {
        struct sched_param schedpar;
        struct test_thread_data *td;
@@ -360,7 +360,8 @@ static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf,
 * @dev:        thread to query
 * @buf:        char buffer to be filled with thread status info
 */
-static ssize_t sysfs_test_status(struct sys_device *dev, char *buf)
+static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute *attr,
+                                 char *buf)
 {
        struct test_thread_data *td;
        struct task_struct *tsk;
diff --git a/kernel/sched.c b/kernel/sched.c
index 4e2f60335656..0236958addcb 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -70,10 +70,13 @@
 #include <linux/bootmem.h>
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
+#include <linux/ftrace.h>
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
+#include "sched_cpupri.h"
 /*
 * Convert user-nice values [ -20 ... 0 ... 19 ]
 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -289,15 +292,15 @@ struct task_group root_task_group;
 static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
 /* Default task group's cfs_rq on each cpu */
 static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
 static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
-#else
+#else /* !CONFIG_FAIR_GROUP_SCHED */
 #define root_task_group init_task_group
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 /* task_group_lock serializes add/remove of task groups and also changes to
 * a task group's cpu shares.
@@ -307,9 +310,9 @@ static DEFINE_SPINLOCK(task_group_lock);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_USER_SCHED
 # define INIT_TASK_GROUP_LOAD   (2*NICE_0_LOAD)
-#else
+#else /* !CONFIG_USER_SCHED */
 # define INIT_TASK_GROUP_LOAD   NICE_0_LOAD
-#endif
+#endif /* CONFIG_USER_SCHED */
 /*
 * A weight of 0 or 1 can cause arithmetics problems.
@@ -363,6 +366,10 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 #else
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
+static inline struct task_group *task_group(struct task_struct *p)
+{
+        return NULL;
+}
 #endif  /* CONFIG_GROUP_SCHED */
@@ -373,6 +380,7 @@ struct cfs_rq {
        u64 exec_clock;
        u64 min_vruntime;
+        u64 pair_start;
        struct rb_root tasks_timeline;
        struct rb_node *rb_leftmost;
@@ -401,6 +409,31 @@ struct cfs_rq {
         */
        struct list_head leaf_cfs_rq_list;
        struct task_group *tg;  /* group that "owns" this runqueue */
+#ifdef CONFIG_SMP
+        /*
+         * the part of load.weight contributed by tasks
+         */
+        unsigned long task_weight;
+        /*
+         *   h_load = weight * f(tg)
+         *
+         * Where f(tg) is the recursive weight fraction assigned to
+         * this group.
+         */
+        unsigned long h_load;
+        /*
+         * this cpu's part of tg->shares
+         */
+        unsigned long shares;
+        /*
+         * load.weight at the time we set shares
+         */
+        unsigned long rq_weight;
+#endif
 #endif
 };
@@ -452,6 +485,9 @@ struct root_domain {
         */
        cpumask_t rto_mask;
        atomic_t rto_count;
+#ifdef CONFIG_SMP
+        struct cpupri cpupri;
+#endif
 };
 /*
@@ -526,14 +562,19 @@ struct rq {
        int push_cpu;
        /* cpu of this runqueue: */
        int cpu;
+        int online;
+        unsigned long avg_load_per_task;
        struct task_struct *migration_thread;
        struct list_head migration_queue;
 #endif
 #ifdef CONFIG_SCHED_HRTICK
-        unsigned long hrtick_flags;
+#ifdef CONFIG_SMP
-        ktime_t hrtick_expire;
+        int hrtick_csd_pending;
+        struct call_single_data hrtick_csd;
+#endif
        struct hrtimer hrtick_timer;
 #endif
@@ -607,6 +648,24 @@ static inline void update_rq_clock(struct rq *rq)
 # define const_debug static const
 #endif
+/**
+ * runqueue_is_locked
+ *
+ * Returns true if the current cpu runqueue is locked.
+ * This interface allows printk to be called with the runqueue lock
+ * held and know whether or not it is OK to wake up the klogd.
+ */
+int runqueue_is_locked(void)
+{
+        int cpu = get_cpu();
+        struct rq *rq = cpu_rq(cpu);
+        int ret;
+        ret = spin_is_locked(&rq->lock);
+        put_cpu();
+        return ret;
+}
 /*
 * Debugging: various feature bits
 */
@@ -749,6 +808,12 @@ late_initcall(sched_init_debug);
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 /*
+ * ratelimit for updating the group shares.
+ * default: 0.5ms
+ */
+const_debug unsigned int sysctl_sched_shares_ratelimit = 500000;
+/*
 * period over which we measure -rt task cpu usage in us.
 * default: 1s
 */
@@ -775,82 +840,6 @@ static inline u64 global_rt_runtime(void)
        return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
-unsigned long long time_sync_thresh = 100000;
-static DEFINE_PER_CPU(unsigned long long, time_offset);
-static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
-/*
- * Global lock which we take every now and then to synchronize
- * the CPUs time. This method is not warp-safe, but it's good
- * enough to synchronize slowly diverging time sources and thus
- * it's good enough for tracing:
- */
-static DEFINE_SPINLOCK(time_sync_lock);
-static unsigned long long prev_global_time;
-static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
-{
-        /*
-         * We want this inlined, to not get tracer function calls
-         * in this critical section:
-         */
-        spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
-        __raw_spin_lock(&time_sync_lock.raw_lock);
-        if (time < prev_global_time) {
-                per_cpu(time_offset, cpu) += prev_global_time - time;
-                time = prev_global_time;
-        } else {
-                prev_global_time = time;
-        }
-        __raw_spin_unlock(&time_sync_lock.raw_lock);
-        spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
-        return time;
-}
-static unsigned long long __cpu_clock(int cpu)
-{
-        unsigned long long now;
-        /*
-         * Only call sched_clock() if the scheduler has already been
-         * initialized (some code might call cpu_clock() very early):
-         */
-        if (unlikely(!scheduler_running))
-                return 0;
-        now = sched_clock_cpu(cpu);
-        return now;
-}
-/*
- * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
- * clock constructed from sched_clock():
- */
-unsigned long long cpu_clock(int cpu)
-{
-        unsigned long long prev_cpu_time, time, delta_time;
-        unsigned long flags;
-        local_irq_save(flags);
-        prev_cpu_time = per_cpu(prev_cpu_time, cpu);
-        time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
-        delta_time = time-prev_cpu_time;
-        if (unlikely(delta_time > time_sync_thresh)) {
-                time = __sync_cpu_clock(time, cpu);
-                per_cpu(prev_cpu_time, cpu) = time;
-        }
-        local_irq_restore(flags);
-        return time;
-}
-EXPORT_SYMBOL_GPL(cpu_clock);
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)      do { } while (0)
 #endif
@@ -996,13 +985,6 @@ static struct rq *this_rq_lock(void)
        return rq;
 }
-static void __resched_task(struct task_struct *p, int tif_bit);
-static inline void resched_task(struct task_struct *p)
-{
-        __resched_task(p, TIF_NEED_RESCHED);
-}
 #ifdef CONFIG_SCHED_HRTICK
 /*
 * Use HR-timers to deliver accurate preemption points.
@@ -1014,25 +996,6 @@ static inline void resched_task(struct task_struct *p)
 * When we get rescheduled we reprogram the hrtick_timer outside of the
 * rq->lock.
 */
-static inline void resched_hrt(struct task_struct *p)
-{
-        __resched_task(p, TIF_HRTICK_RESCHED);
-}
-static inline void resched_rq(struct rq *rq)
-{
-        unsigned long flags;
-        spin_lock_irqsave(&rq->lock, flags);
-        resched_task(rq->curr);
-        spin_unlock_irqrestore(&rq->lock, flags);
-}
-enum {
-        HRTICK_SET,             /* re-programm hrtick_timer */
-        HRTICK_RESET,           /* not a new slice */
-        HRTICK_BLOCK,           /* stop hrtick operations */
-};
 /*
 * Use hrtick when:
@@ -1043,40 +1006,11 @@ static inline int hrtick_enabled(struct rq *rq)
 {
        if (!sched_feat(HRTICK))
                return 0;
-        if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags)))
+        if (!cpu_active(cpu_of(rq)))
                return 0;
        return hrtimer_is_hres_active(&rq->hrtick_timer);
 }
-/*
- * Called to set the hrtick timer state.
- *
- * called with rq->lock held and irqs disabled
- */
-static void hrtick_start(struct rq *rq, u64 delay, int reset)
-{
-        assert_spin_locked(&rq->lock);
-        /*
-         * preempt at: now + delay
-         */
-        rq->hrtick_expire =
-                ktime_add_ns(rq->hrtick_timer.base->get_time(), delay);
-        /*
-         * indicate we need to program the timer
-         */
-        __set_bit(HRTICK_SET, &rq->hrtick_flags);
-        if (reset)
-                __set_bit(HRTICK_RESET, &rq->hrtick_flags);
-        /*
-         * New slices are called from the schedule path and don't need a
-         * forced reschedule.
-         */
-        if (reset)
-                resched_hrt(rq->curr);
-}
 static void hrtick_clear(struct rq *rq)
 {
        if (hrtimer_active(&rq->hrtick_timer))
@@ -1084,32 +1018,6 @@ static void hrtick_clear(struct rq *rq)
 }
 /*
- * Update the timer from the possible pending state.
- */
-static void hrtick_set(struct rq *rq)
-{
-        ktime_t time;
-        int set, reset;
-        unsigned long flags;
-        WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
-        spin_lock_irqsave(&rq->lock, flags);
-        set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags);
-        reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags);
-        time = rq->hrtick_expire;
-        clear_thread_flag(TIF_HRTICK_RESCHED);
-        spin_unlock_irqrestore(&rq->lock, flags);
-        if (set) {
-                hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS);
-                if (reset && !hrtimer_active(&rq->hrtick_timer))
-                        resched_rq(rq);
-        } else
-                hrtick_clear(rq);
-}
-/*
 * High-resolution timer tick.
 * Runs from hardirq context with interrupts disabled.
 */
@@ -1128,27 +1036,37 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
 }
 #ifdef CONFIG_SMP
-static void hotplug_hrtick_disable(int cpu)
+/*
+ * called from hardirq (IPI) context
+ */
+static void __hrtick_start(void *arg)
 {
-        struct rq *rq = cpu_rq(cpu);
+        struct rq *rq = arg;
-        unsigned long flags;
-        spin_lock_irqsave(&rq->lock, flags);
+        spin_lock(&rq->lock);
-        rq->hrtick_flags = 0;
+        hrtimer_restart(&rq->hrtick_timer);
-        __set_bit(HRTICK_BLOCK, &rq->hrtick_flags);
+        rq->hrtick_csd_pending = 0;
-        spin_unlock_irqrestore(&rq->lock, flags);
+        spin_unlock(&rq->lock);
-        hrtick_clear(rq);
 }
-static void hotplug_hrtick_enable(int cpu)
+/*
+ * Called to set the hrtick timer state.
+ *
+ * called with rq->lock held and irqs disabled
+ */
+static void hrtick_start(struct rq *rq, u64 delay)
 {
-        struct rq *rq = cpu_rq(cpu);
+        struct hrtimer *timer = &rq->hrtick_timer;
-        unsigned long flags;
+        ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
-        spin_lock_irqsave(&rq->lock, flags);
+        timer->expires = time;
-        __clear_bit(HRTICK_BLOCK, &rq->hrtick_flags);
-        spin_unlock_irqrestore(&rq->lock, flags);
+        if (rq == this_rq()) {
+                hrtimer_restart(timer);
+        } else if (!rq->hrtick_csd_pending) {
+                __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
+                rq->hrtick_csd_pending = 1;
+        }
 }
 static int
@@ -1163,16 +1081,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
        case CPU_DOWN_PREPARE_FROZEN:
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
-                hotplug_hrtick_disable(cpu);
+                hrtick_clear(cpu_rq(cpu));
-                return NOTIFY_OK;
-        case CPU_UP_PREPARE:
-        case CPU_UP_PREPARE_FROZEN:
-        case CPU_DOWN_FAILED:
-        case CPU_DOWN_FAILED_FROZEN:
-        case CPU_ONLINE:
-        case CPU_ONLINE_FROZEN:
-                hotplug_hrtick_enable(cpu);
                return NOTIFY_OK;
        }
@@ -1183,46 +1092,45 @@ static void init_hrtick(void)
 {
        hotcpu_notifier(hotplug_hrtick, 0);
 }
-#endif /* CONFIG_SMP */
+#else
+/*
+ * Called to set the hrtick timer state.
+ *
+ * called with rq->lock held and irqs disabled
+ */
+static void hrtick_start(struct rq *rq, u64 delay)
+{
+        hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
+}
-static void init_rq_hrtick(struct rq *rq)
+static void init_hrtick(void)
 {
-        rq->hrtick_flags = 0;
-        hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-        rq->hrtick_timer.function = hrtick;
-        rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 }
+#endif /* CONFIG_SMP */
-void hrtick_resched(void)
+static void init_rq_hrtick(struct rq *rq)
 {
-        struct rq *rq;
+#ifdef CONFIG_SMP
-        unsigned long flags;
+        rq->hrtick_csd_pending = 0;
-        if (!test_thread_flag(TIF_HRTICK_RESCHED))
+        rq->hrtick_csd.flags = 0;
-                return;
+        rq->hrtick_csd.func = __hrtick_start;
+        rq->hrtick_csd.info = rq;
+#endif
-        local_irq_save(flags);
+        hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-        rq = cpu_rq(smp_processor_id());
+        rq->hrtick_timer.function = hrtick;
-        hrtick_set(rq);
+        rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
-        local_irq_restore(flags);
 }
 #else
 static inline void hrtick_clear(struct rq *rq)
 {
 }
-static inline void hrtick_set(struct rq *rq)
-{
-}
 static inline void init_rq_hrtick(struct rq *rq)
 {
 }
-void hrtick_resched(void)
-{
-}
 static inline void init_hrtick(void)
 {
 }
@@ -1241,16 +1149,16 @@ static inline void init_hrtick(void)
 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
 #endif
-static void __resched_task(struct task_struct *p, int tif_bit)
+static void resched_task(struct task_struct *p)
 {
        int cpu;
        assert_spin_locked(&task_rq(p)->lock);
-        if (unlikely(test_tsk_thread_flag(p, tif_bit)))
+        if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
                return;
-        set_tsk_thread_flag(p, tif_bit);
+        set_tsk_thread_flag(p, TIF_NEED_RESCHED);
        cpu = task_cpu(p);
        if (cpu == smp_processor_id())
@@ -1313,15 +1221,15 @@ void wake_up_idle_cpu(int cpu)
        if (!tsk_is_polling(rq->idle))
                smp_send_reschedule(cpu);
 }
-#endif
+#endif /* CONFIG_NO_HZ */
-#else
+#else /* !CONFIG_SMP */
-static void __resched_task(struct task_struct *p, int tif_bit)
+static void resched_task(struct task_struct *p)
 {
        assert_spin_locked(&task_rq(p)->lock);
-        set_tsk_thread_flag(p, tif_bit);
+        set_tsk_need_resched(p);
 }
-#endif
+#endif /* CONFIG_SMP */
 #if BITS_PER_LONG == 32
 # define WMULT_CONST    (~0UL)
@@ -1336,6 +1244,9 @@ static void __resched_task(struct task_struct *p, int tif_bit)
 */
 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
+/*
+ * delta *= weight / lw
+ */
 static unsigned long
 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
                struct load_weight *lw)
@@ -1363,12 +1274,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
        return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
 }
-static inline unsigned long
-calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
-{
-        return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
-}
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
        lw->weight += inc;
@@ -1479,17 +1384,211 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
 #ifdef CONFIG_SMP
 static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
-static unsigned long cpu_avg_load_per_task(int cpu);
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
-#else /* CONFIG_SMP */
+static unsigned long cpu_avg_load_per_task(int cpu)
+{
+        struct rq *rq = cpu_rq(cpu);
+        if (rq->nr_running)
+                rq->avg_load_per_task = rq->load.weight / rq->nr_running;
+        return rq->avg_load_per_task;
+}
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
+/*
+ * Iterate the full tree, calling @down when first entering a node and @up when
+ * leaving it for the final time.
+ */
+static void
+walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
+{
+        struct task_group *parent, *child;
+        rcu_read_lock();
+        parent = &root_task_group;
+down:
+        (*down)(parent, cpu, sd);
+        list_for_each_entry_rcu(child, &parent->children, siblings) {
+                parent = child;
+                goto down;
+up:
+                continue;
+        }
+        (*up)(parent, cpu, sd);
+        child = parent;
+        parent = parent->parent;
+        if (parent)
+                goto up;
+        rcu_read_unlock();
+}
+static void __set_se_shares(struct sched_entity *se, unsigned long shares);
+/*
+ * Calculate and set the cpu's group shares.
+ */
+static void
+__update_group_shares_cpu(struct task_group *tg, int cpu,
+                          unsigned long sd_shares, unsigned long sd_rq_weight)
 {
+        int boost = 0;
+        unsigned long shares;
+        unsigned long rq_weight;
+        if (!tg->se[cpu])
+                return;
+        rq_weight = tg->cfs_rq[cpu]->load.weight;
+        /*
+         * If there are currently no tasks on the cpu pretend there is one of
+         * average load so that when a new task gets to run here it will not
+         * get delayed by group starvation.
+         */
+        if (!rq_weight) {
+                boost = 1;
+                rq_weight = NICE_0_LOAD;
+        }
+        if (unlikely(rq_weight > sd_rq_weight))
+                rq_weight = sd_rq_weight;
+        /*
+         *           \Sum shares * rq_weight
+         * shares =  -----------------------
+         *               \Sum rq_weight
+         *
+         */
+        shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+        /*
+         * record the actual number of shares, not the boosted amount.
+         */
+        tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
+        tg->cfs_rq[cpu]->rq_weight = rq_weight;
+        if (shares < MIN_SHARES)
+                shares = MIN_SHARES;
+        else if (shares > MAX_SHARES)
+                shares = MAX_SHARES;
+        __set_se_shares(tg->se[cpu], shares);
+}
+/*
+ * Re-compute the task group their per cpu shares over the given domain.
+ * This needs to be done in a bottom-up fashion because the rq weight of a
+ * parent group depends on the shares of its child groups.
+ */
+static void
+tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
+{
+        unsigned long rq_weight = 0;
+        unsigned long shares = 0;
+        int i;
+        for_each_cpu_mask(i, sd->span) {
+                rq_weight += tg->cfs_rq[i]->load.weight;
+                shares += tg->cfs_rq[i]->shares;
+        }
+        if ((!shares && rq_weight) || shares > tg->shares)
+                shares = tg->shares;
+        if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
+                shares = tg->shares;
+        if (!rq_weight)
+                rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
+        for_each_cpu_mask(i, sd->span) {
+                struct rq *rq = cpu_rq(i);
+                unsigned long flags;
+                spin_lock_irqsave(&rq->lock, flags);
+                __update_group_shares_cpu(tg, i, shares, rq_weight);
+                spin_unlock_irqrestore(&rq->lock, flags);
+        }
 }
+/*
+ * Compute the cpu's hierarchical load factor for each task group.
+ * This needs to be done in a top-down fashion because the load of a child
+ * group is a fraction of its parents load.
+ */
+static void
+tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
+{
+        unsigned long load;
+        if (!tg->parent) {
+                load = cpu_rq(cpu)->load.weight;
+        } else {
+                load = tg->parent->cfs_rq[cpu]->h_load;
+                load *= tg->cfs_rq[cpu]->shares;
+                load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
+        }
+        tg->cfs_rq[cpu]->h_load = load;
+}
+static void
+tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
+{
+}
+static void update_shares(struct sched_domain *sd)
+{
+        u64 now = cpu_clock(raw_smp_processor_id());
+        s64 elapsed = now - sd->last_update;
+        if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
+                sd->last_update = now;
+                walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
+        }
+}
+static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
+{
+        spin_unlock(&rq->lock);
+        update_shares(sd);
+        spin_lock(&rq->lock);
+}
+static void update_h_load(int cpu)
+{
+        walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
+}
+#else
+static inline void update_shares(struct sched_domain *sd)
+{
+}
+static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
+{
+}
 #endif
-#endif /* CONFIG_SMP */
+#endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+{
+#ifdef CONFIG_SMP
+        cfs_rq->shares = shares;
+#endif
+}
+#endif
 #include "sched_stats.h"
 #include "sched_idletask.c"
@@ -1500,27 +1599,17 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 #endif
 #define sched_class_highest (&rt_sched_class)
+#define for_each_class(class) \
+   for (class = sched_class_highest; class; class = class->next)
-static inline void inc_load(struct rq *rq, const struct task_struct *p)
+static void inc_nr_running(struct rq *rq)
-{
-        update_load_add(&rq->load, p->se.load.weight);
-}
-static inline void dec_load(struct rq *rq, const struct task_struct *p)
-{
-        update_load_sub(&rq->load, p->se.load.weight);
-}
-static void inc_nr_running(struct task_struct *p, struct rq *rq)
 {
        rq->nr_running++;
-        inc_load(rq, p);
 }
-static void dec_nr_running(struct task_struct *p, struct rq *rq)
+static void dec_nr_running(struct rq *rq)
 {
        rq->nr_running--;
-        dec_load(rq, p);
 }
 static void set_load_weight(struct task_struct *p)
@@ -1544,6 +1633,12 @@ static void set_load_weight(struct task_struct *p)
        p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
 }
+static void update_avg(u64 *avg, u64 sample)
+{
+        s64 diff = sample - *avg;
+        *avg += diff >> 3;
+}
 static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
 {
        sched_info_queued(p);
@@ -1553,6 +1648,13 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
 static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
 {
+        if (sleep && p->se.last_wakeup) {
+                update_avg(&p->se.avg_overlap,
+                           p->se.sum_exec_runtime - p->se.last_wakeup);
+                p->se.last_wakeup = 0;
+        }
+        sched_info_dequeued(p);
        p->sched_class->dequeue_task(rq, p, sleep);
        p->se.on_rq = 0;
 }
@@ -1612,7 +1714,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
                rq->nr_uninterruptible--;
        enqueue_task(rq, p, wakeup);
-        inc_nr_running(p, rq);
+        inc_nr_running(rq);
 }
 /*
@@ -1624,7 +1726,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
                rq->nr_uninterruptible++;
        dequeue_task(rq, p, sleep);
-        dec_nr_running(p, rq);
+        dec_nr_running(rq);
 }
 /**
@@ -1636,12 +1738,6 @@ inline int task_curr(const struct task_struct *p)
        return cpu_curr(task_cpu(p)) == p;
 }
-/* Used instead of source_load when we know the type == 0 */
-unsigned long weighted_cpuload(const int cpu)
-{
-        return cpu_rq(cpu)->load.weight;
-}
 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
        set_task_rq(p, cpu);
@@ -1670,6 +1766,12 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 #ifdef CONFIG_SMP
+/* Used instead of source_load when we know the type == 0 */
+static unsigned long weighted_cpuload(const int cpu)
+{
+        return cpu_rq(cpu)->load.weight;
+}
 /*
 * Is this task likely cache-hot:
 */
@@ -1765,16 +1867,24 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 /*
 * wait_task_inactive - wait for a thread to unschedule.
 *
+ * If @match_state is nonzero, it's the @p->state value just checked and
+ * not expected to change.  If it changes, i.e. @p might have woken up,
+ * then return zero.  When we succeed in waiting for @p to be off its CPU,
+ * we return a positive number (its total switch count).  If a second call
+ * a short while later returns the same number, the caller can be sure that
+ * @p has remained unscheduled the whole time.
+ *
 * The caller must ensure that the task *will* unschedule sometime soon,
 * else this function might spin for a *long* time. This function can't
 * be called with interrupts off, or it may introduce deadlock with
 * smp_call_function() if an IPI is sent by the same process we are
 * waiting to become inactive.
 */
-void wait_task_inactive(struct task_struct *p)
+unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 {
        unsigned long flags;
        int running, on_rq;
+        unsigned long ncsw;
        struct rq *rq;
        for (;;) {
@@ -1797,8 +1907,11 @@ void wait_task_inactive(struct task_struct *p)
                 * return false if the runqueue has changed and p
                 * is actually now running somewhere else!
                 */
-                while (task_running(rq, p))
+                while (task_running(rq, p)) {
+                        if (match_state && unlikely(p->state != match_state))
+                                return 0;
                        cpu_relax();
+                }
                /*
                 * Ok, time to look more closely! We need the rq
@@ -1808,9 +1921,21 @@ void wait_task_inactive(struct task_struct *p)
                rq = task_rq_lock(p, &flags);
                running = task_running(rq, p);
                on_rq = p->se.on_rq;
+                ncsw = 0;
+                if (!match_state || p->state == match_state) {
+                        ncsw = p->nivcsw + p->nvcsw;
+                        if (unlikely(!ncsw))
+                                ncsw = 1;
+                }
                task_rq_unlock(rq, &flags);
                /*
+                 * If it changed from the expected state, bail out now.
+                 */
+                if (unlikely(!ncsw))
+                        break;
+                /*
                 * Was it really running after all now that we
                 * checked with the proper locks actually held?
                 *
@@ -1842,6 +1967,8 @@ void wait_task_inactive(struct task_struct *p)
                 */
                break;
        }
+        return ncsw;
 }
 /***
@@ -1880,7 +2007,7 @@ static unsigned long source_load(int cpu, int type)
        struct rq *rq = cpu_rq(cpu);
        unsigned long total = weighted_cpuload(cpu);
-        if (type == 0)
+        if (type == 0 || !sched_feat(LB_BIAS))
                return total;
        return min(rq->cpu_load[type-1], total);
@@ -1895,25 +2022,13 @@ static unsigned long target_load(int cpu, int type)
        struct rq *rq = cpu_rq(cpu);
        unsigned long total = weighted_cpuload(cpu);
-        if (type == 0)
+        if (type == 0 || !sched_feat(LB_BIAS))
                return total;
        return max(rq->cpu_load[type-1], total);
 }
 /*
- * Return the average load per task on the cpu's run queue
- */
-static unsigned long cpu_avg_load_per_task(int cpu)
-{
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long total = weighted_cpuload(cpu);
-        unsigned long n = rq->nr_running;
-        return n ? total / n : SCHED_LOAD_SCALE;
-}
-/*
 * find_idlest_group finds and returns the least busy CPU group within the
 * domain.
 */
@@ -1939,7 +2054,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
                /* Tally up the load of all CPUs in the group */
                avg_load = 0;
-                for_each_cpu_mask(i, group->cpumask) {
+                for_each_cpu_mask_nr(i, group->cpumask) {
                        /* Bias balancing toward cpus of our domain */
                        if (local_group)
                                load = source_load(i, load_idx);
@@ -1981,7 +2096,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
        /* Traverse only the allowed CPUs */
        cpus_and(*tmp, group->cpumask, p->cpus_allowed);
-        for_each_cpu_mask(i, *tmp) {
+        for_each_cpu_mask_nr(i, *tmp) {
                load = weighted_cpuload(i);
                if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -2019,6 +2134,9 @@ static int sched_balance_self(int cpu, int flag)
                        sd = tmp;
        }
+        if (sd)
+                update_shares(sd);
        while (sd) {
                cpumask_t span, tmpmask;
                struct sched_group *group;
@@ -2085,6 +2203,22 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
        if (!sched_feat(SYNC_WAKEUPS))
                sync = 0;
+#ifdef CONFIG_SMP
+        if (sched_feat(LB_WAKEUP_UPDATE)) {
+                struct sched_domain *sd;
+                this_cpu = raw_smp_processor_id();
+                cpu = task_cpu(p);
+                for_each_domain(this_cpu, sd) {
+                        if (cpu_isset(cpu, sd->span)) {
+                                update_shares(sd);
+                                break;
+                        }
+                }
+        }
+#endif
        smp_wmb();
        rq = task_rq_lock(p, &flags);
        old_state = p->state;
@@ -2131,7 +2265,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
                        }
                }
        }
-#endif
+#endif /* CONFIG_SCHEDSTATS */
 out_activate:
 #endif /* CONFIG_SMP */
@@ -2149,6 +2283,9 @@ out_activate:
        success = 1;
 out_running:
+        trace_mark(kernel_sched_wakeup,
+                "pid %d state %ld ## rq %p task %p rq->curr %p",
+                p->pid, p->state, rq, p, rq->curr);
        check_preempt_curr(rq, p);
        p->state = TASK_RUNNING;
@@ -2157,6 +2294,8 @@ out_running:
                p->sched_class->task_wake_up(rq, p);
 #endif
 out:
+        current->se.last_wakeup = current->se.sum_exec_runtime;
        task_rq_unlock(rq, &flags);
        return success;
@@ -2277,8 +2416,11 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
                 * management (if any):
                 */
                p->sched_class->task_new(rq, p);
-                inc_nr_running(p, rq);
+                inc_nr_running(rq);
        }
+        trace_mark(kernel_sched_wakeup_new,
+                "pid %d state %ld ## rq %p task %p rq->curr %p",
+                p->pid, p->state, rq, p, rq->curr);
        check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
        if (p->sched_class->task_wake_up)
@@ -2331,7 +2473,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
                notifier->ops->sched_out(notifier, next);
 }
-#else
+#else /* !CONFIG_PREEMPT_NOTIFIERS */
 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
 {
@@ -2343,7 +2485,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
 {
 }
-#endif
+#endif /* CONFIG_PREEMPT_NOTIFIERS */
 /**
 * prepare_task_switch - prepare to switch tasks
@@ -2451,6 +2593,11 @@ context_switch(struct rq *rq, struct task_struct *prev,
        struct mm_struct *mm, *oldmm;
        prepare_task_switch(rq, prev, next);
+        trace_mark(kernel_sched_schedule,
+                "prev_pid %d next_pid %d prev_state %ld "
+                "## rq %p prev %p next %p",
+                prev->pid, next->pid, prev->state,
+                rq, prev, next);
        mm = next->mm;
        oldmm = prev->active_mm;
        /*
@@ -2680,7 +2827,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
        rq = task_rq_lock(p, &flags);
        if (!cpu_isset(dest_cpu, p->cpus_allowed)
-            || unlikely(cpu_is_offline(dest_cpu)))
+            || unlikely(!cpu_active(dest_cpu)))
                goto out;
        /* force the process onto the specified CPU */
@@ -2785,7 +2932,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
              enum cpu_idle_type idle, int *all_pinned,
              int *this_best_prio, struct rq_iterator *iterator)
 {
-        int loops = 0, pulled = 0, pinned = 0, skip_for_load;
+        int loops = 0, pulled = 0, pinned = 0;
        struct task_struct *p;
        long rem_load_move = max_load_move;
@@ -2801,14 +2948,8 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 next:
        if (!p || loops++ > sysctl_sched_nr_migrate)
                goto out;
-        /*
-         * To help distribute high priority tasks across CPUs we don't
+        if ((p->se.load.weight >> 1) > rem_load_move ||
-         * skip a task if it will be the highest priority task (i.e. smallest
-         * prio value) on its new queue regardless of its load weight
-         */
-        skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
-                                                         SCHED_LOAD_SCALE_FUZZ;
-        if ((skip_for_load && p->prio >= *this_best_prio) ||
            !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
                p = iterator->next(iterator->arg);
                goto next;
@@ -2863,6 +3004,10 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
                                max_load_move - total_load_moved,
                                sd, idle, all_pinned, &this_best_prio);
                class = class->next;
+                if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
+                        break;
        } while (class && max_load_move > total_load_moved);
        return total_load_moved > 0;
@@ -2939,6 +3084,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
        max_load = this_load = total_load = total_pwr = 0;
        busiest_load_per_task = busiest_nr_running = 0;
        this_load_per_task = this_nr_running = 0;
        if (idle == CPU_NOT_IDLE)
                load_idx = sd->busy_idx;
        else if (idle == CPU_NEWLY_IDLE)
@@ -2953,6 +3099,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                int __group_imb = 0;
                unsigned int balance_cpu = -1, first_idle_cpu = 0;
                unsigned long sum_nr_running, sum_weighted_load;
+                unsigned long sum_avg_load_per_task;
+                unsigned long avg_load_per_task;
                local_group = cpu_isset(this_cpu, group->cpumask);
@@ -2961,10 +3109,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                /* Tally up the load of all CPUs in the group */
                sum_weighted_load = sum_nr_running = avg_load = 0;
+                sum_avg_load_per_task = avg_load_per_task = 0;
                max_cpu_load = 0;
                min_cpu_load = ~0UL;
-                for_each_cpu_mask(i, group->cpumask) {
+                for_each_cpu_mask_nr(i, group->cpumask) {
                        struct rq *rq;
                        if (!cpu_isset(i, *cpus))
@@ -2994,6 +3144,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                        avg_load += load;
                        sum_nr_running += rq->nr_running;
                        sum_weighted_load += weighted_cpuload(i);
+                        sum_avg_load_per_task += cpu_avg_load_per_task(i);
                }
                /*
@@ -3015,7 +3167,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                avg_load = sg_div_cpu_power(group,
                                avg_load * SCHED_LOAD_SCALE);
-                if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE)
+                /*
+                 * Consider the group unbalanced when the imbalance is larger
+                 * than the average weight of two tasks.
+                 *
+                 * APZ: with cgroup the avg task weight can vary wildly and
+                 *      might not be a suitable number - should we keep a
+                 *      normalized nr_running number somewhere that negates
+                 *      the hierarchy?
+                 */
+                avg_load_per_task = sg_div_cpu_power(group,
+                                sum_avg_load_per_task * SCHED_LOAD_SCALE);
+                if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
                        __group_imb = 1;
                group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
@@ -3156,9 +3321,9 @@ small_imbalance:
                        if (busiest_load_per_task > this_load_per_task)
                                imbn = 1;
                } else
-                        this_load_per_task = SCHED_LOAD_SCALE;
+                        this_load_per_task = cpu_avg_load_per_task(this_cpu);
-                if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
+                if (max_load - this_load + 2*busiest_load_per_task >=
                                        busiest_load_per_task * imbn) {
                        *imbalance = busiest_load_per_task;
                        return busiest;
@@ -3228,7 +3393,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
        unsigned long max_load = 0;
        int i;
-        for_each_cpu_mask(i, group->cpumask) {
+        for_each_cpu_mask_nr(i, group->cpumask) {
                unsigned long wl;
                if (!cpu_isset(i, *cpus))
@@ -3284,6 +3449,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        schedstat_inc(sd, lb_count[idle]);
 redo:
+        update_shares(sd);
        group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
                                   cpus, balance);
@@ -3386,8 +3552,9 @@ redo:
        if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                return -1;
+                ld_moved = -1;
-        return ld_moved;
+        goto out;
 out_balanced:
        schedstat_inc(sd, lb_balanced[idle]);
@@ -3402,8 +3569,13 @@ out_one_pinned:
        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                return -1;
+                ld_moved = -1;
-        return 0;
+        else
+                ld_moved = 0;
+out:
+        if (ld_moved)
+                update_shares(sd);
+        return ld_moved;
 }
 /*
@@ -3438,6 +3610,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
        schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
 redo:
+        update_shares_locked(this_rq, sd);
        group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
                                   &sd_idle, cpus, NULL);
        if (!group) {
@@ -3481,6 +3654,7 @@ redo:
        } else
                sd->nr_balance_failed = 0;
+        update_shares_locked(this_rq, sd);
        return ld_moved;
 out_balanced:
@@ -3621,7 +3795,7 @@ int select_nohz_load_balancer(int stop_tick)
                /*
                 * If we are going offline and still the leader, give up!
                 */
-                if (cpu_is_offline(cpu) &&
+                if (!cpu_active(cpu) &&
                    atomic_read(&nohz.load_balancer) == cpu) {
                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
                                BUG();
@@ -3672,6 +3846,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
        /* Earliest time when we have to do rebalance again */
        unsigned long next_balance = jiffies + 60*HZ;
        int update_next_balance = 0;
+        int need_serialize;
        cpumask_t tmp;
        for_each_domain(cpu, sd) {
@@ -3689,8 +3864,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
                if (interval > HZ*NR_CPUS/10)
                        interval = HZ*NR_CPUS/10;
+                need_serialize = sd->flags & SD_SERIALIZE;
-                if (sd->flags & SD_SERIALIZE) {
+                if (need_serialize) {
                        if (!spin_trylock(&balancing))
                                goto out;
                }
@@ -3706,7 +3882,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
                        }
                        sd->last_balance = jiffies;
                }
-                if (sd->flags & SD_SERIALIZE)
+                if (need_serialize)
                        spin_unlock(&balancing);
 out:
                if (time_after(next_balance, sd->last_balance + interval)) {
@@ -3759,7 +3935,7 @@ static void run_rebalance_domains(struct softirq_action *h)
                int balance_cpu;
                cpu_clear(this_cpu, cpus);
-                for_each_cpu_mask(balance_cpu, cpus) {
+                for_each_cpu_mask_nr(balance_cpu, cpus) {
                        /*
                         * If this cpu gets work to do, stop the load balancing
                         * work being done for other cpus. Next load
@@ -3895,6 +4071,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
                cpustat->nice = cputime64_add(cpustat->nice, tmp);
        else
                cpustat->user = cputime64_add(cpustat->user, tmp);
+        /* Account for user time used */
+        acct_update_integrals(p);
 }
 /*
@@ -4021,26 +4199,44 @@ void scheduler_tick(void)
 #endif
 }
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+                                defined(CONFIG_PREEMPT_TRACER))
+static inline unsigned long get_parent_ip(unsigned long addr)
+{
+        if (in_lock_functions(addr)) {
+                addr = CALLER_ADDR2;
+                if (in_lock_functions(addr))
+                        addr = CALLER_ADDR3;
+        }
+        return addr;
+}
 void __kprobes add_preempt_count(int val)
 {
+#ifdef CONFIG_DEBUG_PREEMPT
        /*
         * Underflow?
         */
        if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
                return;
+#endif
        preempt_count() += val;
+#ifdef CONFIG_DEBUG_PREEMPT
        /*
         * Spinlock count overflowing soon?
         */
        DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
                                PREEMPT_MASK - 10);
+#endif
+        if (preempt_count() == val)
+                trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 }
 EXPORT_SYMBOL(add_preempt_count);
 void __kprobes sub_preempt_count(int val)
 {
+#ifdef CONFIG_DEBUG_PREEMPT
        /*
         * Underflow?
         */
@@ -4052,7 +4248,10 @@ void __kprobes sub_preempt_count(int val)
        if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
                        !(preempt_count() & PREEMPT_MASK)))
                return;
+#endif
+        if (preempt_count() == val)
+                trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
        preempt_count() -= val;
 }
 EXPORT_SYMBOL(sub_preempt_count);
@@ -4070,6 +4269,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
                prev->comm, prev->pid, preempt_count());
        debug_show_held_locks(prev);
+        print_modules();
        if (irqs_disabled())
                print_irqtrace_events(prev);
@@ -4158,7 +4358,8 @@ need_resched_nonpreemptible:
        schedule_debug(prev);
-        hrtick_clear(rq);
+        if (sched_feat(HRTICK))
+                hrtick_clear(rq);
        /*
         * Do the rq-clock update outside the rq lock:
@@ -4204,8 +4405,6 @@ need_resched_nonpreemptible:
        } else
                spin_unlock_irq(&rq->lock);
-        hrtick_set(rq);
        if (unlikely(reacquire_kernel_lock(current) < 0))
                goto need_resched_nonpreemptible;
@@ -4586,10 +4785,8 @@ void set_user_nice(struct task_struct *p, long nice)
                goto out_unlock;
        }
        on_rq = p->se.on_rq;
-        if (on_rq) {
+        if (on_rq)
                dequeue_task(rq, p, 0);
-                dec_load(rq, p);
-        }
        p->static_prio = NICE_TO_PRIO(nice);
        set_load_weight(p);
@@ -4599,7 +4796,6 @@ void set_user_nice(struct task_struct *p, long nice)
        if (on_rq) {
                enqueue_task(rq, p, 0);
-                inc_load(rq, p);
                /*
                 * If the task increased its priority or is running and
                 * lowered its priority, then reschedule its CPU:
@@ -4744,16 +4940,8 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
        set_load_weight(p);
 }
-/**
+static int __sched_setscheduler(struct task_struct *p, int policy,
- * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
+                                struct sched_param *param, bool user)
- * @p: the task in question.
- * @policy: new policy.
- * @param: structure containing the new RT priority.
- *
- * NOTE that the task may be already dead.
- */
-int sched_setscheduler(struct task_struct *p, int policy,
-                       struct sched_param *param)
 {
        int retval, oldprio, oldpolicy = -1, on_rq, running;
        unsigned long flags;
@@ -4785,7 +4973,7 @@ recheck:
        /*
         * Allow unprivileged RT tasks to decrease priority:
         */
-        if (!capable(CAP_SYS_NICE)) {
+        if (user && !capable(CAP_SYS_NICE)) {
                if (rt_policy(policy)) {
                        unsigned long rlim_rtprio;
@@ -4821,7 +5009,8 @@ recheck:
         * Do not allow realtime tasks into groups that have no runtime
         * assigned.
         */
-        if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
+        if (user
+            && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
                return -EPERM;
 #endif
@@ -4870,8 +5059,39 @@ recheck:
        return 0;
 }
+/**
+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * NOTE that the task may be already dead.
+ */
+int sched_setscheduler(struct task_struct *p, int policy,
+                       struct sched_param *param)
+{
+        return __sched_setscheduler(p, policy, param, true);
+}
 EXPORT_SYMBOL_GPL(sched_setscheduler);
+/**
+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Just like sched_setscheduler, only don't bother checking if the
+ * current context has permission.  For example, this is needed in
+ * stop_machine(): we create temporary high priority worker threads,
+ * but our caller might not have that capability.
+ */
+int sched_setscheduler_nocheck(struct task_struct *p, int policy,
+                               struct sched_param *param)
+{
+        return __sched_setscheduler(p, policy, param, false);
+}
 static int
 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 {
@@ -5070,24 +5290,6 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
        return sched_setaffinity(pid, &new_mask);
 }
-/*
- * Represents all cpu's present in the system
- * In systems capable of hotplug, this map could dynamically grow
- * as new cpu's are detected in the system via any platform specific
- * method, such as ACPI for e.g.
- */
-cpumask_t cpu_present_map __read_mostly;
-EXPORT_SYMBOL(cpu_present_map);
-#ifndef CONFIG_SMP
-cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
-EXPORT_SYMBOL(cpu_online_map);
-cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
-EXPORT_SYMBOL(cpu_possible_map);
-#endif
 long sched_getaffinity(pid_t pid, cpumask_t *mask)
 {
        struct task_struct *p;
@@ -5384,7 +5586,7 @@ out_unlock:
        return retval;
 }
-static const char stat_nam[] = "RSDTtZX";
+static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
 void sched_show_task(struct task_struct *p)
 {
@@ -5571,6 +5773,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
                goto out;
        }
+        if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
+                     !cpus_equal(p->cpus_allowed, *new_mask))) {
+                ret = -EINVAL;
+                goto out;
+        }
        if (p->sched_class->set_cpus_allowed)
                p->sched_class->set_cpus_allowed(p, new_mask);
        else {
@@ -5613,7 +5821,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
        struct rq *rq_dest, *rq_src;
        int ret = 0, on_rq;
-        if (unlikely(cpu_is_offline(dest_cpu)))
+        if (unlikely(!cpu_active(dest_cpu)))
                return ret;
        rq_src = cpu_rq(src_cpu);
@@ -6060,6 +6268,36 @@ static void unregister_sched_domain_sysctl(void)
 }
 #endif
+static void set_rq_online(struct rq *rq)
+{
+        if (!rq->online) {
+                const struct sched_class *class;
+                cpu_set(rq->cpu, rq->rd->online);
+                rq->online = 1;
+                for_each_class(class) {
+                        if (class->rq_online)
+                                class->rq_online(rq);
+                }
+        }
+}
+static void set_rq_offline(struct rq *rq)
+{
+        if (rq->online) {
+                const struct sched_class *class;
+                for_each_class(class) {
+                        if (class->rq_offline)
+                                class->rq_offline(rq);
+                }
+                cpu_clear(rq->cpu, rq->rd->online);
+                rq->online = 0;
+        }
+}
 /*
 * migration_call - callback that gets triggered when a CPU is added.
 * Here we can start up the necessary migration thread for the new CPU.
@@ -6097,7 +6335,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
                        BUG_ON(!cpu_isset(cpu, rq->rd->span));
-                        cpu_set(cpu, rq->rd->online);
+                        set_rq_online(rq);
                }
                spin_unlock_irqrestore(&rq->lock, flags);
                break;
@@ -6158,7 +6397,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
                        BUG_ON(!cpu_isset(cpu, rq->rd->span));
-                        cpu_clear(cpu, rq->rd->online);
+                        set_rq_offline(rq);
                }
                spin_unlock_irqrestore(&rq->lock, flags);
                break;
@@ -6175,7 +6414,7 @@ static struct notifier_block __cpuinitdata migration_notifier = {
        .priority = 10
 };
-void __init migration_init(void)
+static int __init migration_init(void)
 {
        void *cpu = (void *)(long)smp_processor_id();
        int err;
@@ -6185,13 +6424,38 @@ void __init migration_init(void)
        BUG_ON(err == NOTIFY_BAD);
        migration_call(&migration_notifier, CPU_ONLINE, cpu);
        register_cpu_notifier(&migration_notifier);
+        return err;
 }
+early_initcall(migration_init);
 #endif
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SCHED_DEBUG
+static inline const char *sd_level_to_string(enum sched_domain_level lvl)
+{
+        switch (lvl) {
+        case SD_LV_NONE:
+                        return "NONE";
+        case SD_LV_SIBLING:
+                        return "SIBLING";
+        case SD_LV_MC:
+                        return "MC";
+        case SD_LV_CPU:
+                        return "CPU";
+        case SD_LV_NODE:
+                        return "NODE";
+        case SD_LV_ALLNODES:
+                        return "ALLNODES";
+        case SD_LV_MAX:
+                        return "MAX";
+        }
+        return "MAX";
+}
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                                  cpumask_t *groupmask)
 {
@@ -6211,7 +6475,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                return -1;
        }
-        printk(KERN_CONT "span %s\n", str);
+        printk(KERN_CONT "span %s level %s\n",
+                str, sd_level_to_string(sd->level));
        if (!cpu_isset(cpu, sd->span)) {
                printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -6295,9 +6560,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
        }
        kfree(groupmask);
 }
-#else
+#else /* !CONFIG_SCHED_DEBUG */
 # define sched_domain_debug(sd, cpu) do { } while (0)
-#endif
+#endif /* CONFIG_SCHED_DEBUG */
 static int sd_degenerate(struct sched_domain *sd)
 {
@@ -6357,20 +6622,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 {
        unsigned long flags;
-        const struct sched_class *class;
        spin_lock_irqsave(&rq->lock, flags);
        if (rq->rd) {
                struct root_domain *old_rd = rq->rd;
-                for (class = sched_class_highest; class; class = class->next) {
+                if (cpu_isset(rq->cpu, old_rd->online))
-                        if (class->leave_domain)
+                        set_rq_offline(rq);
-                                class->leave_domain(rq);
-                }
                cpu_clear(rq->cpu, old_rd->span);
-                cpu_clear(rq->cpu, old_rd->online);
                if (atomic_dec_and_test(&old_rd->refcount))
                        kfree(old_rd);
@@ -6381,12 +6642,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
        cpu_set(rq->cpu, rd->span);
        if (cpu_isset(rq->cpu, cpu_online_map))
-                cpu_set(rq->cpu, rd->online);
+                set_rq_online(rq);
-        for (class = sched_class_highest; class; class = class->next) {
-                if (class->join_domain)
-                        class->join_domain(rq);
-        }
        spin_unlock_irqrestore(&rq->lock, flags);
 }
@@ -6397,6 +6653,8 @@ static void init_rootdomain(struct root_domain *rd)
        cpus_clear(rd->span);
        cpus_clear(rd->online);
+        cpupri_init(&rd->cpupri);
 }
 static void init_defrootdomain(void)
@@ -6458,7 +6716,8 @@ static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
-        int ints[NR_CPUS], i;
+        static int __initdata ints[NR_CPUS];
+        int i;
        str = get_options(str, ARRAY_SIZE(ints), ints);
        cpus_clear(cpu_isolated_map);
@@ -6492,7 +6751,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
        cpus_clear(*covered);
-        for_each_cpu_mask(i, *span) {
+        for_each_cpu_mask_nr(i, *span) {
                struct sched_group *sg;
                int group = group_fn(i, cpu_map, &sg, tmpmask);
                int j;
@@ -6503,7 +6762,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
                cpus_clear(sg->cpumask);
                sg->__cpu_power = 0;
-                for_each_cpu_mask(j, *span) {
+                for_each_cpu_mask_nr(j, *span) {
                        if (group_fn(j, cpu_map, NULL, tmpmask) != group)
                                continue;
@@ -6539,9 +6798,9 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
        min_val = INT_MAX;
-        for (i = 0; i < MAX_NUMNODES; i++) {
+        for (i = 0; i < nr_node_ids; i++) {
                /* Start at @node */
-                n = (node + i) % MAX_NUMNODES;
+                n = (node + i) % nr_node_ids;
                if (!nr_cpus_node(n))
                        continue;
@@ -6591,7 +6850,7 @@ static void sched_domain_node_span(int node, cpumask_t *span)
                cpus_or(*span, *span, *nodemask);
        }
 }
-#endif
+#endif /* CONFIG_NUMA */
 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
@@ -6610,7 +6869,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
                *sg = &per_cpu(sched_group_cpus, cpu);
        return cpu;
 }
-#endif
+#endif /* CONFIG_SCHED_SMT */
 /*
 * multi-core sched-domains:
@@ -6618,7 +6877,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
 #ifdef CONFIG_SCHED_MC
 static DEFINE_PER_CPU(struct sched_domain, core_domains);
 static DEFINE_PER_CPU(struct sched_group, sched_group_core);
-#endif
+#endif /* CONFIG_SCHED_MC */
 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
 static int
@@ -6703,7 +6962,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
        if (!sg)
                return;
        do {
-                for_each_cpu_mask(j, sg->cpumask) {
+                for_each_cpu_mask_nr(j, sg->cpumask) {
                        struct sched_domain *sd;
                        sd = &per_cpu(phys_domains, j);
@@ -6720,7 +6979,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
                sg = sg->next;
        } while (sg != group_head);
 }
-#endif
+#endif /* CONFIG_NUMA */
 #ifdef CONFIG_NUMA
 /* Free memory allocated for various sched_group structures */
@@ -6728,14 +6987,14 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
 {
        int cpu, i;
-        for_each_cpu_mask(cpu, *cpu_map) {
+        for_each_cpu_mask_nr(cpu, *cpu_map) {
                struct sched_group **sched_group_nodes
                        = sched_group_nodes_bycpu[cpu];
                if (!sched_group_nodes)
                        continue;
-                for (i = 0; i < MAX_NUMNODES; i++) {
+                for (i = 0; i < nr_node_ids; i++) {
                        struct sched_group *oldsg, *sg = sched_group_nodes[i];
                        *nodemask = node_to_cpumask(i);
@@ -6757,11 +7016,11 @@ next_sg:
                sched_group_nodes_bycpu[cpu] = NULL;
        }
 }
-#else
+#else /* !CONFIG_NUMA */
 static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
 {
 }
-#endif
+#endif /* CONFIG_NUMA */
 /*
 * Initialize sched groups cpu_power.
@@ -6928,7 +7187,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
        /*
         * Allocate the per-node list of sched groups
         */
-        sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *),
+        sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
                                    GFP_KERNEL);
        if (!sched_group_nodes) {
                printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -6967,7 +7226,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
        /*
         * Set up domains for cpus specified by the cpu_map.
         */
-        for_each_cpu_mask(i, *cpu_map) {
+        for_each_cpu_mask_nr(i, *cpu_map) {
                struct sched_domain *sd = NULL, *p;
                SCHED_CPUMASK_VAR(nodemask, allmasks);
@@ -7034,7 +7293,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #ifdef CONFIG_SCHED_SMT
        /* Set up CPU (sibling) groups */
-        for_each_cpu_mask(i, *cpu_map) {
+        for_each_cpu_mask_nr(i, *cpu_map) {
                SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
                SCHED_CPUMASK_VAR(send_covered, allmasks);
@@ -7051,7 +7310,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #ifdef CONFIG_SCHED_MC
        /* Set up multi-core groups */
-        for_each_cpu_mask(i, *cpu_map) {
+        for_each_cpu_mask_nr(i, *cpu_map) {
                SCHED_CPUMASK_VAR(this_core_map, allmasks);
                SCHED_CPUMASK_VAR(send_covered, allmasks);
@@ -7067,7 +7326,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #endif
        /* Set up physical groups */
-        for (i = 0; i < MAX_NUMNODES; i++) {
+        for (i = 0; i < nr_node_ids; i++) {
                SCHED_CPUMASK_VAR(nodemask, allmasks);
                SCHED_CPUMASK_VAR(send_covered, allmasks);
@@ -7091,7 +7350,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
                                        send_covered, tmpmask);
        }
-        for (i = 0; i < MAX_NUMNODES; i++) {
+        for (i = 0; i < nr_node_ids; i++) {
                /* Set up node groups */
                struct sched_group *sg, *prev;
                SCHED_CPUMASK_VAR(nodemask, allmasks);
@@ -7118,7 +7377,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
                        goto error;
                }
                sched_group_nodes[i] = sg;
-                for_each_cpu_mask(j, *nodemask) {
+                for_each_cpu_mask_nr(j, *nodemask) {
                        struct sched_domain *sd;
                        sd = &per_cpu(node_domains, j);
@@ -7130,9 +7389,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
                cpus_or(*covered, *covered, *nodemask);
                prev = sg;
-                for (j = 0; j < MAX_NUMNODES; j++) {
+                for (j = 0; j < nr_node_ids; j++) {
                        SCHED_CPUMASK_VAR(notcovered, allmasks);
-                        int n = (i + j) % MAX_NUMNODES;
+                        int n = (i + j) % nr_node_ids;
                        node_to_cpumask_ptr(pnodemask, n);
                        cpus_complement(*notcovered, *covered);
@@ -7164,28 +7423,28 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
        /* Calculate CPU power for physical packages and nodes */
 #ifdef CONFIG_SCHED_SMT
-        for_each_cpu_mask(i, *cpu_map) {
+        for_each_cpu_mask_nr(i, *cpu_map) {
                struct sched_domain *sd = &per_cpu(cpu_domains, i);
                init_sched_groups_power(i, sd);
        }
 #endif
 #ifdef CONFIG_SCHED_MC
-        for_each_cpu_mask(i, *cpu_map) {
+        for_each_cpu_mask_nr(i, *cpu_map) {
                struct sched_domain *sd = &per_cpu(core_domains, i);
                init_sched_groups_power(i, sd);
        }
 #endif
-        for_each_cpu_mask(i, *cpu_map) {
+        for_each_cpu_mask_nr(i, *cpu_map) {
                struct sched_domain *sd = &per_cpu(phys_domains, i);
                init_sched_groups_power(i, sd);
        }
 #ifdef CONFIG_NUMA
-        for (i = 0; i < MAX_NUMNODES; i++)
+        for (i = 0; i < nr_node_ids; i++)
                init_numa_sched_groups_power(sched_group_nodes[i]);
        if (sd_allnodes) {
@@ -7198,7 +7457,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #endif
        /* Attach the domains */
-        for_each_cpu_mask(i, *cpu_map) {
+        for_each_cpu_mask_nr(i, *cpu_map) {
                struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
                sd = &per_cpu(cpu_domains, i);
@@ -7243,18 +7502,6 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
 }
 /*
- * Free current domain masks.
- * Called after all cpus are attached to NULL domain.
- */
-static void free_sched_domains(void)
-{
-        ndoms_cur = 0;
-        if (doms_cur != &fallback_doms)
-                kfree(doms_cur);
-        doms_cur = &fallback_doms;
-}
-/*
 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
 * For now this just excludes isolated cpus, but could be used to
 * exclude other special cases in the future.
@@ -7293,7 +7540,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
        unregister_sched_domain_sysctl();
-        for_each_cpu_mask(i, *cpu_map)
+        for_each_cpu_mask_nr(i, *cpu_map)
                cpu_attach_domain(NULL, &def_root_domain, i);
        synchronize_sched();
        arch_destroy_sched_domains(cpu_map, &tmpmask);
@@ -7332,7 +7579,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
 * ownership of it and will kfree it when done with it. If the caller
 * failed the kmalloc call, then it can pass in doms_new == NULL,
 * and partition_sched_domains() will fallback to the single partition
- * 'fallback_doms'.
+ * 'fallback_doms', it also forces the domains to be rebuilt.
 *
 * Call with hotplug lock held
 */
@@ -7346,12 +7593,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
        /* always unregister in case we don't destroy any domains */
        unregister_sched_domain_sysctl();
-        if (doms_new == NULL) {
+        if (doms_new == NULL)
-                ndoms_new = 1;
+                ndoms_new = 0;
-                doms_new = &fallback_doms;
-                cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
-                dattr_new = NULL;
-        }
        /* Destroy deleted domains */
        for (i = 0; i < ndoms_cur; i++) {
@@ -7366,6 +7609,14 @@ match1:
                ;
        }
+        if (doms_new == NULL) {
+                ndoms_cur = 0;
+                ndoms_new = 1;
+                doms_new = &fallback_doms;
+                cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
+                dattr_new = NULL;
+        }
        /* Build new domains */
        for (i = 0; i < ndoms_new; i++) {
                for (j = 0; j < ndoms_cur; j++) {
@@ -7396,17 +7647,10 @@ match2:
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 int arch_reinit_sched_domains(void)
 {
-        int err;
        get_online_cpus();
-        mutex_lock(&sched_domains_mutex);
+        rebuild_sched_domains();
-        detach_destroy_domains(&cpu_online_map);
-        free_sched_domains();
-        err = arch_init_sched_domains(&cpu_online_map);
-        mutex_unlock(&sched_domains_mutex);
        put_online_cpus();
+        return 0;
-        return err;
 }
 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
@@ -7427,11 +7671,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 }
 #ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
+static ssize_t sched_mc_power_savings_show(struct sys_device *dev,
+                                struct sysdev_attribute *attr, char *page)
 {
        return sprintf(page, "%u\n", sched_mc_power_savings);
 }
 static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
+                                            struct sysdev_attribute *attr,
                                            const char *buf, size_t count)
 {
        return sched_power_savings_store(buf, count, 0);
@@ -7441,11 +7687,13 @@ static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
 #endif
 #ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
+static ssize_t sched_smt_power_savings_show(struct sys_device *dev,
+                                struct sysdev_attribute *attr, char *page)
 {
        return sprintf(page, "%u\n", sched_smt_power_savings);
 }
 static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
+                                             struct sysdev_attribute *attr,
                                             const char *buf, size_t count)
 {
        return sched_power_savings_store(buf, count, 1);
@@ -7470,54 +7718,51 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
 #endif
        return err;
 }
-#endif
+#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+#ifndef CONFIG_CPUSETS
 /*
- * Force a reinitialization of the sched domains hierarchy. The domains
+ * Add online and remove offline CPUs from the scheduler domains.
- * and groups cannot be updated in place without racing with the balancing
+ * When cpusets are enabled they take over this function.
- * code, so we temporarily attach all running cpus to the NULL domain
- * which will prevent rebalancing while the sched domains are recalculated.
 */
 static int update_sched_domains(struct notifier_block *nfb,
                                unsigned long action, void *hcpu)
 {
        switch (action) {
-        case CPU_UP_PREPARE:
+        case CPU_ONLINE:
-        case CPU_UP_PREPARE_FROZEN:
+        case CPU_ONLINE_FROZEN:
+        case CPU_DEAD:
+        case CPU_DEAD_FROZEN:
+                partition_sched_domains(0, NULL, NULL);
+                return NOTIFY_OK;
+        default:
+                return NOTIFY_DONE;
+        }
+}
+#endif
+static int update_runtime(struct notifier_block *nfb,
+                                unsigned long action, void *hcpu)
+{
+        int cpu = (int)(long)hcpu;
+        switch (action) {
        case CPU_DOWN_PREPARE:
        case CPU_DOWN_PREPARE_FROZEN:
-                detach_destroy_domains(&cpu_online_map);
+                disable_runtime(cpu_rq(cpu));
-                free_sched_domains();
                return NOTIFY_OK;
-        case CPU_UP_CANCELED:
-        case CPU_UP_CANCELED_FROZEN:
        case CPU_DOWN_FAILED:
        case CPU_DOWN_FAILED_FROZEN:
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
-        case CPU_DEAD:
+                enable_runtime(cpu_rq(cpu));
-        case CPU_DEAD_FROZEN:
+                return NOTIFY_OK;
-                /*
-                 * Fall through and re-initialise the domains.
-                 */
-                break;
        default:
                return NOTIFY_DONE;
        }
-#ifndef CONFIG_CPUSETS
-        /*
-         * Create default domain partitioning if cpusets are disabled.
-         * Otherwise we let cpusets rebuild the domains based on the
-         * current setup.
-         */
-        /* The hotplug lock is already held by cpu_up/cpu_down */
-        arch_init_sched_domains(&cpu_online_map);
-#endif
-        return NOTIFY_OK;
 }
 void __init sched_init_smp(void)
@@ -7537,8 +7782,15 @@ void __init sched_init_smp(void)
                cpu_set(smp_processor_id(), non_isolated_cpus);
        mutex_unlock(&sched_domains_mutex);
        put_online_cpus();
+#ifndef CONFIG_CPUSETS
        /* XXX: Theoretical race here - CPU may be hotplugged now */
        hotcpu_notifier(update_sched_domains, 0);
+#endif
+        /* RT runtime code needs to handle some hotplug events */
+        hotcpu_notifier(update_runtime, 0);
        init_hrtick();
        /* Move init over to a non-isolated CPU */
@@ -7695,8 +7947,8 @@ void __init sched_init(void)
                root_task_group.cfs_rq = (struct cfs_rq **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
-#endif
+#endif /* CONFIG_USER_SCHED */
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
                init_task_group.rt_se = (struct sched_rt_entity **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
@@ -7710,8 +7962,8 @@ void __init sched_init(void)
                root_task_group.rt_rq = (struct rt_rq **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
-#endif
+#endif /* CONFIG_USER_SCHED */
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
        }
 #ifdef CONFIG_SMP
@@ -7727,8 +7979,8 @@ void __init sched_init(void)
 #ifdef CONFIG_USER_SCHED
        init_rt_bandwidth(&root_task_group.rt_bandwidth,
                        global_rt_period(), RUNTIME_INF);
-#endif
+#endif /* CONFIG_USER_SCHED */
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_GROUP_SCHED
        list_add(&init_task_group.list, &task_groups);
@@ -7738,8 +7990,8 @@ void __init sched_init(void)
        INIT_LIST_HEAD(&root_task_group.children);
        init_task_group.parent = &root_task_group;
        list_add(&init_task_group.siblings, &root_task_group.children);
-#endif
+#endif /* CONFIG_USER_SCHED */
-#endif
+#endif /* CONFIG_GROUP_SCHED */
        for_each_possible_cpu(i) {
                struct rq *rq;
@@ -7819,6 +8071,7 @@ void __init sched_init(void)
                rq->next_balance = jiffies;
                rq->push_cpu = 0;
                rq->cpu = i;
+                rq->online = 0;
                rq->migration_thread = NULL;
                INIT_LIST_HEAD(&rq->migration_queue);
                rq_attach_root(rq, &def_root_domain);
@@ -7834,7 +8087,7 @@ void __init sched_init(void)
 #endif
 #ifdef CONFIG_SMP
-        open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
+        open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
 #endif
 #ifdef CONFIG_RT_MUTEXES
@@ -8058,7 +8311,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
 {
        list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
 }
-#else
+#else /* !CONFG_FAIR_GROUP_SCHED */
 static inline void free_fair_sched_group(struct task_group *tg)
 {
 }
@@ -8076,7 +8329,7 @@ static inline void register_fair_sched_group(struct task_group *tg, int cpu)
 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
 {
 }
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 static void free_rt_sched_group(struct task_group *tg)
@@ -8147,7 +8400,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
 {
        list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
 }
-#else
+#else /* !CONFIG_RT_GROUP_SCHED */
 static inline void free_rt_sched_group(struct task_group *tg)
 {
 }
@@ -8165,7 +8418,7 @@ static inline void register_rt_sched_group(struct task_group *tg, int cpu)
 static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
 {
 }
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_GROUP_SCHED
 static void free_sched_group(struct task_group *tg)
@@ -8276,17 +8529,14 @@ void sched_move_task(struct task_struct *tsk)
        task_rq_unlock(rq, &flags);
 }
-#endif
+#endif /* CONFIG_GROUP_SCHED */
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void set_se_shares(struct sched_entity *se, unsigned long shares)
+static void __set_se_shares(struct sched_entity *se, unsigned long shares)
 {
        struct cfs_rq *cfs_rq = se->cfs_rq;
-        struct rq *rq = cfs_rq->rq;
        int on_rq;
-        spin_lock_irq(&rq->lock);
        on_rq = se->on_rq;
        if (on_rq)
                dequeue_entity(cfs_rq, se, 0);
@@ -8296,8 +8546,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
        if (on_rq)
                enqueue_entity(cfs_rq, se, 0);
+}
-        spin_unlock_irq(&rq->lock);
+static void set_se_shares(struct sched_entity *se, unsigned long shares)
+{
+        struct cfs_rq *cfs_rq = se->cfs_rq;
+        struct rq *rq = cfs_rq->rq;
+        unsigned long flags;
+        spin_lock_irqsave(&rq->lock, flags);
+        __set_se_shares(se, shares);
+        spin_unlock_irqrestore(&rq->lock, flags);
 }
 static DEFINE_MUTEX(shares_mutex);
@@ -8336,8 +8595,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
         * w/o tripping rebalance_share or load_balance_fair.
         */
        tg->shares = shares;
-        for_each_possible_cpu(i)
+        for_each_possible_cpu(i) {
+                /*
+                 * force a rebalance
+                 */
+                cfs_rq_set_shares(tg->cfs_rq[i], 0);
                set_se_shares(tg->se[i], shares);
+        }
        /*
         * Enable load balance activity on this group, by inserting it back on
@@ -8376,7 +8640,7 @@ static unsigned long to_ratio(u64 period, u64 runtime)
 #ifdef CONFIG_CGROUP_SCHED
 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 {
-        struct task_group *tgi, *parent = tg ? tg->parent : NULL;
+        struct task_group *tgi, *parent = tg->parent;
        unsigned long total = 0;
        if (!parent) {
@@ -8400,7 +8664,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
        }
        rcu_read_unlock();
-        return total + to_ratio(period, runtime) <
+        return total + to_ratio(period, runtime) <=
                to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
                                parent->rt_bandwidth.rt_runtime);
 }
@@ -8520,16 +8784,21 @@ long sched_group_rt_period(struct task_group *tg)
 static int sched_rt_global_constraints(void)
 {
+        struct task_group *tg = &root_task_group;
+        u64 rt_runtime, rt_period;
        int ret = 0;
+        rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+        rt_runtime = tg->rt_bandwidth.rt_runtime;
        mutex_lock(&rt_constraints_mutex);
-        if (!__rt_schedulable(NULL, 1, 0))
+        if (!__rt_schedulable(tg, rt_period, rt_runtime))
                ret = -EINVAL;
        mutex_unlock(&rt_constraints_mutex);
        return ret;
 }
-#else
+#else /* !CONFIG_RT_GROUP_SCHED */
 static int sched_rt_global_constraints(void)
 {
        unsigned long flags;
@@ -8547,7 +8816,7 @@ static int sched_rt_global_constraints(void)
        return 0;
 }
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
 int sched_rt_handler(struct ctl_table *table, int write,
                struct file *filp, void __user *buffer, size_t *lenp,
@@ -8655,7 +8924,7 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
        return (u64) tg->shares;
 }
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
@@ -8679,7 +8948,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
 {
        return sched_group_rt_period(cgroup_tg(cgrp));
 }
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
 static struct cftype cpu_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index ce05271219ab..22ed55d1167f 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -3,6 +3,9 @@
 *
 *  Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
 *
+ *  Updates and enhancements:
+ *    Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com>
+ *
 * Based on code by:
 *   Ingo Molnar <mingo@redhat.com>
 *   Guillaume Chazarain <guichaz@gmail.com>
@@ -32,6 +35,11 @@
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+#define MULTI_SHIFT 15
+/* Max is double, Min is 1/2 */
+#define MAX_MULTI (2LL << MULTI_SHIFT)
+#define MIN_MULTI (1LL << (MULTI_SHIFT-1))
 struct sched_clock_data {
        /*
         * Raw spinlock - this is a special case: this might be called
@@ -40,11 +48,15 @@ struct sched_clock_data {
         */
        raw_spinlock_t          lock;
-        unsigned long           prev_jiffies;
+        unsigned long           tick_jiffies;
        u64                     prev_raw;
        u64                     tick_raw;
        u64                     tick_gtod;
        u64                     clock;
+        s64                     multi;
+#ifdef CONFIG_NO_HZ
+        int                     check_max;
+#endif
 };
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
@@ -71,41 +83,91 @@ void sched_clock_init(void)
                struct sched_clock_data *scd = cpu_sdc(cpu);
                scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
-                scd->prev_jiffies = now_jiffies;
+                scd->tick_jiffies = now_jiffies;
                scd->prev_raw = 0;
                scd->tick_raw = 0;
                scd->tick_gtod = ktime_now;
                scd->clock = ktime_now;
+                scd->multi = 1 << MULTI_SHIFT;
+#ifdef CONFIG_NO_HZ
+                scd->check_max = 1;
+#endif
        }
        sched_clock_running = 1;
 }
+#ifdef CONFIG_NO_HZ
+/*
+ * The dynamic ticks makes the delta jiffies inaccurate. This
+ * prevents us from checking the maximum time update.
+ * Disable the maximum check during stopped ticks.
+ */
+void sched_clock_tick_stop(int cpu)
+{
+        struct sched_clock_data *scd = cpu_sdc(cpu);
+        scd->check_max = 0;
+}
+void sched_clock_tick_start(int cpu)
+{
+        struct sched_clock_data *scd = cpu_sdc(cpu);
+        scd->check_max = 1;
+}
+static int check_max(struct sched_clock_data *scd)
+{
+        return scd->check_max;
+}
+#else
+static int check_max(struct sched_clock_data *scd)
+{
+        return 1;
+}
+#endif /* CONFIG_NO_HZ */
 /*
 * update the percpu scd from the raw @now value
 *
 *  - filter out backward motion
 *  - use jiffies to generate a min,max window to clip the raw values
 */
-static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
+static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *time)
 {
        unsigned long now_jiffies = jiffies;
-        long delta_jiffies = now_jiffies - scd->prev_jiffies;
+        long delta_jiffies = now_jiffies - scd->tick_jiffies;
        u64 clock = scd->clock;
        u64 min_clock, max_clock;
        s64 delta = now - scd->prev_raw;
        WARN_ON_ONCE(!irqs_disabled());
-        min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC;
+        /*
+         * At schedule tick the clock can be just under the gtod. We don't
+         * want to push it too prematurely.
+         */
+        min_clock = scd->tick_gtod + (delta_jiffies * TICK_NSEC);
+        if (min_clock > TICK_NSEC)
+                min_clock -= TICK_NSEC / 2;
        if (unlikely(delta < 0)) {
                clock++;
                goto out;
        }
-        max_clock = min_clock + TICK_NSEC;
+        /*
+         * The clock must stay within a jiffie of the gtod.
+         * But since we may be at the start of a jiffy or the end of one
+         * we add another jiffy buffer.
+         */
+        max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC;
+        delta *= scd->multi;
+        delta >>= MULTI_SHIFT;
-        if (unlikely(clock + delta > max_clock)) {
+        if (unlikely(clock + delta > max_clock) && check_max(scd)) {
                if (clock < max_clock)
                        clock = max_clock;
                else
@@ -118,9 +180,12 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
        if (unlikely(clock < min_clock))
                clock = min_clock;
-        scd->prev_raw = now;
+        if (time)
-        scd->prev_jiffies = now_jiffies;
+                *time = clock;
-        scd->clock = clock;
+        else {
+                scd->prev_raw = now;
+                scd->clock = clock;
+        }
 }
 static void lock_double_clock(struct sched_clock_data *data1,
@@ -160,25 +225,30 @@ u64 sched_clock_cpu(int cpu)
                now -= my_scd->tick_raw;
                now += scd->tick_raw;
-                now -= my_scd->tick_gtod;
+                now += my_scd->tick_gtod;
-                now += scd->tick_gtod;
+                now -= scd->tick_gtod;
                __raw_spin_unlock(&my_scd->lock);
+                __update_sched_clock(scd, now, &clock);
+                __raw_spin_unlock(&scd->lock);
        } else {
                __raw_spin_lock(&scd->lock);
+                __update_sched_clock(scd, now, NULL);
+                clock = scd->clock;
+                __raw_spin_unlock(&scd->lock);
        }
-        __update_sched_clock(scd, now);
-        clock = scd->clock;
-        __raw_spin_unlock(&scd->lock);
        return clock;
 }
 void sched_clock_tick(void)
 {
        struct sched_clock_data *scd = this_scd();
+        unsigned long now_jiffies = jiffies;
+        s64 mult, delta_gtod, delta_raw;
        u64 now, now_gtod;
        if (unlikely(!sched_clock_running))
@@ -186,18 +256,33 @@ void sched_clock_tick(void)
        WARN_ON_ONCE(!irqs_disabled());
-        now = sched_clock();
        now_gtod = ktime_to_ns(ktime_get());
+        now = sched_clock();
        __raw_spin_lock(&scd->lock);
-        __update_sched_clock(scd, now);
+        __update_sched_clock(scd, now, NULL);
        /*
         * update tick_gtod after __update_sched_clock() because that will
         * already observe 1 new jiffy; adding a new tick_gtod to that would
         * increase the clock 2 jiffies.
         */
+        delta_gtod = now_gtod - scd->tick_gtod;
+        delta_raw = now - scd->tick_raw;
+        if ((long)delta_raw > 0) {
+                mult = delta_gtod << MULTI_SHIFT;
+                do_div(mult, delta_raw);
+                scd->multi = mult;
+                if (scd->multi > MAX_MULTI)
+                        scd->multi = MAX_MULTI;
+                else if (scd->multi < MIN_MULTI)
+                        scd->multi = MIN_MULTI;
+        } else
+                scd->multi = 1 << MULTI_SHIFT;
        scd->tick_raw = now;
        scd->tick_gtod = now_gtod;
+        scd->tick_jiffies = now_jiffies;
        __raw_spin_unlock(&scd->lock);
 }
@@ -227,6 +312,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
        __raw_spin_lock(&scd->lock);
        scd->prev_raw = now;
        scd->clock += delta_ns;
+        scd->multi = 1 << MULTI_SHIFT;
        __raw_spin_unlock(&scd->lock);
        touch_softlockup_watchdog();
@@ -244,3 +330,16 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 {
        return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
 }
+unsigned long long cpu_clock(int cpu)
+{
+        unsigned long long clock;
+        unsigned long flags;
+        local_irq_save(flags);
+        clock = sched_clock_cpu(cpu);
+        local_irq_restore(flags);
+        return clock;
+}
+EXPORT_SYMBOL_GPL(cpu_clock);
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
new file mode 100644
index 000000000000..52154fefab7e
--- /dev/null
+++ b/kernel/sched_cpupri.c
@@ -0,0 +1,174 @@
+/*
+ *  kernel/sched_cpupri.c
+ *
+ *  CPU priority management
+ *
+ *  Copyright (C) 2007-2008 Novell
+ *
+ *  Author: Gregory Haskins <ghaskins@novell.com>
+ *
+ *  This code tracks the priority of each CPU so that global migration
+ *  decisions are easy to calculate.  Each CPU can be in a state as follows:
+ *
+ *                 (INVALID), IDLE, NORMAL, RT1, ... RT99
+ *
+ *  going from the lowest priority to the highest.  CPUs in the INVALID state
+ *  are not eligible for routing.  The system maintains this state with
+ *  a 2 dimensional bitmap (the first for priority class, the second for cpus
+ *  in that class).  Therefore a typical application without affinity
+ *  restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
+ *  searches).  For tasks with affinity restrictions, the algorithm has a
+ *  worst case complexity of O(min(102, nr_domcpus)), though the scenario that
+ *  yields the worst case search is fairly contrived.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; version 2
+ *  of the License.
+ */
+#include "sched_cpupri.h"
+/* Convert between a 140 based task->prio, and our 102 based cpupri */
+static int convert_prio(int prio)
+{
+        int cpupri;
+        if (prio == CPUPRI_INVALID)
+                cpupri = CPUPRI_INVALID;
+        else if (prio == MAX_PRIO)
+                cpupri = CPUPRI_IDLE;
+        else if (prio >= MAX_RT_PRIO)
+                cpupri = CPUPRI_NORMAL;
+        else
+                cpupri = MAX_RT_PRIO - prio + 1;
+        return cpupri;
+}
+#define for_each_cpupri_active(array, idx)                    \
+  for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES);     \
+       idx < CPUPRI_NR_PRIORITIES;                            \
+       idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
+/**
+ * cpupri_find - find the best (lowest-pri) CPU in the system
+ * @cp: The cpupri context
+ * @p: The task
+ * @lowest_mask: A mask to fill in with selected CPUs
+ *
+ * Note: This function returns the recommended CPUs as calculated during the
+ * current invokation.  By the time the call returns, the CPUs may have in
+ * fact changed priorities any number of times.  While not ideal, it is not
+ * an issue of correctness since the normal rebalancer logic will correct
+ * any discrepancies created by racing against the uncertainty of the current
+ * priority configuration.
+ *
+ * Returns: (int)bool - CPUs were found
+ */
+int cpupri_find(struct cpupri *cp, struct task_struct *p,
+                cpumask_t *lowest_mask)
+{
+        int                  idx      = 0;
+        int                  task_pri = convert_prio(p->prio);
+        for_each_cpupri_active(cp->pri_active, idx) {
+                struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
+                cpumask_t mask;
+                if (idx >= task_pri)
+                        break;
+                cpus_and(mask, p->cpus_allowed, vec->mask);
+                if (cpus_empty(mask))
+                        continue;
+                *lowest_mask = mask;
+                return 1;
+        }
+        return 0;
+}
+/**
+ * cpupri_set - update the cpu priority setting
+ * @cp: The cpupri context
+ * @cpu: The target cpu
+ * @pri: The priority (INVALID-RT99) to assign to this CPU
+ *
+ * Note: Assumes cpu_rq(cpu)->lock is locked
+ *
+ * Returns: (void)
+ */
+void cpupri_set(struct cpupri *cp, int cpu, int newpri)
+{
+        int                 *currpri = &cp->cpu_to_pri[cpu];
+        int                  oldpri  = *currpri;
+        unsigned long        flags;
+        newpri = convert_prio(newpri);
+        BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
+        if (newpri == oldpri)
+                return;
+        /*
+         * If the cpu was currently mapped to a different value, we
+         * first need to unmap the old value
+         */
+        if (likely(oldpri != CPUPRI_INVALID)) {
+                struct cpupri_vec *vec  = &cp->pri_to_cpu[oldpri];
+                spin_lock_irqsave(&vec->lock, flags);
+                vec->count--;
+                if (!vec->count)
+                        clear_bit(oldpri, cp->pri_active);
+                cpu_clear(cpu, vec->mask);
+                spin_unlock_irqrestore(&vec->lock, flags);
+        }
+        if (likely(newpri != CPUPRI_INVALID)) {
+                struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
+                spin_lock_irqsave(&vec->lock, flags);
+                cpu_set(cpu, vec->mask);
+                vec->count++;
+                if (vec->count == 1)
+                        set_bit(newpri, cp->pri_active);
+                spin_unlock_irqrestore(&vec->lock, flags);
+        }
+        *currpri = newpri;
+}
+/**
+ * cpupri_init - initialize the cpupri structure
+ * @cp: The cpupri context
+ *
+ * Returns: (void)
+ */
+void cpupri_init(struct cpupri *cp)
+{
+        int i;
+        memset(cp, 0, sizeof(*cp));
+        for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
+                struct cpupri_vec *vec = &cp->pri_to_cpu[i];
+                spin_lock_init(&vec->lock);
+                vec->count = 0;
+                cpus_clear(vec->mask);
+        }
+        for_each_possible_cpu(i)
+                cp->cpu_to_pri[i] = CPUPRI_INVALID;
+}
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
new file mode 100644
index 000000000000..f25811b0f931
--- /dev/null
+++ b/kernel/sched_cpupri.h
@@ -0,0 +1,36 @@
+#ifndef _LINUX_CPUPRI_H
+#define _LINUX_CPUPRI_H
+#include <linux/sched.h>
+#define CPUPRI_NR_PRIORITIES    (MAX_RT_PRIO + 2)
+#define CPUPRI_NR_PRI_WORDS     BITS_TO_LONGS(CPUPRI_NR_PRIORITIES)
+#define CPUPRI_INVALID -1
+#define CPUPRI_IDLE     0
+#define CPUPRI_NORMAL   1
+/* values 2-101 are RT priorities 0-99 */
+struct cpupri_vec {
+        spinlock_t lock;
+        int        count;
+        cpumask_t  mask;
+};
+struct cpupri {
+        struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
+        long              pri_active[CPUPRI_NR_PRI_WORDS];
+        int               cpu_to_pri[NR_CPUS];
+};
+#ifdef CONFIG_SMP
+int  cpupri_find(struct cpupri *cp,
+                 struct task_struct *p, cpumask_t *lowest_mask);
+void cpupri_set(struct cpupri *cp, int cpu, int pri);
+void cpupri_init(struct cpupri *cp);
+#else
+#define cpupri_set(cp, cpu, pri) do { } while (0)
+#define cpupri_init() do { } while (0)
+#endif
+#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 8bb713040ac9..bbe6b31c3c56 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -119,9 +119,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
        struct sched_entity *last;
        unsigned long flags;
-#if !defined(CONFIG_CGROUP_SCHED) || !defined(CONFIG_USER_SCHED)
+#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
-        SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
-#else
        char path[128] = "";
        struct cgroup *cgroup = NULL;
        struct task_group *tg = cfs_rq->tg;
@@ -133,6 +131,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
                cgroup_path(cgroup, path, sizeof(path));
        SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
+#else
+        SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
 #endif
        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
@@ -162,11 +162,64 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
        SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
        SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
 #ifdef CONFIG_SCHEDSTATS
-        SEQ_printf(m, "  .%-30s: %d\n", "bkl_count",
+#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
-                        rq->bkl_count);
+        P(yld_exp_empty);
+        P(yld_act_empty);
+        P(yld_both_empty);
+        P(yld_count);
+        P(sched_switch);
+        P(sched_count);
+        P(sched_goidle);
+        P(ttwu_count);
+        P(ttwu_local);
+        P(bkl_count);
+#undef P
 #endif
        SEQ_printf(m, "  .%-30s: %ld\n", "nr_spread_over",
                        cfs_rq->nr_spread_over);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_SMP
+        SEQ_printf(m, "  .%-30s: %lu\n", "shares", cfs_rq->shares);
+#endif
+#endif
+}
+void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
+{
+#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
+        char path[128] = "";
+        struct cgroup *cgroup = NULL;
+        struct task_group *tg = rt_rq->tg;
+        if (tg)
+                cgroup = tg->css.cgroup;
+        if (cgroup)
+                cgroup_path(cgroup, path, sizeof(path));
+        SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
+#else
+        SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
+#endif
+#define P(x) \
+        SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
+#define PN(x) \
+        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
+        P(rt_nr_running);
+        P(rt_throttled);
+        PN(rt_time);
+        PN(rt_runtime);
+#undef PN
+#undef P
 }
 static void print_cpu(struct seq_file *m, int cpu)
@@ -208,6 +261,7 @@ static void print_cpu(struct seq_file *m, int cpu)
 #undef PN
        print_cfs_stats(m, cpu);
+        print_rt_stats(m, cpu);
        print_rq(m, rq, cpu);
 }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 08ae848b71d4..cf2cd6ce4cb2 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
 /*
 * SCHED_OTHER wake-up granularity.
- * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds)
 *
 * This option delays the preemption effects of decoupled workloads
 * and reduces their over-scheduling. Synchronous workloads will still
 * have immediate wakeup/sleep latencies.
 */
-unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
+unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
@@ -334,6 +334,34 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
 #endif
 /*
+ * delta *= w / rw
+ */
+static inline unsigned long
+calc_delta_weight(unsigned long delta, struct sched_entity *se)
+{
+        for_each_sched_entity(se) {
+                delta = calc_delta_mine(delta,
+                                se->load.weight, &cfs_rq_of(se)->load);
+        }
+        return delta;
+}
+/*
+ * delta *= rw / w
+ */
+static inline unsigned long
+calc_delta_fair(unsigned long delta, struct sched_entity *se)
+{
+        for_each_sched_entity(se) {
+                delta = calc_delta_mine(delta,
+                                cfs_rq_of(se)->load.weight, &se->load);
+        }
+        return delta;
+}
+/*
 * The idea is to set a period in which each task runs once.
 *
 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
@@ -362,47 +390,80 @@ static u64 __sched_period(unsigned long nr_running)
 */
 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-        u64 slice = __sched_period(cfs_rq->nr_running);
+        return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
-        for_each_sched_entity(se) {
-                cfs_rq = cfs_rq_of(se);
-                slice *= se->load.weight;
-                do_div(slice, cfs_rq->load.weight);
-        }
-        return slice;
 }
 /*
 * We calculate the vruntime slice of a to be inserted task
 *
- * vs = s/w = p/rw
+ * vs = s*rw/w = p
 */
 static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        unsigned long nr_running = cfs_rq->nr_running;
-        unsigned long weight;
-        u64 vslice;
        if (!se->on_rq)
                nr_running++;
-        vslice = __sched_period(nr_running);
+        return __sched_period(nr_running);
+}
+/*
+ * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
+ * that it favours >=0 over <0.
+ *
+ *   -20         |
+ *               |
+ *     0 --------+-------
+ *             .'
+ *    19     .'
+ *
+ */
+static unsigned long
+calc_delta_asym(unsigned long delta, struct sched_entity *se)
+{
+        struct load_weight lw = {
+                .weight = NICE_0_LOAD,
+                .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
+        };
        for_each_sched_entity(se) {
-                cfs_rq = cfs_rq_of(se);
+                struct load_weight *se_lw = &se->load;
+                unsigned long rw = cfs_rq_of(se)->load.weight;
+#ifdef CONFIG_FAIR_SCHED_GROUP
+                struct cfs_rq *cfs_rq = se->my_q;
+                struct task_group *tg = NULL
+                if (cfs_rq)
+                        tg = cfs_rq->tg;
+                if (tg && tg->shares < NICE_0_LOAD) {
+                        /*
+                         * scale shares to what it would have been had
+                         * tg->weight been NICE_0_LOAD:
+                         *
+                         *   weight = 1024 * shares / tg->weight
+                         */
+                        lw.weight *= se->load.weight;
+                        lw.weight /= tg->shares;
+                        lw.inv_weight = 0;
+                        se_lw = &lw;
+                        rw += lw.weight - se->load.weight;
+                } else
+#endif
-                weight = cfs_rq->load.weight;
+                if (se->load.weight < NICE_0_LOAD) {
-                if (!se->on_rq)
+                        se_lw = &lw;
-                        weight += se->load.weight;
+                        rw += NICE_0_LOAD - se->load.weight;
+                }
-                vslice *= NICE_0_LOAD;
+                delta = calc_delta_mine(delta, rw, se_lw);
-                do_div(vslice, weight);
        }
-        return vslice;
+        return delta;
 }
 /*
@@ -419,11 +480,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
        curr->sum_exec_runtime += delta_exec;
        schedstat_add(cfs_rq, exec_clock, delta_exec);
-        delta_exec_weighted = delta_exec;
+        delta_exec_weighted = calc_delta_fair(delta_exec, curr);
-        if (unlikely(curr->load.weight != NICE_0_LOAD)) {
-                delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
-                                                        &curr->load);
-        }
        curr->vruntime += delta_exec_weighted;
 }
@@ -510,10 +567,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 * Scheduling class queueing methods:
 */
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+static void
+add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+{
+        cfs_rq->task_weight += weight;
+}
+#else
+static inline void
+add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+{
+}
+#endif
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        update_load_add(&cfs_rq->load, se->load.weight);
+        if (!parent_entity(se))
+                inc_cpu_load(rq_of(cfs_rq), se->load.weight);
+        if (entity_is_task(se))
+                add_cfs_task_weight(cfs_rq, se->load.weight);
        cfs_rq->nr_running++;
        se->on_rq = 1;
        list_add(&se->group_node, &cfs_rq->tasks);
@@ -523,6 +597,10 @@ static void
 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        update_load_sub(&cfs_rq->load, se->load.weight);
+        if (!parent_entity(se))
+                dec_cpu_load(rq_of(cfs_rq), se->load.weight);
+        if (entity_is_task(se))
+                add_cfs_task_weight(cfs_rq, -se->load.weight);
        cfs_rq->nr_running--;
        se->on_rq = 0;
        list_del_init(&se->group_node);
@@ -609,8 +687,17 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
        if (!initial) {
                /* sleeps upto a single latency don't count. */
-                if (sched_feat(NEW_FAIR_SLEEPERS))
+                if (sched_feat(NEW_FAIR_SLEEPERS)) {
-                        vruntime -= sysctl_sched_latency;
+                        unsigned long thresh = sysctl_sched_latency;
+                        /*
+                         * convert the sleeper threshold into virtual time
+                         */
+                        if (sched_feat(NORMALIZED_SLEEPER))
+                                thresh = calc_delta_fair(thresh, se);
+                        vruntime -= thresh;
+                }
                /* ensure we never gain time by being placed backwards. */
                vruntime = max_vruntime(se->vruntime, vruntime);
@@ -639,21 +726,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
                __enqueue_entity(cfs_rq, se);
 }
-static void update_avg(u64 *avg, u64 sample)
-{
-        s64 diff = sample - *avg;
-        *avg += diff >> 3;
-}
-static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-        if (!se->last_wakeup)
-                return;
-        update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup);
-        se->last_wakeup = 0;
-}
 static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 {
@@ -664,7 +736,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
        update_stats_dequeue(cfs_rq, se);
        if (sleep) {
-                update_avg_stats(cfs_rq, se);
 #ifdef CONFIG_SCHEDSTATS
                if (entity_is_task(se)) {
                        struct task_struct *tsk = task_of(se);
@@ -726,17 +797,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
        se->prev_sum_exec_runtime = se->sum_exec_runtime;
 }
-static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
 static struct sched_entity *
 pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-        if (!cfs_rq->next)
+        struct rq *rq = rq_of(cfs_rq);
-                return se;
+        u64 pair_slice = rq->clock - cfs_rq->pair_start;
-        if (wakeup_preempt_entity(cfs_rq->next, se) != 0)
+        if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) {
+                cfs_rq->pair_start = rq->clock;
                return se;
+        }
        return cfs_rq->next;
 }
@@ -808,7 +878,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 #ifdef CONFIG_SCHED_HRTICK
 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
 {
-        int requeue = rq->curr == p;
        struct sched_entity *se = &p->se;
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -829,13 +898,13 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
                 * Don't schedule slices shorter than 10000ns, that just
                 * doesn't make sense. Rely on vruntime for fairness.
                 */
-                if (!requeue)
+                if (rq->curr != p)
                        delta = max(10000LL, delta);
-                hrtick_start(rq, delta, requeue);
+                hrtick_start(rq, delta);
        }
 }
-#else
+#else /* !CONFIG_SCHED_HRTICK */
 static inline void
 hrtick_start_fair(struct rq *rq, struct task_struct *p)
 {
@@ -934,6 +1003,8 @@ static void yield_task_fair(struct rq *rq)
 * not idle and an idle cpu is available.  The span of cpus to
 * search starts with cpus closest then further out as needed,
 * so we always favor a closer, idle cpu.
+ * Domains may include CPUs that are not usable for migration,
+ * hence we need to mask them out (cpu_active_map)
 *
 * Returns the CPU we should wake onto.
 */
@@ -961,7 +1032,8 @@ static int wake_idle(int cpu, struct task_struct *p)
                    || ((sd->flags & SD_WAKE_IDLE_FAR)
                        && !task_hot(p, task_rq(p)->clock, sd))) {
                        cpus_and(tmp, sd->span, p->cpus_allowed);
-                        for_each_cpu_mask(i, tmp) {
+                        cpus_and(tmp, tmp, cpu_active_map);
+                        for_each_cpu_mask_nr(i, tmp) {
                                if (idle_cpu(i)) {
                                        if (i != task_cpu(p)) {
                                                schedstat_inc(p,
@@ -976,7 +1048,7 @@ static int wake_idle(int cpu, struct task_struct *p)
        }
        return cpu;
 }
-#else
+#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
 static inline int wake_idle(int cpu, struct task_struct *p)
 {
        return cpu;
@@ -987,6 +1059,89 @@ static inline int wake_idle(int cpu, struct task_struct *p)
 static const struct sched_class fair_sched_class;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * effective_load() calculates the load change as seen from the root_task_group
+ *
+ * Adding load to a group doesn't make a group heavier, but can cause movement
+ * of group shares between cpus. Assuming the shares were perfectly aligned one
+ * can calculate the shift in shares.
+ *
+ * The problem is that perfectly aligning the shares is rather expensive, hence
+ * we try to avoid doing that too often - see update_shares(), which ratelimits
+ * this change.
+ *
+ * We compensate this by not only taking the current delta into account, but
+ * also considering the delta between when the shares were last adjusted and
+ * now.
+ *
+ * We still saw a performance dip, some tracing learned us that between
+ * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
+ * significantly. Therefore try to bias the error in direction of failing
+ * the affine wakeup.
+ *
+ */
+static long effective_load(struct task_group *tg, int cpu,
+                long wl, long wg)
+{
+        struct sched_entity *se = tg->se[cpu];
+        long more_w;
+        if (!tg->parent)
+                return wl;
+        /*
+         * By not taking the decrease of shares on the other cpu into
+         * account our error leans towards reducing the affine wakeups.
+         */
+        if (!wl && sched_feat(ASYM_EFF_LOAD))
+                return wl;
+        /*
+         * Instead of using this increment, also add the difference
+         * between when the shares were last updated and now.
+         */
+        more_w = se->my_q->load.weight - se->my_q->rq_weight;
+        wl += more_w;
+        wg += more_w;
+        for_each_sched_entity(se) {
+#define D(n) (likely(n) ? (n) : 1)
+                long S, rw, s, a, b;
+                S = se->my_q->tg->shares;
+                s = se->my_q->shares;
+                rw = se->my_q->rq_weight;
+                a = S*(rw + wl);
+                b = S*rw + s*wg;
+                wl = s*(a-b)/D(b);
+                /*
+                 * Assume the group is already running and will
+                 * thus already be accounted for in the weight.
+                 *
+                 * That is, moving shares between CPUs, does not
+                 * alter the group weight.
+                 */
+                wg = 0;
+#undef D
+        }
+        return wl;
+}
+#else
+static inline unsigned long effective_load(struct task_group *tg, int cpu,
+                unsigned long wl, unsigned long wg)
+{
+        return wl;
+}
+#endif
 static int
 wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
            struct task_struct *p, int prev_cpu, int this_cpu, int sync,
@@ -994,8 +1149,10 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
            unsigned int imbalance)
 {
        struct task_struct *curr = this_rq->curr;
+        struct task_group *tg;
        unsigned long tl = this_load;
        unsigned long tl_per_task;
+        unsigned long weight;
        int balanced;
        if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
@@ -1006,19 +1163,28 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
         * effect of the currently running task from the load
         * of the current CPU:
         */
-        if (sync)
+        if (sync) {
-                tl -= current->se.load.weight;
+                tg = task_group(current);
+                weight = current->se.load.weight;
+                tl += effective_load(tg, this_cpu, -weight, -weight);
+                load += effective_load(tg, prev_cpu, 0, -weight);
+        }
-        balanced = 100*(tl + p->se.load.weight) <= imbalance*load;
+        tg = task_group(p);
+        weight = p->se.load.weight;
+        balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
+                imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
        /*
         * If the currently running task will sleep within
         * a reasonable amount of time then attract this newly
         * woken task:
         */
-        if (sync && balanced && curr->sched_class == &fair_sched_class) {
+        if (sync && balanced) {
                if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
-                                p->se.avg_overlap < sysctl_sched_migration_cost)
+                    p->se.avg_overlap < sysctl_sched_migration_cost)
                        return 1;
        }
@@ -1111,11 +1277,13 @@ static unsigned long wakeup_gran(struct sched_entity *se)
        unsigned long gran = sysctl_sched_wakeup_granularity;
        /*
-         * More easily preempt - nice tasks, while not making
+         * More easily preempt - nice tasks, while not making it harder for
-         * it harder for + nice tasks.
+         * + nice tasks.
         */
-        if (unlikely(se->load.weight > NICE_0_LOAD))
+        if (sched_feat(ASYM_GRAN))
-                gran = calc_delta_fair(gran, &se->load);
+                gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
+        else
+                gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
        return gran;
 }
@@ -1177,7 +1345,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
                return;
        }
-        se->last_wakeup = se->sum_exec_runtime;
        if (unlikely(se == pse))
                return;
@@ -1275,23 +1442,18 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
        struct task_struct *p = NULL;
        struct sched_entity *se;
-        if (next == &cfs_rq->tasks)
+        while (next != &cfs_rq->tasks) {
-                return NULL;
-        /* Skip over entities that are not tasks */
-        do {
                se = list_entry(next, struct sched_entity, group_node);
                next = next->next;
-        } while (next != &cfs_rq->tasks && !entity_is_task(se));
-        if (next == &cfs_rq->tasks)
+                /* Skip over entities that are not tasks */
-                return NULL;
+                if (entity_is_task(se)) {
+                        p = task_of(se);
+                        break;
+                }
+        }
        cfs_rq->balance_iterator = next;
-        if (entity_is_task(se))
-                p = task_of(se);
        return p;
 }
@@ -1309,75 +1471,82 @@ static struct task_struct *load_balance_next_fair(void *arg)
        return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
 }
-#ifdef CONFIG_FAIR_GROUP_SCHED
+static unsigned long
-static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
+__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                unsigned long max_load_move, struct sched_domain *sd,
+                enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
+                struct cfs_rq *cfs_rq)
 {
-        struct sched_entity *curr;
+        struct rq_iterator cfs_rq_iterator;
-        struct task_struct *p;
-        if (!cfs_rq->nr_running || !first_fair(cfs_rq))
-                return MAX_PRIO;
-        curr = cfs_rq->curr;
-        if (!curr)
-                curr = __pick_next_entity(cfs_rq);
-        p = task_of(curr);
+        cfs_rq_iterator.start = load_balance_start_fair;
+        cfs_rq_iterator.next = load_balance_next_fair;
+        cfs_rq_iterator.arg = cfs_rq;
-        return p->prio;
+        return balance_tasks(this_rq, this_cpu, busiest,
+                        max_load_move, sd, idle, all_pinned,
+                        this_best_prio, &cfs_rq_iterator);
 }
-#endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  unsigned long max_load_move,
                  struct sched_domain *sd, enum cpu_idle_type idle,
                  int *all_pinned, int *this_best_prio)
 {
-        struct cfs_rq *busy_cfs_rq;
        long rem_load_move = max_load_move;
-        struct rq_iterator cfs_rq_iterator;
+        int busiest_cpu = cpu_of(busiest);
+        struct task_group *tg;
-        cfs_rq_iterator.start = load_balance_start_fair;
-        cfs_rq_iterator.next = load_balance_next_fair;
-        for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
+        rcu_read_lock();
-#ifdef CONFIG_FAIR_GROUP_SCHED
+        update_h_load(busiest_cpu);
-                struct cfs_rq *this_cfs_rq;
-                long imbalance;
-                unsigned long maxload;
-                this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
+        list_for_each_entry(tg, &task_groups, list) {
+                struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
+                unsigned long busiest_h_load = busiest_cfs_rq->h_load;
+                unsigned long busiest_weight = busiest_cfs_rq->load.weight;
+                u64 rem_load, moved_load;
-                imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
+                /*
-                /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
+                 * empty group
-                if (imbalance <= 0)
+                 */
+                if (!busiest_cfs_rq->task_weight)
                        continue;
-                /* Don't pull more than imbalance/2 */
+                rem_load = (u64)rem_load_move * busiest_weight;
-                imbalance /= 2;
+                rem_load = div_u64(rem_load, busiest_h_load + 1);
-                maxload = min(rem_load_move, imbalance);
-                *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
+                moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
-#else
+                                rem_load, sd, idle, all_pinned, this_best_prio,
-# define maxload rem_load_move
+                                tg->cfs_rq[busiest_cpu]);
-#endif
-                /*
+                if (!moved_load)
-                 * pass busy_cfs_rq argument into
+                        continue;
-                 * load_balance_[start|next]_fair iterators
-                 */
+                moved_load *= busiest_h_load;
-                cfs_rq_iterator.arg = busy_cfs_rq;
+                moved_load = div_u64(moved_load, busiest_weight + 1);
-                rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
-                                               maxload, sd, idle, all_pinned,
-                                               this_best_prio,
-                                               &cfs_rq_iterator);
-                if (rem_load_move <= 0)
+                rem_load_move -= moved_load;
+                if (rem_load_move < 0)
                        break;
        }
+        rcu_read_unlock();
        return max_load_move - rem_load_move;
 }
+#else
+static unsigned long
+load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                  unsigned long max_load_move,
+                  struct sched_domain *sd, enum cpu_idle_type idle,
+                  int *all_pinned, int *this_best_prio)
+{
+        return __load_balance_fair(this_rq, this_cpu, busiest,
+                        max_load_move, sd, idle, all_pinned,
+                        this_best_prio, &busiest->cfs);
+}
+#endif
 static int
 move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
@@ -1402,7 +1571,7 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
        return 0;
 }
-#endif
+#endif /* CONFIG_SMP */
 /*
 * scheduler tick hitting a task of our scheduling class:
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 1c7283cb9581..862b06bd560a 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,4 +1,5 @@
 SCHED_FEAT(NEW_FAIR_SLEEPERS, 1)
+SCHED_FEAT(NORMALIZED_SLEEPER, 1)
 SCHED_FEAT(WAKEUP_PREEMPT, 1)
 SCHED_FEAT(START_DEBIT, 1)
 SCHED_FEAT(AFFINE_WAKEUPS, 1)
@@ -6,5 +7,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, 1)
 SCHED_FEAT(SYNC_WAKEUPS, 1)
 SCHED_FEAT(HRTICK, 1)
 SCHED_FEAT(DOUBLE_TICK, 0)
-SCHED_FEAT(NORMALIZED_SLEEPER, 1)
+SCHED_FEAT(ASYM_GRAN, 1)
-SCHED_FEAT(DEADLINE, 1)
+SCHED_FEAT(LB_BIAS, 0)
+SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
+SCHED_FEAT(ASYM_EFF_LOAD, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 0f3c19197fa4..908c04f9dad0 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -12,6 +12,9 @@ static inline int rt_overloaded(struct rq *rq)
 static inline void rt_set_overload(struct rq *rq)
 {
+        if (!rq->online)
+                return;
        cpu_set(rq->cpu, rq->rd->rto_mask);
        /*
         * Make sure the mask is visible before we set
@@ -26,6 +29,9 @@ static inline void rt_set_overload(struct rq *rq)
 static inline void rt_clear_overload(struct rq *rq)
 {
+        if (!rq->online)
+                return;
        /* the order here really doesn't matter */
        atomic_dec(&rq->rd->rto_count);
        cpu_clear(rq->cpu, rq->rd->rto_mask);
@@ -155,7 +161,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
        return &rt_rq->tg->rt_bandwidth;
 }
-#else
+#else /* !CONFIG_RT_GROUP_SCHED */
 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 {
@@ -220,7 +226,160 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
        return &def_rt_bandwidth;
 }
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_SMP
+static int do_balance_runtime(struct rt_rq *rt_rq)
+{
+        struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+        struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+        int i, weight, more = 0;
+        u64 rt_period;
+        weight = cpus_weight(rd->span);
+        spin_lock(&rt_b->rt_runtime_lock);
+        rt_period = ktime_to_ns(rt_b->rt_period);
+        for_each_cpu_mask_nr(i, rd->span) {
+                struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
+                s64 diff;
+                if (iter == rt_rq)
+                        continue;
+                spin_lock(&iter->rt_runtime_lock);
+                if (iter->rt_runtime == RUNTIME_INF)
+                        goto next;
+                diff = iter->rt_runtime - iter->rt_time;
+                if (diff > 0) {
+                        diff = div_u64((u64)diff, weight);
+                        if (rt_rq->rt_runtime + diff > rt_period)
+                                diff = rt_period - rt_rq->rt_runtime;
+                        iter->rt_runtime -= diff;
+                        rt_rq->rt_runtime += diff;
+                        more = 1;
+                        if (rt_rq->rt_runtime == rt_period) {
+                                spin_unlock(&iter->rt_runtime_lock);
+                                break;
+                        }
+                }
+next:
+                spin_unlock(&iter->rt_runtime_lock);
+        }
+        spin_unlock(&rt_b->rt_runtime_lock);
+        return more;
+}
+static void __disable_runtime(struct rq *rq)
+{
+        struct root_domain *rd = rq->rd;
+        struct rt_rq *rt_rq;
+        if (unlikely(!scheduler_running))
+                return;
+        for_each_leaf_rt_rq(rt_rq, rq) {
+                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+                s64 want;
+                int i;
+                spin_lock(&rt_b->rt_runtime_lock);
+                spin_lock(&rt_rq->rt_runtime_lock);
+                if (rt_rq->rt_runtime == RUNTIME_INF ||
+                                rt_rq->rt_runtime == rt_b->rt_runtime)
+                        goto balanced;
+                spin_unlock(&rt_rq->rt_runtime_lock);
+                want = rt_b->rt_runtime - rt_rq->rt_runtime;
+                for_each_cpu_mask(i, rd->span) {
+                        struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
+                        s64 diff;
+                        if (iter == rt_rq)
+                                continue;
+                        spin_lock(&iter->rt_runtime_lock);
+                        if (want > 0) {
+                                diff = min_t(s64, iter->rt_runtime, want);
+                                iter->rt_runtime -= diff;
+                                want -= diff;
+                        } else {
+                                iter->rt_runtime -= want;
+                                want -= want;
+                        }
+                        spin_unlock(&iter->rt_runtime_lock);
+                        if (!want)
+                                break;
+                }
+                spin_lock(&rt_rq->rt_runtime_lock);
+                BUG_ON(want);
+balanced:
+                rt_rq->rt_runtime = RUNTIME_INF;
+                spin_unlock(&rt_rq->rt_runtime_lock);
+                spin_unlock(&rt_b->rt_runtime_lock);
+        }
+}
+static void disable_runtime(struct rq *rq)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&rq->lock, flags);
+        __disable_runtime(rq);
+        spin_unlock_irqrestore(&rq->lock, flags);
+}
+static void __enable_runtime(struct rq *rq)
+{
+        struct rt_rq *rt_rq;
+        if (unlikely(!scheduler_running))
+                return;
+        for_each_leaf_rt_rq(rt_rq, rq) {
+                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+                spin_lock(&rt_b->rt_runtime_lock);
+                spin_lock(&rt_rq->rt_runtime_lock);
+                rt_rq->rt_runtime = rt_b->rt_runtime;
+                rt_rq->rt_time = 0;
+                spin_unlock(&rt_rq->rt_runtime_lock);
+                spin_unlock(&rt_b->rt_runtime_lock);
+        }
+}
+static void enable_runtime(struct rq *rq)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&rq->lock, flags);
+        __enable_runtime(rq);
+        spin_unlock_irqrestore(&rq->lock, flags);
+}
+static int balance_runtime(struct rt_rq *rt_rq)
+{
+        int more = 0;
+        if (rt_rq->rt_time > rt_rq->rt_runtime) {
+                spin_unlock(&rt_rq->rt_runtime_lock);
+                more = do_balance_runtime(rt_rq);
+                spin_lock(&rt_rq->rt_runtime_lock);
+        }
+        return more;
+}
+#else /* !CONFIG_SMP */
+static inline int balance_runtime(struct rt_rq *rt_rq)
+{
+        return 0;
+}
+#endif /* CONFIG_SMP */
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 {
@@ -241,6 +400,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
                        u64 runtime;
                        spin_lock(&rt_rq->rt_runtime_lock);
+                        if (rt_rq->rt_throttled)
+                                balance_runtime(rt_rq);
                        runtime = rt_rq->rt_runtime;
                        rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
                        if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
@@ -261,47 +422,6 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
        return idle;
 }
-#ifdef CONFIG_SMP
-static int balance_runtime(struct rt_rq *rt_rq)
-{
-        struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
-        struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
-        int i, weight, more = 0;
-        u64 rt_period;
-        weight = cpus_weight(rd->span);
-        spin_lock(&rt_b->rt_runtime_lock);
-        rt_period = ktime_to_ns(rt_b->rt_period);
-        for_each_cpu_mask(i, rd->span) {
-                struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
-                s64 diff;
-                if (iter == rt_rq)
-                        continue;
-                spin_lock(&iter->rt_runtime_lock);
-                diff = iter->rt_runtime - iter->rt_time;
-                if (diff > 0) {
-                        do_div(diff, weight);
-                        if (rt_rq->rt_runtime + diff > rt_period)
-                                diff = rt_period - rt_rq->rt_runtime;
-                        iter->rt_runtime -= diff;
-                        rt_rq->rt_runtime += diff;
-                        more = 1;
-                        if (rt_rq->rt_runtime == rt_period) {
-                                spin_unlock(&iter->rt_runtime_lock);
-                                break;
-                        }
-                }
-                spin_unlock(&iter->rt_runtime_lock);
-        }
-        spin_unlock(&rt_b->rt_runtime_lock);
-        return more;
-}
-#endif
 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 {
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -327,18 +447,10 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
        if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))
                return 0;
-#ifdef CONFIG_SMP
+        balance_runtime(rt_rq);
-        if (rt_rq->rt_time > runtime) {
+        runtime = sched_rt_runtime(rt_rq);
-                int more;
+        if (runtime == RUNTIME_INF)
+                return 0;
-                spin_unlock(&rt_rq->rt_runtime_lock);
-                more = balance_runtime(rt_rq);
-                spin_lock(&rt_rq->rt_runtime_lock);
-                if (more)
-                        runtime = sched_rt_runtime(rt_rq);
-        }
-#endif
        if (rt_rq->rt_time > runtime) {
                rt_rq->rt_throttled = 1;
@@ -392,12 +504,23 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
        rt_rq->rt_nr_running++;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-        if (rt_se_prio(rt_se) < rt_rq->highest_prio)
+        if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
+#ifdef CONFIG_SMP
+                struct rq *rq = rq_of_rt_rq(rt_rq);
+#endif
                rt_rq->highest_prio = rt_se_prio(rt_se);
+#ifdef CONFIG_SMP
+                if (rq->online)
+                        cpupri_set(&rq->rd->cpupri, rq->cpu,
+                                   rt_se_prio(rt_se));
+#endif
+        }
 #endif
 #ifdef CONFIG_SMP
        if (rt_se->nr_cpus_allowed > 1) {
                struct rq *rq = rq_of_rt_rq(rt_rq);
                rq->rt.rt_nr_migratory++;
        }
@@ -417,6 +540,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 static inline
 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
+#ifdef CONFIG_SMP
+        int highest_prio = rt_rq->highest_prio;
+#endif
        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
        WARN_ON(!rt_rq->rt_nr_running);
        rt_rq->rt_nr_running--;
@@ -440,6 +567,14 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
                rq->rt.rt_nr_migratory--;
        }
+        if (rt_rq->highest_prio != highest_prio) {
+                struct rq *rq = rq_of_rt_rq(rt_rq);
+                if (rq->online)
+                        cpupri_set(&rq->rd->cpupri, rq->cpu,
+                                   rt_rq->highest_prio);
+        }
        update_rt_migration(rq_of_rt_rq(rt_rq));
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -455,6 +590,7 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
        struct rt_prio_array *array = &rt_rq->active;
        struct rt_rq *group_rq = group_rt_rq(rt_se);
+        struct list_head *queue = array->queue + rt_se_prio(rt_se);
        /*
         * Don't enqueue the group if its throttled, or when empty.
@@ -465,7 +601,7 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
        if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
                return;
-        list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
+        list_add_tail(&rt_se->run_list, queue);
        __set_bit(rt_se_prio(rt_se), array->bitmap);
        inc_rt_tasks(rt_se, rt_rq);
@@ -532,6 +668,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
                rt_se->timeout = 0;
        enqueue_rt_entity(rt_se);
+        inc_cpu_load(rq, p->se.load.weight);
 }
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -540,36 +678,42 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
        update_curr_rt(rq);
        dequeue_rt_entity(rt_se);
+        dec_cpu_load(rq, p->se.load.weight);
 }
 /*
 * Put task to the end of the run list without the overhead of dequeue
 * followed by enqueue.
 */
-static
+static void
-void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
 {
-        struct rt_prio_array *array = &rt_rq->active;
+        if (on_rt_rq(rt_se)) {
-        struct list_head *queue = array->queue + rt_se_prio(rt_se);
+                struct rt_prio_array *array = &rt_rq->active;
+                struct list_head *queue = array->queue + rt_se_prio(rt_se);
-        if (on_rt_rq(rt_se))
+                if (head)
-                list_move_tail(&rt_se->run_list, queue);
+                        list_move(&rt_se->run_list, queue);
+                else
+                        list_move_tail(&rt_se->run_list, queue);
+        }
 }
-static void requeue_task_rt(struct rq *rq, struct task_struct *p)
+static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
 {
        struct sched_rt_entity *rt_se = &p->rt;
        struct rt_rq *rt_rq;
        for_each_sched_rt_entity(rt_se) {
                rt_rq = rt_rq_of_se(rt_se);
-                requeue_rt_entity(rt_rq, rt_se);
+                requeue_rt_entity(rt_rq, rt_se, head);
        }
 }
 static void yield_task_rt(struct rq *rq)
 {
-        requeue_task_rt(rq, rq->curr);
+        requeue_task_rt(rq, rq->curr, 0);
 }
 #ifdef CONFIG_SMP
@@ -609,6 +753,30 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
         */
        return task_cpu(p);
 }
+static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
+{
+        cpumask_t mask;
+        if (rq->curr->rt.nr_cpus_allowed == 1)
+                return;
+        if (p->rt.nr_cpus_allowed != 1
+            && cpupri_find(&rq->rd->cpupri, p, &mask))
+                return;
+        if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
+                return;
+        /*
+         * There appears to be other cpus that can accept
+         * current and none to run 'p', so lets reschedule
+         * to try and push current away:
+         */
+        requeue_task_rt(rq, p, 1);
+        resched_task(rq->curr);
+}
 #endif /* CONFIG_SMP */
 /*
@@ -616,8 +784,27 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
 */
 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
 {
-        if (p->prio < rq->curr->prio)
+        if (p->prio < rq->curr->prio) {
                resched_task(rq->curr);
+                return;
+        }
+#ifdef CONFIG_SMP
+        /*
+         * If:
+         *
+         * - the newly woken task is of equal priority to the current task
+         * - the newly woken task is non-migratable while current is migratable
+         * - current will be preempted on the next reschedule
+         *
+         * we should check to see if current can readily move to a different
+         * cpu.  If so, we will reschedule to allow the push logic to try
+         * to move current somewhere else, making room for our non-migratable
+         * task.
+         */
+        if (p->prio == rq->curr->prio && !need_resched())
+                check_preempt_equal_prio(rq, p);
+#endif
 }
 static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
@@ -720,73 +907,6 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
-static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
-{
-        int       lowest_prio = -1;
-        int       lowest_cpu  = -1;
-        int       count       = 0;
-        int       cpu;
-        cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
-        /*
-         * Scan each rq for the lowest prio.
-         */
-        for_each_cpu_mask(cpu, *lowest_mask) {
-                struct rq *rq = cpu_rq(cpu);
-                /* We look for lowest RT prio or non-rt CPU */
-                if (rq->rt.highest_prio >= MAX_RT_PRIO) {
-                        /*
-                         * if we already found a low RT queue
-                         * and now we found this non-rt queue
-                         * clear the mask and set our bit.
-                         * Otherwise just return the queue as is
-                         * and the count==1 will cause the algorithm
-                         * to use the first bit found.
-                         */
-                        if (lowest_cpu != -1) {
-                                cpus_clear(*lowest_mask);
-                                cpu_set(rq->cpu, *lowest_mask);
-                        }
-                        return 1;
-                }
-                /* no locking for now */
-                if ((rq->rt.highest_prio > task->prio)
-                    && (rq->rt.highest_prio >= lowest_prio)) {
-                        if (rq->rt.highest_prio > lowest_prio) {
-                                /* new low - clear old data */
-                                lowest_prio = rq->rt.highest_prio;
-                                lowest_cpu = cpu;
-                                count = 0;
-                        }
-                        count++;
-                } else
-                        cpu_clear(cpu, *lowest_mask);
-        }
-        /*
-         * Clear out all the set bits that represent
-         * runqueues that were of higher prio than
-         * the lowest_prio.
-         */
-        if (lowest_cpu > 0) {
-                /*
-                 * Perhaps we could add another cpumask op to
-                 * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
-                 * Then that could be optimized to use memset and such.
-                 */
-                for_each_cpu_mask(cpu, *lowest_mask) {
-                        if (cpu >= lowest_cpu)
-                                break;
-                        cpu_clear(cpu, *lowest_mask);
-                }
-        }
-        return count;
-}
 static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
 {
        int first;
@@ -808,17 +928,19 @@ static int find_lowest_rq(struct task_struct *task)
        cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
        int this_cpu = smp_processor_id();
        int cpu      = task_cpu(task);
-        int count    = find_lowest_cpus(task, lowest_mask);
-        if (!count)
+        if (task->rt.nr_cpus_allowed == 1)
+                return -1; /* No other targets possible */
+        if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
                return -1; /* No targets found */
        /*
-         * There is no sense in performing an optimal search if only one
+         * Only consider CPUs that are usable for migration.
-         * target is found.
+         * I guess we might want to change cpupri_find() to ignore those
+         * in the first place.
         */
-        if (count == 1)
+        cpus_and(*lowest_mask, *lowest_mask, cpu_active_map);
-                return first_cpu(*lowest_mask);
        /*
         * At this point we have built a mask of cpus representing the
@@ -1006,7 +1128,7 @@ static int pull_rt_task(struct rq *this_rq)
        next = pick_next_task_rt(this_rq);
-        for_each_cpu_mask(cpu, this_rq->rd->rto_mask) {
+        for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) {
                if (this_cpu == cpu)
                        continue;
@@ -1163,17 +1285,25 @@ static void set_cpus_allowed_rt(struct task_struct *p,
 }
 /* Assumes rq->lock is held */
-static void join_domain_rt(struct rq *rq)
+static void rq_online_rt(struct rq *rq)
 {
        if (rq->rt.overloaded)
                rt_set_overload(rq);
+        __enable_runtime(rq);
+        cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
 }
 /* Assumes rq->lock is held */
-static void leave_domain_rt(struct rq *rq)
+static void rq_offline_rt(struct rq *rq)
 {
        if (rq->rt.overloaded)
                rt_clear_overload(rq);
+        __disable_runtime(rq);
+        cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
 }
 /*
@@ -1306,7 +1436,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
         * on the queue:
         */
        if (p->rt.run_list.prev != p->rt.run_list.next) {
-                requeue_task_rt(rq, p);
+                requeue_task_rt(rq, p, 0);
                set_tsk_need_resched(p);
        }
 }
@@ -1336,8 +1466,8 @@ static const struct sched_class rt_sched_class = {
        .load_balance           = load_balance_rt,
        .move_one_task          = move_one_task_rt,
        .set_cpus_allowed       = set_cpus_allowed_rt,
-        .join_domain            = join_domain_rt,
+        .rq_online              = rq_online_rt,
-        .leave_domain           = leave_domain_rt,
+        .rq_offline             = rq_offline_rt,
        .pre_schedule           = pre_schedule_rt,
        .post_schedule          = post_schedule_rt,
        .task_wake_up           = task_wake_up_rt,
@@ -1350,3 +1480,17 @@ static const struct sched_class rt_sched_class = {
        .prio_changed           = prio_changed_rt,
        .switched_to            = switched_to_rt,
 };
+#ifdef CONFIG_SCHED_DEBUG
+extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
+static void print_rt_stats(struct seq_file *m, int cpu)
+{
+        struct rt_rq *rt_rq;
+        rcu_read_lock();
+        for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu))
+                print_rt_rq(m, cpu, rt_rq);
+        rcu_read_unlock();
+}
+#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 80179ef7450e..8385d43987e2 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -118,6 +118,13 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
        if (rq)
                rq->rq_sched_info.cpu_time += delta;
 }
+static inline void
+rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
+{
+        if (rq)
+                rq->rq_sched_info.run_delay += delta;
+}
 # define schedstat_inc(rq, field)       do { (rq)->field++; } while (0)
 # define schedstat_add(rq, field, amt)  do { (rq)->field += (amt); } while (0)
 # define schedstat_set(var, val)        do { var = (val); } while (0)
@@ -126,6 +133,9 @@ static inline void
 rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
 {}
 static inline void
+rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
+{}
+static inline void
 rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 {}
 # define schedstat_inc(rq, field)       do { } while (0)
@@ -134,6 +144,11 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 #endif
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+static inline void sched_info_reset_dequeued(struct task_struct *t)
+{
+        t->sched_info.last_queued = 0;
+}
 /*
 * Called when a process is dequeued from the active array and given
 * the cpu.  We should note that with the exception of interactive
@@ -143,15 +158,22 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 * active queue, thus delaying tasks in the expired queue from running;
 * see scheduler_tick()).
 *
- * This function is only called from sched_info_arrive(), rather than
+ * Though we are interested in knowing how long it was from the *first* time a
- * dequeue_task(). Even though a task may be queued and dequeued multiple
+ * task was queued to the time that it finally hit a cpu, we call this routine
- * times as it is shuffled about, we're really interested in knowing how
+ * from dequeue_task() to account for possible rq->clock skew across cpus. The
- * long it was from the *first* time it was queued to the time that it
+ * delta taken on each cpu would annul the skew.
- * finally hit a cpu.
 */
 static inline void sched_info_dequeued(struct task_struct *t)
 {
-        t->sched_info.last_queued = 0;
+        unsigned long long now = task_rq(t)->clock, delta = 0;
+        if (unlikely(sched_info_on()))
+                if (t->sched_info.last_queued)
+                        delta = now - t->sched_info.last_queued;
+        sched_info_reset_dequeued(t);
+        t->sched_info.run_delay += delta;
+        rq_sched_info_dequeued(task_rq(t), delta);
 }
 /*
@@ -165,7 +187,7 @@ static void sched_info_arrive(struct task_struct *t)
        if (t->sched_info.last_queued)
                delta = now - t->sched_info.last_queued;
-        sched_info_dequeued(t);
+        sched_info_reset_dequeued(t);
        t->sched_info.run_delay += delta;
        t->sched_info.last_arrival = now;
        t->sched_info.pcount++;
@@ -242,7 +264,9 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
                __sched_info_switch(prev, next);
 }
 #else
-#define sched_info_queued(t)            do { } while (0)
+#define sched_info_queued(t)                    do { } while (0)
-#define sched_info_switch(t, next)      do { } while (0)
+#define sched_info_reset_dequeued(t)    do { } while (0)
+#define sched_info_dequeued(t)                  do { } while (0)
+#define sched_info_switch(t, next)              do { } while (0)
 #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 5c2942e768cd..aaaeae8244e7 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -31,6 +31,7 @@
 #include <linux/sched.h>
 #include <linux/semaphore.h>
 #include <linux/spinlock.h>
+#include <linux/ftrace.h>
 static noinline void __down(struct semaphore *sem);
 static noinline int __down_interruptible(struct semaphore *sem);
diff --git a/kernel/signal.c b/kernel/signal.c
index 6c0958e52ea7..954f77d7e3bc 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,6 +22,7 @@
 #include <linux/ptrace.h>
 #include <linux/signal.h>
 #include <linux/signalfd.h>
+#include <linux/tracehook.h>
 #include <linux/capability.h>
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
@@ -39,24 +40,21 @@
 static struct kmem_cache *sigqueue_cachep;
-static int __sig_ignored(struct task_struct *t, int sig)
+static void __user *sig_handler(struct task_struct *t, int sig)
 {
-        void __user *handler;
+        return t->sighand->action[sig - 1].sa.sa_handler;
+}
+static int sig_handler_ignored(void __user *handler, int sig)
+{
        /* Is it explicitly or implicitly ignored? */
-        handler = t->sighand->action[sig - 1].sa.sa_handler;
        return handler == SIG_IGN ||
                (handler == SIG_DFL && sig_kernel_ignore(sig));
 }
 static int sig_ignored(struct task_struct *t, int sig)
 {
-        /*
+        void __user *handler;
-         * Tracers always want to know about signals..
-         */
-        if (t->ptrace & PT_PTRACED)
-                return 0;
        /*
         * Blocked signals are never ignored, since the
@@ -66,7 +64,14 @@ static int sig_ignored(struct task_struct *t, int sig)
        if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
                return 0;
-        return __sig_ignored(t, sig);
+        handler = sig_handler(t, sig);
+        if (!sig_handler_ignored(handler, sig))
+                return 0;
+        /*
+         * Tracers may want to know about even ignored signals.
+         */
+        return !tracehook_consider_ignored_signal(t, sig, handler);
 }
 /*
@@ -129,7 +134,9 @@ void recalc_sigpending_and_wake(struct task_struct *t)
 void recalc_sigpending(void)
 {
-        if (!recalc_sigpending_tsk(current) && !freezing(current))
+        if (unlikely(tracehook_force_sigpending()))
+                set_thread_flag(TIF_SIGPENDING);
+        else if (!recalc_sigpending_tsk(current) && !freezing(current))
                clear_thread_flag(TIF_SIGPENDING);
 }
@@ -295,12 +302,12 @@ flush_signal_handlers(struct task_struct *t, int force_default)
 int unhandled_signal(struct task_struct *tsk, int sig)
 {
+        void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler;
        if (is_global_init(tsk))
                return 1;
-        if (tsk->ptrace & PT_PTRACED)
+        if (handler != SIG_IGN && handler != SIG_DFL)
                return 0;
-        return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
+        return !tracehook_consider_fatal_signal(tsk, sig, handler);
-                (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
 }
@@ -338,13 +345,9 @@ unblock_all_signals(void)
        spin_unlock_irqrestore(&current->sighand->siglock, flags);
 }
-static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
+static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
 {
        struct sigqueue *q, *first = NULL;
-        int still_pending = 0;
-        if (unlikely(!sigismember(&list->signal, sig)))
-                return 0;
        /*
         * Collect the siginfo appropriate to this signal.  Check if
@@ -352,33 +355,30 @@ static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
        */
        list_for_each_entry(q, &list->list, list) {
                if (q->info.si_signo == sig) {
-                        if (first) {
+                        if (first)
-                                still_pending = 1;
+                                goto still_pending;
-                                break;
-                        }
                        first = q;
                }
        }
+        sigdelset(&list->signal, sig);
        if (first) {
+still_pending:
                list_del_init(&first->list);
                copy_siginfo(info, &first->info);
                __sigqueue_free(first);
-                if (!still_pending)
-                        sigdelset(&list->signal, sig);
        } else {
                /* Ok, it wasn't in the queue.  This must be
                   a fast-pathed signal or we must have been
                   out of queue space.  So zero out the info.
                 */
-                sigdelset(&list->signal, sig);
                info->si_signo = sig;
                info->si_errno = 0;
                info->si_code = 0;
                info->si_pid = 0;
                info->si_uid = 0;
        }
-        return 1;
 }
 static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
@@ -396,8 +396,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
                        }
                }
-                if (!collect_signal(sig, pending, info))
+                collect_signal(sig, pending, info);
-                        sig = 0;
        }
        return sig;
@@ -462,8 +461,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
                 * is to alert stop-signal processing code when another
                 * processor has come along and cleared the flag.
                 */
-                if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
+                tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
-                        tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
        }
        if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
                /*
@@ -600,9 +598,6 @@ static int check_kill_permission(int sig, struct siginfo *info,
        return security_task_kill(t, info, sig, 0);
 }
-/* forward decl */
-static void do_notify_parent_cldstop(struct task_struct *tsk, int why);
 /*
 * Handle magic process-wide effects of stop/continue signals. Unlike
 * the signal actions, these happen immediately at signal-generation
@@ -765,7 +760,8 @@ static void complete_signal(int sig, struct task_struct *p, int group)
        if (sig_fatal(p, sig) &&
            !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
            !sigismember(&t->real_blocked, sig) &&
-            (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) {
+            (sig == SIGKILL ||
+             !tracehook_consider_fatal_signal(t, sig, SIG_DFL))) {
                /*
                 * This signal will be fatal to the whole group.
                 */
@@ -1125,7 +1121,7 @@ EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
 * is probably wrong.  Should make it like BSD or SYSV.
 */
-static int kill_something_info(int sig, struct siginfo *info, int pid)
+static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
 {
        int ret;
@@ -1237,17 +1233,6 @@ int kill_pid(struct pid *pid, int sig, int priv)
 }
 EXPORT_SYMBOL(kill_pid);
-int
-kill_proc(pid_t pid, int sig, int priv)
-{
-        int ret;
-        rcu_read_lock();
-        ret = kill_pid_info(sig, __si_special(priv), find_pid(pid));
-        rcu_read_unlock();
-        return ret;
-}
 /*
 * These functions support sending signals using preallocated sigqueue
 * structures.  This is needed "because realtime applications cannot
@@ -1343,9 +1328,11 @@ static inline void __wake_up_parent(struct task_struct *p,
 /*
 * Let a parent know about the death of a child.
 * For a stopped/continued status change, use do_notify_parent_cldstop instead.
+ *
+ * Returns -1 if our parent ignored us and so we've switched to
+ * self-reaping, or else @sig.
 */
+int do_notify_parent(struct task_struct *tsk, int sig)
-void do_notify_parent(struct task_struct *tsk, int sig)
 {
        struct siginfo info;
        unsigned long flags;
@@ -1379,10 +1366,9 @@ void do_notify_parent(struct task_struct *tsk, int sig)
        info.si_uid = tsk->uid;
-        /* FIXME: find out whether or not this is supposed to be c*time. */
+        info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
-        info.si_utime = cputime_to_jiffies(cputime_add(tsk->utime,
                                                       tsk->signal->utime));
-        info.si_stime = cputime_to_jiffies(cputime_add(tsk->stime,
+        info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
                                                       tsk->signal->stime));
        info.si_status = tsk->exit_code & 0x7f;
@@ -1417,12 +1403,14 @@ void do_notify_parent(struct task_struct *tsk, int sig)
                 */
                tsk->exit_signal = -1;
                if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
-                        sig = 0;
+                        sig = -1;
        }
        if (valid_signal(sig) && sig > 0)
                __group_send_sig_info(sig, &info, tsk->parent);
        __wake_up_parent(tsk, tsk->parent);
        spin_unlock_irqrestore(&psig->siglock, flags);
+        return sig;
 }
 static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
@@ -1450,9 +1438,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
        info.si_uid = tsk->uid;
-        /* FIXME: find out whether or not this is supposed to be c*time. */
+        info.si_utime = cputime_to_clock_t(tsk->utime);
-        info.si_utime = cputime_to_jiffies(tsk->utime);
+        info.si_stime = cputime_to_clock_t(tsk->stime);
-        info.si_stime = cputime_to_jiffies(tsk->stime);
        info.si_code = why;
        switch (why) {
@@ -1491,10 +1478,10 @@ static inline int may_ptrace_stop(void)
         * is a deadlock situation, and pointless because our tracer
         * is dead so don't allow us to stop.
         * If SIGKILL was already sent before the caller unlocked
-         * ->siglock we must see ->core_waiters != 0. Otherwise it
+         * ->siglock we must see ->core_state != NULL. Otherwise it
         * is safe to enter schedule().
         */
-        if (unlikely(current->mm->core_waiters) &&
+        if (unlikely(current->mm->core_state) &&
            unlikely(current->mm == current->parent->mm))
                return 0;
@@ -1507,9 +1494,8 @@ static inline int may_ptrace_stop(void)
 */
 static int sigkill_pending(struct task_struct *tsk)
 {
-        return ((sigismember(&tsk->pending.signal, SIGKILL) ||
+        return  sigismember(&tsk->pending.signal, SIGKILL) ||
-                 sigismember(&tsk->signal->shared_pending.signal, SIGKILL)) &&
+                sigismember(&tsk->signal->shared_pending.signal, SIGKILL);
-                !unlikely(sigismember(&tsk->blocked, SIGKILL)));
 }
 /*
@@ -1525,8 +1511,6 @@ static int sigkill_pending(struct task_struct *tsk)
 */
 static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
 {
-        int killed = 0;
        if (arch_ptrace_stop_needed(exit_code, info)) {
                /*
                 * The arch code has something special to do before a
@@ -1542,7 +1526,8 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
                spin_unlock_irq(&current->sighand->siglock);
                arch_ptrace_stop(exit_code, info);
                spin_lock_irq(&current->sighand->siglock);
-                killed = sigkill_pending(current);
+                if (sigkill_pending(current))
+                        return;
        }
        /*
@@ -1559,7 +1544,7 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
        __set_current_state(TASK_TRACED);
        spin_unlock_irq(&current->sighand->siglock);
        read_lock(&tasklist_lock);
-        if (!unlikely(killed) && may_ptrace_stop()) {
+        if (may_ptrace_stop()) {
                do_notify_parent_cldstop(current, CLD_TRAPPED);
                read_unlock(&tasklist_lock);
                schedule();
@@ -1623,7 +1608,7 @@ finish_stop(int stop_count)
         * a group stop in progress and we are the last to stop,
         * report to the parent.  When ptraced, every thread reports itself.
         */
-        if (stop_count == 0 || (current->ptrace & PT_PTRACED)) {
+        if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) {
                read_lock(&tasklist_lock);
                do_notify_parent_cldstop(current, CLD_STOPPED);
                read_unlock(&tasklist_lock);
@@ -1658,8 +1643,7 @@ static int do_signal_stop(int signr)
        } else {
                struct task_struct *t;
-                if (unlikely((sig->flags & (SIGNAL_STOP_DEQUEUED | SIGNAL_UNKILLABLE))
+                if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
-                                         != SIGNAL_STOP_DEQUEUED) ||
                    unlikely(signal_group_exit(sig)))
                        return 0;
                /*
@@ -1760,6 +1744,9 @@ relock:
                signal->flags &= ~SIGNAL_CLD_MASK;
                spin_unlock_irq(&sighand->siglock);
+                if (unlikely(!tracehook_notify_jctl(1, why)))
+                        goto relock;
                read_lock(&tasklist_lock);
                do_notify_parent_cldstop(current->group_leader, why);
                read_unlock(&tasklist_lock);
@@ -1773,17 +1760,33 @@ relock:
                    do_signal_stop(0))
                        goto relock;
-                signr = dequeue_signal(current, &current->blocked, info);
+                /*
-                if (!signr)
+                 * Tracing can induce an artifical signal and choose sigaction.
-                        break; /* will return 0 */
+                 * The return value in @signr determines the default action,
+                 * but @info->si_signo is the signal number we will report.
+                 */
+                signr = tracehook_get_signal(current, regs, info, return_ka);
+                if (unlikely(signr < 0))
+                        goto relock;
+                if (unlikely(signr != 0))
+                        ka = return_ka;
+                else {
+                        signr = dequeue_signal(current, &current->blocked,
+                                               info);
-                if (signr != SIGKILL) {
-                        signr = ptrace_signal(signr, info, regs, cookie);
                        if (!signr)
-                                continue;
+                                break; /* will return 0 */
+                        if (signr != SIGKILL) {
+                                signr = ptrace_signal(signr, info,
+                                                      regs, cookie);
+                                if (!signr)
+                                        continue;
+                        }
+                        ka = &sighand->action[signr-1];
                }
-                ka = &sighand->action[signr-1];
                if (ka->sa.sa_handler == SIG_IGN) /* Do nothing.  */
                        continue;
                if (ka->sa.sa_handler != SIG_DFL) {
@@ -1831,7 +1834,7 @@ relock:
                                spin_lock_irq(&sighand->siglock);
                        }
-                        if (likely(do_signal_stop(signr))) {
+                        if (likely(do_signal_stop(info->si_signo))) {
                                /* It released the siglock.  */
                                goto relock;
                        }
@@ -1852,7 +1855,7 @@ relock:
                if (sig_kernel_coredump(signr)) {
                        if (print_fatal_signals)
-                                print_fatal_signal(regs, signr);
+                                print_fatal_signal(regs, info->si_signo);
                        /*
                         * If it was able to dump core, this kills all
                         * other threads in the group and synchronizes with
@@ -1861,13 +1864,13 @@ relock:
                         * first and our do_group_exit call below will use
                         * that value and ignore the one we pass it.
                         */
-                        do_coredump((long)signr, signr, regs);
+                        do_coredump(info->si_signo, info->si_signo, regs);
                }
                /*
                 * Death signals, no core dump.
                 */
-                do_group_exit(signr);
+                do_group_exit(info->si_signo);
                /* NOTREACHED */
        }
        spin_unlock_irq(&sighand->siglock);
@@ -1909,7 +1912,7 @@ void exit_signals(struct task_struct *tsk)
 out:
        spin_unlock_irq(&tsk->sighand->siglock);
-        if (unlikely(group_stop)) {
+        if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) {
                read_lock(&tasklist_lock);
                do_notify_parent_cldstop(tsk, CLD_STOPPED);
                read_unlock(&tasklist_lock);
@@ -1920,8 +1923,6 @@ EXPORT_SYMBOL(recalc_sigpending);
 EXPORT_SYMBOL_GPL(dequeue_signal);
 EXPORT_SYMBOL(flush_signals);
 EXPORT_SYMBOL(force_sig);
-EXPORT_SYMBOL(kill_proc);
-EXPORT_SYMBOL(ptrace_notify);
 EXPORT_SYMBOL(send_sig);
 EXPORT_SYMBOL(send_sig_info);
 EXPORT_SYMBOL(sigprocmask);
@@ -2196,7 +2197,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
 }
 asmlinkage long
-sys_kill(int pid, int sig)
+sys_kill(pid_t pid, int sig)
 {
        struct siginfo info;
@@ -2209,7 +2210,7 @@ sys_kill(int pid, int sig)
        return kill_something_info(sig, &info, pid);
 }
-static int do_tkill(int tgid, int pid, int sig)
+static int do_tkill(pid_t tgid, pid_t pid, int sig)
 {
        int error;
        struct siginfo info;
@@ -2255,7 +2256,7 @@ static int do_tkill(int tgid, int pid, int sig)
 *  exists but it's not belonging to the target process anymore. This
 *  method solves the problem of threads exiting and PIDs getting reused.
 */
-asmlinkage long sys_tgkill(int tgid, int pid, int sig)
+asmlinkage long sys_tgkill(pid_t tgid, pid_t pid, int sig)
 {
        /* This is only valid for single tasks */
        if (pid <= 0 || tgid <= 0)
@@ -2268,7 +2269,7 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig)
 *  Send a signal to only one task, even if it's a CLONE_THREAD task.
 */
 asmlinkage long
-sys_tkill(int pid, int sig)
+sys_tkill(pid_t pid, int sig)
 {
        /* This is only valid for single tasks */
        if (pid <= 0)
@@ -2278,7 +2279,7 @@ sys_tkill(int pid, int sig)
 }
 asmlinkage long
-sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo)
+sys_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t __user *uinfo)
 {
        siginfo_t info;
@@ -2325,7 +2326,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
                 *   (for example, SIGCHLD), shall cause the pending signal to
                 *   be discarded, whether or not it is blocked"
                 */
-                if (__sig_ignored(t, sig)) {
+                if (sig_handler_ignored(sig_handler(t, sig), sig)) {
                        sigemptyset(&mask);
                        sigaddset(&mask, sig);
                        rm_from_queue_full(&mask, &t->signal->shared_pending);
diff --git a/kernel/smp.c b/kernel/smp.c
new file mode 100644
index 000000000000..96fc7c0edc59
--- /dev/null
+++ b/kernel/smp.c
@@ -0,0 +1,385 @@
+/*
+ * Generic helpers for smp ipi calls
+ *
+ * (C) Jens Axboe <jens.axboe@oracle.com> 2008
+ *
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist.h>
+#include <linux/smp.h>
+static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
+static LIST_HEAD(call_function_queue);
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock);
+enum {
+        CSD_FLAG_WAIT           = 0x01,
+        CSD_FLAG_ALLOC          = 0x02,
+};
+struct call_function_data {
+        struct call_single_data csd;
+        spinlock_t lock;
+        unsigned int refs;
+        cpumask_t cpumask;
+        struct rcu_head rcu_head;
+};
+struct call_single_queue {
+        struct list_head list;
+        spinlock_t lock;
+};
+static int __cpuinit init_call_single_data(void)
+{
+        int i;
+        for_each_possible_cpu(i) {
+                struct call_single_queue *q = &per_cpu(call_single_queue, i);
+                spin_lock_init(&q->lock);
+                INIT_LIST_HEAD(&q->list);
+        }
+        return 0;
+}
+early_initcall(init_call_single_data);
+static void csd_flag_wait(struct call_single_data *data)
+{
+        /* Wait for response */
+        do {
+                /*
+                 * We need to see the flags store in the IPI handler
+                 */
+                smp_mb();
+                if (!(data->flags & CSD_FLAG_WAIT))
+                        break;
+                cpu_relax();
+        } while (1);
+}
+/*
+ * Insert a previously allocated call_single_data element for execution
+ * on the given CPU. data must already have ->func, ->info, and ->flags set.
+ */
+static void generic_exec_single(int cpu, struct call_single_data *data)
+{
+        struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
+        int wait = data->flags & CSD_FLAG_WAIT, ipi;
+        unsigned long flags;
+        spin_lock_irqsave(&dst->lock, flags);
+        ipi = list_empty(&dst->list);
+        list_add_tail(&data->list, &dst->list);
+        spin_unlock_irqrestore(&dst->lock, flags);
+        if (ipi)
+                arch_send_call_function_single_ipi(cpu);
+        if (wait)
+                csd_flag_wait(data);
+}
+static void rcu_free_call_data(struct rcu_head *head)
+{
+        struct call_function_data *data;
+        data = container_of(head, struct call_function_data, rcu_head);
+        kfree(data);
+}
+/*
+ * Invoked by arch to handle an IPI for call function. Must be called with
+ * interrupts disabled.
+ */
+void generic_smp_call_function_interrupt(void)
+{
+        struct call_function_data *data;
+        int cpu = get_cpu();
+        /*
+         * It's ok to use list_for_each_rcu() here even though we may delete
+         * 'pos', since list_del_rcu() doesn't clear ->next
+         */
+        rcu_read_lock();
+        list_for_each_entry_rcu(data, &call_function_queue, csd.list) {
+                int refs;
+                if (!cpu_isset(cpu, data->cpumask))
+                        continue;
+                data->csd.func(data->csd.info);
+                spin_lock(&data->lock);
+                cpu_clear(cpu, data->cpumask);
+                WARN_ON(data->refs == 0);
+                data->refs--;
+                refs = data->refs;
+                spin_unlock(&data->lock);
+                if (refs)
+                        continue;
+                spin_lock(&call_function_lock);
+                list_del_rcu(&data->csd.list);
+                spin_unlock(&call_function_lock);
+                if (data->csd.flags & CSD_FLAG_WAIT) {
+                        /*
+                         * serialize stores to data with the flag clear
+                         * and wakeup
+                         */
+                        smp_wmb();
+                        data->csd.flags &= ~CSD_FLAG_WAIT;
+                } else
+                        call_rcu(&data->rcu_head, rcu_free_call_data);
+        }
+        rcu_read_unlock();
+        put_cpu();
+}
+/*
+ * Invoked by arch to handle an IPI for call function single. Must be called
+ * from the arch with interrupts disabled.
+ */
+void generic_smp_call_function_single_interrupt(void)
+{
+        struct call_single_queue *q = &__get_cpu_var(call_single_queue);
+        LIST_HEAD(list);
+        /*
+         * Need to see other stores to list head for checking whether
+         * list is empty without holding q->lock
+         */
+        smp_mb();
+        while (!list_empty(&q->list)) {
+                unsigned int data_flags;
+                spin_lock(&q->lock);
+                list_replace_init(&q->list, &list);
+                spin_unlock(&q->lock);
+                while (!list_empty(&list)) {
+                        struct call_single_data *data;
+                        data = list_entry(list.next, struct call_single_data,
+                                                list);
+                        list_del(&data->list);
+                        /*
+                         * 'data' can be invalid after this call if
+                         * flags == 0 (when called through
+                         * generic_exec_single(), so save them away before
+                         * making the call.
+                         */
+                        data_flags = data->flags;
+                        data->func(data->info);
+                        if (data_flags & CSD_FLAG_WAIT) {
+                                smp_wmb();
+                                data->flags &= ~CSD_FLAG_WAIT;
+                        } else if (data_flags & CSD_FLAG_ALLOC)
+                                kfree(data);
+                }
+                /*
+                 * See comment on outer loop
+                 */
+                smp_mb();
+        }
+}
+/*
+ * smp_call_function_single - Run a function on a specific CPU
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code. Note that @wait
+ * will be implicitly turned on in case of allocation failures, since
+ * we fall back to on-stack allocation.
+ */
+int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+                             int wait)
+{
+        struct call_single_data d;
+        unsigned long flags;
+        /* prevent preemption and reschedule on another processor */
+        int me = get_cpu();
+        /* Can deadlock when called with interrupts disabled */
+        WARN_ON(irqs_disabled());
+        if (cpu == me) {
+                local_irq_save(flags);
+                func(info);
+                local_irq_restore(flags);
+        } else {
+                struct call_single_data *data = NULL;
+                if (!wait) {
+                        data = kmalloc(sizeof(*data), GFP_ATOMIC);
+                        if (data)
+                                data->flags = CSD_FLAG_ALLOC;
+                }
+                if (!data) {
+                        data = &d;
+                        data->flags = CSD_FLAG_WAIT;
+                }
+                data->func = func;
+                data->info = info;
+                generic_exec_single(cpu, data);
+        }
+        put_cpu();
+        return 0;
+}
+EXPORT_SYMBOL(smp_call_function_single);
+/**
+ * __smp_call_function_single(): Run a function on another CPU
+ * @cpu: The CPU to run on.
+ * @data: Pre-allocated and setup data structure
+ *
+ * Like smp_call_function_single(), but allow caller to pass in a pre-allocated
+ * data structure. Useful for embedding @data inside other structures, for
+ * instance.
+ *
+ */
+void __smp_call_function_single(int cpu, struct call_single_data *data)
+{
+        /* Can deadlock when called with interrupts disabled */
+        WARN_ON((data->flags & CSD_FLAG_WAIT) && irqs_disabled());
+        generic_exec_single(cpu, data);
+}
+/**
+ * smp_call_function_mask(): Run a function on a set of other CPUs.
+ * @mask: The set of cpus to run on.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code.
+ *
+ * If @wait is true, then returns once @func has returned. Note that @wait
+ * will be implicitly turned on in case of allocation failures, since
+ * we fall back to on-stack allocation.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler. Preemption
+ * must be disabled when calling this function.
+ */
+int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
+                           int wait)
+{
+        struct call_function_data d;
+        struct call_function_data *data = NULL;
+        cpumask_t allbutself;
+        unsigned long flags;
+        int cpu, num_cpus;
+        /* Can deadlock when called with interrupts disabled */
+        WARN_ON(irqs_disabled());
+        cpu = smp_processor_id();
+        allbutself = cpu_online_map;
+        cpu_clear(cpu, allbutself);
+        cpus_and(mask, mask, allbutself);
+        num_cpus = cpus_weight(mask);
+        /*
+         * If zero CPUs, return. If just a single CPU, turn this request
+         * into a targetted single call instead since it's faster.
+         */
+        if (!num_cpus)
+                return 0;
+        else if (num_cpus == 1) {
+                cpu = first_cpu(mask);
+                return smp_call_function_single(cpu, func, info, wait);
+        }
+        if (!wait) {
+                data = kmalloc(sizeof(*data), GFP_ATOMIC);
+                if (data)
+                        data->csd.flags = CSD_FLAG_ALLOC;
+        }
+        if (!data) {
+                data = &d;
+                data->csd.flags = CSD_FLAG_WAIT;
+                wait = 1;
+        }
+        spin_lock_init(&data->lock);
+        data->csd.func = func;
+        data->csd.info = info;
+        data->refs = num_cpus;
+        data->cpumask = mask;
+        spin_lock_irqsave(&call_function_lock, flags);
+        list_add_tail_rcu(&data->csd.list, &call_function_queue);
+        spin_unlock_irqrestore(&call_function_lock, flags);
+        /* Send a message to all CPUs in the map */
+        arch_send_call_function_ipi(mask);
+        /* optionally wait for the CPUs to complete */
+        if (wait)
+                csd_flag_wait(&data->csd);
+        return 0;
+}
+EXPORT_SYMBOL(smp_call_function_mask);
+/**
+ * smp_call_function(): Run a function on all other CPUs.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code.
+ *
+ * If @wait is true, then returns once @func has returned; otherwise
+ * it returns just before the target cpu calls @func. In case of allocation
+ * failure, @wait will be implicitly turned on.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler.
+ */
+int smp_call_function(void (*func)(void *), void *info, int wait)
+{
+        int ret;
+        preempt_disable();
+        ret = smp_call_function_mask(cpu_online_map, func, info, wait);
+        preempt_enable();
+        return ret;
+}
+EXPORT_SYMBOL(smp_call_function);
+void ipi_call_lock(void)
+{
+        spin_lock(&call_function_lock);
+}
+void ipi_call_unlock(void)
+{
+        spin_unlock(&call_function_lock);
+}
+void ipi_call_lock_irq(void)
+{
+        spin_lock_irq(&call_function_lock);
+}
+void ipi_call_unlock_irq(void)
+{
+        spin_unlock_irq(&call_function_lock);
+}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 36e061740047..c506f266a6b9 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -131,23 +131,17 @@ void _local_bh_enable(void)
 EXPORT_SYMBOL(_local_bh_enable);
-void local_bh_enable(void)
+static inline void _local_bh_enable_ip(unsigned long ip)
 {
+        WARN_ON_ONCE(in_irq() || irqs_disabled());
 #ifdef CONFIG_TRACE_IRQFLAGS
-        unsigned long flags;
+        local_irq_disable();
-        WARN_ON_ONCE(in_irq());
-#endif
-        WARN_ON_ONCE(irqs_disabled());
-#ifdef CONFIG_TRACE_IRQFLAGS
-        local_irq_save(flags);
 #endif
        /*
         * Are softirqs going to be turned on now:
         */
        if (softirq_count() == SOFTIRQ_OFFSET)
-                trace_softirqs_on((unsigned long)__builtin_return_address(0));
+                trace_softirqs_on(ip);
        /*
         * Keep preemption disabled until we are done with
         * softirq processing:
@@ -159,40 +153,20 @@ void local_bh_enable(void)
        dec_preempt_count();
 #ifdef CONFIG_TRACE_IRQFLAGS
-        local_irq_restore(flags);
+        local_irq_enable();
 #endif
        preempt_check_resched();
 }
+void local_bh_enable(void)
+{
+        _local_bh_enable_ip((unsigned long)__builtin_return_address(0));
+}
 EXPORT_SYMBOL(local_bh_enable);
 void local_bh_enable_ip(unsigned long ip)
 {
-#ifdef CONFIG_TRACE_IRQFLAGS
+        _local_bh_enable_ip(ip);
-        unsigned long flags;
-        WARN_ON_ONCE(in_irq());
-        local_irq_save(flags);
-#endif
-        /*
-         * Are softirqs going to be turned on now:
-         */
-        if (softirq_count() == SOFTIRQ_OFFSET)
-                trace_softirqs_on(ip);
-        /*
-         * Keep preemption disabled until we are done with
-         * softirq processing:
-         */
-        sub_preempt_count(SOFTIRQ_OFFSET - 1);
-        if (unlikely(!in_interrupt() && local_softirq_pending()))
-                do_softirq();
-        dec_preempt_count();
-#ifdef CONFIG_TRACE_IRQFLAGS
-        local_irq_restore(flags);
-#endif
-        preempt_check_resched();
 }
 EXPORT_SYMBOL(local_bh_enable_ip);
@@ -312,7 +286,7 @@ void irq_exit(void)
 #ifdef CONFIG_NO_HZ
        /* Make sure that timer wheel updates are propagated */
        if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
-                tick_nohz_stop_sched_tick();
+                tick_nohz_stop_sched_tick(0);
        rcu_irq_exit();
 #endif
        preempt_enable_no_resched();
@@ -347,9 +321,8 @@ void raise_softirq(unsigned int nr)
        local_irq_restore(flags);
 }
-void open_softirq(int nr, void (*action)(struct softirq_action*), void *data)
+void open_softirq(int nr, void (*action)(struct softirq_action *))
 {
-        softirq_vec[nr].data = data;
        softirq_vec[nr].action = action;
 }
@@ -360,10 +333,8 @@ struct tasklet_head
        struct tasklet_struct **tail;
 };
-/* Some compilers disobey section attribute on statics when not
+static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
-   initialized -- RR */
+static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
-static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec) = { NULL };
-static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec) = { NULL };
 void __tasklet_schedule(struct tasklet_struct *t)
 {
@@ -503,8 +474,8 @@ void __init softirq_init(void)
                        &per_cpu(tasklet_hi_vec, cpu).head;
        }
-        open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL);
+        open_softirq(TASKLET_SOFTIRQ, tasklet_action);
-        open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL);
+        open_softirq(HI_SOFTIRQ, tasklet_hi_action);
 }
 static int ksoftirqd(void * __bind_cpu)
@@ -645,7 +616,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
                p = per_cpu(ksoftirqd, hotcpu);
                per_cpu(ksoftirqd, hotcpu) = NULL;
-                sched_setscheduler(p, SCHED_FIFO, &param);
+                sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
                kthread_stop(p);
                takeover_tasklets(hotcpu);
                break;
@@ -659,7 +630,7 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
        .notifier_call = cpu_callback
 };
-__init int spawn_ksoftirqd(void)
+static __init int spawn_ksoftirqd(void)
 {
        void *cpu = (void *)(long)smp_processor_id();
        int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
@@ -669,17 +640,18 @@ __init int spawn_ksoftirqd(void)
        register_cpu_notifier(&cpu_nfb);
        return 0;
 }
+early_initcall(spawn_ksoftirqd);
 #ifdef CONFIG_SMP
 /*
 * Call a function on all processors
 */
-int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait)
+int on_each_cpu(void (*func) (void *info), void *info, int wait)
 {
        int ret = 0;
        preempt_disable();
-        ret = smp_call_function(func, info, retry, wait);
+        ret = smp_call_function(func, info, wait);
        local_irq_disable();
        func(info);
        local_irq_enable();
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index a272d78185eb..b75b492fbfcf 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -13,6 +13,7 @@
 #include <linux/delay.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
+#include <linux/lockdep.h>
 #include <linux/notifier.h>
 #include <linux/module.h>
@@ -25,7 +26,22 @@ static DEFINE_PER_CPU(unsigned long, print_timestamp);
 static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
 static int __read_mostly did_panic;
-unsigned long __read_mostly softlockup_thresh = 60;
+int __read_mostly softlockup_thresh = 60;
+/*
+ * Should we panic (and reboot, if panic_timeout= is set) when a
+ * soft-lockup occurs:
+ */
+unsigned int __read_mostly softlockup_panic =
+                                CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
+static int __init softlockup_panic_setup(char *str)
+{
+        softlockup_panic = simple_strtoul(str, NULL, 0);
+        return 1;
+}
+__setup("softlockup_panic=", softlockup_panic_setup);
 static int
 softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
@@ -84,6 +100,14 @@ void softlockup_tick(void)
        struct pt_regs *regs = get_irq_regs();
        unsigned long now;
+        /* Is detection switched off? */
+        if (!per_cpu(watchdog_task, this_cpu) || softlockup_thresh <= 0) {
+                /* Be sure we don't false trigger if switched back on */
+                if (touch_timestamp)
+                        per_cpu(touch_timestamp, this_cpu) = 0;
+                return;
+        }
        if (touch_timestamp == 0) {
                __touch_softlockup_watchdog();
                return;
@@ -92,11 +116,8 @@ void softlockup_tick(void)
        print_timestamp = per_cpu(print_timestamp, this_cpu);
        /* report at most once a second */
-        if ((print_timestamp >= touch_timestamp &&
+        if (print_timestamp == touch_timestamp || did_panic)
-                        print_timestamp < (touch_timestamp + 1)) ||
-                        did_panic || !per_cpu(watchdog_task, this_cpu)) {
                return;
-        }
        /* do not print during early bootup: */
        if (unlikely(system_state != SYSTEM_RUNNING)) {
@@ -106,8 +127,11 @@ void softlockup_tick(void)
        now = get_timestamp(this_cpu);
-        /* Wake up the high-prio watchdog task every second: */
+        /*
-        if (now > (touch_timestamp + 1))
+         * Wake up the high-prio watchdog task twice per
+         * threshold timespan.
+         */
+        if (now > touch_timestamp + softlockup_thresh/2)
                wake_up_process(per_cpu(watchdog_task, this_cpu));
        /* Warn about unreasonable delays: */
@@ -121,11 +145,15 @@ void softlockup_tick(void)
                        this_cpu, now - touch_timestamp,
                        current->comm, task_pid_nr(current));
        print_modules();
+        print_irqtrace_events(current);
        if (regs)
                show_regs(regs);
        else
                dump_stack();
        spin_unlock(&print_lock);
+        if (softlockup_panic)
+                panic("softlockup: hung tasks");
 }
 /*
@@ -178,6 +206,9 @@ static void check_hung_task(struct task_struct *t, unsigned long now)
        t->last_switch_timestamp = now;
        touch_nmi_watchdog();
+        if (softlockup_panic)
+                panic("softlockup: blocked tasks");
 }
 /*
@@ -307,14 +338,33 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
        .notifier_call = cpu_callback
 };
-__init void spawn_softlockup_task(void)
+static int __initdata nosoftlockup;
+static int __init nosoftlockup_setup(char *str)
+{
+        nosoftlockup = 1;
+        return 1;
+}
+__setup("nosoftlockup", nosoftlockup_setup);
+static int __init spawn_softlockup_task(void)
 {
        void *cpu = (void *)(long)smp_processor_id();
-        int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+        int err;
+        if (nosoftlockup)
+                return 0;
-        BUG_ON(err == NOTIFY_BAD);
+        err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+        if (err == NOTIFY_BAD) {
+                BUG();
+                return 1;
+        }
        cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
        register_cpu_notifier(&cpu_nfb);
        atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+        return 0;
 }
+early_initcall(spawn_softlockup_task);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index ae28c8245123..a1fb54c93cdd 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -436,7 +436,7 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock)
 }
 EXPORT_SYMBOL(_spin_trylock_bh);
-int in_lock_functions(unsigned long addr)
+notrace int in_lock_functions(unsigned long addr)
 {
        /* Linker adds these: start and end of __lockfunc functions */
        extern char __lock_text_start[], __lock_text_end[];
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index b71816e47a30..94b527ef1d1e 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -6,19 +6,21 @@
 *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
 */
 #include <linux/sched.h>
+#include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/stacktrace.h>
 void print_stack_trace(struct stack_trace *trace, int spaces)
 {
-        int i, j;
+        int i;
-        for (i = 0; i < trace->nr_entries; i++) {
+        if (WARN_ON(!trace->entries))
-                unsigned long ip = trace->entries[i];
+                return;
-                for (j = 0; j < spaces + 1; j++)
+        for (i = 0; i < trace->nr_entries; i++) {
-                        printk(" ");
+                printk("%*c", 1 + spaces, ' ');
-                print_ip_sym(ip);
+                print_ip_sym(trace->entries[i]);
        }
 }
+EXPORT_SYMBOL_GPL(print_stack_trace);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index b7350bbfb076..738b411ff2d3 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -33,8 +33,9 @@ static int stopmachine(void *cpu)
 {
        int irqs_disabled = 0;
        int prepared = 0;
+        cpumask_of_cpu_ptr(cpumask, (int)(long)cpu);
-        set_cpus_allowed_ptr(current, &cpumask_of_cpu((int)(long)cpu));
+        set_cpus_allowed_ptr(current, cpumask);
        /* Ack: we are alive */
        smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
@@ -187,7 +188,7 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
                struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
                /* One high-prio thread per cpu.  We'll do this one. */
-                sched_setscheduler(p, SCHED_FIFO, &param);
+                sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
                kthread_bind(p, cpu);
                wake_up_process(p);
                wait_for_completion(&smdata.done);
diff --git a/kernel/sys.c b/kernel/sys.c
index 14e97282eb6c..c01858090a98 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -301,26 +301,6 @@ void kernel_restart(char *cmd)
 }
 EXPORT_SYMBOL_GPL(kernel_restart);
-/**
- *      kernel_kexec - reboot the system
- *
- *      Move into place and start executing a preloaded standalone
- *      executable.  If nothing was preloaded return an error.
- */
-static void kernel_kexec(void)
-{
-#ifdef CONFIG_KEXEC
-        struct kimage *image;
-        image = xchg(&kexec_image, NULL);
-        if (!image)
-                return;
-        kernel_restart_prepare(NULL);
-        printk(KERN_EMERG "Starting new kernel\n");
-        machine_shutdown();
-        machine_kexec(image);
-#endif
-}
 static void kernel_shutdown_prepare(enum system_states state)
 {
        blocking_notifier_call_chain(&reboot_notifier_list,
@@ -425,10 +405,15 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
                kernel_restart(buffer);
                break;
+#ifdef CONFIG_KEXEC
        case LINUX_REBOOT_CMD_KEXEC:
-                kernel_kexec();
+                {
-                unlock_kernel();
+                        int ret;
-                return -EINVAL;
+                        ret = kernel_kexec();
+                        unlock_kernel();
+                        return ret;
+                }
+#endif
 #ifdef CONFIG_HIBERNATION
        case LINUX_REBOOT_CMD_SW_SUSPEND:
@@ -1343,8 +1328,6 @@ EXPORT_SYMBOL(in_egroup_p);
 DECLARE_RWSEM(uts_sem);
-EXPORT_SYMBOL(uts_sem);
 asmlinkage long sys_newuname(struct new_utsname __user * name)
 {
        int errno = 0;
@@ -1795,7 +1778,7 @@ int orderly_poweroff(bool force)
                goto out;
        }
-        info = call_usermodehelper_setup(argv[0], argv, envp);
+        info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC);
        if (info == NULL) {
                argv_free(argv);
                goto out;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 5b9b467de070..08d6e1bb99ac 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -31,6 +31,7 @@ cond_syscall(sys_socketpair);
 cond_syscall(sys_bind);
 cond_syscall(sys_listen);
 cond_syscall(sys_accept);
+cond_syscall(sys_paccept);
 cond_syscall(sys_connect);
 cond_syscall(sys_getsockname);
 cond_syscall(sys_getpeername);
@@ -56,9 +57,11 @@ cond_syscall(compat_sys_set_robust_list);
 cond_syscall(sys_get_robust_list);
 cond_syscall(compat_sys_get_robust_list);
 cond_syscall(sys_epoll_create);
+cond_syscall(sys_epoll_create1);
 cond_syscall(sys_epoll_ctl);
 cond_syscall(sys_epoll_wait);
 cond_syscall(sys_epoll_pwait);
+cond_syscall(compat_sys_epoll_pwait);
 cond_syscall(sys_semget);
 cond_syscall(sys_semop);
 cond_syscall(sys_semtimedop);
@@ -94,6 +97,7 @@ cond_syscall(sys_keyctl);
 cond_syscall(compat_sys_keyctl);
 cond_syscall(compat_sys_socketcall);
 cond_syscall(sys_inotify_init);
+cond_syscall(sys_inotify_init1);
 cond_syscall(sys_inotify_add_watch);
 cond_syscall(sys_inotify_rm_watch);
 cond_syscall(sys_migrate_pages);
@@ -154,10 +158,13 @@ cond_syscall(sys_ioprio_get);
 /* New file descriptors */
 cond_syscall(sys_signalfd);
+cond_syscall(sys_signalfd4);
 cond_syscall(compat_sys_signalfd);
+cond_syscall(compat_sys_signalfd4);
 cond_syscall(sys_timerfd_create);
 cond_syscall(sys_timerfd_settime);
 cond_syscall(sys_timerfd_gettime);
 cond_syscall(compat_sys_timerfd_settime);
 cond_syscall(compat_sys_timerfd_gettime);
 cond_syscall(sys_eventfd);
+cond_syscall(sys_eventfd2);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 29116652dca8..fe4713347275 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -43,9 +43,11 @@
 #include <linux/limits.h>
 #include <linux/dcache.h>
 #include <linux/syscalls.h>
+#include <linux/vmstat.h>
 #include <linux/nfs_fs.h>
 #include <linux/acpi.h>
 #include <linux/reboot.h>
+#include <linux/ftrace.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -79,17 +81,20 @@ extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
 extern int compat_log;
 extern int maps_protect;
-extern int sysctl_stat_interval;
 extern int latencytop_enabled;
 extern int sysctl_nr_open_min, sysctl_nr_open_max;
+#ifdef CONFIG_RCU_TORTURE_TEST
+extern int rcutorture_runnable;
+#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
 /* Constants used for minimum and  maximum */
-#if defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM)
+#if defined(CONFIG_HIGHMEM) || defined(CONFIG_DETECT_SOFTLOCKUP)
 static int one = 1;
 #endif
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 static int sixty = 60;
+static int neg_one = -1;
 #endif
 #ifdef CONFIG_MMU
@@ -106,7 +111,7 @@ static int min_percpu_pagelist_fract = 8;
 static int ngroups_max = NGROUPS_MAX;
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
 extern char modprobe_path[];
 #endif
 #ifdef CONFIG_CHR_DEV_SG
@@ -132,8 +137,6 @@ extern int sysctl_userprocess_debug;
 extern int spin_retry;
 #endif
-extern int sysctl_hz_timer;
 #ifdef CONFIG_BSD_PROCESS_ACCT
 extern int acct_parm[];
 #endif
@@ -157,12 +160,13 @@ static struct ctl_table root_table[];
 static struct ctl_table_root sysctl_table_root;
 static struct ctl_table_header root_table_header = {
        .ctl_table = root_table,
-        .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.header_list),
+        .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),
        .root = &sysctl_table_root,
+        .set = &sysctl_table_root.default_set,
 };
 static struct ctl_table_root sysctl_table_root = {
        .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
-        .header_list = LIST_HEAD_INIT(root_table_header.ctl_entry),
+        .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry),
 };
 static struct ctl_table kern_table[];
@@ -266,6 +270,14 @@ static struct ctl_table kern_table[] = {
        },
        {
                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "sched_shares_ratelimit",
+                .data           = &sysctl_sched_shares_ratelimit,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "sched_child_runs_first",
                .data           = &sysctl_sched_child_runs_first,
                .maxlen         = sizeof(unsigned int),
@@ -455,7 +467,17 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
        },
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_FTRACE
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "ftrace_enabled",
+                .data           = &ftrace_enabled,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &ftrace_enable_sysctl,
+        },
+#endif
+#ifdef CONFIG_MODULES
        {
                .ctl_name       = KERN_MODPROBE,
                .procname       = "modprobe",
@@ -563,16 +585,6 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = &proc_dointvec,
        },
 #endif
-#ifdef CONFIG_NO_IDLE_HZ
-        {
-                .ctl_name       = KERN_HZ_TIMER,
-                .procname       = "hz_timer",
-                .data           = &sysctl_hz_timer,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
-        },
-#endif
        {
                .ctl_name       = KERN_S390_USER_DEBUG_LOGGING,
                .procname       = "userprocess_debug",
@@ -613,7 +625,7 @@ static struct ctl_table kern_table[] = {
        {
                .ctl_name       = KERN_PRINTK_RATELIMIT,
                .procname       = "printk_ratelimit",
-                .data           = &printk_ratelimit_jiffies,
+                .data           = &printk_ratelimit_state.interval,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = &proc_dointvec_jiffies,
@@ -622,7 +634,7 @@ static struct ctl_table kern_table[] = {
        {
                .ctl_name       = KERN_PRINTK_RATELIMIT_BURST,
                .procname       = "printk_ratelimit_burst",
-                .data           = &printk_ratelimit_burst,
+                .data           = &printk_ratelimit_state.burst,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
@@ -729,13 +741,24 @@ static struct ctl_table kern_table[] = {
 #ifdef CONFIG_DETECT_SOFTLOCKUP
        {
                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "softlockup_panic",
+                .data           = &softlockup_panic,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_minmax,
+                .strategy       = &sysctl_intvec,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "softlockup_thresh",
                .data           = &softlockup_thresh,
-                .maxlen         = sizeof(unsigned long),
+                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_doulongvec_minmax,
+                .proc_handler   = &proc_dointvec_minmax,
                .strategy       = &sysctl_intvec,
-                .extra1         = &one,
+                .extra1         = &neg_one,
                .extra2         = &sixty,
        },
        {
@@ -813,6 +836,16 @@ static struct ctl_table kern_table[] = {
                .child          = key_sysctls,
        },
 #endif
+#ifdef CONFIG_RCU_TORTURE_TEST
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "rcutorture_runnable",
+                .data           = &rcutorture_runnable,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+#endif
 /*
 * NOTE: do not add new entries to this table unless you have read
 * Documentation/sysctl/ctl_unnumbered.txt
@@ -927,7 +960,7 @@ static struct ctl_table vm_table[] = {
 #ifdef CONFIG_HUGETLB_PAGE
         {
                .procname       = "nr_hugepages",
-                .data           = &max_huge_pages,
+                .data           = NULL,
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
                .proc_handler   = &hugetlb_sysctl_handler,
@@ -953,10 +986,12 @@ static struct ctl_table vm_table[] = {
        {
                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nr_overcommit_hugepages",
-                .data           = &sysctl_overcommit_huge_pages,
+                .data           = NULL,
-                .maxlen         = sizeof(sysctl_overcommit_huge_pages),
+                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
                .proc_handler   = &hugetlb_overcommit_handler,
+                .extra1         = (void *)&hugetlb_zero,
+                .extra2         = (void *)&hugetlb_infinity,
        },
 #endif
        {
@@ -1352,6 +1387,9 @@ static void start_unregistering(struct ctl_table_header *p)
                spin_unlock(&sysctl_lock);
                wait_for_completion(&wait);
                spin_lock(&sysctl_lock);
+        } else {
+                /* anything non-NULL; we'll never dereference it */
+                p->unregistering = ERR_PTR(-EINVAL);
        }
        /*
         * do not remove from the list until nobody holds it; walking the
@@ -1360,6 +1398,32 @@ static void start_unregistering(struct ctl_table_header *p)
        list_del_init(&p->ctl_entry);
 }
+void sysctl_head_get(struct ctl_table_header *head)
+{
+        spin_lock(&sysctl_lock);
+        head->count++;
+        spin_unlock(&sysctl_lock);
+}
+void sysctl_head_put(struct ctl_table_header *head)
+{
+        spin_lock(&sysctl_lock);
+        if (!--head->count)
+                kfree(head);
+        spin_unlock(&sysctl_lock);
+}
+struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
+{
+        if (!head)
+                BUG();
+        spin_lock(&sysctl_lock);
+        if (!use_table(head))
+                head = ERR_PTR(-ENOENT);
+        spin_unlock(&sysctl_lock);
+        return head;
+}
 void sysctl_head_finish(struct ctl_table_header *head)
 {
        if (!head)
@@ -1369,14 +1433,20 @@ void sysctl_head_finish(struct ctl_table_header *head)
        spin_unlock(&sysctl_lock);
 }
+static struct ctl_table_set *
+lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
+{
+        struct ctl_table_set *set = &root->default_set;
+        if (root->lookup)
+                set = root->lookup(root, namespaces);
+        return set;
+}
 static struct list_head *
 lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
 {
-        struct list_head *header_list;
+        struct ctl_table_set *set = lookup_header_set(root, namespaces);
-        header_list = &root->header_list;
+        return &set->list;
-        if (root->lookup)
-                header_list = root->lookup(root, namespaces);
-        return header_list;
 }
 struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
@@ -1446,9 +1516,9 @@ static int do_sysctl_strategy(struct ctl_table_root *root,
        int op = 0, rc;
        if (oldval)
-                op |= 004;
+                op |= MAY_READ;
        if (newval)
-                op |= 002;
+                op |= MAY_WRITE;
        if (sysctl_perm(root, table, op))
                return -EPERM;
@@ -1490,7 +1560,7 @@ repeat:
                if (n == table->ctl_name) {
                        int error;
                        if (table->child) {
-                                if (sysctl_perm(root, table, 001))
+                                if (sysctl_perm(root, table, MAY_EXEC))
                                        return -EPERM;
                                name++;
                                nlen--;
@@ -1565,7 +1635,7 @@ static int test_perm(int mode, int op)
                mode >>= 6;
        else if (in_egroup_p(0))
                mode >>= 3;
-        if ((mode & op & 0007) == op)
+        if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
                return 0;
        return -EACCES;
 }
@@ -1575,7 +1645,7 @@ int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
        int error;
        int mode;
-        error = security_sysctl(table, op);
+        error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC));
        if (error)
                return error;
@@ -1610,6 +1680,54 @@ static __init int sysctl_init(void)
 core_initcall(sysctl_init);
+static struct ctl_table *is_branch_in(struct ctl_table *branch,
+                                      struct ctl_table *table)
+{
+        struct ctl_table *p;
+        const char *s = branch->procname;
+        /* branch should have named subdirectory as its first element */
+        if (!s || !branch->child)
+                return NULL;
+        /* ... and nothing else */
+        if (branch[1].procname || branch[1].ctl_name)
+                return NULL;
+        /* table should contain subdirectory with the same name */
+        for (p = table; p->procname || p->ctl_name; p++) {
+                if (!p->child)
+                        continue;
+                if (p->procname && strcmp(p->procname, s) == 0)
+                        return p;
+        }
+        return NULL;
+}
+/* see if attaching q to p would be an improvement */
+static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
+{
+        struct ctl_table *to = p->ctl_table, *by = q->ctl_table;
+        struct ctl_table *next;
+        int is_better = 0;
+        int not_in_parent = !p->attached_by;
+        while ((next = is_branch_in(by, to)) != NULL) {
+                if (by == q->attached_by)
+                        is_better = 1;
+                if (to == p->attached_by)
+                        not_in_parent = 1;
+                by = by->child;
+                to = next->child;
+        }
+        if (is_better && not_in_parent) {
+                q->attached_by = by;
+                q->attached_to = to;
+                q->parent = p;
+        }
+}
 /**
 * __register_sysctl_paths - register a sysctl hierarchy
 * @root: List of sysctl headers to register on
@@ -1686,10 +1804,10 @@ struct ctl_table_header *__register_sysctl_paths(
        struct nsproxy *namespaces,
        const struct ctl_path *path, struct ctl_table *table)
 {
-        struct list_head *header_list;
        struct ctl_table_header *header;
        struct ctl_table *new, **prevp;
        unsigned int n, npath;
+        struct ctl_table_set *set;
        /* Count the path components */
        for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath)
@@ -1731,6 +1849,7 @@ struct ctl_table_header *__register_sysctl_paths(
        header->unregistering = NULL;
        header->root = root;
        sysctl_set_parent(NULL, header->ctl_table);
+        header->count = 1;
 #ifdef CONFIG_SYSCTL_SYSCALL_CHECK
        if (sysctl_check_table(namespaces, header->ctl_table)) {
                kfree(header);
@@ -1738,8 +1857,20 @@ struct ctl_table_header *__register_sysctl_paths(
        }
 #endif
        spin_lock(&sysctl_lock);
-        header_list = lookup_header_list(root, namespaces);
+        header->set = lookup_header_set(root, namespaces);
-        list_add_tail(&header->ctl_entry, header_list);
+        header->attached_by = header->ctl_table;
+        header->attached_to = root_table;
+        header->parent = &root_table_header;
+        for (set = header->set; set; set = set->parent) {
+                struct ctl_table_header *p;
+                list_for_each_entry(p, &set->list, ctl_entry) {
+                        if (p->unregistering)
+                                continue;
+                        try_attach(p, header);
+                }
+        }
+        header->parent->count++;
+        list_add_tail(&header->ctl_entry, &header->set->list);
        spin_unlock(&sysctl_lock);
        return header;
@@ -1794,8 +1925,37 @@ void unregister_sysctl_table(struct ctl_table_header * header)
        spin_lock(&sysctl_lock);
        start_unregistering(header);
+        if (!--header->parent->count) {
+                WARN_ON(1);
+                kfree(header->parent);
+        }
+        if (!--header->count)
+                kfree(header);
+        spin_unlock(&sysctl_lock);
+}
+int sysctl_is_seen(struct ctl_table_header *p)
+{
+        struct ctl_table_set *set = p->set;
+        int res;
+        spin_lock(&sysctl_lock);
+        if (p->unregistering)
+                res = 0;
+        else if (!set->is_seen)
+                res = 1;
+        else
+                res = set->is_seen(set);
        spin_unlock(&sysctl_lock);
-        kfree(header);
+        return res;
+}
+void setup_sysctl_set(struct ctl_table_set *p,
+        struct ctl_table_set *parent,
+        int (*is_seen)(struct ctl_table_set *))
+{
+        INIT_LIST_HEAD(&p->list);
+        p->parent = parent ? parent : &sysctl_table_root.default_set;
+        p->is_seen = is_seen;
 }
 #else /* !CONFIG_SYSCTL */
@@ -1814,6 +1974,16 @@ void unregister_sysctl_table(struct ctl_table_header * table)
 {
 }
+void setup_sysctl_set(struct ctl_table_set *p,
+        struct ctl_table_set *parent,
+        int (*is_seen)(struct ctl_table_set *))
+{
+}
+void sysctl_head_put(struct ctl_table_header *head)
+{
+}
 #endif /* CONFIG_SYSCTL */
 /*
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index c09350d564f2..c35da23ab8fb 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -1532,6 +1532,8 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
                        sysctl_check_leaf(namespaces, table, &fail);
                }
                sysctl_check_bin_path(table, &fail);
+                if (table->mode > 0777)
+                        set_fail(&fail, table, "bogus .mode");
                if (fail) {
                        set_fail(&fail, table, NULL);
                        error = -EINVAL;
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 4a23517169a6..bd6be76303cf 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -35,7 +35,7 @@
 */
 #define TASKSTATS_CPUMASK_MAXLEN        (100+6*NR_CPUS)
-static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
+static DEFINE_PER_CPU(__u32, taskstats_seqnum);
 static int family_registered;
 struct kmem_cache *taskstats_cache;
@@ -301,7 +301,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
                return -EINVAL;
        if (isadd == REGISTER) {
-                for_each_cpu_mask(cpu, mask) {
+                for_each_cpu_mask_nr(cpu, mask) {
                        s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
                                         cpu_to_node(cpu));
                        if (!s)
@@ -320,7 +320,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
        /* Deregister or cleanup */
 cleanup:
-        for_each_cpu_mask(cpu, mask) {
+        for_each_cpu_mask_nr(cpu, mask) {
                listeners = &per_cpu(listener_array, cpu);
                down_write(&listeners->sem);
                list_for_each_entry_safe(s, tmp, &listeners->list, list) {
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index dadde5361f32..093d4acf993b 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -145,9 +145,9 @@ static void clocksource_watchdog(unsigned long data)
                 * Cycle through CPUs to check if the CPUs stay
                 * synchronized to each other.
                 */
-                int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map);
+                int next_cpu = next_cpu_nr(raw_smp_processor_id(), cpu_online_map);
-                if (next_cpu >= NR_CPUS)
+                if (next_cpu >= nr_cpu_ids)
                        next_cpu = first_cpu(cpu_online_map);
                watchdog_timer.expires += WATCHDOG_INTERVAL;
                add_timer_on(&watchdog_timer, next_cpu);
@@ -376,7 +376,8 @@ void clocksource_unregister(struct clocksource *cs)
 * Provides sysfs interface for listing current clocksource.
 */
 static ssize_t
-sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
+sysfs_show_current_clocksources(struct sys_device *dev,
+                                struct sysdev_attribute *attr, char *buf)
 {
        ssize_t count = 0;
@@ -397,6 +398,7 @@ sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
 * clocksource selction.
 */
 static ssize_t sysfs_override_clocksource(struct sys_device *dev,
+                                          struct sysdev_attribute *attr,
                                          const char *buf, size_t count)
 {
        struct clocksource *ovr = NULL;
@@ -449,7 +451,9 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
 * Provides sysfs interface for listing registered clocksources
 */
 static ssize_t
-sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
+sysfs_show_available_clocksources(struct sys_device *dev,
+                                  struct sysdev_attribute *attr,
+                                  char *buf)
 {
        struct clocksource *src;
        ssize_t count = 0;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 57a1f02e5ec0..31463d370b94 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -30,6 +30,7 @@
 struct tick_device tick_broadcast_device;
 static cpumask_t tick_broadcast_mask;
 static DEFINE_SPINLOCK(tick_broadcast_lock);
+static int tick_broadcast_force;
 #ifdef CONFIG_TICK_ONESHOT
 static void tick_broadcast_clear_oneshot(int cpu);
@@ -232,10 +233,11 @@ static void tick_do_broadcast_on_off(void *why)
                                                     CLOCK_EVT_MODE_SHUTDOWN);
                }
                if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
-                        dev->features |= CLOCK_EVT_FEAT_DUMMY;
+                        tick_broadcast_force = 1;
                break;
        case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
-                if (cpu_isset(cpu, tick_broadcast_mask)) {
+                if (!tick_broadcast_force &&
+                    cpu_isset(cpu, tick_broadcast_mask)) {
                        cpu_clear(cpu, tick_broadcast_mask);
                        if (td->mode == TICKDEV_MODE_PERIODIC)
                                tick_setup_periodic(dev, 0);
@@ -266,7 +268,7 @@ void tick_broadcast_on_off(unsigned long reason, int *oncpu)
                       "offline CPU #%d\n", *oncpu);
        else
                smp_call_function_single(*oncpu, tick_do_broadcast_on_off,
-                                         &reason, 1, 1);
+                                         &reason, 1);
 }
 /*
@@ -397,8 +399,7 @@ again:
        mask = CPU_MASK_NONE;
        now = ktime_get();
        /* Find all expired events */
-        for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS;
+        for_each_cpu_mask_nr(cpu, tick_broadcast_oneshot_mask) {
-             cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) {
                td = &per_cpu(tick_cpu_device, cpu);
                if (td->evtdev->next_event.tv64 <= now.tv64)
                        cpu_set(cpu, mask);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 4f3886562b8c..bf43284d6855 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -135,7 +135,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
 */
 static void tick_setup_device(struct tick_device *td,
                              struct clock_event_device *newdev, int cpu,
-                              cpumask_t cpumask)
+                              const cpumask_t *cpumask)
 {
        ktime_t next_event;
        void (*handler)(struct clock_event_device *) = NULL;
@@ -169,8 +169,8 @@ static void tick_setup_device(struct tick_device *td,
         * When the device is not per cpu, pin the interrupt to the
         * current cpu:
         */
-        if (!cpus_equal(newdev->cpumask, cpumask))
+        if (!cpus_equal(newdev->cpumask, *cpumask))
-                irq_set_affinity(newdev->irq, cpumask);
+                irq_set_affinity(newdev->irq, *cpumask);
        /*
         * When global broadcasting is active, check if the current
@@ -196,20 +196,20 @@ static int tick_check_new_device(struct clock_event_device *newdev)
        struct tick_device *td;
        int cpu, ret = NOTIFY_OK;
        unsigned long flags;
-        cpumask_t cpumask;
+        cpumask_of_cpu_ptr_declare(cpumask);
        spin_lock_irqsave(&tick_device_lock, flags);
        cpu = smp_processor_id();
+        cpumask_of_cpu_ptr_next(cpumask, cpu);
        if (!cpu_isset(cpu, newdev->cpumask))
                goto out_bc;
        td = &per_cpu(tick_cpu_device, cpu);
        curdev = td->evtdev;
-        cpumask = cpumask_of_cpu(cpu);
        /* cpu local device ? */
-        if (!cpus_equal(newdev->cpumask, cpumask)) {
+        if (!cpus_equal(newdev->cpumask, *cpumask)) {
                /*
                 * If the cpu affinity of the device interrupt can not
@@ -222,7 +222,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
                 * If we have a cpu local device already, do not replace it
                 * by a non cpu local device
                 */
-                if (curdev && cpus_equal(curdev->cpumask, cpumask))
+                if (curdev && cpus_equal(curdev->cpumask, *cpumask))
                        goto out_bc;
        }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b854a895591e..825b4c00fe44 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -48,6 +48,13 @@ static void tick_do_update_jiffies64(ktime_t now)
        unsigned long ticks = 0;
        ktime_t delta;
+        /*
+         * Do a quick check without holding xtime_lock:
+         */
+        delta = ktime_sub(now, last_jiffies_update);
+        if (delta.tv64 < tick_period.tv64)
+                return;
        /* Reevalute with xtime_lock held */
        write_seqlock(&xtime_lock);
@@ -133,8 +140,6 @@ void tick_nohz_update_jiffies(void)
        if (!ts->tick_stopped)
                return;
-        touch_softlockup_watchdog();
        cpu_clear(cpu, nohz_cpu_mask);
        now = ktime_get();
        ts->idle_waketime = now;
@@ -142,6 +147,8 @@ void tick_nohz_update_jiffies(void)
        local_irq_save(flags);
        tick_do_update_jiffies64(now);
        local_irq_restore(flags);
+        touch_softlockup_watchdog();
 }
 void tick_nohz_stop_idle(int cpu)
@@ -188,7 +195,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 * Called either from the idle loop or from irq_exit() when an idle period was
 * just interrupted by an interrupt which did not cause a reschedule.
 */
-void tick_nohz_stop_sched_tick(void)
+void tick_nohz_stop_sched_tick(int inidle)
 {
        unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
        struct tick_sched *ts;
@@ -217,6 +224,11 @@ void tick_nohz_stop_sched_tick(void)
        if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
                goto end;
+        if (!inidle && !ts->inidle)
+                goto end;
+        ts->inidle = 1;
        if (need_resched())
                goto end;
@@ -228,6 +240,7 @@ void tick_nohz_stop_sched_tick(void)
                               local_softirq_pending());
                        ratelimit++;
                }
+                goto end;
        }
        ts->idle_calls++;
@@ -276,6 +289,7 @@ void tick_nohz_stop_sched_tick(void)
                        ts->tick_stopped = 1;
                        ts->idle_jiffies = last_jiffies;
                        rcu_enter_nohz();
+                        sched_clock_tick_stop(cpu);
                }
                /*
@@ -364,17 +378,21 @@ void tick_nohz_restart_sched_tick(void)
        local_irq_disable();
        tick_nohz_stop_idle(cpu);
-        if (!ts->tick_stopped) {
+        if (!ts->inidle || !ts->tick_stopped) {
+                ts->inidle = 0;
                local_irq_enable();
                return;
        }
+        ts->inidle = 0;
        rcu_exit_nohz();
        /* Update jiffies first */
        select_nohz_load_balancer(0);
        now = ktime_get();
        tick_do_update_jiffies64(now);
+        sched_clock_tick_start(cpu);
        cpu_clear(cpu, nohz_cpu_mask);
        /*
diff --git a/kernel/timer.c b/kernel/timer.c
index ceacc6626572..03bc7f1f1593 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -812,7 +812,7 @@ static inline void __run_timers(struct tvec_base *base)
        spin_unlock_irq(&base->lock);
 }
-#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ)
+#ifdef CONFIG_NO_HZ
 /*
 * Find out when the next timer event is due to happen. This
 * is used on S/390 to stop all activity when a cpus is idle.
@@ -947,14 +947,6 @@ unsigned long get_next_timer_interrupt(unsigned long now)
        return cmp_next_hrtimer_event(now, expires);
 }
-#ifdef CONFIG_NO_IDLE_HZ
-unsigned long next_timer_interrupt(void)
-{
-        return get_next_timer_interrupt(jiffies);
-}
-#endif
 #endif
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
@@ -1502,7 +1494,7 @@ void __init init_timers(void)
        BUG_ON(err == NOTIFY_BAD);
        register_cpu_notifier(&timers_nb);
-        open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
+        open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
 }
 /**
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
new file mode 100644
index 000000000000..263e9e6bbd60
--- /dev/null
+++ b/kernel/trace/Kconfig
@@ -0,0 +1,135 @@
+#
+# Architectures that offer an FTRACE implementation should select HAVE_FTRACE:
+#
+config HAVE_FTRACE
+        bool
+config HAVE_DYNAMIC_FTRACE
+        bool
+config TRACER_MAX_TRACE
+        bool
+config TRACING
+        bool
+        select DEBUG_FS
+        select STACKTRACE
+config FTRACE
+        bool "Kernel Function Tracer"
+        depends on HAVE_FTRACE
+        select FRAME_POINTER
+        select TRACING
+        select CONTEXT_SWITCH_TRACER
+        help
+          Enable the kernel to trace every kernel function. This is done
+          by using a compiler feature to insert a small, 5-byte No-Operation
+          instruction to the beginning of every kernel function, which NOP
+          sequence is then dynamically patched into a tracer call when
+          tracing is enabled by the administrator. If it's runtime disabled
+          (the bootup default), then the overhead of the instructions is very
+          small and not measurable even in micro-benchmarks.
+config IRQSOFF_TRACER
+        bool "Interrupts-off Latency Tracer"
+        default n
+        depends on TRACE_IRQFLAGS_SUPPORT
+        depends on GENERIC_TIME
+        depends on HAVE_FTRACE
+        select TRACE_IRQFLAGS
+        select TRACING
+        select TRACER_MAX_TRACE
+        help
+          This option measures the time spent in irqs-off critical
+          sections, with microsecond accuracy.
+          The default measurement method is a maximum search, which is
+          disabled by default and can be runtime (re-)started
+          via:
+              echo 0 > /debugfs/tracing/tracing_max_latency
+          (Note that kernel size and overhead increases with this option
+          enabled. This option and the preempt-off timing option can be
+          used together or separately.)
+config PREEMPT_TRACER
+        bool "Preemption-off Latency Tracer"
+        default n
+        depends on GENERIC_TIME
+        depends on PREEMPT
+        depends on HAVE_FTRACE
+        select TRACING
+        select TRACER_MAX_TRACE
+        help
+          This option measures the time spent in preemption off critical
+          sections, with microsecond accuracy.
+          The default measurement method is a maximum search, which is
+          disabled by default and can be runtime (re-)started
+          via:
+              echo 0 > /debugfs/tracing/tracing_max_latency
+          (Note that kernel size and overhead increases with this option
+          enabled. This option and the irqs-off timing option can be
+          used together or separately.)
+config SYSPROF_TRACER
+        bool "Sysprof Tracer"
+        depends on X86
+        select TRACING
+        help
+          This tracer provides the trace needed by the 'Sysprof' userspace
+          tool.
+config SCHED_TRACER
+        bool "Scheduling Latency Tracer"
+        depends on HAVE_FTRACE
+        select TRACING
+        select CONTEXT_SWITCH_TRACER
+        select TRACER_MAX_TRACE
+        help
+          This tracer tracks the latency of the highest priority task
+          to be scheduled in, starting from the point it has woken up.
+config CONTEXT_SWITCH_TRACER
+        bool "Trace process context switches"
+        depends on HAVE_FTRACE
+        select TRACING
+        select MARKERS
+        help
+          This tracer gets called from the context switch and records
+          all switching of tasks.
+config DYNAMIC_FTRACE
+        bool "enable/disable ftrace tracepoints dynamically"
+        depends on FTRACE
+        depends on HAVE_DYNAMIC_FTRACE
+        default y
+        help
+         This option will modify all the calls to ftrace dynamically
+         (will patch them out of the binary image and replaces them
+         with a No-Op instruction) as they are called. A table is
+         created to dynamically enable them again.
+         This way a CONFIG_FTRACE kernel is slightly larger, but otherwise
+         has native performance as long as no tracing is active.
+         The changes to the code are done by a kernel thread that
+         wakes up once a second and checks to see if any ftrace calls
+         were made. If so, it runs stop_machine (stops all CPUS)
+         and modifies the code to jump over the call to ftrace.
+config FTRACE_SELFTEST
+        bool
+config FTRACE_STARTUP_TEST
+        bool "Perform a startup test on ftrace"
+        depends on TRACING
+        select FTRACE_SELFTEST
+        help
+          This option performs a series of startup tests on ftrace. On bootup
+          a series of tests are made to verify that the tracer is
+          functioning properly. It will do tests on all the configured
+          tracers of ftrace.
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
new file mode 100644
index 000000000000..71d17de17288
--- /dev/null
+++ b/kernel/trace/Makefile
@@ -0,0 +1,24 @@
+# Do not instrument the tracer itself:
+ifdef CONFIG_FTRACE
+ORIG_CFLAGS := $(KBUILD_CFLAGS)
+KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
+# selftest needs instrumentation
+CFLAGS_trace_selftest_dynamic.o = -pg
+obj-y += trace_selftest_dynamic.o
+endif
+obj-$(CONFIG_FTRACE) += libftrace.o
+obj-$(CONFIG_TRACING) += trace.o
+obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
+obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
+obj-$(CONFIG_FTRACE) += trace_functions.o
+obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
+obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
+obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
+obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
+libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
new file mode 100644
index 000000000000..4231a3dc224a
--- /dev/null
+++ b/kernel/trace/ftrace.c
@@ -0,0 +1,1727 @@
+/*
+ * Infrastructure for profiling code inserted by 'gcc -pg'.
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2004-2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Originally ported from the -rt patch by:
+ *   Copyright (C) 2007 Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Based on code in the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/stop_machine.h>
+#include <linux/clocksource.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/hardirq.h>
+#include <linux/kthread.h>
+#include <linux/uaccess.h>
+#include <linux/kprobes.h>
+#include <linux/ftrace.h>
+#include <linux/sysctl.h>
+#include <linux/ctype.h>
+#include <linux/hash.h>
+#include <linux/list.h>
+#include <asm/ftrace.h>
+#include "trace.h"
+/* ftrace_enabled is a method to turn ftrace on or off */
+int ftrace_enabled __read_mostly;
+static int last_ftrace_enabled;
+/*
+ * ftrace_disabled is set when an anomaly is discovered.
+ * ftrace_disabled is much stronger than ftrace_enabled.
+ */
+static int ftrace_disabled __read_mostly;
+static DEFINE_SPINLOCK(ftrace_lock);
+static DEFINE_MUTEX(ftrace_sysctl_lock);
+static struct ftrace_ops ftrace_list_end __read_mostly =
+{
+        .func = ftrace_stub,
+};
+static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
+ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
+static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
+{
+        struct ftrace_ops *op = ftrace_list;
+        /* in case someone actually ports this to alpha! */
+        read_barrier_depends();
+        while (op != &ftrace_list_end) {
+                /* silly alpha */
+                read_barrier_depends();
+                op->func(ip, parent_ip);
+                op = op->next;
+        };
+}
+/**
+ * clear_ftrace_function - reset the ftrace function
+ *
+ * This NULLs the ftrace function and in essence stops
+ * tracing.  There may be lag
+ */
+void clear_ftrace_function(void)
+{
+        ftrace_trace_function = ftrace_stub;
+}
+static int __register_ftrace_function(struct ftrace_ops *ops)
+{
+        /* Should never be called by interrupts */
+        spin_lock(&ftrace_lock);
+        ops->next = ftrace_list;
+        /*
+         * We are entering ops into the ftrace_list but another
+         * CPU might be walking that list. We need to make sure
+         * the ops->next pointer is valid before another CPU sees
+         * the ops pointer included into the ftrace_list.
+         */
+        smp_wmb();
+        ftrace_list = ops;
+        if (ftrace_enabled) {
+                /*
+                 * For one func, simply call it directly.
+                 * For more than one func, call the chain.
+                 */
+                if (ops->next == &ftrace_list_end)
+                        ftrace_trace_function = ops->func;
+                else
+                        ftrace_trace_function = ftrace_list_func;
+        }
+        spin_unlock(&ftrace_lock);
+        return 0;
+}
+static int __unregister_ftrace_function(struct ftrace_ops *ops)
+{
+        struct ftrace_ops **p;
+        int ret = 0;
+        spin_lock(&ftrace_lock);
+        /*
+         * If we are removing the last function, then simply point
+         * to the ftrace_stub.
+         */
+        if (ftrace_list == ops && ops->next == &ftrace_list_end) {
+                ftrace_trace_function = ftrace_stub;
+                ftrace_list = &ftrace_list_end;
+                goto out;
+        }
+        for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next)
+                if (*p == ops)
+                        break;
+        if (*p != ops) {
+                ret = -1;
+                goto out;
+        }
+        *p = (*p)->next;
+        if (ftrace_enabled) {
+                /* If we only have one func left, then call that directly */
+                if (ftrace_list == &ftrace_list_end ||
+                    ftrace_list->next == &ftrace_list_end)
+                        ftrace_trace_function = ftrace_list->func;
+        }
+ out:
+        spin_unlock(&ftrace_lock);
+        return ret;
+}
+#ifdef CONFIG_DYNAMIC_FTRACE
+static struct task_struct *ftraced_task;
+enum {
+        FTRACE_ENABLE_CALLS             = (1 << 0),
+        FTRACE_DISABLE_CALLS            = (1 << 1),
+        FTRACE_UPDATE_TRACE_FUNC        = (1 << 2),
+        FTRACE_ENABLE_MCOUNT            = (1 << 3),
+        FTRACE_DISABLE_MCOUNT           = (1 << 4),
+};
+static int ftrace_filtered;
+static int tracing_on;
+static int frozen_record_count;
+static struct hlist_head ftrace_hash[FTRACE_HASHSIZE];
+static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu);
+static DEFINE_SPINLOCK(ftrace_shutdown_lock);
+static DEFINE_MUTEX(ftraced_lock);
+static DEFINE_MUTEX(ftrace_regex_lock);
+struct ftrace_page {
+        struct ftrace_page      *next;
+        unsigned long           index;
+        struct dyn_ftrace       records[];
+};
+#define ENTRIES_PER_PAGE \
+  ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace))
+/* estimate from running different kernels */
+#define NR_TO_INIT              10000
+static struct ftrace_page       *ftrace_pages_start;
+static struct ftrace_page       *ftrace_pages;
+static int ftraced_trigger;
+static int ftraced_suspend;
+static int ftraced_stop;
+static int ftrace_record_suspend;
+static struct dyn_ftrace *ftrace_free_records;
+#ifdef CONFIG_KPROBES
+static inline void freeze_record(struct dyn_ftrace *rec)
+{
+        if (!(rec->flags & FTRACE_FL_FROZEN)) {
+                rec->flags |= FTRACE_FL_FROZEN;
+                frozen_record_count++;
+        }
+}
+static inline void unfreeze_record(struct dyn_ftrace *rec)
+{
+        if (rec->flags & FTRACE_FL_FROZEN) {
+                rec->flags &= ~FTRACE_FL_FROZEN;
+                frozen_record_count--;
+        }
+}
+static inline int record_frozen(struct dyn_ftrace *rec)
+{
+        return rec->flags & FTRACE_FL_FROZEN;
+}
+#else
+# define freeze_record(rec)                     ({ 0; })
+# define unfreeze_record(rec)                   ({ 0; })
+# define record_frozen(rec)                     ({ 0; })
+#endif /* CONFIG_KPROBES */
+int skip_trace(unsigned long ip)
+{
+        unsigned long fl;
+        struct dyn_ftrace *rec;
+        struct hlist_node *t;
+        struct hlist_head *head;
+        if (frozen_record_count == 0)
+                return 0;
+        head = &ftrace_hash[hash_long(ip, FTRACE_HASHBITS)];
+        hlist_for_each_entry_rcu(rec, t, head, node) {
+                if (rec->ip == ip) {
+                        if (record_frozen(rec)) {
+                                if (rec->flags & FTRACE_FL_FAILED)
+                                        return 1;
+                                if (!(rec->flags & FTRACE_FL_CONVERTED))
+                                        return 1;
+                                if (!tracing_on || !ftrace_enabled)
+                                        return 1;
+                                if (ftrace_filtered) {
+                                        fl = rec->flags & (FTRACE_FL_FILTER |
+                                                           FTRACE_FL_NOTRACE);
+                                        if (!fl || (fl & FTRACE_FL_NOTRACE))
+                                                return 1;
+                                }
+                        }
+                        break;
+                }
+        }
+        return 0;
+}
+static inline int
+ftrace_ip_in_hash(unsigned long ip, unsigned long key)
+{
+        struct dyn_ftrace *p;
+        struct hlist_node *t;
+        int found = 0;
+        hlist_for_each_entry_rcu(p, t, &ftrace_hash[key], node) {
+                if (p->ip == ip) {
+                        found = 1;
+                        break;
+                }
+        }
+        return found;
+}
+static inline void
+ftrace_add_hash(struct dyn_ftrace *node, unsigned long key)
+{
+        hlist_add_head_rcu(&node->node, &ftrace_hash[key]);
+}
+/* called from kstop_machine */
+static inline void ftrace_del_hash(struct dyn_ftrace *node)
+{
+        hlist_del(&node->node);
+}
+static void ftrace_free_rec(struct dyn_ftrace *rec)
+{
+        /* no locking, only called from kstop_machine */
+        rec->ip = (unsigned long)ftrace_free_records;
+        ftrace_free_records = rec;
+        rec->flags |= FTRACE_FL_FREE;
+}
+static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
+{
+        struct dyn_ftrace *rec;
+        /* First check for freed records */
+        if (ftrace_free_records) {
+                rec = ftrace_free_records;
+                if (unlikely(!(rec->flags & FTRACE_FL_FREE))) {
+                        WARN_ON_ONCE(1);
+                        ftrace_free_records = NULL;
+                        ftrace_disabled = 1;
+                        ftrace_enabled = 0;
+                        return NULL;
+                }
+                ftrace_free_records = (void *)rec->ip;
+                memset(rec, 0, sizeof(*rec));
+                return rec;
+        }
+        if (ftrace_pages->index == ENTRIES_PER_PAGE) {
+                if (!ftrace_pages->next)
+                        return NULL;
+                ftrace_pages = ftrace_pages->next;
+        }
+        return &ftrace_pages->records[ftrace_pages->index++];
+}
+static void
+ftrace_record_ip(unsigned long ip)
+{
+        struct dyn_ftrace *node;
+        unsigned long flags;
+        unsigned long key;
+        int resched;
+        int atomic;
+        int cpu;
+        if (!ftrace_enabled || ftrace_disabled)
+                return;
+        resched = need_resched();
+        preempt_disable_notrace();
+        /*
+         * We simply need to protect against recursion.
+         * Use the the raw version of smp_processor_id and not
+         * __get_cpu_var which can call debug hooks that can
+         * cause a recursive crash here.
+         */
+        cpu = raw_smp_processor_id();
+        per_cpu(ftrace_shutdown_disable_cpu, cpu)++;
+        if (per_cpu(ftrace_shutdown_disable_cpu, cpu) != 1)
+                goto out;
+        if (unlikely(ftrace_record_suspend))
+                goto out;
+        key = hash_long(ip, FTRACE_HASHBITS);
+        WARN_ON_ONCE(key >= FTRACE_HASHSIZE);
+        if (ftrace_ip_in_hash(ip, key))
+                goto out;
+        atomic = irqs_disabled();
+        spin_lock_irqsave(&ftrace_shutdown_lock, flags);
+        /* This ip may have hit the hash before the lock */
+        if (ftrace_ip_in_hash(ip, key))
+                goto out_unlock;
+        node = ftrace_alloc_dyn_node(ip);
+        if (!node)
+                goto out_unlock;
+        node->ip = ip;
+        ftrace_add_hash(node, key);
+        ftraced_trigger = 1;
+ out_unlock:
+        spin_unlock_irqrestore(&ftrace_shutdown_lock, flags);
+ out:
+        per_cpu(ftrace_shutdown_disable_cpu, cpu)--;
+        /* prevent recursion with scheduler */
+        if (resched)
+                preempt_enable_no_resched_notrace();
+        else
+                preempt_enable_notrace();
+}
+#define FTRACE_ADDR ((long)(ftrace_caller))
+static int
+__ftrace_replace_code(struct dyn_ftrace *rec,
+                      unsigned char *old, unsigned char *new, int enable)
+{
+        unsigned long ip, fl;
+        ip = rec->ip;
+        if (ftrace_filtered && enable) {
+                /*
+                 * If filtering is on:
+                 *
+                 * If this record is set to be filtered and
+                 * is enabled then do nothing.
+                 *
+                 * If this record is set to be filtered and
+                 * it is not enabled, enable it.
+                 *
+                 * If this record is not set to be filtered
+                 * and it is not enabled do nothing.
+                 *
+                 * If this record is set not to trace then
+                 * do nothing.
+                 *
+                 * If this record is set not to trace and
+                 * it is enabled then disable it.
+                 *
+                 * If this record is not set to be filtered and
+                 * it is enabled, disable it.
+                 */
+                fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE |
+                                   FTRACE_FL_ENABLED);
+                if ((fl ==  (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) ||
+                    (fl ==  (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE)) ||
+                    !fl || (fl == FTRACE_FL_NOTRACE))
+                        return 0;
+                /*
+                 * If it is enabled disable it,
+                 * otherwise enable it!
+                 */
+                if (fl & FTRACE_FL_ENABLED) {
+                        /* swap new and old */
+                        new = old;
+                        old = ftrace_call_replace(ip, FTRACE_ADDR);
+                        rec->flags &= ~FTRACE_FL_ENABLED;
+                } else {
+                        new = ftrace_call_replace(ip, FTRACE_ADDR);
+                        rec->flags |= FTRACE_FL_ENABLED;
+                }
+        } else {
+                if (enable) {
+                        /*
+                         * If this record is set not to trace and is
+                         * not enabled, do nothing.
+                         */
+                        fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED);
+                        if (fl == FTRACE_FL_NOTRACE)
+                                return 0;
+                        new = ftrace_call_replace(ip, FTRACE_ADDR);
+                } else
+                        old = ftrace_call_replace(ip, FTRACE_ADDR);
+                if (enable) {
+                        if (rec->flags & FTRACE_FL_ENABLED)
+                                return 0;
+                        rec->flags |= FTRACE_FL_ENABLED;
+                } else {
+                        if (!(rec->flags & FTRACE_FL_ENABLED))
+                                return 0;
+                        rec->flags &= ~FTRACE_FL_ENABLED;
+                }
+        }
+        return ftrace_modify_code(ip, old, new);
+}
+static void ftrace_replace_code(int enable)
+{
+        int i, failed;
+        unsigned char *new = NULL, *old = NULL;
+        struct dyn_ftrace *rec;
+        struct ftrace_page *pg;
+        if (enable)
+                old = ftrace_nop_replace();
+        else
+                new = ftrace_nop_replace();
+        for (pg = ftrace_pages_start; pg; pg = pg->next) {
+                for (i = 0; i < pg->index; i++) {
+                        rec = &pg->records[i];
+                        /* don't modify code that has already faulted */
+                        if (rec->flags & FTRACE_FL_FAILED)
+                                continue;
+                        /* ignore updates to this record's mcount site */
+                        if (get_kprobe((void *)rec->ip)) {
+                                freeze_record(rec);
+                                continue;
+                        } else {
+                                unfreeze_record(rec);
+                        }
+                        failed = __ftrace_replace_code(rec, old, new, enable);
+                        if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
+                                rec->flags |= FTRACE_FL_FAILED;
+                                if ((system_state == SYSTEM_BOOTING) ||
+                                    !core_kernel_text(rec->ip)) {
+                                        ftrace_del_hash(rec);
+                                        ftrace_free_rec(rec);
+                                }
+                        }
+                }
+        }
+}
+static void ftrace_shutdown_replenish(void)
+{
+        if (ftrace_pages->next)
+                return;
+        /* allocate another page */
+        ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
+}
+static int
+ftrace_code_disable(struct dyn_ftrace *rec)
+{
+        unsigned long ip;
+        unsigned char *nop, *call;
+        int failed;
+        ip = rec->ip;
+        nop = ftrace_nop_replace();
+        call = ftrace_call_replace(ip, MCOUNT_ADDR);
+        failed = ftrace_modify_code(ip, call, nop);
+        if (failed) {
+                rec->flags |= FTRACE_FL_FAILED;
+                return 0;
+        }
+        return 1;
+}
+static int __ftrace_update_code(void *ignore);
+static int __ftrace_modify_code(void *data)
+{
+        unsigned long addr;
+        int *command = data;
+        if (*command & FTRACE_ENABLE_CALLS) {
+                /*
+                 * Update any recorded ips now that we have the
+                 * machine stopped
+                 */
+                __ftrace_update_code(NULL);
+                ftrace_replace_code(1);
+                tracing_on = 1;
+        } else if (*command & FTRACE_DISABLE_CALLS) {
+                ftrace_replace_code(0);
+                tracing_on = 0;
+        }
+        if (*command & FTRACE_UPDATE_TRACE_FUNC)
+                ftrace_update_ftrace_func(ftrace_trace_function);
+        if (*command & FTRACE_ENABLE_MCOUNT) {
+                addr = (unsigned long)ftrace_record_ip;
+                ftrace_mcount_set(&addr);
+        } else if (*command & FTRACE_DISABLE_MCOUNT) {
+                addr = (unsigned long)ftrace_stub;
+                ftrace_mcount_set(&addr);
+        }
+        return 0;
+}
+static void ftrace_run_update_code(int command)
+{
+        stop_machine_run(__ftrace_modify_code, &command, NR_CPUS);
+}
+void ftrace_disable_daemon(void)
+{
+        /* Stop the daemon from calling kstop_machine */
+        mutex_lock(&ftraced_lock);
+        ftraced_stop = 1;
+        mutex_unlock(&ftraced_lock);
+        ftrace_force_update();
+}
+void ftrace_enable_daemon(void)
+{
+        mutex_lock(&ftraced_lock);
+        ftraced_stop = 0;
+        mutex_unlock(&ftraced_lock);
+        ftrace_force_update();
+}
+static ftrace_func_t saved_ftrace_func;
+static void ftrace_startup(void)
+{
+        int command = 0;
+        if (unlikely(ftrace_disabled))
+                return;
+        mutex_lock(&ftraced_lock);
+        ftraced_suspend++;
+        if (ftraced_suspend == 1)
+                command |= FTRACE_ENABLE_CALLS;
+        if (saved_ftrace_func != ftrace_trace_function) {
+                saved_ftrace_func = ftrace_trace_function;
+                command |= FTRACE_UPDATE_TRACE_FUNC;
+        }
+        if (!command || !ftrace_enabled)
+                goto out;
+        ftrace_run_update_code(command);
+ out:
+        mutex_unlock(&ftraced_lock);
+}
+static void ftrace_shutdown(void)
+{
+        int command = 0;
+        if (unlikely(ftrace_disabled))
+                return;
+        mutex_lock(&ftraced_lock);
+        ftraced_suspend--;
+        if (!ftraced_suspend)
+                command |= FTRACE_DISABLE_CALLS;
+        if (saved_ftrace_func != ftrace_trace_function) {
+                saved_ftrace_func = ftrace_trace_function;
+                command |= FTRACE_UPDATE_TRACE_FUNC;
+        }
+        if (!command || !ftrace_enabled)
+                goto out;
+        ftrace_run_update_code(command);
+ out:
+        mutex_unlock(&ftraced_lock);
+}
+static void ftrace_startup_sysctl(void)
+{
+        int command = FTRACE_ENABLE_MCOUNT;
+        if (unlikely(ftrace_disabled))
+                return;
+        mutex_lock(&ftraced_lock);
+        /* Force update next time */
+        saved_ftrace_func = NULL;
+        /* ftraced_suspend is true if we want ftrace running */
+        if (ftraced_suspend)
+                command |= FTRACE_ENABLE_CALLS;
+        ftrace_run_update_code(command);
+        mutex_unlock(&ftraced_lock);
+}
+static void ftrace_shutdown_sysctl(void)
+{
+        int command = FTRACE_DISABLE_MCOUNT;
+        if (unlikely(ftrace_disabled))
+                return;
+        mutex_lock(&ftraced_lock);
+        /* ftraced_suspend is true if ftrace is running */
+        if (ftraced_suspend)
+                command |= FTRACE_DISABLE_CALLS;
+        ftrace_run_update_code(command);
+        mutex_unlock(&ftraced_lock);
+}
+static cycle_t          ftrace_update_time;
+static unsigned long    ftrace_update_cnt;
+unsigned long           ftrace_update_tot_cnt;
+static int __ftrace_update_code(void *ignore)
+{
+        int i, save_ftrace_enabled;
+        cycle_t start, stop;
+        struct dyn_ftrace *p;
+        struct hlist_node *t, *n;
+        struct hlist_head *head, temp_list;
+        /* Don't be recording funcs now */
+        ftrace_record_suspend++;
+        save_ftrace_enabled = ftrace_enabled;
+        ftrace_enabled = 0;
+        start = ftrace_now(raw_smp_processor_id());
+        ftrace_update_cnt = 0;
+        /* No locks needed, the machine is stopped! */
+        for (i = 0; i < FTRACE_HASHSIZE; i++) {
+                INIT_HLIST_HEAD(&temp_list);
+                head = &ftrace_hash[i];
+                /* all CPUS are stopped, we are safe to modify code */
+                hlist_for_each_entry_safe(p, t, n, head, node) {
+                        /* Skip over failed records which have not been
+                         * freed. */
+                        if (p->flags & FTRACE_FL_FAILED)
+                                continue;
+                        /* Unconverted records are always at the head of the
+                         * hash bucket. Once we encounter a converted record,
+                         * simply skip over to the next bucket. Saves ftraced
+                         * some processor cycles (ftrace does its bid for
+                         * global warming :-p ). */
+                        if (p->flags & (FTRACE_FL_CONVERTED))
+                                break;
+                        /* Ignore updates to this record's mcount site.
+                         * Reintroduce this record at the head of this
+                         * bucket to attempt to "convert" it again if
+                         * the kprobe on it is unregistered before the
+                         * next run. */
+                        if (get_kprobe((void *)p->ip)) {
+                                ftrace_del_hash(p);
+                                INIT_HLIST_NODE(&p->node);
+                                hlist_add_head(&p->node, &temp_list);
+                                freeze_record(p);
+                                continue;
+                        } else {
+                                unfreeze_record(p);
+                        }
+                        /* convert record (i.e, patch mcount-call with NOP) */
+                        if (ftrace_code_disable(p)) {
+                                p->flags |= FTRACE_FL_CONVERTED;
+                                ftrace_update_cnt++;
+                        } else {
+                                if ((system_state == SYSTEM_BOOTING) ||
+                                    !core_kernel_text(p->ip)) {
+                                        ftrace_del_hash(p);
+                                        ftrace_free_rec(p);
+                                }
+                        }
+                }
+                hlist_for_each_entry_safe(p, t, n, &temp_list, node) {
+                        hlist_del(&p->node);
+                        INIT_HLIST_NODE(&p->node);
+                        hlist_add_head(&p->node, head);
+                }
+        }
+        stop = ftrace_now(raw_smp_processor_id());
+        ftrace_update_time = stop - start;
+        ftrace_update_tot_cnt += ftrace_update_cnt;
+        ftraced_trigger = 0;
+        ftrace_enabled = save_ftrace_enabled;
+        ftrace_record_suspend--;
+        return 0;
+}
+static int ftrace_update_code(void)
+{
+        if (unlikely(ftrace_disabled) ||
+            !ftrace_enabled || !ftraced_trigger)
+                return 0;
+        stop_machine_run(__ftrace_update_code, NULL, NR_CPUS);
+        return 1;
+}
+static int ftraced(void *ignore)
+{
+        unsigned long usecs;
+        while (!kthread_should_stop()) {
+                set_current_state(TASK_INTERRUPTIBLE);
+                /* check once a second */
+                schedule_timeout(HZ);
+                if (unlikely(ftrace_disabled))
+                        continue;
+                mutex_lock(&ftrace_sysctl_lock);
+                mutex_lock(&ftraced_lock);
+                if (!ftraced_suspend && !ftraced_stop &&
+                    ftrace_update_code()) {
+                        usecs = nsecs_to_usecs(ftrace_update_time);
+                        if (ftrace_update_tot_cnt > 100000) {
+                                ftrace_update_tot_cnt = 0;
+                                pr_info("hm, dftrace overflow: %lu change%s"
+                                        " (%lu total) in %lu usec%s\n",
+                                        ftrace_update_cnt,
+                                        ftrace_update_cnt != 1 ? "s" : "",
+                                        ftrace_update_tot_cnt,
+                                        usecs, usecs != 1 ? "s" : "");
+                                ftrace_disabled = 1;
+                                WARN_ON_ONCE(1);
+                        }
+                }
+                mutex_unlock(&ftraced_lock);
+                mutex_unlock(&ftrace_sysctl_lock);
+                ftrace_shutdown_replenish();
+        }
+        __set_current_state(TASK_RUNNING);
+        return 0;
+}
+static int __init ftrace_dyn_table_alloc(void)
+{
+        struct ftrace_page *pg;
+        int cnt;
+        int i;
+        /* allocate a few pages */
+        ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
+        if (!ftrace_pages_start)
+                return -1;
+        /*
+         * Allocate a few more pages.
+         *
+         * TODO: have some parser search vmlinux before
+         *   final linking to find all calls to ftrace.
+         *   Then we can:
+         *    a) know how many pages to allocate.
+         *     and/or
+         *    b) set up the table then.
+         *
+         *  The dynamic code is still necessary for
+         *  modules.
+         */
+        pg = ftrace_pages = ftrace_pages_start;
+        cnt = NR_TO_INIT / ENTRIES_PER_PAGE;
+        for (i = 0; i < cnt; i++) {
+                pg->next = (void *)get_zeroed_page(GFP_KERNEL);
+                /* If we fail, we'll try later anyway */
+                if (!pg->next)
+                        break;
+                pg = pg->next;
+        }
+        return 0;
+}
+enum {
+        FTRACE_ITER_FILTER      = (1 << 0),
+        FTRACE_ITER_CONT        = (1 << 1),
+        FTRACE_ITER_NOTRACE     = (1 << 2),
+        FTRACE_ITER_FAILURES    = (1 << 3),
+};
+#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
+struct ftrace_iterator {
+        loff_t                  pos;
+        struct ftrace_page      *pg;
+        unsigned                idx;
+        unsigned                flags;
+        unsigned char           buffer[FTRACE_BUFF_MAX+1];
+        unsigned                buffer_idx;
+        unsigned                filtered;
+};
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        struct ftrace_iterator *iter = m->private;
+        struct dyn_ftrace *rec = NULL;
+        (*pos)++;
+ retry:
+        if (iter->idx >= iter->pg->index) {
+                if (iter->pg->next) {
+                        iter->pg = iter->pg->next;
+                        iter->idx = 0;
+                        goto retry;
+                }
+        } else {
+                rec = &iter->pg->records[iter->idx++];
+                if ((!(iter->flags & FTRACE_ITER_FAILURES) &&
+                     (rec->flags & FTRACE_FL_FAILED)) ||
+                    ((iter->flags & FTRACE_ITER_FAILURES) &&
+                     (!(rec->flags & FTRACE_FL_FAILED) ||
+                      (rec->flags & FTRACE_FL_FREE))) ||
+                    ((iter->flags & FTRACE_ITER_FILTER) &&
+                     !(rec->flags & FTRACE_FL_FILTER)) ||
+                    ((iter->flags & FTRACE_ITER_NOTRACE) &&
+                     !(rec->flags & FTRACE_FL_NOTRACE))) {
+                        rec = NULL;
+                        goto retry;
+                }
+        }
+        iter->pos = *pos;
+        return rec;
+}
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+        struct ftrace_iterator *iter = m->private;
+        void *p = NULL;
+        loff_t l = -1;
+        if (*pos != iter->pos) {
+                for (p = t_next(m, p, &l); p && l < *pos; p = t_next(m, p, &l))
+                        ;
+        } else {
+                l = *pos;
+                p = t_next(m, p, &l);
+        }
+        return p;
+}
+static void t_stop(struct seq_file *m, void *p)
+{
+}
+static int t_show(struct seq_file *m, void *v)
+{
+        struct dyn_ftrace *rec = v;
+        char str[KSYM_SYMBOL_LEN];
+        if (!rec)
+                return 0;
+        kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+        seq_printf(m, "%s\n", str);
+        return 0;
+}
+static struct seq_operations show_ftrace_seq_ops = {
+        .start = t_start,
+        .next = t_next,
+        .stop = t_stop,
+        .show = t_show,
+};
+static int
+ftrace_avail_open(struct inode *inode, struct file *file)
+{
+        struct ftrace_iterator *iter;
+        int ret;
+        if (unlikely(ftrace_disabled))
+                return -ENODEV;
+        iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+        if (!iter)
+                return -ENOMEM;
+        iter->pg = ftrace_pages_start;
+        iter->pos = -1;
+        ret = seq_open(file, &show_ftrace_seq_ops);
+        if (!ret) {
+                struct seq_file *m = file->private_data;
+                m->private = iter;
+        } else {
+                kfree(iter);
+        }
+        return ret;
+}
+int ftrace_avail_release(struct inode *inode, struct file *file)
+{
+        struct seq_file *m = (struct seq_file *)file->private_data;
+        struct ftrace_iterator *iter = m->private;
+        seq_release(inode, file);
+        kfree(iter);
+        return 0;
+}
+static int
+ftrace_failures_open(struct inode *inode, struct file *file)
+{
+        int ret;
+        struct seq_file *m;
+        struct ftrace_iterator *iter;
+        ret = ftrace_avail_open(inode, file);
+        if (!ret) {
+                m = (struct seq_file *)file->private_data;
+                iter = (struct ftrace_iterator *)m->private;
+                iter->flags = FTRACE_ITER_FAILURES;
+        }
+        return ret;
+}
+static void ftrace_filter_reset(int enable)
+{
+        struct ftrace_page *pg;
+        struct dyn_ftrace *rec;
+        unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
+        unsigned i;
+        /* keep kstop machine from running */
+        preempt_disable();
+        if (enable)
+                ftrace_filtered = 0;
+        pg = ftrace_pages_start;
+        while (pg) {
+                for (i = 0; i < pg->index; i++) {
+                        rec = &pg->records[i];
+                        if (rec->flags & FTRACE_FL_FAILED)
+                                continue;
+                        rec->flags &= ~type;
+                }
+                pg = pg->next;
+        }
+        preempt_enable();
+}
+static int
+ftrace_regex_open(struct inode *inode, struct file *file, int enable)
+{
+        struct ftrace_iterator *iter;
+        int ret = 0;
+        if (unlikely(ftrace_disabled))
+                return -ENODEV;
+        iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+        if (!iter)
+                return -ENOMEM;
+        mutex_lock(&ftrace_regex_lock);
+        if ((file->f_mode & FMODE_WRITE) &&
+            !(file->f_flags & O_APPEND))
+                ftrace_filter_reset(enable);
+        if (file->f_mode & FMODE_READ) {
+                iter->pg = ftrace_pages_start;
+                iter->pos = -1;
+                iter->flags = enable ? FTRACE_ITER_FILTER :
+                        FTRACE_ITER_NOTRACE;
+                ret = seq_open(file, &show_ftrace_seq_ops);
+                if (!ret) {
+                        struct seq_file *m = file->private_data;
+                        m->private = iter;
+                } else
+                        kfree(iter);
+        } else
+                file->private_data = iter;
+        mutex_unlock(&ftrace_regex_lock);
+        return ret;
+}
+static int
+ftrace_filter_open(struct inode *inode, struct file *file)
+{
+        return ftrace_regex_open(inode, file, 1);
+}
+static int
+ftrace_notrace_open(struct inode *inode, struct file *file)
+{
+        return ftrace_regex_open(inode, file, 0);
+}
+static ssize_t
+ftrace_regex_read(struct file *file, char __user *ubuf,
+                       size_t cnt, loff_t *ppos)
+{
+        if (file->f_mode & FMODE_READ)
+                return seq_read(file, ubuf, cnt, ppos);
+        else
+                return -EPERM;
+}
+static loff_t
+ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
+{
+        loff_t ret;
+        if (file->f_mode & FMODE_READ)
+                ret = seq_lseek(file, offset, origin);
+        else
+                file->f_pos = ret = 1;
+        return ret;
+}
+enum {
+        MATCH_FULL,
+        MATCH_FRONT_ONLY,
+        MATCH_MIDDLE_ONLY,
+        MATCH_END_ONLY,
+};
+static void
+ftrace_match(unsigned char *buff, int len, int enable)
+{
+        char str[KSYM_SYMBOL_LEN];
+        char *search = NULL;
+        struct ftrace_page *pg;
+        struct dyn_ftrace *rec;
+        int type = MATCH_FULL;
+        unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
+        unsigned i, match = 0, search_len = 0;
+        for (i = 0; i < len; i++) {
+                if (buff[i] == '*') {
+                        if (!i) {
+                                search = buff + i + 1;
+                                type = MATCH_END_ONLY;
+                                search_len = len - (i + 1);
+                        } else {
+                                if (type == MATCH_END_ONLY) {
+                                        type = MATCH_MIDDLE_ONLY;
+                                } else {
+                                        match = i;
+                                        type = MATCH_FRONT_ONLY;
+                                }
+                                buff[i] = 0;
+                                break;
+                        }
+                }
+        }
+        /* keep kstop machine from running */
+        preempt_disable();
+        if (enable)
+                ftrace_filtered = 1;
+        pg = ftrace_pages_start;
+        while (pg) {
+                for (i = 0; i < pg->index; i++) {
+                        int matched = 0;
+                        char *ptr;
+                        rec = &pg->records[i];
+                        if (rec->flags & FTRACE_FL_FAILED)
+                                continue;
+                        kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+                        switch (type) {
+                        case MATCH_FULL:
+                                if (strcmp(str, buff) == 0)
+                                        matched = 1;
+                                break;
+                        case MATCH_FRONT_ONLY:
+                                if (memcmp(str, buff, match) == 0)
+                                        matched = 1;
+                                break;
+                        case MATCH_MIDDLE_ONLY:
+                                if (strstr(str, search))
+                                        matched = 1;
+                                break;
+                        case MATCH_END_ONLY:
+                                ptr = strstr(str, search);
+                                if (ptr && (ptr[search_len] == 0))
+                                        matched = 1;
+                                break;
+                        }
+                        if (matched)
+                                rec->flags |= flag;
+                }
+                pg = pg->next;
+        }
+        preempt_enable();
+}
+static ssize_t
+ftrace_regex_write(struct file *file, const char __user *ubuf,
+                   size_t cnt, loff_t *ppos, int enable)
+{
+        struct ftrace_iterator *iter;
+        char ch;
+        size_t read = 0;
+        ssize_t ret;
+        if (!cnt || cnt < 0)
+                return 0;
+        mutex_lock(&ftrace_regex_lock);
+        if (file->f_mode & FMODE_READ) {
+                struct seq_file *m = file->private_data;
+                iter = m->private;
+        } else
+                iter = file->private_data;
+        if (!*ppos) {
+                iter->flags &= ~FTRACE_ITER_CONT;
+                iter->buffer_idx = 0;
+        }
+        ret = get_user(ch, ubuf++);
+        if (ret)
+                goto out;
+        read++;
+        cnt--;
+        if (!(iter->flags & ~FTRACE_ITER_CONT)) {
+                /* skip white space */
+                while (cnt && isspace(ch)) {
+                        ret = get_user(ch, ubuf++);
+                        if (ret)
+                                goto out;
+                        read++;
+                        cnt--;
+                }
+                if (isspace(ch)) {
+                        file->f_pos += read;
+                        ret = read;
+                        goto out;
+                }
+                iter->buffer_idx = 0;
+        }
+        while (cnt && !isspace(ch)) {
+                if (iter->buffer_idx < FTRACE_BUFF_MAX)
+                        iter->buffer[iter->buffer_idx++] = ch;
+                else {
+                        ret = -EINVAL;
+                        goto out;
+                }
+                ret = get_user(ch, ubuf++);
+                if (ret)
+                        goto out;
+                read++;
+                cnt--;
+        }
+        if (isspace(ch)) {
+                iter->filtered++;
+                iter->buffer[iter->buffer_idx] = 0;
+                ftrace_match(iter->buffer, iter->buffer_idx, enable);
+                iter->buffer_idx = 0;
+        } else
+                iter->flags |= FTRACE_ITER_CONT;
+        file->f_pos += read;
+        ret = read;
+ out:
+        mutex_unlock(&ftrace_regex_lock);
+        return ret;
+}
+static ssize_t
+ftrace_filter_write(struct file *file, const char __user *ubuf,
+                    size_t cnt, loff_t *ppos)
+{
+        return ftrace_regex_write(file, ubuf, cnt, ppos, 1);
+}
+static ssize_t
+ftrace_notrace_write(struct file *file, const char __user *ubuf,
+                     size_t cnt, loff_t *ppos)
+{
+        return ftrace_regex_write(file, ubuf, cnt, ppos, 0);
+}
+static void
+ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)
+{
+        if (unlikely(ftrace_disabled))
+                return;
+        mutex_lock(&ftrace_regex_lock);
+        if (reset)
+                ftrace_filter_reset(enable);
+        if (buf)
+                ftrace_match(buf, len, enable);
+        mutex_unlock(&ftrace_regex_lock);
+}
+/**
+ * ftrace_set_filter - set a function to filter on in ftrace
+ * @buf - the string that holds the function filter text.
+ * @len - the length of the string.
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Filters denote which functions should be enabled when tracing is enabled.
+ * If @buf is NULL and reset is set, all functions will be enabled for tracing.
+ */
+void ftrace_set_filter(unsigned char *buf, int len, int reset)
+{
+        ftrace_set_regex(buf, len, reset, 1);
+}
+/**
+ * ftrace_set_notrace - set a function to not trace in ftrace
+ * @buf - the string that holds the function notrace text.
+ * @len - the length of the string.
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Notrace Filters denote which functions should not be enabled when tracing
+ * is enabled. If @buf is NULL and reset is set, all functions will be enabled
+ * for tracing.
+ */
+void ftrace_set_notrace(unsigned char *buf, int len, int reset)
+{
+        ftrace_set_regex(buf, len, reset, 0);
+}
+static int
+ftrace_regex_release(struct inode *inode, struct file *file, int enable)
+{
+        struct seq_file *m = (struct seq_file *)file->private_data;
+        struct ftrace_iterator *iter;
+        mutex_lock(&ftrace_regex_lock);
+        if (file->f_mode & FMODE_READ) {
+                iter = m->private;
+                seq_release(inode, file);
+        } else
+                iter = file->private_data;
+        if (iter->buffer_idx) {
+                iter->filtered++;
+                iter->buffer[iter->buffer_idx] = 0;
+                ftrace_match(iter->buffer, iter->buffer_idx, enable);
+        }
+        mutex_lock(&ftrace_sysctl_lock);
+        mutex_lock(&ftraced_lock);
+        if (iter->filtered && ftraced_suspend && ftrace_enabled)
+                ftrace_run_update_code(FTRACE_ENABLE_CALLS);
+        mutex_unlock(&ftraced_lock);
+        mutex_unlock(&ftrace_sysctl_lock);
+        kfree(iter);
+        mutex_unlock(&ftrace_regex_lock);
+        return 0;
+}
+static int
+ftrace_filter_release(struct inode *inode, struct file *file)
+{
+        return ftrace_regex_release(inode, file, 1);
+}
+static int
+ftrace_notrace_release(struct inode *inode, struct file *file)
+{
+        return ftrace_regex_release(inode, file, 0);
+}
+static ssize_t
+ftraced_read(struct file *filp, char __user *ubuf,
+                     size_t cnt, loff_t *ppos)
+{
+        /* don't worry about races */
+        char *buf = ftraced_stop ? "disabled\n" : "enabled\n";
+        int r = strlen(buf);
+        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+static ssize_t
+ftraced_write(struct file *filp, const char __user *ubuf,
+                      size_t cnt, loff_t *ppos)
+{
+        char buf[64];
+        long val;
+        int ret;
+        if (cnt >= sizeof(buf))
+                return -EINVAL;
+        if (copy_from_user(&buf, ubuf, cnt))
+                return -EFAULT;
+        if (strncmp(buf, "enable", 6) == 0)
+                val = 1;
+        else if (strncmp(buf, "disable", 7) == 0)
+                val = 0;
+        else {
+                buf[cnt] = 0;
+                ret = strict_strtoul(buf, 10, &val);
+                if (ret < 0)
+                        return ret;
+                val = !!val;
+        }
+        if (val)
+                ftrace_enable_daemon();
+        else
+                ftrace_disable_daemon();
+        filp->f_pos += cnt;
+        return cnt;
+}
+static struct file_operations ftrace_avail_fops = {
+        .open = ftrace_avail_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = ftrace_avail_release,
+};
+static struct file_operations ftrace_failures_fops = {
+        .open = ftrace_failures_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = ftrace_avail_release,
+};
+static struct file_operations ftrace_filter_fops = {
+        .open = ftrace_filter_open,
+        .read = ftrace_regex_read,
+        .write = ftrace_filter_write,
+        .llseek = ftrace_regex_lseek,
+        .release = ftrace_filter_release,
+};
+static struct file_operations ftrace_notrace_fops = {
+        .open = ftrace_notrace_open,
+        .read = ftrace_regex_read,
+        .write = ftrace_notrace_write,
+        .llseek = ftrace_regex_lseek,
+        .release = ftrace_notrace_release,
+};
+static struct file_operations ftraced_fops = {
+        .open = tracing_open_generic,
+        .read = ftraced_read,
+        .write = ftraced_write,
+};
+/**
+ * ftrace_force_update - force an update to all recording ftrace functions
+ */
+int ftrace_force_update(void)
+{
+        int ret = 0;
+        if (unlikely(ftrace_disabled))
+                return -ENODEV;
+        mutex_lock(&ftrace_sysctl_lock);
+        mutex_lock(&ftraced_lock);
+        /*
+         * If ftraced_trigger is not set, then there is nothing
+         * to update.
+         */
+        if (ftraced_trigger && !ftrace_update_code())
+                ret = -EBUSY;
+        mutex_unlock(&ftraced_lock);
+        mutex_unlock(&ftrace_sysctl_lock);
+        return ret;
+}
+static void ftrace_force_shutdown(void)
+{
+        struct task_struct *task;
+        int command = FTRACE_DISABLE_CALLS | FTRACE_UPDATE_TRACE_FUNC;
+        mutex_lock(&ftraced_lock);
+        task = ftraced_task;
+        ftraced_task = NULL;
+        ftraced_suspend = -1;
+        ftrace_run_update_code(command);
+        mutex_unlock(&ftraced_lock);
+        if (task)
+                kthread_stop(task);
+}
+static __init int ftrace_init_debugfs(void)
+{
+        struct dentry *d_tracer;
+        struct dentry *entry;
+        d_tracer = tracing_init_dentry();
+        entry = debugfs_create_file("available_filter_functions", 0444,
+                                    d_tracer, NULL, &ftrace_avail_fops);
+        if (!entry)
+                pr_warning("Could not create debugfs "
+                           "'available_filter_functions' entry\n");
+        entry = debugfs_create_file("failures", 0444,
+                                    d_tracer, NULL, &ftrace_failures_fops);
+        if (!entry)
+                pr_warning("Could not create debugfs 'failures' entry\n");
+        entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer,
+                                    NULL, &ftrace_filter_fops);
+        if (!entry)
+                pr_warning("Could not create debugfs "
+                           "'set_ftrace_filter' entry\n");
+        entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer,
+                                    NULL, &ftrace_notrace_fops);
+        if (!entry)
+                pr_warning("Could not create debugfs "
+                           "'set_ftrace_notrace' entry\n");
+        entry = debugfs_create_file("ftraced_enabled", 0644, d_tracer,
+                                    NULL, &ftraced_fops);
+        if (!entry)
+                pr_warning("Could not create debugfs "
+                           "'ftraced_enabled' entry\n");
+        return 0;
+}
+fs_initcall(ftrace_init_debugfs);
+static int __init ftrace_dynamic_init(void)
+{
+        struct task_struct *p;
+        unsigned long addr;
+        int ret;
+        addr = (unsigned long)ftrace_record_ip;
+        stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS);
+        /* ftrace_dyn_arch_init places the return code in addr */
+        if (addr) {
+                ret = (int)addr;
+                goto failed;
+        }
+        ret = ftrace_dyn_table_alloc();
+        if (ret)
+                goto failed;
+        p = kthread_run(ftraced, NULL, "ftraced");
+        if (IS_ERR(p)) {
+                ret = -1;
+                goto failed;
+        }
+        last_ftrace_enabled = ftrace_enabled = 1;
+        ftraced_task = p;
+        return 0;
+ failed:
+        ftrace_disabled = 1;
+        return ret;
+}
+core_initcall(ftrace_dynamic_init);
+#else
+# define ftrace_startup()               do { } while (0)
+# define ftrace_shutdown()              do { } while (0)
+# define ftrace_startup_sysctl()        do { } while (0)
+# define ftrace_shutdown_sysctl()       do { } while (0)
+# define ftrace_force_shutdown()        do { } while (0)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+/**
+ * ftrace_kill_atomic - kill ftrace from critical sections
+ *
+ * This function should be used by panic code. It stops ftrace
+ * but in a not so nice way. If you need to simply kill ftrace
+ * from a non-atomic section, use ftrace_kill.
+ */
+void ftrace_kill_atomic(void)
+{
+        ftrace_disabled = 1;
+        ftrace_enabled = 0;
+#ifdef CONFIG_DYNAMIC_FTRACE
+        ftraced_suspend = -1;
+#endif
+        clear_ftrace_function();
+}
+/**
+ * ftrace_kill - totally shutdown ftrace
+ *
+ * This is a safety measure. If something was detected that seems
+ * wrong, calling this function will keep ftrace from doing
+ * any more modifications, and updates.
+ * used when something went wrong.
+ */
+void ftrace_kill(void)
+{
+        mutex_lock(&ftrace_sysctl_lock);
+        ftrace_disabled = 1;
+        ftrace_enabled = 0;
+        clear_ftrace_function();
+        mutex_unlock(&ftrace_sysctl_lock);
+        /* Try to totally disable ftrace */
+        ftrace_force_shutdown();
+}
+/**
+ * register_ftrace_function - register a function for profiling
+ * @ops - ops structure that holds the function for profiling.
+ *
+ * Register a function to be called by all functions in the
+ * kernel.
+ *
+ * Note: @ops->func and all the functions it calls must be labeled
+ *       with "notrace", otherwise it will go into a
+ *       recursive loop.
+ */
+int register_ftrace_function(struct ftrace_ops *ops)
+{
+        int ret;
+        if (unlikely(ftrace_disabled))
+                return -1;
+        mutex_lock(&ftrace_sysctl_lock);
+        ret = __register_ftrace_function(ops);
+        ftrace_startup();
+        mutex_unlock(&ftrace_sysctl_lock);
+        return ret;
+}
+/**
+ * unregister_ftrace_function - unresgister a function for profiling.
+ * @ops - ops structure that holds the function to unregister
+ *
+ * Unregister a function that was added to be called by ftrace profiling.
+ */
+int unregister_ftrace_function(struct ftrace_ops *ops)
+{
+        int ret;
+        mutex_lock(&ftrace_sysctl_lock);
+        ret = __unregister_ftrace_function(ops);
+        ftrace_shutdown();
+        mutex_unlock(&ftrace_sysctl_lock);
+        return ret;
+}
+int
+ftrace_enable_sysctl(struct ctl_table *table, int write,
+                     struct file *file, void __user *buffer, size_t *lenp,
+                     loff_t *ppos)
+{
+        int ret;
+        if (unlikely(ftrace_disabled))
+                return -ENODEV;
+        mutex_lock(&ftrace_sysctl_lock);
+        ret  = proc_dointvec(table, write, file, buffer, lenp, ppos);
+        if (ret || !write || (last_ftrace_enabled == ftrace_enabled))
+                goto out;
+        last_ftrace_enabled = ftrace_enabled;
+        if (ftrace_enabled) {
+                ftrace_startup_sysctl();
+                /* we are starting ftrace again */
+                if (ftrace_list != &ftrace_list_end) {
+                        if (ftrace_list->next == &ftrace_list_end)
+                                ftrace_trace_function = ftrace_list->func;
+                        else
+                                ftrace_trace_function = ftrace_list_func;
+                }
+        } else {
+                /* stopping ftrace calls (just send to ftrace_stub) */
+                ftrace_trace_function = ftrace_stub;
+                ftrace_shutdown_sysctl();
+        }
+ out:
+        mutex_unlock(&ftrace_sysctl_lock);
+        return ret;
+}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
new file mode 100644
index 000000000000..8f3fb3db61c3
--- /dev/null
+++ b/kernel/trace/trace.c
@@ -0,0 +1,3157 @@
+/*
+ * ring buffer based function tracer
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Originally taken from the RT patch by:
+ *    Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Based on code from the latency_tracer, that is:
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/utsrelease.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/pagemap.h>
+#include <linux/hardirq.h>
+#include <linux/linkage.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/gfp.h>
+#include <linux/fs.h>
+#include <linux/kprobes.h>
+#include <linux/writeback.h>
+#include <linux/stacktrace.h>
+#include "trace.h"
+unsigned long __read_mostly     tracing_max_latency = (cycle_t)ULONG_MAX;
+unsigned long __read_mostly     tracing_thresh;
+static unsigned long __read_mostly      tracing_nr_buffers;
+static cpumask_t __read_mostly          tracing_buffer_mask;
+#define for_each_tracing_cpu(cpu)       \
+        for_each_cpu_mask(cpu, tracing_buffer_mask)
+static int trace_alloc_page(void);
+static int trace_free_page(void);
+static int tracing_disabled = 1;
+static unsigned long tracing_pages_allocated;
+long
+ns2usecs(cycle_t nsec)
+{
+        nsec += 500;
+        do_div(nsec, 1000);
+        return nsec;
+}
+cycle_t ftrace_now(int cpu)
+{
+        return cpu_clock(cpu);
+}
+/*
+ * The global_trace is the descriptor that holds the tracing
+ * buffers for the live tracing. For each CPU, it contains
+ * a link list of pages that will store trace entries. The
+ * page descriptor of the pages in the memory is used to hold
+ * the link list by linking the lru item in the page descriptor
+ * to each of the pages in the buffer per CPU.
+ *
+ * For each active CPU there is a data field that holds the
+ * pages for the buffer for that CPU. Each CPU has the same number
+ * of pages allocated for its buffer.
+ */
+static struct trace_array       global_trace;
+static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
+/*
+ * The max_tr is used to snapshot the global_trace when a maximum
+ * latency is reached. Some tracers will use this to store a maximum
+ * trace while it continues examining live traces.
+ *
+ * The buffers for the max_tr are set up the same as the global_trace.
+ * When a snapshot is taken, the link list of the max_tr is swapped
+ * with the link list of the global_trace and the buffers are reset for
+ * the global_trace so the tracing can continue.
+ */
+static struct trace_array       max_tr;
+static DEFINE_PER_CPU(struct trace_array_cpu, max_data);
+/* tracer_enabled is used to toggle activation of a tracer */
+static int                      tracer_enabled = 1;
+/* function tracing enabled */
+int                             ftrace_function_enabled;
+/*
+ * trace_nr_entries is the number of entries that is allocated
+ * for a buffer. Note, the number of entries is always rounded
+ * to ENTRIES_PER_PAGE.
+ */
+static unsigned long            trace_nr_entries = 65536UL;
+/* trace_types holds a link list of available tracers. */
+static struct tracer            *trace_types __read_mostly;
+/* current_trace points to the tracer that is currently active */
+static struct tracer            *current_trace __read_mostly;
+/*
+ * max_tracer_type_len is used to simplify the allocating of
+ * buffers to read userspace tracer names. We keep track of
+ * the longest tracer name registered.
+ */
+static int                      max_tracer_type_len;
+/*
+ * trace_types_lock is used to protect the trace_types list.
+ * This lock is also used to keep user access serialized.
+ * Accesses from userspace will grab this lock while userspace
+ * activities happen inside the kernel.
+ */
+static DEFINE_MUTEX(trace_types_lock);
+/* trace_wait is a waitqueue for tasks blocked on trace_poll */
+static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
+/* trace_flags holds iter_ctrl options */
+unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
+static notrace void no_trace_init(struct trace_array *tr)
+{
+        int cpu;
+        ftrace_function_enabled = 0;
+        if(tr->ctrl)
+                for_each_online_cpu(cpu)
+                        tracing_reset(tr->data[cpu]);
+        tracer_enabled = 0;
+}
+/* dummy trace to disable tracing */
+static struct tracer no_tracer __read_mostly = {
+        .name           = "none",
+        .init           = no_trace_init
+};
+/**
+ * trace_wake_up - wake up tasks waiting for trace input
+ *
+ * Simply wakes up any task that is blocked on the trace_wait
+ * queue. These is used with trace_poll for tasks polling the trace.
+ */
+void trace_wake_up(void)
+{
+        /*
+         * The runqueue_is_locked() can fail, but this is the best we
+         * have for now:
+         */
+        if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked())
+                wake_up(&trace_wait);
+}
+#define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry))
+static int __init set_nr_entries(char *str)
+{
+        unsigned long nr_entries;
+        int ret;
+        if (!str)
+                return 0;
+        ret = strict_strtoul(str, 0, &nr_entries);
+        /* nr_entries can not be zero */
+        if (ret < 0 || nr_entries == 0)
+                return 0;
+        trace_nr_entries = nr_entries;
+        return 1;
+}
+__setup("trace_entries=", set_nr_entries);
+unsigned long nsecs_to_usecs(unsigned long nsecs)
+{
+        return nsecs / 1000;
+}
+/*
+ * trace_flag_type is an enumeration that holds different
+ * states when a trace occurs. These are:
+ *  IRQS_OFF    - interrupts were disabled
+ *  NEED_RESCED - reschedule is requested
+ *  HARDIRQ     - inside an interrupt handler
+ *  SOFTIRQ     - inside a softirq handler
+ */
+enum trace_flag_type {
+        TRACE_FLAG_IRQS_OFF             = 0x01,
+        TRACE_FLAG_NEED_RESCHED         = 0x02,
+        TRACE_FLAG_HARDIRQ              = 0x04,
+        TRACE_FLAG_SOFTIRQ              = 0x08,
+};
+/*
+ * TRACE_ITER_SYM_MASK masks the options in trace_flags that
+ * control the output of kernel symbols.
+ */
+#define TRACE_ITER_SYM_MASK \
+        (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
+/* These must match the bit postions in trace_iterator_flags */
+static const char *trace_options[] = {
+        "print-parent",
+        "sym-offset",
+        "sym-addr",
+        "verbose",
+        "raw",
+        "hex",
+        "bin",
+        "block",
+        "stacktrace",
+        "sched-tree",
+        NULL
+};
+/*
+ * ftrace_max_lock is used to protect the swapping of buffers
+ * when taking a max snapshot. The buffers themselves are
+ * protected by per_cpu spinlocks. But the action of the swap
+ * needs its own lock.
+ *
+ * This is defined as a raw_spinlock_t in order to help
+ * with performance when lockdep debugging is enabled.
+ */
+static raw_spinlock_t ftrace_max_lock =
+        (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+/*
+ * Copy the new maximum trace into the separate maximum-trace
+ * structure. (this way the maximum trace is permanently saved,
+ * for later retrieval via /debugfs/tracing/latency_trace)
+ */
+static void
+__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+        struct trace_array_cpu *data = tr->data[cpu];
+        max_tr.cpu = cpu;
+        max_tr.time_start = data->preempt_timestamp;
+        data = max_tr.data[cpu];
+        data->saved_latency = tracing_max_latency;
+        memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
+        data->pid = tsk->pid;
+        data->uid = tsk->uid;
+        data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
+        data->policy = tsk->policy;
+        data->rt_priority = tsk->rt_priority;
+        /* record this tasks comm */
+        tracing_record_cmdline(current);
+}
+#define CHECK_COND(cond)                        \
+        if (unlikely(cond)) {                   \
+                tracing_disabled = 1;           \
+                WARN_ON(1);                     \
+                return -1;                      \
+        }
+/**
+ * check_pages - integrity check of trace buffers
+ *
+ * As a safty measure we check to make sure the data pages have not
+ * been corrupted.
+ */
+int check_pages(struct trace_array_cpu *data)
+{
+        struct page *page, *tmp;
+        CHECK_COND(data->trace_pages.next->prev != &data->trace_pages);
+        CHECK_COND(data->trace_pages.prev->next != &data->trace_pages);
+        list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) {
+                CHECK_COND(page->lru.next->prev != &page->lru);
+                CHECK_COND(page->lru.prev->next != &page->lru);
+        }
+        return 0;
+}
+/**
+ * head_page - page address of the first page in per_cpu buffer.
+ *
+ * head_page returns the page address of the first page in
+ * a per_cpu buffer. This also preforms various consistency
+ * checks to make sure the buffer has not been corrupted.
+ */
+void *head_page(struct trace_array_cpu *data)
+{
+        struct page *page;
+        if (list_empty(&data->trace_pages))
+                return NULL;
+        page = list_entry(data->trace_pages.next, struct page, lru);
+        BUG_ON(&page->lru == &data->trace_pages);
+        return page_address(page);
+}
+/**
+ * trace_seq_printf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ */
+int
+trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+{
+        int len = (PAGE_SIZE - 1) - s->len;
+        va_list ap;
+        int ret;
+        if (!len)
+                return 0;
+        va_start(ap, fmt);
+        ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
+        va_end(ap);
+        /* If we can't write it all, don't bother writing anything */
+        if (ret >= len)
+                return 0;
+        s->len += ret;
+        return len;
+}
+/**
+ * trace_seq_puts - trace sequence printing of simple string
+ * @s: trace sequence descriptor
+ * @str: simple string to record
+ *
+ * The tracer may use either the sequence operations or its own
+ * copy to user routines. This function records a simple string
+ * into a special buffer (@s) for later retrieval by a sequencer
+ * or other mechanism.
+ */
+static int
+trace_seq_puts(struct trace_seq *s, const char *str)
+{
+        int len = strlen(str);
+        if (len > ((PAGE_SIZE - 1) - s->len))
+                return 0;
+        memcpy(s->buffer + s->len, str, len);
+        s->len += len;
+        return len;
+}
+static int
+trace_seq_putc(struct trace_seq *s, unsigned char c)
+{
+        if (s->len >= (PAGE_SIZE - 1))
+                return 0;
+        s->buffer[s->len++] = c;
+        return 1;
+}
+static int
+trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
+{
+        if (len > ((PAGE_SIZE - 1) - s->len))
+                return 0;
+        memcpy(s->buffer + s->len, mem, len);
+        s->len += len;
+        return len;
+}
+#define HEX_CHARS 17
+static const char hex2asc[] = "0123456789abcdef";
+static int
+trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
+{
+        unsigned char hex[HEX_CHARS];
+        unsigned char *data = mem;
+        unsigned char byte;
+        int i, j;
+        BUG_ON(len >= HEX_CHARS);
+#ifdef __BIG_ENDIAN
+        for (i = 0, j = 0; i < len; i++) {
+#else
+        for (i = len-1, j = 0; i >= 0; i--) {
+#endif
+                byte = data[i];
+                hex[j++] = hex2asc[byte & 0x0f];
+                hex[j++] = hex2asc[byte >> 4];
+        }
+        hex[j++] = ' ';
+        return trace_seq_putmem(s, hex, j);
+}
+static void
+trace_seq_reset(struct trace_seq *s)
+{
+        s->len = 0;
+        s->readpos = 0;
+}
+ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
+{
+        int len;
+        int ret;
+        if (s->len <= s->readpos)
+                return -EBUSY;
+        len = s->len - s->readpos;
+        if (cnt > len)
+                cnt = len;
+        ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
+        if (ret)
+                return -EFAULT;
+        s->readpos += len;
+        return cnt;
+}
+static void
+trace_print_seq(struct seq_file *m, struct trace_seq *s)
+{
+        int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
+        s->buffer[len] = 0;
+        seq_puts(m, s->buffer);
+        trace_seq_reset(s);
+}
+/*
+ * flip the trace buffers between two trace descriptors.
+ * This usually is the buffers between the global_trace and
+ * the max_tr to record a snapshot of a current trace.
+ *
+ * The ftrace_max_lock must be held.
+ */
+static void
+flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2)
+{
+        struct list_head flip_pages;
+        INIT_LIST_HEAD(&flip_pages);
+        memcpy(&tr1->trace_head_idx, &tr2->trace_head_idx,
+                sizeof(struct trace_array_cpu) -
+                offsetof(struct trace_array_cpu, trace_head_idx));
+        check_pages(tr1);
+        check_pages(tr2);
+        list_splice_init(&tr1->trace_pages, &flip_pages);
+        list_splice_init(&tr2->trace_pages, &tr1->trace_pages);
+        list_splice_init(&flip_pages, &tr2->trace_pages);
+        BUG_ON(!list_empty(&flip_pages));
+        check_pages(tr1);
+        check_pages(tr2);
+}
+/**
+ * update_max_tr - snapshot all trace buffers from global_trace to max_tr
+ * @tr: tracer
+ * @tsk: the task with the latency
+ * @cpu: The cpu that initiated the trace.
+ *
+ * Flip the buffers between the @tr and the max_tr and record information
+ * about which task was the cause of this latency.
+ */
+void
+update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+        struct trace_array_cpu *data;
+        int i;
+        WARN_ON_ONCE(!irqs_disabled());
+        __raw_spin_lock(&ftrace_max_lock);
+        /* clear out all the previous traces */
+        for_each_tracing_cpu(i) {
+                data = tr->data[i];
+                flip_trace(max_tr.data[i], data);
+                tracing_reset(data);
+        }
+        __update_max_tr(tr, tsk, cpu);
+        __raw_spin_unlock(&ftrace_max_lock);
+}
+/**
+ * update_max_tr_single - only copy one trace over, and reset the rest
+ * @tr - tracer
+ * @tsk - task with the latency
+ * @cpu - the cpu of the buffer to copy.
+ *
+ * Flip the trace of a single CPU buffer between the @tr and the max_tr.
+ */
+void
+update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+        struct trace_array_cpu *data = tr->data[cpu];
+        int i;
+        WARN_ON_ONCE(!irqs_disabled());
+        __raw_spin_lock(&ftrace_max_lock);
+        for_each_tracing_cpu(i)
+                tracing_reset(max_tr.data[i]);
+        flip_trace(max_tr.data[cpu], data);
+        tracing_reset(data);
+        __update_max_tr(tr, tsk, cpu);
+        __raw_spin_unlock(&ftrace_max_lock);
+}
+/**
+ * register_tracer - register a tracer with the ftrace system.
+ * @type - the plugin for the tracer
+ *
+ * Register a new plugin tracer.
+ */
+int register_tracer(struct tracer *type)
+{
+        struct tracer *t;
+        int len;
+        int ret = 0;
+        if (!type->name) {
+                pr_info("Tracer must have a name\n");
+                return -1;
+        }
+        mutex_lock(&trace_types_lock);
+        for (t = trace_types; t; t = t->next) {
+                if (strcmp(type->name, t->name) == 0) {
+                        /* already found */
+                        pr_info("Trace %s already registered\n",
+                                type->name);
+                        ret = -1;
+                        goto out;
+                }
+        }
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+        if (type->selftest) {
+                struct tracer *saved_tracer = current_trace;
+                struct trace_array_cpu *data;
+                struct trace_array *tr = &global_trace;
+                int saved_ctrl = tr->ctrl;
+                int i;
+                /*
+                 * Run a selftest on this tracer.
+                 * Here we reset the trace buffer, and set the current
+                 * tracer to be this tracer. The tracer can then run some
+                 * internal tracing to verify that everything is in order.
+                 * If we fail, we do not register this tracer.
+                 */
+                for_each_tracing_cpu(i) {
+                        data = tr->data[i];
+                        if (!head_page(data))
+                                continue;
+                        tracing_reset(data);
+                }
+                current_trace = type;
+                tr->ctrl = 0;
+                /* the test is responsible for initializing and enabling */
+                pr_info("Testing tracer %s: ", type->name);
+                ret = type->selftest(type, tr);
+                /* the test is responsible for resetting too */
+                current_trace = saved_tracer;
+                tr->ctrl = saved_ctrl;
+                if (ret) {
+                        printk(KERN_CONT "FAILED!\n");
+                        goto out;
+                }
+                /* Only reset on passing, to avoid touching corrupted buffers */
+                for_each_tracing_cpu(i) {
+                        data = tr->data[i];
+                        if (!head_page(data))
+                                continue;
+                        tracing_reset(data);
+                }
+                printk(KERN_CONT "PASSED\n");
+        }
+#endif
+        type->next = trace_types;
+        trace_types = type;
+        len = strlen(type->name);
+        if (len > max_tracer_type_len)
+                max_tracer_type_len = len;
+ out:
+        mutex_unlock(&trace_types_lock);
+        return ret;
+}
+void unregister_tracer(struct tracer *type)
+{
+        struct tracer **t;
+        int len;
+        mutex_lock(&trace_types_lock);
+        for (t = &trace_types; *t; t = &(*t)->next) {
+                if (*t == type)
+                        goto found;
+        }
+        pr_info("Trace %s not registered\n", type->name);
+        goto out;
+ found:
+        *t = (*t)->next;
+        if (strlen(type->name) != max_tracer_type_len)
+                goto out;
+        max_tracer_type_len = 0;
+        for (t = &trace_types; *t; t = &(*t)->next) {
+                len = strlen((*t)->name);
+                if (len > max_tracer_type_len)
+                        max_tracer_type_len = len;
+        }
+ out:
+        mutex_unlock(&trace_types_lock);
+}
+void tracing_reset(struct trace_array_cpu *data)
+{
+        data->trace_idx = 0;
+        data->overrun = 0;
+        data->trace_head = data->trace_tail = head_page(data);
+        data->trace_head_idx = 0;
+        data->trace_tail_idx = 0;
+}
+#define SAVED_CMDLINES 128
+static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
+static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
+static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
+static int cmdline_idx;
+static DEFINE_SPINLOCK(trace_cmdline_lock);
+/* temporary disable recording */
+atomic_t trace_record_cmdline_disabled __read_mostly;
+static void trace_init_cmdlines(void)
+{
+        memset(&map_pid_to_cmdline, -1, sizeof(map_pid_to_cmdline));
+        memset(&map_cmdline_to_pid, -1, sizeof(map_cmdline_to_pid));
+        cmdline_idx = 0;
+}
+void trace_stop_cmdline_recording(void);
+static void trace_save_cmdline(struct task_struct *tsk)
+{
+        unsigned map;
+        unsigned idx;
+        if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
+                return;
+        /*
+         * It's not the end of the world if we don't get
+         * the lock, but we also don't want to spin
+         * nor do we want to disable interrupts,
+         * so if we miss here, then better luck next time.
+         */
+        if (!spin_trylock(&trace_cmdline_lock))
+                return;
+        idx = map_pid_to_cmdline[tsk->pid];
+        if (idx >= SAVED_CMDLINES) {
+                idx = (cmdline_idx + 1) % SAVED_CMDLINES;
+                map = map_cmdline_to_pid[idx];
+                if (map <= PID_MAX_DEFAULT)
+                        map_pid_to_cmdline[map] = (unsigned)-1;
+                map_pid_to_cmdline[tsk->pid] = idx;
+                cmdline_idx = idx;
+        }
+        memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
+        spin_unlock(&trace_cmdline_lock);
+}
+static char *trace_find_cmdline(int pid)
+{
+        char *cmdline = "<...>";
+        unsigned map;
+        if (!pid)
+                return "<idle>";
+        if (pid > PID_MAX_DEFAULT)
+                goto out;
+        map = map_pid_to_cmdline[pid];
+        if (map >= SAVED_CMDLINES)
+                goto out;
+        cmdline = saved_cmdlines[map];
+ out:
+        return cmdline;
+}
+void tracing_record_cmdline(struct task_struct *tsk)
+{
+        if (atomic_read(&trace_record_cmdline_disabled))
+                return;
+        trace_save_cmdline(tsk);
+}
+static inline struct list_head *
+trace_next_list(struct trace_array_cpu *data, struct list_head *next)
+{
+        /*
+         * Roundrobin - but skip the head (which is not a real page):
+         */
+        next = next->next;
+        if (unlikely(next == &data->trace_pages))
+                next = next->next;
+        BUG_ON(next == &data->trace_pages);
+        return next;
+}
+static inline void *
+trace_next_page(struct trace_array_cpu *data, void *addr)
+{
+        struct list_head *next;
+        struct page *page;
+        page = virt_to_page(addr);
+        next = trace_next_list(data, &page->lru);
+        page = list_entry(next, struct page, lru);
+        return page_address(page);
+}
+static inline struct trace_entry *
+tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data)
+{
+        unsigned long idx, idx_next;
+        struct trace_entry *entry;
+        data->trace_idx++;
+        idx = data->trace_head_idx;
+        idx_next = idx + 1;
+        BUG_ON(idx * TRACE_ENTRY_SIZE >= PAGE_SIZE);
+        entry = data->trace_head + idx * TRACE_ENTRY_SIZE;
+        if (unlikely(idx_next >= ENTRIES_PER_PAGE)) {
+                data->trace_head = trace_next_page(data, data->trace_head);
+                idx_next = 0;
+        }
+        if (data->trace_head == data->trace_tail &&
+            idx_next == data->trace_tail_idx) {
+                /* overrun */
+                data->overrun++;
+                data->trace_tail_idx++;
+                if (data->trace_tail_idx >= ENTRIES_PER_PAGE) {
+                        data->trace_tail =
+                                trace_next_page(data, data->trace_tail);
+                        data->trace_tail_idx = 0;
+                }
+        }
+        data->trace_head_idx = idx_next;
+        return entry;
+}
+static inline void
+tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags)
+{
+        struct task_struct *tsk = current;
+        unsigned long pc;
+        pc = preempt_count();
+        entry->preempt_count    = pc & 0xff;
+        entry->pid              = (tsk) ? tsk->pid : 0;
+        entry->t                = ftrace_now(raw_smp_processor_id());
+        entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
+                ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
+                ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
+                (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
+}
+void
+trace_function(struct trace_array *tr, struct trace_array_cpu *data,
+               unsigned long ip, unsigned long parent_ip, unsigned long flags)
+{
+        struct trace_entry *entry;
+        unsigned long irq_flags;
+        raw_local_irq_save(irq_flags);
+        __raw_spin_lock(&data->lock);
+        entry                   = tracing_get_trace_entry(tr, data);
+        tracing_generic_entry_update(entry, flags);
+        entry->type             = TRACE_FN;
+        entry->fn.ip            = ip;
+        entry->fn.parent_ip     = parent_ip;
+        __raw_spin_unlock(&data->lock);
+        raw_local_irq_restore(irq_flags);
+}
+void
+ftrace(struct trace_array *tr, struct trace_array_cpu *data,
+       unsigned long ip, unsigned long parent_ip, unsigned long flags)
+{
+        if (likely(!atomic_read(&data->disabled)))
+                trace_function(tr, data, ip, parent_ip, flags);
+}
+#ifdef CONFIG_MMIOTRACE
+void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data,
+                                                struct mmiotrace_rw *rw)
+{
+        struct trace_entry *entry;
+        unsigned long irq_flags;
+        raw_local_irq_save(irq_flags);
+        __raw_spin_lock(&data->lock);
+        entry                   = tracing_get_trace_entry(tr, data);
+        tracing_generic_entry_update(entry, 0);
+        entry->type             = TRACE_MMIO_RW;
+        entry->mmiorw           = *rw;
+        __raw_spin_unlock(&data->lock);
+        raw_local_irq_restore(irq_flags);
+        trace_wake_up();
+}
+void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data,
+                                                struct mmiotrace_map *map)
+{
+        struct trace_entry *entry;
+        unsigned long irq_flags;
+        raw_local_irq_save(irq_flags);
+        __raw_spin_lock(&data->lock);
+        entry                   = tracing_get_trace_entry(tr, data);
+        tracing_generic_entry_update(entry, 0);
+        entry->type             = TRACE_MMIO_MAP;
+        entry->mmiomap          = *map;
+        __raw_spin_unlock(&data->lock);
+        raw_local_irq_restore(irq_flags);
+        trace_wake_up();
+}
+#endif
+void __trace_stack(struct trace_array *tr,
+                   struct trace_array_cpu *data,
+                   unsigned long flags,
+                   int skip)
+{
+        struct trace_entry *entry;
+        struct stack_trace trace;
+        if (!(trace_flags & TRACE_ITER_STACKTRACE))
+                return;
+        entry                   = tracing_get_trace_entry(tr, data);
+        tracing_generic_entry_update(entry, flags);
+        entry->type             = TRACE_STACK;
+        memset(&entry->stack, 0, sizeof(entry->stack));
+        trace.nr_entries        = 0;
+        trace.max_entries       = FTRACE_STACK_ENTRIES;
+        trace.skip              = skip;
+        trace.entries           = entry->stack.caller;
+        save_stack_trace(&trace);
+}
+void
+__trace_special(void *__tr, void *__data,
+                unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+        struct trace_array_cpu *data = __data;
+        struct trace_array *tr = __tr;
+        struct trace_entry *entry;
+        unsigned long irq_flags;
+        raw_local_irq_save(irq_flags);
+        __raw_spin_lock(&data->lock);
+        entry                   = tracing_get_trace_entry(tr, data);
+        tracing_generic_entry_update(entry, 0);
+        entry->type             = TRACE_SPECIAL;
+        entry->special.arg1     = arg1;
+        entry->special.arg2     = arg2;
+        entry->special.arg3     = arg3;
+        __trace_stack(tr, data, irq_flags, 4);
+        __raw_spin_unlock(&data->lock);
+        raw_local_irq_restore(irq_flags);
+        trace_wake_up();
+}
+void
+tracing_sched_switch_trace(struct trace_array *tr,
+                           struct trace_array_cpu *data,
+                           struct task_struct *prev,
+                           struct task_struct *next,
+                           unsigned long flags)
+{
+        struct trace_entry *entry;
+        unsigned long irq_flags;
+        raw_local_irq_save(irq_flags);
+        __raw_spin_lock(&data->lock);
+        entry                   = tracing_get_trace_entry(tr, data);
+        tracing_generic_entry_update(entry, flags);
+        entry->type             = TRACE_CTX;
+        entry->ctx.prev_pid     = prev->pid;
+        entry->ctx.prev_prio    = prev->prio;
+        entry->ctx.prev_state   = prev->state;
+        entry->ctx.next_pid     = next->pid;
+        entry->ctx.next_prio    = next->prio;
+        entry->ctx.next_state   = next->state;
+        __trace_stack(tr, data, flags, 5);
+        __raw_spin_unlock(&data->lock);
+        raw_local_irq_restore(irq_flags);
+}
+void
+tracing_sched_wakeup_trace(struct trace_array *tr,
+                           struct trace_array_cpu *data,
+                           struct task_struct *wakee,
+                           struct task_struct *curr,
+                           unsigned long flags)
+{
+        struct trace_entry *entry;
+        unsigned long irq_flags;
+        raw_local_irq_save(irq_flags);
+        __raw_spin_lock(&data->lock);
+        entry                   = tracing_get_trace_entry(tr, data);
+        tracing_generic_entry_update(entry, flags);
+        entry->type             = TRACE_WAKE;
+        entry->ctx.prev_pid     = curr->pid;
+        entry->ctx.prev_prio    = curr->prio;
+        entry->ctx.prev_state   = curr->state;
+        entry->ctx.next_pid     = wakee->pid;
+        entry->ctx.next_prio    = wakee->prio;
+        entry->ctx.next_state   = wakee->state;
+        __trace_stack(tr, data, flags, 6);
+        __raw_spin_unlock(&data->lock);
+        raw_local_irq_restore(irq_flags);
+        trace_wake_up();
+}
+void
+ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+        struct trace_array *tr = &global_trace;
+        struct trace_array_cpu *data;
+        unsigned long flags;
+        long disabled;
+        int cpu;
+        if (tracing_disabled || current_trace == &no_tracer || !tr->ctrl)
+                return;
+        local_irq_save(flags);
+        cpu = raw_smp_processor_id();
+        data = tr->data[cpu];
+        disabled = atomic_inc_return(&data->disabled);
+        if (likely(disabled == 1))
+                __trace_special(tr, data, arg1, arg2, arg3);
+        atomic_dec(&data->disabled);
+        local_irq_restore(flags);
+}
+#ifdef CONFIG_FTRACE
+static void
+function_trace_call(unsigned long ip, unsigned long parent_ip)
+{
+        struct trace_array *tr = &global_trace;
+        struct trace_array_cpu *data;
+        unsigned long flags;
+        long disabled;
+        int cpu;
+        if (unlikely(!ftrace_function_enabled))
+                return;
+        if (skip_trace(ip))
+                return;
+        local_irq_save(flags);
+        cpu = raw_smp_processor_id();
+        data = tr->data[cpu];
+        disabled = atomic_inc_return(&data->disabled);
+        if (likely(disabled == 1))
+                trace_function(tr, data, ip, parent_ip, flags);
+        atomic_dec(&data->disabled);
+        local_irq_restore(flags);
+}
+static struct ftrace_ops trace_ops __read_mostly =
+{
+        .func = function_trace_call,
+};
+void tracing_start_function_trace(void)
+{
+        ftrace_function_enabled = 0;
+        register_ftrace_function(&trace_ops);
+        if (tracer_enabled)
+                ftrace_function_enabled = 1;
+}
+void tracing_stop_function_trace(void)
+{
+        ftrace_function_enabled = 0;
+        unregister_ftrace_function(&trace_ops);
+}
+#endif
+enum trace_file_type {
+        TRACE_FILE_LAT_FMT      = 1,
+};
+static struct trace_entry *
+trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data,
+                struct trace_iterator *iter, int cpu)
+{
+        struct page *page;
+        struct trace_entry *array;
+        if (iter->next_idx[cpu] >= tr->entries ||
+            iter->next_idx[cpu] >= data->trace_idx ||
+            (data->trace_head == data->trace_tail &&
+             data->trace_head_idx == data->trace_tail_idx))
+                return NULL;
+        if (!iter->next_page[cpu]) {
+                /* Initialize the iterator for this cpu trace buffer */
+                WARN_ON(!data->trace_tail);
+                page = virt_to_page(data->trace_tail);
+                iter->next_page[cpu] = &page->lru;
+                iter->next_page_idx[cpu] = data->trace_tail_idx;
+        }
+        page = list_entry(iter->next_page[cpu], struct page, lru);
+        BUG_ON(&data->trace_pages == &page->lru);
+        array = page_address(page);
+        WARN_ON(iter->next_page_idx[cpu] >= ENTRIES_PER_PAGE);
+        return &array[iter->next_page_idx[cpu]];
+}
+static struct trace_entry *
+find_next_entry(struct trace_iterator *iter, int *ent_cpu)
+{
+        struct trace_array *tr = iter->tr;
+        struct trace_entry *ent, *next = NULL;
+        int next_cpu = -1;
+        int cpu;
+        for_each_tracing_cpu(cpu) {
+                if (!head_page(tr->data[cpu]))
+                        continue;
+                ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu);
+                /*
+                 * Pick the entry with the smallest timestamp:
+                 */
+                if (ent && (!next || ent->t < next->t)) {
+                        next = ent;
+                        next_cpu = cpu;
+                }
+        }
+        if (ent_cpu)
+                *ent_cpu = next_cpu;
+        return next;
+}
+static void trace_iterator_increment(struct trace_iterator *iter)
+{
+        iter->idx++;
+        iter->next_idx[iter->cpu]++;
+        iter->next_page_idx[iter->cpu]++;
+        if (iter->next_page_idx[iter->cpu] >= ENTRIES_PER_PAGE) {
+                struct trace_array_cpu *data = iter->tr->data[iter->cpu];
+                iter->next_page_idx[iter->cpu] = 0;
+                iter->next_page[iter->cpu] =
+                        trace_next_list(data, iter->next_page[iter->cpu]);
+        }
+}
+static void trace_consume(struct trace_iterator *iter)
+{
+        struct trace_array_cpu *data = iter->tr->data[iter->cpu];
+        data->trace_tail_idx++;
+        if (data->trace_tail_idx >= ENTRIES_PER_PAGE) {
+                data->trace_tail = trace_next_page(data, data->trace_tail);
+                data->trace_tail_idx = 0;
+        }
+        /* Check if we empty it, then reset the index */
+        if (data->trace_head == data->trace_tail &&
+            data->trace_head_idx == data->trace_tail_idx)
+                data->trace_idx = 0;
+}
+static void *find_next_entry_inc(struct trace_iterator *iter)
+{
+        struct trace_entry *next;
+        int next_cpu = -1;
+        next = find_next_entry(iter, &next_cpu);
+        iter->prev_ent = iter->ent;
+        iter->prev_cpu = iter->cpu;
+        iter->ent = next;
+        iter->cpu = next_cpu;
+        if (next)
+                trace_iterator_increment(iter);
+        return next ? iter : NULL;
+}
+static void *s_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        struct trace_iterator *iter = m->private;
+        int i = (int)*pos;
+        void *ent;
+        (*pos)++;
+        /* can't go backwards */
+        if (iter->idx > i)
+                return NULL;
+        if (iter->idx < 0)
+                ent = find_next_entry_inc(iter);
+        else
+                ent = iter;
+        while (ent && iter->idx < i)
+                ent = find_next_entry_inc(iter);
+        iter->pos = *pos;
+        return ent;
+}
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+        struct trace_iterator *iter = m->private;
+        void *p = NULL;
+        loff_t l = 0;
+        int i;
+        mutex_lock(&trace_types_lock);
+        if (!current_trace || current_trace != iter->trace) {
+                mutex_unlock(&trace_types_lock);
+                return NULL;
+        }
+        atomic_inc(&trace_record_cmdline_disabled);
+        /* let the tracer grab locks here if needed */
+        if (current_trace->start)
+                current_trace->start(iter);
+        if (*pos != iter->pos) {
+                iter->ent = NULL;
+                iter->cpu = 0;
+                iter->idx = -1;
+                iter->prev_ent = NULL;
+                iter->prev_cpu = -1;
+                for_each_tracing_cpu(i) {
+                        iter->next_idx[i] = 0;
+                        iter->next_page[i] = NULL;
+                }
+                for (p = iter; p && l < *pos; p = s_next(m, p, &l))
+                        ;
+        } else {
+                l = *pos - 1;
+                p = s_next(m, p, &l);
+        }
+        return p;
+}
+static void s_stop(struct seq_file *m, void *p)
+{
+        struct trace_iterator *iter = m->private;
+        atomic_dec(&trace_record_cmdline_disabled);
+        /* let the tracer release locks here if needed */
+        if (current_trace && current_trace == iter->trace && iter->trace->stop)
+                iter->trace->stop(iter);
+        mutex_unlock(&trace_types_lock);
+}
+#define KRETPROBE_MSG "[unknown/kretprobe'd]"
+#ifdef CONFIG_KRETPROBES
+static inline int kretprobed(unsigned long addr)
+{
+        return addr == (unsigned long)kretprobe_trampoline;
+}
+#else
+static inline int kretprobed(unsigned long addr)
+{
+        return 0;
+}
+#endif /* CONFIG_KRETPROBES */
+static int
+seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+        char str[KSYM_SYMBOL_LEN];
+        kallsyms_lookup(address, NULL, NULL, NULL, str);
+        return trace_seq_printf(s, fmt, str);
+#endif
+        return 1;
+}
+static int
+seq_print_sym_offset(struct trace_seq *s, const char *fmt,
+                     unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+        char str[KSYM_SYMBOL_LEN];
+        sprint_symbol(str, address);
+        return trace_seq_printf(s, fmt, str);
+#endif
+        return 1;
+}
+#ifndef CONFIG_64BIT
+# define IP_FMT "%08lx"
+#else
+# define IP_FMT "%016lx"
+#endif
+static int
+seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
+{
+        int ret;
+        if (!ip)
+                return trace_seq_printf(s, "0");
+        if (sym_flags & TRACE_ITER_SYM_OFFSET)
+                ret = seq_print_sym_offset(s, "%s", ip);
+        else
+                ret = seq_print_sym_short(s, "%s", ip);
+        if (!ret)
+                return 0;
+        if (sym_flags & TRACE_ITER_SYM_ADDR)
+                ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+        return ret;
+}
+static void print_lat_help_header(struct seq_file *m)
+{
+        seq_puts(m, "#                _------=> CPU#            \n");
+        seq_puts(m, "#               / _-----=> irqs-off        \n");
+        seq_puts(m, "#              | / _----=> need-resched    \n");
+        seq_puts(m, "#              || / _---=> hardirq/softirq \n");
+        seq_puts(m, "#              ||| / _--=> preempt-depth   \n");
+        seq_puts(m, "#              |||| /                      \n");
+        seq_puts(m, "#              |||||     delay             \n");
+        seq_puts(m, "#  cmd     pid ||||| time  |   caller      \n");
+        seq_puts(m, "#     \\   /    |||||   \\   |   /           \n");
+}
+static void print_func_help_header(struct seq_file *m)
+{
+        seq_puts(m, "#           TASK-PID   CPU#    TIMESTAMP  FUNCTION\n");
+        seq_puts(m, "#              | |      |          |         |\n");
+}
+static void
+print_trace_header(struct seq_file *m, struct trace_iterator *iter)
+{
+        unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+        struct trace_array *tr = iter->tr;
+        struct trace_array_cpu *data = tr->data[tr->cpu];
+        struct tracer *type = current_trace;
+        unsigned long total   = 0;
+        unsigned long entries = 0;
+        int cpu;
+        const char *name = "preemption";
+        if (type)
+                name = type->name;
+        for_each_tracing_cpu(cpu) {
+                if (head_page(tr->data[cpu])) {
+                        total += tr->data[cpu]->trace_idx;
+                        if (tr->data[cpu]->trace_idx > tr->entries)
+                                entries += tr->entries;
+                        else
+                                entries += tr->data[cpu]->trace_idx;
+                }
+        }
+        seq_printf(m, "%s latency trace v1.1.5 on %s\n",
+                   name, UTS_RELEASE);
+        seq_puts(m, "-----------------------------------"
+                 "---------------------------------\n");
+        seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |"
+                   " (M:%s VP:%d, KP:%d, SP:%d HP:%d",
+                   nsecs_to_usecs(data->saved_latency),
+                   entries,
+                   total,
+                   tr->cpu,
+#if defined(CONFIG_PREEMPT_NONE)
+                   "server",
+#elif defined(CONFIG_PREEMPT_VOLUNTARY)
+                   "desktop",
+#elif defined(CONFIG_PREEMPT)
+                   "preempt",
+#else
+                   "unknown",
+#endif
+                   /* These are reserved for later use */
+                   0, 0, 0, 0);
+#ifdef CONFIG_SMP
+        seq_printf(m, " #P:%d)\n", num_online_cpus());
+#else
+        seq_puts(m, ")\n");
+#endif
+        seq_puts(m, "    -----------------\n");
+        seq_printf(m, "    | task: %.16s-%d "
+                   "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n",
+                   data->comm, data->pid, data->uid, data->nice,
+                   data->policy, data->rt_priority);
+        seq_puts(m, "    -----------------\n");
+        if (data->critical_start) {
+                seq_puts(m, " => started at: ");
+                seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags);
+                trace_print_seq(m, &iter->seq);
+                seq_puts(m, "\n => ended at:   ");
+                seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags);
+                trace_print_seq(m, &iter->seq);
+                seq_puts(m, "\n");
+        }
+        seq_puts(m, "\n");
+}
+static void
+lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
+{
+        int hardirq, softirq;
+        char *comm;
+        comm = trace_find_cmdline(entry->pid);
+        trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid);
+        trace_seq_printf(s, "%d", cpu);
+        trace_seq_printf(s, "%c%c",
+                        (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.',
+                        ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
+        hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+        softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+        if (hardirq && softirq) {
+                trace_seq_putc(s, 'H');
+        } else {
+                if (hardirq) {
+                        trace_seq_putc(s, 'h');
+                } else {
+                        if (softirq)
+                                trace_seq_putc(s, 's');
+                        else
+                                trace_seq_putc(s, '.');
+                }
+        }
+        if (entry->preempt_count)
+                trace_seq_printf(s, "%x", entry->preempt_count);
+        else
+                trace_seq_puts(s, ".");
+}
+unsigned long preempt_mark_thresh = 100;
+static void
+lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs,
+                    unsigned long rel_usecs)
+{
+        trace_seq_printf(s, " %4lldus", abs_usecs);
+        if (rel_usecs > preempt_mark_thresh)
+                trace_seq_puts(s, "!: ");
+        else if (rel_usecs > 1)
+                trace_seq_puts(s, "+: ");
+        else
+                trace_seq_puts(s, " : ");
+}
+static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
+static int
+print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
+{
+        struct trace_seq *s = &iter->seq;
+        unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+        struct trace_entry *next_entry = find_next_entry(iter, NULL);
+        unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
+        struct trace_entry *entry = iter->ent;
+        unsigned long abs_usecs;
+        unsigned long rel_usecs;
+        char *comm;
+        int S, T;
+        int i;
+        unsigned state;
+        if (!next_entry)
+                next_entry = entry;
+        rel_usecs = ns2usecs(next_entry->t - entry->t);
+        abs_usecs = ns2usecs(entry->t - iter->tr->time_start);
+        if (verbose) {
+                comm = trace_find_cmdline(entry->pid);
+                trace_seq_printf(s, "%16s %5d %d %d %08x %08x [%08lx]"
+                                 " %ld.%03ldms (+%ld.%03ldms): ",
+                                 comm,
+                                 entry->pid, cpu, entry->flags,
+                                 entry->preempt_count, trace_idx,
+                                 ns2usecs(entry->t),
+                                 abs_usecs/1000,
+                                 abs_usecs % 1000, rel_usecs/1000,
+                                 rel_usecs % 1000);
+        } else {
+                lat_print_generic(s, entry, cpu);
+                lat_print_timestamp(s, abs_usecs, rel_usecs);
+        }
+        switch (entry->type) {
+        case TRACE_FN:
+                seq_print_ip_sym(s, entry->fn.ip, sym_flags);
+                trace_seq_puts(s, " (");
+                if (kretprobed(entry->fn.parent_ip))
+                        trace_seq_puts(s, KRETPROBE_MSG);
+                else
+                        seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags);
+                trace_seq_puts(s, ")\n");
+                break;
+        case TRACE_CTX:
+        case TRACE_WAKE:
+                T = entry->ctx.next_state < sizeof(state_to_char) ?
+                        state_to_char[entry->ctx.next_state] : 'X';
+                state = entry->ctx.prev_state ? __ffs(entry->ctx.prev_state) + 1 : 0;
+                S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X';
+                comm = trace_find_cmdline(entry->ctx.next_pid);
+                trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n",
+                                 entry->ctx.prev_pid,
+                                 entry->ctx.prev_prio,
+                                 S, entry->type == TRACE_CTX ? "==>" : "  +",
+                                 entry->ctx.next_pid,
+                                 entry->ctx.next_prio,
+                                 T, comm);
+                break;
+        case TRACE_SPECIAL:
+                trace_seq_printf(s, "# %ld %ld %ld\n",
+                                 entry->special.arg1,
+                                 entry->special.arg2,
+                                 entry->special.arg3);
+                break;
+        case TRACE_STACK:
+                for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+                        if (i)
+                                trace_seq_puts(s, " <= ");
+                        seq_print_ip_sym(s, entry->stack.caller[i], sym_flags);
+                }
+                trace_seq_puts(s, "\n");
+                break;
+        default:
+                trace_seq_printf(s, "Unknown type %d\n", entry->type);
+        }
+        return 1;
+}
+static int print_trace_fmt(struct trace_iterator *iter)
+{
+        struct trace_seq *s = &iter->seq;
+        unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+        struct trace_entry *entry;
+        unsigned long usec_rem;
+        unsigned long long t;
+        unsigned long secs;
+        char *comm;
+        int ret;
+        int S, T;
+        int i;
+        entry = iter->ent;
+        comm = trace_find_cmdline(iter->ent->pid);
+        t = ns2usecs(entry->t);
+        usec_rem = do_div(t, 1000000ULL);
+        secs = (unsigned long)t;
+        ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
+        if (!ret)
+                return 0;
+        ret = trace_seq_printf(s, "[%02d] ", iter->cpu);
+        if (!ret)
+                return 0;
+        ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem);
+        if (!ret)
+                return 0;
+        switch (entry->type) {
+        case TRACE_FN:
+                ret = seq_print_ip_sym(s, entry->fn.ip, sym_flags);
+                if (!ret)
+                        return 0;
+                if ((sym_flags & TRACE_ITER_PRINT_PARENT) &&
+                                                entry->fn.parent_ip) {
+                        ret = trace_seq_printf(s, " <-");
+                        if (!ret)
+                                return 0;
+                        if (kretprobed(entry->fn.parent_ip))
+                                ret = trace_seq_puts(s, KRETPROBE_MSG);
+                        else
+                                ret = seq_print_ip_sym(s, entry->fn.parent_ip,
+                                                       sym_flags);
+                        if (!ret)
+                                return 0;
+                }
+                ret = trace_seq_printf(s, "\n");
+                if (!ret)
+                        return 0;
+                break;
+        case TRACE_CTX:
+        case TRACE_WAKE:
+                S = entry->ctx.prev_state < sizeof(state_to_char) ?
+                        state_to_char[entry->ctx.prev_state] : 'X';
+                T = entry->ctx.next_state < sizeof(state_to_char) ?
+                        state_to_char[entry->ctx.next_state] : 'X';
+                ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c\n",
+                                       entry->ctx.prev_pid,
+                                       entry->ctx.prev_prio,
+                                       S,
+                                       entry->type == TRACE_CTX ? "==>" : "  +",
+                                       entry->ctx.next_pid,
+                                       entry->ctx.next_prio,
+                                       T);
+                if (!ret)
+                        return 0;
+                break;
+        case TRACE_SPECIAL:
+                ret = trace_seq_printf(s, "# %ld %ld %ld\n",
+                                 entry->special.arg1,
+                                 entry->special.arg2,
+                                 entry->special.arg3);
+                if (!ret)
+                        return 0;
+                break;
+        case TRACE_STACK:
+                for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+                        if (i) {
+                                ret = trace_seq_puts(s, " <= ");
+                                if (!ret)
+                                        return 0;
+                        }
+                        ret = seq_print_ip_sym(s, entry->stack.caller[i],
+                                               sym_flags);
+                        if (!ret)
+                                return 0;
+                }
+                ret = trace_seq_puts(s, "\n");
+                if (!ret)
+                        return 0;
+                break;
+        }
+        return 1;
+}
+static int print_raw_fmt(struct trace_iterator *iter)
+{
+        struct trace_seq *s = &iter->seq;
+        struct trace_entry *entry;
+        int ret;
+        int S, T;
+        entry = iter->ent;
+        ret = trace_seq_printf(s, "%d %d %llu ",
+                entry->pid, iter->cpu, entry->t);
+        if (!ret)
+                return 0;
+        switch (entry->type) {
+        case TRACE_FN:
+                ret = trace_seq_printf(s, "%x %x\n",
+                                        entry->fn.ip, entry->fn.parent_ip);
+                if (!ret)
+                        return 0;
+                break;
+        case TRACE_CTX:
+        case TRACE_WAKE:
+                S = entry->ctx.prev_state < sizeof(state_to_char) ?
+                        state_to_char[entry->ctx.prev_state] : 'X';
+                T = entry->ctx.next_state < sizeof(state_to_char) ?
+                        state_to_char[entry->ctx.next_state] : 'X';
+                if (entry->type == TRACE_WAKE)
+                        S = '+';
+                ret = trace_seq_printf(s, "%d %d %c %d %d %c\n",
+                                       entry->ctx.prev_pid,
+                                       entry->ctx.prev_prio,
+                                       S,
+                                       entry->ctx.next_pid,
+                                       entry->ctx.next_prio,
+                                       T);
+                if (!ret)
+                        return 0;
+                break;
+        case TRACE_SPECIAL:
+        case TRACE_STACK:
+                ret = trace_seq_printf(s, "# %ld %ld %ld\n",
+                                 entry->special.arg1,
+                                 entry->special.arg2,
+                                 entry->special.arg3);
+                if (!ret)
+                        return 0;
+                break;
+        }
+        return 1;
+}
+#define SEQ_PUT_FIELD_RET(s, x)                         \
+do {                                                    \
+        if (!trace_seq_putmem(s, &(x), sizeof(x)))      \
+                return 0;                               \
+} while (0)
+#define SEQ_PUT_HEX_FIELD_RET(s, x)                     \
+do {                                                    \
+        if (!trace_seq_putmem_hex(s, &(x), sizeof(x)))  \
+                return 0;                               \
+} while (0)
+static int print_hex_fmt(struct trace_iterator *iter)
+{
+        struct trace_seq *s = &iter->seq;
+        unsigned char newline = '\n';
+        struct trace_entry *entry;
+        int S, T;
+        entry = iter->ent;
+        SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
+        SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
+        SEQ_PUT_HEX_FIELD_RET(s, entry->t);
+        switch (entry->type) {
+        case TRACE_FN:
+                SEQ_PUT_HEX_FIELD_RET(s, entry->fn.ip);
+                SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip);
+                break;
+        case TRACE_CTX:
+        case TRACE_WAKE:
+                S = entry->ctx.prev_state < sizeof(state_to_char) ?
+                        state_to_char[entry->ctx.prev_state] : 'X';
+                T = entry->ctx.next_state < sizeof(state_to_char) ?
+                        state_to_char[entry->ctx.next_state] : 'X';
+                if (entry->type == TRACE_WAKE)
+                        S = '+';
+                SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_pid);
+                SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_prio);
+                SEQ_PUT_HEX_FIELD_RET(s, S);
+                SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_pid);
+                SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_prio);
+                SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip);
+                SEQ_PUT_HEX_FIELD_RET(s, T);
+                break;
+        case TRACE_SPECIAL:
+        case TRACE_STACK:
+                SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg1);
+                SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg2);
+                SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg3);
+                break;
+        }
+        SEQ_PUT_FIELD_RET(s, newline);
+        return 1;
+}
+static int print_bin_fmt(struct trace_iterator *iter)
+{
+        struct trace_seq *s = &iter->seq;
+        struct trace_entry *entry;
+        entry = iter->ent;
+        SEQ_PUT_FIELD_RET(s, entry->pid);
+        SEQ_PUT_FIELD_RET(s, entry->cpu);
+        SEQ_PUT_FIELD_RET(s, entry->t);
+        switch (entry->type) {
+        case TRACE_FN:
+                SEQ_PUT_FIELD_RET(s, entry->fn.ip);
+                SEQ_PUT_FIELD_RET(s, entry->fn.parent_ip);
+                break;
+        case TRACE_CTX:
+                SEQ_PUT_FIELD_RET(s, entry->ctx.prev_pid);
+                SEQ_PUT_FIELD_RET(s, entry->ctx.prev_prio);
+                SEQ_PUT_FIELD_RET(s, entry->ctx.prev_state);
+                SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid);
+                SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio);
+                SEQ_PUT_FIELD_RET(s, entry->ctx.next_state);
+                break;
+        case TRACE_SPECIAL:
+        case TRACE_STACK:
+                SEQ_PUT_FIELD_RET(s, entry->special.arg1);
+                SEQ_PUT_FIELD_RET(s, entry->special.arg2);
+                SEQ_PUT_FIELD_RET(s, entry->special.arg3);
+                break;
+        }
+        return 1;
+}
+static int trace_empty(struct trace_iterator *iter)
+{
+        struct trace_array_cpu *data;
+        int cpu;
+        for_each_tracing_cpu(cpu) {
+                data = iter->tr->data[cpu];
+                if (head_page(data) && data->trace_idx &&
+                    (data->trace_tail != data->trace_head ||
+                     data->trace_tail_idx != data->trace_head_idx))
+                        return 0;
+        }
+        return 1;
+}
+static int print_trace_line(struct trace_iterator *iter)
+{
+        if (iter->trace && iter->trace->print_line)
+                return iter->trace->print_line(iter);
+        if (trace_flags & TRACE_ITER_BIN)
+                return print_bin_fmt(iter);
+        if (trace_flags & TRACE_ITER_HEX)
+                return print_hex_fmt(iter);
+        if (trace_flags & TRACE_ITER_RAW)
+                return print_raw_fmt(iter);
+        if (iter->iter_flags & TRACE_FILE_LAT_FMT)
+                return print_lat_fmt(iter, iter->idx, iter->cpu);
+        return print_trace_fmt(iter);
+}
+static int s_show(struct seq_file *m, void *v)
+{
+        struct trace_iterator *iter = v;
+        if (iter->ent == NULL) {
+                if (iter->tr) {
+                        seq_printf(m, "# tracer: %s\n", iter->trace->name);
+                        seq_puts(m, "#\n");
+                }
+                if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+                        /* print nothing if the buffers are empty */
+                        if (trace_empty(iter))
+                                return 0;
+                        print_trace_header(m, iter);
+                        if (!(trace_flags & TRACE_ITER_VERBOSE))
+                                print_lat_help_header(m);
+                } else {
+                        if (!(trace_flags & TRACE_ITER_VERBOSE))
+                                print_func_help_header(m);
+                }
+        } else {
+                print_trace_line(iter);
+                trace_print_seq(m, &iter->seq);
+        }
+        return 0;
+}
+static struct seq_operations tracer_seq_ops = {
+        .start          = s_start,
+        .next           = s_next,
+        .stop           = s_stop,
+        .show           = s_show,
+};
+static struct trace_iterator *
+__tracing_open(struct inode *inode, struct file *file, int *ret)
+{
+        struct trace_iterator *iter;
+        if (tracing_disabled) {
+                *ret = -ENODEV;
+                return NULL;
+        }
+        iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+        if (!iter) {
+                *ret = -ENOMEM;
+                goto out;
+        }
+        mutex_lock(&trace_types_lock);
+        if (current_trace && current_trace->print_max)
+                iter->tr = &max_tr;
+        else
+                iter->tr = inode->i_private;
+        iter->trace = current_trace;
+        iter->pos = -1;
+        /* TODO stop tracer */
+        *ret = seq_open(file, &tracer_seq_ops);
+        if (!*ret) {
+                struct seq_file *m = file->private_data;
+                m->private = iter;
+                /* stop the trace while dumping */
+                if (iter->tr->ctrl) {
+                        tracer_enabled = 0;
+                        ftrace_function_enabled = 0;
+                }
+                if (iter->trace && iter->trace->open)
+                        iter->trace->open(iter);
+        } else {
+                kfree(iter);
+                iter = NULL;
+        }
+        mutex_unlock(&trace_types_lock);
+ out:
+        return iter;
+}
+int tracing_open_generic(struct inode *inode, struct file *filp)
+{
+        if (tracing_disabled)
+                return -ENODEV;
+        filp->private_data = inode->i_private;
+        return 0;
+}
+int tracing_release(struct inode *inode, struct file *file)
+{
+        struct seq_file *m = (struct seq_file *)file->private_data;
+        struct trace_iterator *iter = m->private;
+        mutex_lock(&trace_types_lock);
+        if (iter->trace && iter->trace->close)
+                iter->trace->close(iter);
+        /* reenable tracing if it was previously enabled */
+        if (iter->tr->ctrl) {
+                tracer_enabled = 1;
+                /*
+                 * It is safe to enable function tracing even if it
+                 * isn't used
+                 */
+                ftrace_function_enabled = 1;
+        }
+        mutex_unlock(&trace_types_lock);
+        seq_release(inode, file);
+        kfree(iter);
+        return 0;
+}
+static int tracing_open(struct inode *inode, struct file *file)
+{
+        int ret;
+        __tracing_open(inode, file, &ret);
+        return ret;
+}
+static int tracing_lt_open(struct inode *inode, struct file *file)
+{
+        struct trace_iterator *iter;
+        int ret;
+        iter = __tracing_open(inode, file, &ret);
+        if (!ret)
+                iter->iter_flags |= TRACE_FILE_LAT_FMT;
+        return ret;
+}
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        struct tracer *t = m->private;
+        (*pos)++;
+        if (t)
+                t = t->next;
+        m->private = t;
+        return t;
+}
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+        struct tracer *t = m->private;
+        loff_t l = 0;
+        mutex_lock(&trace_types_lock);
+        for (; t && l < *pos; t = t_next(m, t, &l))
+                ;
+        return t;
+}
+static void t_stop(struct seq_file *m, void *p)
+{
+        mutex_unlock(&trace_types_lock);
+}
+static int t_show(struct seq_file *m, void *v)
+{
+        struct tracer *t = v;
+        if (!t)
+                return 0;
+        seq_printf(m, "%s", t->name);
+        if (t->next)
+                seq_putc(m, ' ');
+        else
+                seq_putc(m, '\n');
+        return 0;
+}
+static struct seq_operations show_traces_seq_ops = {
+        .start          = t_start,
+        .next           = t_next,
+        .stop           = t_stop,
+        .show           = t_show,
+};
+static int show_traces_open(struct inode *inode, struct file *file)
+{
+        int ret;
+        if (tracing_disabled)
+                return -ENODEV;
+        ret = seq_open(file, &show_traces_seq_ops);
+        if (!ret) {
+                struct seq_file *m = file->private_data;
+                m->private = trace_types;
+        }
+        return ret;
+}
+static struct file_operations tracing_fops = {
+        .open           = tracing_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = tracing_release,
+};
+static struct file_operations tracing_lt_fops = {
+        .open           = tracing_lt_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = tracing_release,
+};
+static struct file_operations show_traces_fops = {
+        .open           = show_traces_open,
+        .read           = seq_read,
+        .release        = seq_release,
+};
+/*
+ * Only trace on a CPU if the bitmask is set:
+ */
+static cpumask_t tracing_cpumask = CPU_MASK_ALL;
+/*
+ * When tracing/tracing_cpu_mask is modified then this holds
+ * the new bitmask we are about to install:
+ */
+static cpumask_t tracing_cpumask_new;
+/*
+ * The tracer itself will not take this lock, but still we want
+ * to provide a consistent cpumask to user-space:
+ */
+static DEFINE_MUTEX(tracing_cpumask_update_lock);
+/*
+ * Temporary storage for the character representation of the
+ * CPU bitmask (and one more byte for the newline):
+ */
+static char mask_str[NR_CPUS + 1];
+static ssize_t
+tracing_cpumask_read(struct file *filp, char __user *ubuf,
+                     size_t count, loff_t *ppos)
+{
+        int len;
+        mutex_lock(&tracing_cpumask_update_lock);
+        len = cpumask_scnprintf(mask_str, count, tracing_cpumask);
+        if (count - len < 2) {
+                count = -EINVAL;
+                goto out_err;
+        }
+        len += sprintf(mask_str + len, "\n");
+        count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1);
+out_err:
+        mutex_unlock(&tracing_cpumask_update_lock);
+        return count;
+}
+static ssize_t
+tracing_cpumask_write(struct file *filp, const char __user *ubuf,
+                      size_t count, loff_t *ppos)
+{
+        int err, cpu;
+        mutex_lock(&tracing_cpumask_update_lock);
+        err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
+        if (err)
+                goto err_unlock;
+        raw_local_irq_disable();
+        __raw_spin_lock(&ftrace_max_lock);
+        for_each_tracing_cpu(cpu) {
+                /*
+                 * Increase/decrease the disabled counter if we are
+                 * about to flip a bit in the cpumask:
+                 */
+                if (cpu_isset(cpu, tracing_cpumask) &&
+                                !cpu_isset(cpu, tracing_cpumask_new)) {
+                        atomic_inc(&global_trace.data[cpu]->disabled);
+                }
+                if (!cpu_isset(cpu, tracing_cpumask) &&
+                                cpu_isset(cpu, tracing_cpumask_new)) {
+                        atomic_dec(&global_trace.data[cpu]->disabled);
+                }
+        }
+        __raw_spin_unlock(&ftrace_max_lock);
+        raw_local_irq_enable();
+        tracing_cpumask = tracing_cpumask_new;
+        mutex_unlock(&tracing_cpumask_update_lock);
+        return count;
+err_unlock:
+        mutex_unlock(&tracing_cpumask_update_lock);
+        return err;
+}
+static struct file_operations tracing_cpumask_fops = {
+        .open           = tracing_open_generic,
+        .read           = tracing_cpumask_read,
+        .write          = tracing_cpumask_write,
+};
+static ssize_t
+tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
+                       size_t cnt, loff_t *ppos)
+{
+        char *buf;
+        int r = 0;
+        int len = 0;
+        int i;
+        /* calulate max size */
+        for (i = 0; trace_options[i]; i++) {
+                len += strlen(trace_options[i]);
+                len += 3; /* "no" and space */
+        }
+        /* +2 for \n and \0 */
+        buf = kmalloc(len + 2, GFP_KERNEL);
+        if (!buf)
+                return -ENOMEM;
+        for (i = 0; trace_options[i]; i++) {
+                if (trace_flags & (1 << i))
+                        r += sprintf(buf + r, "%s ", trace_options[i]);
+                else
+                        r += sprintf(buf + r, "no%s ", trace_options[i]);
+        }
+        r += sprintf(buf + r, "\n");
+        WARN_ON(r >= len + 2);
+        r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+        kfree(buf);
+        return r;
+}
+static ssize_t
+tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
+                        size_t cnt, loff_t *ppos)
+{
+        char buf[64];
+        char *cmp = buf;
+        int neg = 0;
+        int i;
+        if (cnt >= sizeof(buf))
+                return -EINVAL;
+        if (copy_from_user(&buf, ubuf, cnt))
+                return -EFAULT;
+        buf[cnt] = 0;
+        if (strncmp(buf, "no", 2) == 0) {
+                neg = 1;
+                cmp += 2;
+        }
+        for (i = 0; trace_options[i]; i++) {
+                int len = strlen(trace_options[i]);
+                if (strncmp(cmp, trace_options[i], len) == 0) {
+                        if (neg)
+                                trace_flags &= ~(1 << i);
+                        else
+                                trace_flags |= (1 << i);
+                        break;
+                }
+        }
+        /*
+         * If no option could be set, return an error:
+         */
+        if (!trace_options[i])
+                return -EINVAL;
+        filp->f_pos += cnt;
+        return cnt;
+}
+static struct file_operations tracing_iter_fops = {
+        .open           = tracing_open_generic,
+        .read           = tracing_iter_ctrl_read,
+        .write          = tracing_iter_ctrl_write,
+};
+static const char readme_msg[] =
+        "tracing mini-HOWTO:\n\n"
+        "# mkdir /debug\n"
+        "# mount -t debugfs nodev /debug\n\n"
+        "# cat /debug/tracing/available_tracers\n"
+        "wakeup preemptirqsoff preemptoff irqsoff ftrace sched_switch none\n\n"
+        "# cat /debug/tracing/current_tracer\n"
+        "none\n"
+        "# echo sched_switch > /debug/tracing/current_tracer\n"
+        "# cat /debug/tracing/current_tracer\n"
+        "sched_switch\n"
+        "# cat /debug/tracing/iter_ctrl\n"
+        "noprint-parent nosym-offset nosym-addr noverbose\n"
+        "# echo print-parent > /debug/tracing/iter_ctrl\n"
+        "# echo 1 > /debug/tracing/tracing_enabled\n"
+        "# cat /debug/tracing/trace > /tmp/trace.txt\n"
+        "echo 0 > /debug/tracing/tracing_enabled\n"
+;
+static ssize_t
+tracing_readme_read(struct file *filp, char __user *ubuf,
+                       size_t cnt, loff_t *ppos)
+{
+        return simple_read_from_buffer(ubuf, cnt, ppos,
+                                        readme_msg, strlen(readme_msg));
+}
+static struct file_operations tracing_readme_fops = {
+        .open           = tracing_open_generic,
+        .read           = tracing_readme_read,
+};
+static ssize_t
+tracing_ctrl_read(struct file *filp, char __user *ubuf,
+                  size_t cnt, loff_t *ppos)
+{
+        struct trace_array *tr = filp->private_data;
+        char buf[64];
+        int r;
+        r = sprintf(buf, "%ld\n", tr->ctrl);
+        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+static ssize_t
+tracing_ctrl_write(struct file *filp, const char __user *ubuf,
+                   size_t cnt, loff_t *ppos)
+{
+        struct trace_array *tr = filp->private_data;
+        char buf[64];
+        long val;
+        int ret;
+        if (cnt >= sizeof(buf))
+                return -EINVAL;
+        if (copy_from_user(&buf, ubuf, cnt))
+                return -EFAULT;
+        buf[cnt] = 0;
+        ret = strict_strtoul(buf, 10, &val);
+        if (ret < 0)
+                return ret;
+        val = !!val;
+        mutex_lock(&trace_types_lock);
+        if (tr->ctrl ^ val) {
+                if (val)
+                        tracer_enabled = 1;
+                else
+                        tracer_enabled = 0;
+                tr->ctrl = val;
+                if (current_trace && current_trace->ctrl_update)
+                        current_trace->ctrl_update(tr);
+        }
+        mutex_unlock(&trace_types_lock);
+        filp->f_pos += cnt;
+        return cnt;
+}
+static ssize_t
+tracing_set_trace_read(struct file *filp, char __user *ubuf,
+                       size_t cnt, loff_t *ppos)
+{
+        char buf[max_tracer_type_len+2];
+        int r;
+        mutex_lock(&trace_types_lock);
+        if (current_trace)
+                r = sprintf(buf, "%s\n", current_trace->name);
+        else
+                r = sprintf(buf, "\n");
+        mutex_unlock(&trace_types_lock);
+        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+static ssize_t
+tracing_set_trace_write(struct file *filp, const char __user *ubuf,
+                        size_t cnt, loff_t *ppos)
+{
+        struct trace_array *tr = &global_trace;
+        struct tracer *t;
+        char buf[max_tracer_type_len+1];
+        int i;
+        if (cnt > max_tracer_type_len)
+                cnt = max_tracer_type_len;
+        if (copy_from_user(&buf, ubuf, cnt))
+                return -EFAULT;
+        buf[cnt] = 0;
+        /* strip ending whitespace. */
+        for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
+                buf[i] = 0;
+        mutex_lock(&trace_types_lock);
+        for (t = trace_types; t; t = t->next) {
+                if (strcmp(t->name, buf) == 0)
+                        break;
+        }
+        if (!t || t == current_trace)
+                goto out;
+        if (current_trace && current_trace->reset)
+                current_trace->reset(tr);
+        current_trace = t;
+        if (t->init)
+                t->init(tr);
+ out:
+        mutex_unlock(&trace_types_lock);
+        filp->f_pos += cnt;
+        return cnt;
+}
+static ssize_t
+tracing_max_lat_read(struct file *filp, char __user *ubuf,
+                     size_t cnt, loff_t *ppos)
+{
+        unsigned long *ptr = filp->private_data;
+        char buf[64];
+        int r;
+        r = snprintf(buf, sizeof(buf), "%ld\n",
+                     *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr));
+        if (r > sizeof(buf))
+                r = sizeof(buf);
+        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+static ssize_t
+tracing_max_lat_write(struct file *filp, const char __user *ubuf,
+                      size_t cnt, loff_t *ppos)
+{
+        long *ptr = filp->private_data;
+        char buf[64];
+        long val;
+        int ret;
+        if (cnt >= sizeof(buf))
+                return -EINVAL;
+        if (copy_from_user(&buf, ubuf, cnt))
+                return -EFAULT;
+        buf[cnt] = 0;
+        ret = strict_strtoul(buf, 10, &val);
+        if (ret < 0)
+                return ret;
+        *ptr = val * 1000;
+        return cnt;
+}
+static atomic_t tracing_reader;
+static int tracing_open_pipe(struct inode *inode, struct file *filp)
+{
+        struct trace_iterator *iter;
+        if (tracing_disabled)
+                return -ENODEV;
+        /* We only allow for reader of the pipe */
+        if (atomic_inc_return(&tracing_reader) != 1) {
+                atomic_dec(&tracing_reader);
+                return -EBUSY;
+        }
+        /* create a buffer to store the information to pass to userspace */
+        iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+        if (!iter)
+                return -ENOMEM;
+        mutex_lock(&trace_types_lock);
+        iter->tr = &global_trace;
+        iter->trace = current_trace;
+        filp->private_data = iter;
+        if (iter->trace->pipe_open)
+                iter->trace->pipe_open(iter);
+        mutex_unlock(&trace_types_lock);
+        return 0;
+}
+static int tracing_release_pipe(struct inode *inode, struct file *file)
+{
+        struct trace_iterator *iter = file->private_data;
+        kfree(iter);
+        atomic_dec(&tracing_reader);
+        return 0;
+}
+static unsigned int
+tracing_poll_pipe(struct file *filp, poll_table *poll_table)
+{
+        struct trace_iterator *iter = filp->private_data;
+        if (trace_flags & TRACE_ITER_BLOCK) {
+                /*
+                 * Always select as readable when in blocking mode
+                 */
+                return POLLIN | POLLRDNORM;
+        } else {
+                if (!trace_empty(iter))
+                        return POLLIN | POLLRDNORM;
+                poll_wait(filp, &trace_wait, poll_table);
+                if (!trace_empty(iter))
+                        return POLLIN | POLLRDNORM;
+                return 0;
+        }
+}
+/*
+ * Consumer reader.
+ */
+static ssize_t
+tracing_read_pipe(struct file *filp, char __user *ubuf,
+                  size_t cnt, loff_t *ppos)
+{
+        struct trace_iterator *iter = filp->private_data;
+        struct trace_array_cpu *data;
+        static cpumask_t mask;
+        unsigned long flags;
+#ifdef CONFIG_FTRACE
+        int ftrace_save;
+#endif
+        int cpu;
+        ssize_t sret;
+        /* return any leftover data */
+        sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+        if (sret != -EBUSY)
+                return sret;
+        sret = 0;
+        trace_seq_reset(&iter->seq);
+        mutex_lock(&trace_types_lock);
+        if (iter->trace->read) {
+                sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
+                if (sret)
+                        goto out;
+        }
+        while (trace_empty(iter)) {
+                if ((filp->f_flags & O_NONBLOCK)) {
+                        sret = -EAGAIN;
+                        goto out;
+                }
+                /*
+                 * This is a make-shift waitqueue. The reason we don't use
+                 * an actual wait queue is because:
+                 *  1) we only ever have one waiter
+                 *  2) the tracing, traces all functions, we don't want
+                 *     the overhead of calling wake_up and friends
+                 *     (and tracing them too)
+                 *     Anyway, this is really very primitive wakeup.
+                 */
+                set_current_state(TASK_INTERRUPTIBLE);
+                iter->tr->waiter = current;
+                mutex_unlock(&trace_types_lock);
+                /* sleep for 100 msecs, and try again. */
+                schedule_timeout(HZ/10);
+                mutex_lock(&trace_types_lock);
+                iter->tr->waiter = NULL;
+                if (signal_pending(current)) {
+                        sret = -EINTR;
+                        goto out;
+                }
+                if (iter->trace != current_trace)
+                        goto out;
+                /*
+                 * We block until we read something and tracing is disabled.
+                 * We still block if tracing is disabled, but we have never
+                 * read anything. This allows a user to cat this file, and
+                 * then enable tracing. But after we have read something,
+                 * we give an EOF when tracing is again disabled.
+                 *
+                 * iter->pos will be 0 if we haven't read anything.
+                 */
+                if (!tracer_enabled && iter->pos)
+                        break;
+                continue;
+        }
+        /* stop when tracing is finished */
+        if (trace_empty(iter))
+                goto out;
+        if (cnt >= PAGE_SIZE)
+                cnt = PAGE_SIZE - 1;
+        /* reset all but tr, trace, and overruns */
+        memset(&iter->seq, 0,
+               sizeof(struct trace_iterator) -
+               offsetof(struct trace_iterator, seq));
+        iter->pos = -1;
+        /*
+         * We need to stop all tracing on all CPUS to read the
+         * the next buffer. This is a bit expensive, but is
+         * not done often. We fill all what we can read,
+         * and then release the locks again.
+         */
+        cpus_clear(mask);
+        local_irq_save(flags);
+#ifdef CONFIG_FTRACE
+        ftrace_save = ftrace_enabled;
+        ftrace_enabled = 0;
+#endif
+        smp_wmb();
+        for_each_tracing_cpu(cpu) {
+                data = iter->tr->data[cpu];
+                if (!head_page(data) || !data->trace_idx)
+                        continue;
+                atomic_inc(&data->disabled);
+                cpu_set(cpu, mask);
+        }
+        for_each_cpu_mask(cpu, mask) {
+                data = iter->tr->data[cpu];
+                __raw_spin_lock(&data->lock);
+                if (data->overrun > iter->last_overrun[cpu])
+                        iter->overrun[cpu] +=
+                                data->overrun - iter->last_overrun[cpu];
+                iter->last_overrun[cpu] = data->overrun;
+        }
+        while (find_next_entry_inc(iter) != NULL) {
+                int ret;
+                int len = iter->seq.len;
+                ret = print_trace_line(iter);
+                if (!ret) {
+                        /* don't print partial lines */
+                        iter->seq.len = len;
+                        break;
+                }
+                trace_consume(iter);
+                if (iter->seq.len >= cnt)
+                        break;
+        }
+        for_each_cpu_mask(cpu, mask) {
+                data = iter->tr->data[cpu];
+                __raw_spin_unlock(&data->lock);
+        }
+        for_each_cpu_mask(cpu, mask) {
+                data = iter->tr->data[cpu];
+                atomic_dec(&data->disabled);
+        }
+#ifdef CONFIG_FTRACE
+        ftrace_enabled = ftrace_save;
+#endif
+        local_irq_restore(flags);
+        /* Now copy what we have to the user */
+        sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+        if (iter->seq.readpos >= iter->seq.len)
+                trace_seq_reset(&iter->seq);
+        if (sret == -EBUSY)
+                sret = 0;
+out:
+        mutex_unlock(&trace_types_lock);
+        return sret;
+}
+static ssize_t
+tracing_entries_read(struct file *filp, char __user *ubuf,
+                     size_t cnt, loff_t *ppos)
+{
+        struct trace_array *tr = filp->private_data;
+        char buf[64];
+        int r;
+        r = sprintf(buf, "%lu\n", tr->entries);
+        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+static ssize_t
+tracing_entries_write(struct file *filp, const char __user *ubuf,
+                      size_t cnt, loff_t *ppos)
+{
+        unsigned long val;
+        char buf[64];
+        int i, ret;
+        if (cnt >= sizeof(buf))
+                return -EINVAL;
+        if (copy_from_user(&buf, ubuf, cnt))
+                return -EFAULT;
+        buf[cnt] = 0;
+        ret = strict_strtoul(buf, 10, &val);
+        if (ret < 0)
+                return ret;
+        /* must have at least 1 entry */
+        if (!val)
+                return -EINVAL;
+        mutex_lock(&trace_types_lock);
+        if (current_trace != &no_tracer) {
+                cnt = -EBUSY;
+                pr_info("ftrace: set current_tracer to none"
+                        " before modifying buffer size\n");
+                goto out;
+        }
+        if (val > global_trace.entries) {
+                long pages_requested;
+                unsigned long freeable_pages;
+                /* make sure we have enough memory before mapping */
+                pages_requested =
+                        (val + (ENTRIES_PER_PAGE-1)) / ENTRIES_PER_PAGE;
+                /* account for each buffer (and max_tr) */
+                pages_requested *= tracing_nr_buffers * 2;
+                /* Check for overflow */
+                if (pages_requested < 0) {
+                        cnt = -ENOMEM;
+                        goto out;
+                }
+                freeable_pages = determine_dirtyable_memory();
+                /* we only allow to request 1/4 of useable memory */
+                if (pages_requested >
+                    ((freeable_pages + tracing_pages_allocated) / 4)) {
+                        cnt = -ENOMEM;
+                        goto out;
+                }
+                while (global_trace.entries < val) {
+                        if (trace_alloc_page()) {
+                                cnt = -ENOMEM;
+                                goto out;
+                        }
+                        /* double check that we don't go over the known pages */
+                        if (tracing_pages_allocated > pages_requested)
+                                break;
+                }
+        } else {
+                /* include the number of entries in val (inc of page entries) */
+                while (global_trace.entries > val + (ENTRIES_PER_PAGE - 1))
+                        trace_free_page();
+        }
+        /* check integrity */
+        for_each_tracing_cpu(i)
+                check_pages(global_trace.data[i]);
+        filp->f_pos += cnt;
+        /* If check pages failed, return ENOMEM */
+        if (tracing_disabled)
+                cnt = -ENOMEM;
+ out:
+        max_tr.entries = global_trace.entries;
+        mutex_unlock(&trace_types_lock);
+        return cnt;
+}
+static struct file_operations tracing_max_lat_fops = {
+        .open           = tracing_open_generic,
+        .read           = tracing_max_lat_read,
+        .write          = tracing_max_lat_write,
+};
+static struct file_operations tracing_ctrl_fops = {
+        .open           = tracing_open_generic,
+        .read           = tracing_ctrl_read,
+        .write          = tracing_ctrl_write,
+};
+static struct file_operations set_tracer_fops = {
+        .open           = tracing_open_generic,
+        .read           = tracing_set_trace_read,
+        .write          = tracing_set_trace_write,
+};
+static struct file_operations tracing_pipe_fops = {
+        .open           = tracing_open_pipe,
+        .poll           = tracing_poll_pipe,
+        .read           = tracing_read_pipe,
+        .release        = tracing_release_pipe,
+};
+static struct file_operations tracing_entries_fops = {
+        .open           = tracing_open_generic,
+        .read           = tracing_entries_read,
+        .write          = tracing_entries_write,
+};
+#ifdef CONFIG_DYNAMIC_FTRACE
+static ssize_t
+tracing_read_long(struct file *filp, char __user *ubuf,
+                  size_t cnt, loff_t *ppos)
+{
+        unsigned long *p = filp->private_data;
+        char buf[64];
+        int r;
+        r = sprintf(buf, "%ld\n", *p);
+        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+static struct file_operations tracing_read_long_fops = {
+        .open           = tracing_open_generic,
+        .read           = tracing_read_long,
+};
+#endif
+static struct dentry *d_tracer;
+struct dentry *tracing_init_dentry(void)
+{
+        static int once;
+        if (d_tracer)
+                return d_tracer;
+        d_tracer = debugfs_create_dir("tracing", NULL);
+        if (!d_tracer && !once) {
+                once = 1;
+                pr_warning("Could not create debugfs directory 'tracing'\n");
+                return NULL;
+        }
+        return d_tracer;
+}
+#ifdef CONFIG_FTRACE_SELFTEST
+/* Let selftest have access to static functions in this file */
+#include "trace_selftest.c"
+#endif
+static __init void tracer_init_debugfs(void)
+{
+        struct dentry *d_tracer;
+        struct dentry *entry;
+        d_tracer = tracing_init_dentry();
+        entry = debugfs_create_file("tracing_enabled", 0644, d_tracer,
+                                    &global_trace, &tracing_ctrl_fops);
+        if (!entry)
+                pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
+        entry = debugfs_create_file("iter_ctrl", 0644, d_tracer,
+                                    NULL, &tracing_iter_fops);
+        if (!entry)
+                pr_warning("Could not create debugfs 'iter_ctrl' entry\n");
+        entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
+                                    NULL, &tracing_cpumask_fops);
+        if (!entry)
+                pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
+        entry = debugfs_create_file("latency_trace", 0444, d_tracer,
+                                    &global_trace, &tracing_lt_fops);
+        if (!entry)
+                pr_warning("Could not create debugfs 'latency_trace' entry\n");
+        entry = debugfs_create_file("trace", 0444, d_tracer,
+                                    &global_trace, &tracing_fops);
+        if (!entry)
+                pr_warning("Could not create debugfs 'trace' entry\n");
+        entry = debugfs_create_file("available_tracers", 0444, d_tracer,
+                                    &global_trace, &show_traces_fops);
+        if (!entry)
+                pr_warning("Could not create debugfs 'trace' entry\n");
+        entry = debugfs_create_file("current_tracer", 0444, d_tracer,
+                                    &global_trace, &set_tracer_fops);
+        if (!entry)
+                pr_warning("Could not create debugfs 'trace' entry\n");
+        entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer,
+                                    &tracing_max_latency,
+                                    &tracing_max_lat_fops);
+        if (!entry)
+                pr_warning("Could not create debugfs "
+                           "'tracing_max_latency' entry\n");
+        entry = debugfs_create_file("tracing_thresh", 0644, d_tracer,
+                                    &tracing_thresh, &tracing_max_lat_fops);
+        if (!entry)
+                pr_warning("Could not create debugfs "
+                           "'tracing_threash' entry\n");
+        entry = debugfs_create_file("README", 0644, d_tracer,
+                                    NULL, &tracing_readme_fops);
+        if (!entry)
+                pr_warning("Could not create debugfs 'README' entry\n");
+        entry = debugfs_create_file("trace_pipe", 0644, d_tracer,
+                                    NULL, &tracing_pipe_fops);
+        if (!entry)
+                pr_warning("Could not create debugfs "
+                           "'tracing_threash' entry\n");
+        entry = debugfs_create_file("trace_entries", 0644, d_tracer,
+                                    &global_trace, &tracing_entries_fops);
+        if (!entry)
+                pr_warning("Could not create debugfs "
+                           "'tracing_threash' entry\n");
+#ifdef CONFIG_DYNAMIC_FTRACE
+        entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
+                                    &ftrace_update_tot_cnt,
+                                    &tracing_read_long_fops);
+        if (!entry)
+                pr_warning("Could not create debugfs "
+                           "'dyn_ftrace_total_info' entry\n");
+#endif
+#ifdef CONFIG_SYSPROF_TRACER
+        init_tracer_sysprof_debugfs(d_tracer);
+#endif
+}
+static int trace_alloc_page(void)
+{
+        struct trace_array_cpu *data;
+        struct page *page, *tmp;
+        LIST_HEAD(pages);
+        void *array;
+        unsigned pages_allocated = 0;
+        int i;
+        /* first allocate a page for each CPU */
+        for_each_tracing_cpu(i) {
+                array = (void *)__get_free_page(GFP_KERNEL);
+                if (array == NULL) {
+                        printk(KERN_ERR "tracer: failed to allocate page"
+                               "for trace buffer!\n");
+                        goto free_pages;
+                }
+                pages_allocated++;
+                page = virt_to_page(array);
+                list_add(&page->lru, &pages);
+/* Only allocate if we are actually using the max trace */
+#ifdef CONFIG_TRACER_MAX_TRACE
+                array = (void *)__get_free_page(GFP_KERNEL);
+                if (array == NULL) {
+                        printk(KERN_ERR "tracer: failed to allocate page"
+                               "for trace buffer!\n");
+                        goto free_pages;
+                }
+                pages_allocated++;
+                page = virt_to_page(array);
+                list_add(&page->lru, &pages);
+#endif
+        }
+        /* Now that we successfully allocate a page per CPU, add them */
+        for_each_tracing_cpu(i) {
+                data = global_trace.data[i];
+                page = list_entry(pages.next, struct page, lru);
+                list_del_init(&page->lru);
+                list_add_tail(&page->lru, &data->trace_pages);
+                ClearPageLRU(page);
+#ifdef CONFIG_TRACER_MAX_TRACE
+                data = max_tr.data[i];
+                page = list_entry(pages.next, struct page, lru);
+                list_del_init(&page->lru);
+                list_add_tail(&page->lru, &data->trace_pages);
+                SetPageLRU(page);
+#endif
+        }
+        tracing_pages_allocated += pages_allocated;
+        global_trace.entries += ENTRIES_PER_PAGE;
+        return 0;
+ free_pages:
+        list_for_each_entry_safe(page, tmp, &pages, lru) {
+                list_del_init(&page->lru);
+                __free_page(page);
+        }
+        return -ENOMEM;
+}
+static int trace_free_page(void)
+{
+        struct trace_array_cpu *data;
+        struct page *page;
+        struct list_head *p;
+        int i;
+        int ret = 0;
+        /* free one page from each buffer */
+        for_each_tracing_cpu(i) {
+                data = global_trace.data[i];
+                p = data->trace_pages.next;
+                if (p == &data->trace_pages) {
+                        /* should never happen */
+                        WARN_ON(1);
+                        tracing_disabled = 1;
+                        ret = -1;
+                        break;
+                }
+                page = list_entry(p, struct page, lru);
+                ClearPageLRU(page);
+                list_del(&page->lru);
+                tracing_pages_allocated--;
+                tracing_pages_allocated--;
+                __free_page(page);
+                tracing_reset(data);
+#ifdef CONFIG_TRACER_MAX_TRACE
+                data = max_tr.data[i];
+                p = data->trace_pages.next;
+                if (p == &data->trace_pages) {
+                        /* should never happen */
+                        WARN_ON(1);
+                        tracing_disabled = 1;
+                        ret = -1;
+                        break;
+                }
+                page = list_entry(p, struct page, lru);
+                ClearPageLRU(page);
+                list_del(&page->lru);
+                __free_page(page);
+                tracing_reset(data);
+#endif
+        }
+        global_trace.entries -= ENTRIES_PER_PAGE;
+        return ret;
+}
+__init static int tracer_alloc_buffers(void)
+{
+        struct trace_array_cpu *data;
+        void *array;
+        struct page *page;
+        int pages = 0;
+        int ret = -ENOMEM;
+        int i;
+        /* TODO: make the number of buffers hot pluggable with CPUS */
+        tracing_nr_buffers = num_possible_cpus();
+        tracing_buffer_mask = cpu_possible_map;
+        /* Allocate the first page for all buffers */
+        for_each_tracing_cpu(i) {
+                data = global_trace.data[i] = &per_cpu(global_trace_cpu, i);
+                max_tr.data[i] = &per_cpu(max_data, i);
+                array = (void *)__get_free_page(GFP_KERNEL);
+                if (array == NULL) {
+                        printk(KERN_ERR "tracer: failed to allocate page"
+                               "for trace buffer!\n");
+                        goto free_buffers;
+                }
+                /* set the array to the list */
+                INIT_LIST_HEAD(&data->trace_pages);
+                page = virt_to_page(array);
+                list_add(&page->lru, &data->trace_pages);
+                /* use the LRU flag to differentiate the two buffers */
+                ClearPageLRU(page);
+                data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+                max_tr.data[i]->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+/* Only allocate if we are actually using the max trace */
+#ifdef CONFIG_TRACER_MAX_TRACE
+                array = (void *)__get_free_page(GFP_KERNEL);
+                if (array == NULL) {
+                        printk(KERN_ERR "tracer: failed to allocate page"
+                               "for trace buffer!\n");
+                        goto free_buffers;
+                }
+                INIT_LIST_HEAD(&max_tr.data[i]->trace_pages);
+                page = virt_to_page(array);
+                list_add(&page->lru, &max_tr.data[i]->trace_pages);
+                SetPageLRU(page);
+#endif
+        }
+        /*
+         * Since we allocate by orders of pages, we may be able to
+         * round up a bit.
+         */
+        global_trace.entries = ENTRIES_PER_PAGE;
+        pages++;
+        while (global_trace.entries < trace_nr_entries) {
+                if (trace_alloc_page())
+                        break;
+                pages++;
+        }
+        max_tr.entries = global_trace.entries;
+        pr_info("tracer: %d pages allocated for %ld entries of %ld bytes\n",
+                pages, trace_nr_entries, (long)TRACE_ENTRY_SIZE);
+        pr_info("   actual entries %ld\n", global_trace.entries);
+        tracer_init_debugfs();
+        trace_init_cmdlines();
+        register_tracer(&no_tracer);
+        current_trace = &no_tracer;
+        /* All seems OK, enable tracing */
+        global_trace.ctrl = tracer_enabled;
+        tracing_disabled = 0;
+        return 0;
+ free_buffers:
+        for (i-- ; i >= 0; i--) {
+                struct page *page, *tmp;
+                struct trace_array_cpu *data = global_trace.data[i];
+                if (data) {
+                        list_for_each_entry_safe(page, tmp,
+                                                 &data->trace_pages, lru) {
+                                list_del_init(&page->lru);
+                                __free_page(page);
+                        }
+                }
+#ifdef CONFIG_TRACER_MAX_TRACE
+                data = max_tr.data[i];
+                if (data) {
+                        list_for_each_entry_safe(page, tmp,
+                                                 &data->trace_pages, lru) {
+                                list_del_init(&page->lru);
+                                __free_page(page);
+                        }
+                }
+#endif
+        }
+        return ret;
+}
+fs_initcall(tracer_alloc_buffers);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
new file mode 100644
index 000000000000..f69f86788c2b
--- /dev/null
+++ b/kernel/trace/trace.h
@@ -0,0 +1,339 @@
+#ifndef _LINUX_KERNEL_TRACE_H
+#define _LINUX_KERNEL_TRACE_H
+#include <linux/fs.h>
+#include <asm/atomic.h>
+#include <linux/sched.h>
+#include <linux/clocksource.h>
+#include <linux/mmiotrace.h>
+enum trace_type {
+        __TRACE_FIRST_TYPE = 0,
+        TRACE_FN,
+        TRACE_CTX,
+        TRACE_WAKE,
+        TRACE_STACK,
+        TRACE_SPECIAL,
+        TRACE_MMIO_RW,
+        TRACE_MMIO_MAP,
+        __TRACE_LAST_TYPE
+};
+/*
+ * Function trace entry - function address and parent function addres:
+ */
+struct ftrace_entry {
+        unsigned long           ip;
+        unsigned long           parent_ip;
+};
+/*
+ * Context switch trace entry - which task (and prio) we switched from/to:
+ */
+struct ctx_switch_entry {
+        unsigned int            prev_pid;
+        unsigned char           prev_prio;
+        unsigned char           prev_state;
+        unsigned int            next_pid;
+        unsigned char           next_prio;
+        unsigned char           next_state;
+};
+/*
+ * Special (free-form) trace entry:
+ */
+struct special_entry {
+        unsigned long           arg1;
+        unsigned long           arg2;
+        unsigned long           arg3;
+};
+/*
+ * Stack-trace entry:
+ */
+#define FTRACE_STACK_ENTRIES    8
+struct stack_entry {
+        unsigned long           caller[FTRACE_STACK_ENTRIES];
+};
+/*
+ * The trace entry - the most basic unit of tracing. This is what
+ * is printed in the end as a single line in the trace output, such as:
+ *
+ *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter
+ */
+struct trace_entry {
+        char                    type;
+        char                    cpu;
+        char                    flags;
+        char                    preempt_count;
+        int                     pid;
+        cycle_t                 t;
+        union {
+                struct ftrace_entry             fn;
+                struct ctx_switch_entry         ctx;
+                struct special_entry            special;
+                struct stack_entry              stack;
+                struct mmiotrace_rw             mmiorw;
+                struct mmiotrace_map            mmiomap;
+        };
+};
+#define TRACE_ENTRY_SIZE        sizeof(struct trace_entry)
+/*
+ * The CPU trace array - it consists of thousands of trace entries
+ * plus some other descriptor data: (for example which task started
+ * the trace, etc.)
+ */
+struct trace_array_cpu {
+        struct list_head        trace_pages;
+        atomic_t                disabled;
+        raw_spinlock_t          lock;
+        struct lock_class_key   lock_key;
+        /* these fields get copied into max-trace: */
+        unsigned                trace_head_idx;
+        unsigned                trace_tail_idx;
+        void                    *trace_head; /* producer */
+        void                    *trace_tail; /* consumer */
+        unsigned long           trace_idx;
+        unsigned long           overrun;
+        unsigned long           saved_latency;
+        unsigned long           critical_start;
+        unsigned long           critical_end;
+        unsigned long           critical_sequence;
+        unsigned long           nice;
+        unsigned long           policy;
+        unsigned long           rt_priority;
+        cycle_t                 preempt_timestamp;
+        pid_t                   pid;
+        uid_t                   uid;
+        char                    comm[TASK_COMM_LEN];
+};
+struct trace_iterator;
+/*
+ * The trace array - an array of per-CPU trace arrays. This is the
+ * highest level data structure that individual tracers deal with.
+ * They have on/off state as well:
+ */
+struct trace_array {
+        unsigned long           entries;
+        long                    ctrl;
+        int                     cpu;
+        cycle_t                 time_start;
+        struct task_struct      *waiter;
+        struct trace_array_cpu  *data[NR_CPUS];
+};
+/*
+ * A specific tracer, represented by methods that operate on a trace array:
+ */
+struct tracer {
+        const char              *name;
+        void                    (*init)(struct trace_array *tr);
+        void                    (*reset)(struct trace_array *tr);
+        void                    (*open)(struct trace_iterator *iter);
+        void                    (*pipe_open)(struct trace_iterator *iter);
+        void                    (*close)(struct trace_iterator *iter);
+        void                    (*start)(struct trace_iterator *iter);
+        void                    (*stop)(struct trace_iterator *iter);
+        ssize_t                 (*read)(struct trace_iterator *iter,
+                                        struct file *filp, char __user *ubuf,
+                                        size_t cnt, loff_t *ppos);
+        void                    (*ctrl_update)(struct trace_array *tr);
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+        int                     (*selftest)(struct tracer *trace,
+                                            struct trace_array *tr);
+#endif
+        int                     (*print_line)(struct trace_iterator *iter);
+        struct tracer           *next;
+        int                     print_max;
+};
+struct trace_seq {
+        unsigned char           buffer[PAGE_SIZE];
+        unsigned int            len;
+        unsigned int            readpos;
+};
+/*
+ * Trace iterator - used by printout routines who present trace
+ * results to users and which routines might sleep, etc:
+ */
+struct trace_iterator {
+        struct trace_array      *tr;
+        struct tracer           *trace;
+        void                    *private;
+        long                    last_overrun[NR_CPUS];
+        long                    overrun[NR_CPUS];
+        /* The below is zeroed out in pipe_read */
+        struct trace_seq        seq;
+        struct trace_entry      *ent;
+        int                     cpu;
+        struct trace_entry      *prev_ent;
+        int                     prev_cpu;
+        unsigned long           iter_flags;
+        loff_t                  pos;
+        unsigned long           next_idx[NR_CPUS];
+        struct list_head        *next_page[NR_CPUS];
+        unsigned                next_page_idx[NR_CPUS];
+        long                    idx;
+};
+void tracing_reset(struct trace_array_cpu *data);
+int tracing_open_generic(struct inode *inode, struct file *filp);
+struct dentry *tracing_init_dentry(void);
+void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
+void ftrace(struct trace_array *tr,
+                            struct trace_array_cpu *data,
+                            unsigned long ip,
+                            unsigned long parent_ip,
+                            unsigned long flags);
+void tracing_sched_switch_trace(struct trace_array *tr,
+                                struct trace_array_cpu *data,
+                                struct task_struct *prev,
+                                struct task_struct *next,
+                                unsigned long flags);
+void tracing_record_cmdline(struct task_struct *tsk);
+void tracing_sched_wakeup_trace(struct trace_array *tr,
+                                struct trace_array_cpu *data,
+                                struct task_struct *wakee,
+                                struct task_struct *cur,
+                                unsigned long flags);
+void trace_special(struct trace_array *tr,
+                   struct trace_array_cpu *data,
+                   unsigned long arg1,
+                   unsigned long arg2,
+                   unsigned long arg3);
+void trace_function(struct trace_array *tr,
+                    struct trace_array_cpu *data,
+                    unsigned long ip,
+                    unsigned long parent_ip,
+                    unsigned long flags);
+void tracing_start_cmdline_record(void);
+void tracing_stop_cmdline_record(void);
+int register_tracer(struct tracer *type);
+void unregister_tracer(struct tracer *type);
+extern unsigned long nsecs_to_usecs(unsigned long nsecs);
+extern unsigned long tracing_max_latency;
+extern unsigned long tracing_thresh;
+void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
+void update_max_tr_single(struct trace_array *tr,
+                          struct task_struct *tsk, int cpu);
+extern cycle_t ftrace_now(int cpu);
+#ifdef CONFIG_FTRACE
+void tracing_start_function_trace(void);
+void tracing_stop_function_trace(void);
+#else
+# define tracing_start_function_trace()         do { } while (0)
+# define tracing_stop_function_trace()          do { } while (0)
+#endif
+#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+typedef void
+(*tracer_switch_func_t)(void *private,
+                        void *__rq,
+                        struct task_struct *prev,
+                        struct task_struct *next);
+struct tracer_switch_ops {
+        tracer_switch_func_t            func;
+        void                            *private;
+        struct tracer_switch_ops        *next;
+};
+#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
+#ifdef CONFIG_DYNAMIC_FTRACE
+extern unsigned long ftrace_update_tot_cnt;
+#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
+extern int DYN_FTRACE_TEST_NAME(void);
+#endif
+#ifdef CONFIG_MMIOTRACE
+extern void __trace_mmiotrace_rw(struct trace_array *tr,
+                                struct trace_array_cpu *data,
+                                struct mmiotrace_rw *rw);
+extern void __trace_mmiotrace_map(struct trace_array *tr,
+                                struct trace_array_cpu *data,
+                                struct mmiotrace_map *map);
+#endif
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+#ifdef CONFIG_FTRACE
+extern int trace_selftest_startup_function(struct tracer *trace,
+                                           struct trace_array *tr);
+#endif
+#ifdef CONFIG_IRQSOFF_TRACER
+extern int trace_selftest_startup_irqsoff(struct tracer *trace,
+                                          struct trace_array *tr);
+#endif
+#ifdef CONFIG_PREEMPT_TRACER
+extern int trace_selftest_startup_preemptoff(struct tracer *trace,
+                                             struct trace_array *tr);
+#endif
+#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER)
+extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace,
+                                                 struct trace_array *tr);
+#endif
+#ifdef CONFIG_SCHED_TRACER
+extern int trace_selftest_startup_wakeup(struct tracer *trace,
+                                         struct trace_array *tr);
+#endif
+#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+extern int trace_selftest_startup_sched_switch(struct tracer *trace,
+                                               struct trace_array *tr);
+#endif
+#ifdef CONFIG_SYSPROF_TRACER
+extern int trace_selftest_startup_sysprof(struct tracer *trace,
+                                               struct trace_array *tr);
+#endif
+#endif /* CONFIG_FTRACE_STARTUP_TEST */
+extern void *head_page(struct trace_array_cpu *data);
+extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
+extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
+                                 size_t cnt);
+extern long ns2usecs(cycle_t nsec);
+extern unsigned long trace_flags;
+/*
+ * trace_iterator_flags is an enumeration that defines bit
+ * positions into trace_flags that controls the output.
+ *
+ * NOTE: These bits must match the trace_options array in
+ *       trace.c.
+ */
+enum trace_iterator_flags {
+        TRACE_ITER_PRINT_PARENT         = 0x01,
+        TRACE_ITER_SYM_OFFSET           = 0x02,
+        TRACE_ITER_SYM_ADDR             = 0x04,
+        TRACE_ITER_VERBOSE              = 0x08,
+        TRACE_ITER_RAW                  = 0x10,
+        TRACE_ITER_HEX                  = 0x20,
+        TRACE_ITER_BIN                  = 0x40,
+        TRACE_ITER_BLOCK                = 0x80,
+        TRACE_ITER_STACKTRACE           = 0x100,
+        TRACE_ITER_SCHED_TREE           = 0x200,
+};
+#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
new file mode 100644
index 000000000000..312144897970
--- /dev/null
+++ b/kernel/trace/trace_functions.c
@@ -0,0 +1,81 @@
+/*
+ * ring buffer based function tracer
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Based on code from the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/fs.h>
+#include "trace.h"
+static void function_reset(struct trace_array *tr)
+{
+        int cpu;
+        tr->time_start = ftrace_now(tr->cpu);
+        for_each_online_cpu(cpu)
+                tracing_reset(tr->data[cpu]);
+}
+static void start_function_trace(struct trace_array *tr)
+{
+        tr->cpu = get_cpu();
+        function_reset(tr);
+        put_cpu();
+        tracing_start_cmdline_record();
+        tracing_start_function_trace();
+}
+static void stop_function_trace(struct trace_array *tr)
+{
+        tracing_stop_function_trace();
+        tracing_stop_cmdline_record();
+}
+static void function_trace_init(struct trace_array *tr)
+{
+        if (tr->ctrl)
+                start_function_trace(tr);
+}
+static void function_trace_reset(struct trace_array *tr)
+{
+        if (tr->ctrl)
+                stop_function_trace(tr);
+}
+static void function_trace_ctrl_update(struct trace_array *tr)
+{
+        if (tr->ctrl)
+                start_function_trace(tr);
+        else
+                stop_function_trace(tr);
+}
+static struct tracer function_trace __read_mostly =
+{
+        .name        = "ftrace",
+        .init        = function_trace_init,
+        .reset       = function_trace_reset,
+        .ctrl_update = function_trace_ctrl_update,
+#ifdef CONFIG_FTRACE_SELFTEST
+        .selftest    = trace_selftest_startup_function,
+#endif
+};
+static __init int init_function_trace(void)
+{
+        return register_tracer(&function_trace);
+}
+device_initcall(init_function_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
new file mode 100644
index 000000000000..ece6cfb649fa
--- /dev/null
+++ b/kernel/trace/trace_irqsoff.c
@@ -0,0 +1,490 @@
+/*
+ * trace irqs off criticall timings
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * From code in the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/kallsyms.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/fs.h>
+#include "trace.h"
+static struct trace_array               *irqsoff_trace __read_mostly;
+static int                              tracer_enabled __read_mostly;
+static DEFINE_PER_CPU(int, tracing_cpu);
+static DEFINE_SPINLOCK(max_trace_lock);
+enum {
+        TRACER_IRQS_OFF         = (1 << 1),
+        TRACER_PREEMPT_OFF      = (1 << 2),
+};
+static int trace_type __read_mostly;
+#ifdef CONFIG_PREEMPT_TRACER
+static inline int
+preempt_trace(void)
+{
+        return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count());
+}
+#else
+# define preempt_trace() (0)
+#endif
+#ifdef CONFIG_IRQSOFF_TRACER
+static inline int
+irq_trace(void)
+{
+        return ((trace_type & TRACER_IRQS_OFF) &&
+                irqs_disabled());
+}
+#else
+# define irq_trace() (0)
+#endif
+/*
+ * Sequence count - we record it when starting a measurement and
+ * skip the latency if the sequence has changed - some other section
+ * did a maximum and could disturb our measurement with serial console
+ * printouts, etc. Truly coinciding maximum latencies should be rare
+ * and what happens together happens separately as well, so this doesnt
+ * decrease the validity of the maximum found:
+ */
+static __cacheline_aligned_in_smp       unsigned long max_sequence;
+#ifdef CONFIG_FTRACE
+/*
+ * irqsoff uses its own tracer function to keep the overhead down:
+ */
+static void
+irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+        struct trace_array *tr = irqsoff_trace;
+        struct trace_array_cpu *data;
+        unsigned long flags;
+        long disabled;
+        int cpu;
+        /*
+         * Does not matter if we preempt. We test the flags
+         * afterward, to see if irqs are disabled or not.
+         * If we preempt and get a false positive, the flags
+         * test will fail.
+         */
+        cpu = raw_smp_processor_id();
+        if (likely(!per_cpu(tracing_cpu, cpu)))
+                return;
+        local_save_flags(flags);
+        /* slight chance to get a false positive on tracing_cpu */
+        if (!irqs_disabled_flags(flags))
+                return;
+        data = tr->data[cpu];
+        disabled = atomic_inc_return(&data->disabled);
+        if (likely(disabled == 1))
+                trace_function(tr, data, ip, parent_ip, flags);
+        atomic_dec(&data->disabled);
+}
+static struct ftrace_ops trace_ops __read_mostly =
+{
+        .func = irqsoff_tracer_call,
+};
+#endif /* CONFIG_FTRACE */
+/*
+ * Should this new latency be reported/recorded?
+ */
+static int report_latency(cycle_t delta)
+{
+        if (tracing_thresh) {
+                if (delta < tracing_thresh)
+                        return 0;
+        } else {
+                if (delta <= tracing_max_latency)
+                        return 0;
+        }
+        return 1;
+}
+static void
+check_critical_timing(struct trace_array *tr,
+                      struct trace_array_cpu *data,
+                      unsigned long parent_ip,
+                      int cpu)
+{
+        unsigned long latency, t0, t1;
+        cycle_t T0, T1, delta;
+        unsigned long flags;
+        /*
+         * usecs conversion is slow so we try to delay the conversion
+         * as long as possible:
+         */
+        T0 = data->preempt_timestamp;
+        T1 = ftrace_now(cpu);
+        delta = T1-T0;
+        local_save_flags(flags);
+        if (!report_latency(delta))
+                goto out;
+        spin_lock_irqsave(&max_trace_lock, flags);
+        /* check if we are still the max latency */
+        if (!report_latency(delta))
+                goto out_unlock;
+        trace_function(tr, data, CALLER_ADDR0, parent_ip, flags);
+        latency = nsecs_to_usecs(delta);
+        if (data->critical_sequence != max_sequence)
+                goto out_unlock;
+        tracing_max_latency = delta;
+        t0 = nsecs_to_usecs(T0);
+        t1 = nsecs_to_usecs(T1);
+        data->critical_end = parent_ip;
+        update_max_tr_single(tr, current, cpu);
+        max_sequence++;
+out_unlock:
+        spin_unlock_irqrestore(&max_trace_lock, flags);
+out:
+        data->critical_sequence = max_sequence;
+        data->preempt_timestamp = ftrace_now(cpu);
+        tracing_reset(data);
+        trace_function(tr, data, CALLER_ADDR0, parent_ip, flags);
+}
+static inline void
+start_critical_timing(unsigned long ip, unsigned long parent_ip)
+{
+        int cpu;
+        struct trace_array *tr = irqsoff_trace;
+        struct trace_array_cpu *data;
+        unsigned long flags;
+        if (likely(!tracer_enabled))
+                return;
+        cpu = raw_smp_processor_id();
+        if (per_cpu(tracing_cpu, cpu))
+                return;
+        data = tr->data[cpu];
+        if (unlikely(!data) || atomic_read(&data->disabled))
+                return;
+        atomic_inc(&data->disabled);
+        data->critical_sequence = max_sequence;
+        data->preempt_timestamp = ftrace_now(cpu);
+        data->critical_start = parent_ip ? : ip;
+        tracing_reset(data);
+        local_save_flags(flags);
+        trace_function(tr, data, ip, parent_ip, flags);
+        per_cpu(tracing_cpu, cpu) = 1;
+        atomic_dec(&data->disabled);
+}
+static inline void
+stop_critical_timing(unsigned long ip, unsigned long parent_ip)
+{
+        int cpu;
+        struct trace_array *tr = irqsoff_trace;
+        struct trace_array_cpu *data;
+        unsigned long flags;
+        cpu = raw_smp_processor_id();
+        /* Always clear the tracing cpu on stopping the trace */
+        if (unlikely(per_cpu(tracing_cpu, cpu)))
+                per_cpu(tracing_cpu, cpu) = 0;
+        else
+                return;
+        if (!tracer_enabled)
+                return;
+        data = tr->data[cpu];
+        if (unlikely(!data) || unlikely(!head_page(data)) ||
+            !data->critical_start || atomic_read(&data->disabled))
+                return;
+        atomic_inc(&data->disabled);
+        local_save_flags(flags);
+        trace_function(tr, data, ip, parent_ip, flags);
+        check_critical_timing(tr, data, parent_ip ? : ip, cpu);
+        data->critical_start = 0;
+        atomic_dec(&data->disabled);
+}
+/* start and stop critical timings used to for stoppage (in idle) */
+void start_critical_timings(void)
+{
+        if (preempt_trace() || irq_trace())
+                start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+EXPORT_SYMBOL_GPL(start_critical_timings);
+void stop_critical_timings(void)
+{
+        if (preempt_trace() || irq_trace())
+                stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+EXPORT_SYMBOL_GPL(stop_critical_timings);
+#ifdef CONFIG_IRQSOFF_TRACER
+#ifdef CONFIG_PROVE_LOCKING
+void time_hardirqs_on(unsigned long a0, unsigned long a1)
+{
+        if (!preempt_trace() && irq_trace())
+                stop_critical_timing(a0, a1);
+}
+void time_hardirqs_off(unsigned long a0, unsigned long a1)
+{
+        if (!preempt_trace() && irq_trace())
+                start_critical_timing(a0, a1);
+}
+#else /* !CONFIG_PROVE_LOCKING */
+/*
+ * Stubs:
+ */
+void early_boot_irqs_off(void)
+{
+}
+void early_boot_irqs_on(void)
+{
+}
+void trace_softirqs_on(unsigned long ip)
+{
+}
+void trace_softirqs_off(unsigned long ip)
+{
+}
+inline void print_irqtrace_events(struct task_struct *curr)
+{
+}
+/*
+ * We are only interested in hardirq on/off events:
+ */
+void trace_hardirqs_on(void)
+{
+        if (!preempt_trace() && irq_trace())
+                stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+EXPORT_SYMBOL(trace_hardirqs_on);
+void trace_hardirqs_off(void)
+{
+        if (!preempt_trace() && irq_trace())
+                start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+EXPORT_SYMBOL(trace_hardirqs_off);
+void trace_hardirqs_on_caller(unsigned long caller_addr)
+{
+        if (!preempt_trace() && irq_trace())
+                stop_critical_timing(CALLER_ADDR0, caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
+void trace_hardirqs_off_caller(unsigned long caller_addr)
+{
+        if (!preempt_trace() && irq_trace())
+                start_critical_timing(CALLER_ADDR0, caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
+#endif /* CONFIG_PROVE_LOCKING */
+#endif /*  CONFIG_IRQSOFF_TRACER */
+#ifdef CONFIG_PREEMPT_TRACER
+void trace_preempt_on(unsigned long a0, unsigned long a1)
+{
+        if (preempt_trace())
+                stop_critical_timing(a0, a1);
+}
+void trace_preempt_off(unsigned long a0, unsigned long a1)
+{
+        if (preempt_trace())
+                start_critical_timing(a0, a1);
+}
+#endif /* CONFIG_PREEMPT_TRACER */
+static void start_irqsoff_tracer(struct trace_array *tr)
+{
+        register_ftrace_function(&trace_ops);
+        tracer_enabled = 1;
+}
+static void stop_irqsoff_tracer(struct trace_array *tr)
+{
+        tracer_enabled = 0;
+        unregister_ftrace_function(&trace_ops);
+}
+static void __irqsoff_tracer_init(struct trace_array *tr)
+{
+        irqsoff_trace = tr;
+        /* make sure that the tracer is visible */
+        smp_wmb();
+        if (tr->ctrl)
+                start_irqsoff_tracer(tr);
+}
+static void irqsoff_tracer_reset(struct trace_array *tr)
+{
+        if (tr->ctrl)
+                stop_irqsoff_tracer(tr);
+}
+static void irqsoff_tracer_ctrl_update(struct trace_array *tr)
+{
+        if (tr->ctrl)
+                start_irqsoff_tracer(tr);
+        else
+                stop_irqsoff_tracer(tr);
+}
+static void irqsoff_tracer_open(struct trace_iterator *iter)
+{
+        /* stop the trace while dumping */
+        if (iter->tr->ctrl)
+                stop_irqsoff_tracer(iter->tr);
+}
+static void irqsoff_tracer_close(struct trace_iterator *iter)
+{
+        if (iter->tr->ctrl)
+                start_irqsoff_tracer(iter->tr);
+}
+#ifdef CONFIG_IRQSOFF_TRACER
+static void irqsoff_tracer_init(struct trace_array *tr)
+{
+        trace_type = TRACER_IRQS_OFF;
+        __irqsoff_tracer_init(tr);
+}
+static struct tracer irqsoff_tracer __read_mostly =
+{
+        .name           = "irqsoff",
+        .init           = irqsoff_tracer_init,
+        .reset          = irqsoff_tracer_reset,
+        .open           = irqsoff_tracer_open,
+        .close          = irqsoff_tracer_close,
+        .ctrl_update    = irqsoff_tracer_ctrl_update,
+        .print_max      = 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+        .selftest    = trace_selftest_startup_irqsoff,
+#endif
+};
+# define register_irqsoff(trace) register_tracer(&trace)
+#else
+# define register_irqsoff(trace) do { } while (0)
+#endif
+#ifdef CONFIG_PREEMPT_TRACER
+static void preemptoff_tracer_init(struct trace_array *tr)
+{
+        trace_type = TRACER_PREEMPT_OFF;
+        __irqsoff_tracer_init(tr);
+}
+static struct tracer preemptoff_tracer __read_mostly =
+{
+        .name           = "preemptoff",
+        .init           = preemptoff_tracer_init,
+        .reset          = irqsoff_tracer_reset,
+        .open           = irqsoff_tracer_open,
+        .close          = irqsoff_tracer_close,
+        .ctrl_update    = irqsoff_tracer_ctrl_update,
+        .print_max      = 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+        .selftest    = trace_selftest_startup_preemptoff,
+#endif
+};
+# define register_preemptoff(trace) register_tracer(&trace)
+#else
+# define register_preemptoff(trace) do { } while (0)
+#endif
+#if defined(CONFIG_IRQSOFF_TRACER) && \
+        defined(CONFIG_PREEMPT_TRACER)
+static void preemptirqsoff_tracer_init(struct trace_array *tr)
+{
+        trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
+        __irqsoff_tracer_init(tr);
+}
+static struct tracer preemptirqsoff_tracer __read_mostly =
+{
+        .name           = "preemptirqsoff",
+        .init           = preemptirqsoff_tracer_init,
+        .reset          = irqsoff_tracer_reset,
+        .open           = irqsoff_tracer_open,
+        .close          = irqsoff_tracer_close,
+        .ctrl_update    = irqsoff_tracer_ctrl_update,
+        .print_max      = 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+        .selftest    = trace_selftest_startup_preemptirqsoff,
+#endif
+};
+# define register_preemptirqsoff(trace) register_tracer(&trace)
+#else
+# define register_preemptirqsoff(trace) do { } while (0)
+#endif
+__init static int init_irqsoff_tracer(void)
+{
+        register_irqsoff(irqsoff_tracer);
+        register_preemptoff(preemptoff_tracer);
+        register_preemptirqsoff(preemptirqsoff_tracer);
+        return 0;
+}
+device_initcall(init_irqsoff_tracer);
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
new file mode 100644
index 000000000000..b13dc19dcbb4
--- /dev/null
+++ b/kernel/trace/trace_mmiotrace.c
@@ -0,0 +1,295 @@
+/*
+ * Memory mapped I/O tracing
+ *
+ * Copyright (C) 2008 Pekka Paalanen <pq@iki.fi>
+ */
+#define DEBUG 1
+#include <linux/kernel.h>
+#include <linux/mmiotrace.h>
+#include <linux/pci.h>
+#include "trace.h"
+struct header_iter {
+        struct pci_dev *dev;
+};
+static struct trace_array *mmio_trace_array;
+static bool overrun_detected;
+static void mmio_reset_data(struct trace_array *tr)
+{
+        int cpu;
+        overrun_detected = false;
+        tr->time_start = ftrace_now(tr->cpu);
+        for_each_online_cpu(cpu)
+                tracing_reset(tr->data[cpu]);
+}
+static void mmio_trace_init(struct trace_array *tr)
+{
+        pr_debug("in %s\n", __func__);
+        mmio_trace_array = tr;
+        if (tr->ctrl) {
+                mmio_reset_data(tr);
+                enable_mmiotrace();
+        }
+}
+static void mmio_trace_reset(struct trace_array *tr)
+{
+        pr_debug("in %s\n", __func__);
+        if (tr->ctrl)
+                disable_mmiotrace();
+        mmio_reset_data(tr);
+        mmio_trace_array = NULL;
+}
+static void mmio_trace_ctrl_update(struct trace_array *tr)
+{
+        pr_debug("in %s\n", __func__);
+        if (tr->ctrl) {
+                mmio_reset_data(tr);
+                enable_mmiotrace();
+        } else {
+                disable_mmiotrace();
+        }
+}
+static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
+{
+        int ret = 0;
+        int i;
+        resource_size_t start, end;
+        const struct pci_driver *drv = pci_dev_driver(dev);
+        /* XXX: incomplete checks for trace_seq_printf() return value */
+        ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
+                                dev->bus->number, dev->devfn,
+                                dev->vendor, dev->device, dev->irq);
+        /*
+         * XXX: is pci_resource_to_user() appropriate, since we are
+         * supposed to interpret the __ioremap() phys_addr argument based on
+         * these printed values?
+         */
+        for (i = 0; i < 7; i++) {
+                pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+                ret += trace_seq_printf(s, " %llx",
+                        (unsigned long long)(start |
+                        (dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
+        }
+        for (i = 0; i < 7; i++) {
+                pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+                ret += trace_seq_printf(s, " %llx",
+                        dev->resource[i].start < dev->resource[i].end ?
+                        (unsigned long long)(end - start) + 1 : 0);
+        }
+        if (drv)
+                ret += trace_seq_printf(s, " %s\n", drv->name);
+        else
+                ret += trace_seq_printf(s, " \n");
+        return ret;
+}
+static void destroy_header_iter(struct header_iter *hiter)
+{
+        if (!hiter)
+                return;
+        pci_dev_put(hiter->dev);
+        kfree(hiter);
+}
+static void mmio_pipe_open(struct trace_iterator *iter)
+{
+        struct header_iter *hiter;
+        struct trace_seq *s = &iter->seq;
+        trace_seq_printf(s, "VERSION 20070824\n");
+        hiter = kzalloc(sizeof(*hiter), GFP_KERNEL);
+        if (!hiter)
+                return;
+        hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, NULL);
+        iter->private = hiter;
+}
+/* XXX: This is not called when the pipe is closed! */
+static void mmio_close(struct trace_iterator *iter)
+{
+        struct header_iter *hiter = iter->private;
+        destroy_header_iter(hiter);
+        iter->private = NULL;
+}
+static unsigned long count_overruns(struct trace_iterator *iter)
+{
+        int cpu;
+        unsigned long cnt = 0;
+        for_each_online_cpu(cpu) {
+                cnt += iter->overrun[cpu];
+                iter->overrun[cpu] = 0;
+        }
+        return cnt;
+}
+static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp,
+                                char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+        ssize_t ret;
+        struct header_iter *hiter = iter->private;
+        struct trace_seq *s = &iter->seq;
+        unsigned long n;
+        n = count_overruns(iter);
+        if (n) {
+                /* XXX: This is later than where events were lost. */
+                trace_seq_printf(s, "MARK 0.000000 Lost %lu events.\n", n);
+                if (!overrun_detected)
+                        pr_warning("mmiotrace has lost events.\n");
+                overrun_detected = true;
+                goto print_out;
+        }
+        if (!hiter)
+                return 0;
+        mmio_print_pcidev(s, hiter->dev);
+        hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, hiter->dev);
+        if (!hiter->dev) {
+                destroy_header_iter(hiter);
+                iter->private = NULL;
+        }
+print_out:
+        ret = trace_seq_to_user(s, ubuf, cnt);
+        return (ret == -EBUSY) ? 0 : ret;
+}
+static int mmio_print_rw(struct trace_iterator *iter)
+{
+        struct trace_entry *entry = iter->ent;
+        struct mmiotrace_rw *rw = &entry->mmiorw;
+        struct trace_seq *s     = &iter->seq;
+        unsigned long long t    = ns2usecs(entry->t);
+        unsigned long usec_rem  = do_div(t, 1000000ULL);
+        unsigned secs           = (unsigned long)t;
+        int ret = 1;
+        switch (entry->mmiorw.opcode) {
+        case MMIO_READ:
+                ret = trace_seq_printf(s,
+                        "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
+                        rw->width, secs, usec_rem, rw->map_id,
+                        (unsigned long long)rw->phys,
+                        rw->value, rw->pc, 0);
+                break;
+        case MMIO_WRITE:
+                ret = trace_seq_printf(s,
+                        "W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
+                        rw->width, secs, usec_rem, rw->map_id,
+                        (unsigned long long)rw->phys,
+                        rw->value, rw->pc, 0);
+                break;
+        case MMIO_UNKNOWN_OP:
+                ret = trace_seq_printf(s,
+                        "UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n",
+                        secs, usec_rem, rw->map_id,
+                        (unsigned long long)rw->phys,
+                        (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff,
+                        (rw->value >> 0) & 0xff, rw->pc, 0);
+                break;
+        default:
+                ret = trace_seq_printf(s, "rw what?\n");
+                break;
+        }
+        if (ret)
+                return 1;
+        return 0;
+}
+static int mmio_print_map(struct trace_iterator *iter)
+{
+        struct trace_entry *entry = iter->ent;
+        struct mmiotrace_map *m = &entry->mmiomap;
+        struct trace_seq *s     = &iter->seq;
+        unsigned long long t    = ns2usecs(entry->t);
+        unsigned long usec_rem  = do_div(t, 1000000ULL);
+        unsigned secs           = (unsigned long)t;
+        int ret = 1;
+        switch (entry->mmiorw.opcode) {
+        case MMIO_PROBE:
+                ret = trace_seq_printf(s,
+                        "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
+                        secs, usec_rem, m->map_id,
+                        (unsigned long long)m->phys, m->virt, m->len,
+                        0UL, 0);
+                break;
+        case MMIO_UNPROBE:
+                ret = trace_seq_printf(s,
+                        "UNMAP %lu.%06lu %d 0x%lx %d\n",
+                        secs, usec_rem, m->map_id, 0UL, 0);
+                break;
+        default:
+                ret = trace_seq_printf(s, "map what?\n");
+                break;
+        }
+        if (ret)
+                return 1;
+        return 0;
+}
+/* return 0 to abort printing without consuming current entry in pipe mode */
+static int mmio_print_line(struct trace_iterator *iter)
+{
+        switch (iter->ent->type) {
+        case TRACE_MMIO_RW:
+                return mmio_print_rw(iter);
+        case TRACE_MMIO_MAP:
+                return mmio_print_map(iter);
+        default:
+                return 1; /* ignore unknown entries */
+        }
+}
+static struct tracer mmio_tracer __read_mostly =
+{
+        .name           = "mmiotrace",
+        .init           = mmio_trace_init,
+        .reset          = mmio_trace_reset,
+        .pipe_open      = mmio_pipe_open,
+        .close          = mmio_close,
+        .read           = mmio_read,
+        .ctrl_update    = mmio_trace_ctrl_update,
+        .print_line     = mmio_print_line,
+};
+__init static int init_mmio_trace(void)
+{
+        return register_tracer(&mmio_tracer);
+}
+device_initcall(init_mmio_trace);
+void mmio_trace_rw(struct mmiotrace_rw *rw)
+{
+        struct trace_array *tr = mmio_trace_array;
+        struct trace_array_cpu *data = tr->data[smp_processor_id()];
+        __trace_mmiotrace_rw(tr, data, rw);
+}
+void mmio_trace_mapping(struct mmiotrace_map *map)
+{
+        struct trace_array *tr = mmio_trace_array;
+        struct trace_array_cpu *data;
+        preempt_disable();
+        data = tr->data[smp_processor_id()];
+        __trace_mmiotrace_map(tr, data, map);
+        preempt_enable();
+}
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
new file mode 100644
index 000000000000..cb817a209aa0
--- /dev/null
+++ b/kernel/trace/trace_sched_switch.c
@@ -0,0 +1,286 @@
+/*
+ * trace context switch
+ *
+ * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/marker.h>
+#include <linux/ftrace.h>
+#include "trace.h"
+static struct trace_array       *ctx_trace;
+static int __read_mostly        tracer_enabled;
+static atomic_t                 sched_ref;
+static void
+sched_switch_func(void *private, void *__rq, struct task_struct *prev,
+                        struct task_struct *next)
+{
+        struct trace_array **ptr = private;
+        struct trace_array *tr = *ptr;
+        struct trace_array_cpu *data;
+        unsigned long flags;
+        long disabled;
+        int cpu;
+        tracing_record_cmdline(prev);
+        tracing_record_cmdline(next);
+        if (!tracer_enabled)
+                return;
+        local_irq_save(flags);
+        cpu = raw_smp_processor_id();
+        data = tr->data[cpu];
+        disabled = atomic_inc_return(&data->disabled);
+        if (likely(disabled == 1))
+                tracing_sched_switch_trace(tr, data, prev, next, flags);
+        atomic_dec(&data->disabled);
+        local_irq_restore(flags);
+}
+static notrace void
+sched_switch_callback(void *probe_data, void *call_data,
+                      const char *format, va_list *args)
+{
+        struct task_struct *prev;
+        struct task_struct *next;
+        struct rq *__rq;
+        if (!atomic_read(&sched_ref))
+                return;
+        /* skip prev_pid %d next_pid %d prev_state %ld */
+        (void)va_arg(*args, int);
+        (void)va_arg(*args, int);
+        (void)va_arg(*args, long);
+        __rq = va_arg(*args, typeof(__rq));
+        prev = va_arg(*args, typeof(prev));
+        next = va_arg(*args, typeof(next));
+        /*
+         * If tracer_switch_func only points to the local
+         * switch func, it still needs the ptr passed to it.
+         */
+        sched_switch_func(probe_data, __rq, prev, next);
+}
+static void
+wakeup_func(void *private, void *__rq, struct task_struct *wakee, struct
+                        task_struct *curr)
+{
+        struct trace_array **ptr = private;
+        struct trace_array *tr = *ptr;
+        struct trace_array_cpu *data;
+        unsigned long flags;
+        long disabled;
+        int cpu;
+        if (!tracer_enabled)
+                return;
+        tracing_record_cmdline(curr);
+        local_irq_save(flags);
+        cpu = raw_smp_processor_id();
+        data = tr->data[cpu];
+        disabled = atomic_inc_return(&data->disabled);
+        if (likely(disabled == 1))
+                tracing_sched_wakeup_trace(tr, data, wakee, curr, flags);
+        atomic_dec(&data->disabled);
+        local_irq_restore(flags);
+}
+static notrace void
+wake_up_callback(void *probe_data, void *call_data,
+                 const char *format, va_list *args)
+{
+        struct task_struct *curr;
+        struct task_struct *task;
+        struct rq *__rq;
+        if (likely(!tracer_enabled))
+                return;
+        /* Skip pid %d state %ld */
+        (void)va_arg(*args, int);
+        (void)va_arg(*args, long);
+        /* now get the meat: "rq %p task %p rq->curr %p" */
+        __rq = va_arg(*args, typeof(__rq));
+        task = va_arg(*args, typeof(task));
+        curr = va_arg(*args, typeof(curr));
+        tracing_record_cmdline(task);
+        tracing_record_cmdline(curr);
+        wakeup_func(probe_data, __rq, task, curr);
+}
+static void sched_switch_reset(struct trace_array *tr)
+{
+        int cpu;
+        tr->time_start = ftrace_now(tr->cpu);
+        for_each_online_cpu(cpu)
+                tracing_reset(tr->data[cpu]);
+}
+static int tracing_sched_register(void)
+{
+        int ret;
+        ret = marker_probe_register("kernel_sched_wakeup",
+                        "pid %d state %ld ## rq %p task %p rq->curr %p",
+                        wake_up_callback,
+                        &ctx_trace);
+        if (ret) {
+                pr_info("wakeup trace: Couldn't add marker"
+                        " probe to kernel_sched_wakeup\n");
+                return ret;
+        }
+        ret = marker_probe_register("kernel_sched_wakeup_new",
+                        "pid %d state %ld ## rq %p task %p rq->curr %p",
+                        wake_up_callback,
+                        &ctx_trace);
+        if (ret) {
+                pr_info("wakeup trace: Couldn't add marker"
+                        " probe to kernel_sched_wakeup_new\n");
+                goto fail_deprobe;
+        }
+        ret = marker_probe_register("kernel_sched_schedule",
+                "prev_pid %d next_pid %d prev_state %ld "
+                "## rq %p prev %p next %p",
+                sched_switch_callback,
+                &ctx_trace);
+        if (ret) {
+                pr_info("sched trace: Couldn't add marker"
+                        " probe to kernel_sched_schedule\n");
+                goto fail_deprobe_wake_new;
+        }
+        return ret;
+fail_deprobe_wake_new:
+        marker_probe_unregister("kernel_sched_wakeup_new",
+                                wake_up_callback,
+                                &ctx_trace);
+fail_deprobe:
+        marker_probe_unregister("kernel_sched_wakeup",
+                                wake_up_callback,
+                                &ctx_trace);
+        return ret;
+}
+static void tracing_sched_unregister(void)
+{
+        marker_probe_unregister("kernel_sched_schedule",
+                                sched_switch_callback,
+                                &ctx_trace);
+        marker_probe_unregister("kernel_sched_wakeup_new",
+                                wake_up_callback,
+                                &ctx_trace);
+        marker_probe_unregister("kernel_sched_wakeup",
+                                wake_up_callback,
+                                &ctx_trace);
+}
+static void tracing_start_sched_switch(void)
+{
+        long ref;
+        ref = atomic_inc_return(&sched_ref);
+        if (ref == 1)
+                tracing_sched_register();
+}
+static void tracing_stop_sched_switch(void)
+{
+        long ref;
+        ref = atomic_dec_and_test(&sched_ref);
+        if (ref)
+                tracing_sched_unregister();
+}
+void tracing_start_cmdline_record(void)
+{
+        tracing_start_sched_switch();
+}
+void tracing_stop_cmdline_record(void)
+{
+        tracing_stop_sched_switch();
+}
+static void start_sched_trace(struct trace_array *tr)
+{
+        sched_switch_reset(tr);
+        tracing_start_cmdline_record();
+        tracer_enabled = 1;
+}
+static void stop_sched_trace(struct trace_array *tr)
+{
+        tracer_enabled = 0;
+        tracing_stop_cmdline_record();
+}
+static void sched_switch_trace_init(struct trace_array *tr)
+{
+        ctx_trace = tr;
+        if (tr->ctrl)
+                start_sched_trace(tr);
+}
+static void sched_switch_trace_reset(struct trace_array *tr)
+{
+        if (tr->ctrl)
+                stop_sched_trace(tr);
+}
+static void sched_switch_trace_ctrl_update(struct trace_array *tr)
+{
+        /* When starting a new trace, reset the buffers */
+        if (tr->ctrl)
+                start_sched_trace(tr);
+        else
+                stop_sched_trace(tr);
+}
+static struct tracer sched_switch_trace __read_mostly =
+{
+        .name           = "sched_switch",
+        .init           = sched_switch_trace_init,
+        .reset          = sched_switch_trace_reset,
+        .ctrl_update    = sched_switch_trace_ctrl_update,
+#ifdef CONFIG_FTRACE_SELFTEST
+        .selftest    = trace_selftest_startup_sched_switch,
+#endif
+};
+__init static int init_sched_switch_trace(void)
+{
+        int ret = 0;
+        if (atomic_read(&sched_ref))
+                ret = tracing_sched_register();
+        if (ret) {
+                pr_info("error registering scheduler trace\n");
+                return ret;
+        }
+        return register_tracer(&sched_switch_trace);
+}
+device_initcall(init_sched_switch_trace);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
new file mode 100644
index 000000000000..e303ccb62cdf
--- /dev/null
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -0,0 +1,453 @@
+/*
+ * trace task wakeup timings
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Based on code from the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/marker.h>
+#include "trace.h"
+static struct trace_array       *wakeup_trace;
+static int __read_mostly        tracer_enabled;
+static struct task_struct       *wakeup_task;
+static int                      wakeup_cpu;
+static unsigned                 wakeup_prio = -1;
+static raw_spinlock_t wakeup_lock =
+        (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+static void __wakeup_reset(struct trace_array *tr);
+#ifdef CONFIG_FTRACE
+/*
+ * irqsoff uses its own tracer function to keep the overhead down:
+ */
+static void
+wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+        struct trace_array *tr = wakeup_trace;
+        struct trace_array_cpu *data;
+        unsigned long flags;
+        long disabled;
+        int resched;
+        int cpu;
+        if (likely(!wakeup_task))
+                return;
+        resched = need_resched();
+        preempt_disable_notrace();
+        cpu = raw_smp_processor_id();
+        data = tr->data[cpu];
+        disabled = atomic_inc_return(&data->disabled);
+        if (unlikely(disabled != 1))
+                goto out;
+        local_irq_save(flags);
+        __raw_spin_lock(&wakeup_lock);
+        if (unlikely(!wakeup_task))
+                goto unlock;
+        /*
+         * The task can't disappear because it needs to
+         * wake up first, and we have the wakeup_lock.
+         */
+        if (task_cpu(wakeup_task) != cpu)
+                goto unlock;
+        trace_function(tr, data, ip, parent_ip, flags);
+ unlock:
+        __raw_spin_unlock(&wakeup_lock);
+        local_irq_restore(flags);
+ out:
+        atomic_dec(&data->disabled);
+        /*
+         * To prevent recursion from the scheduler, if the
+         * resched flag was set before we entered, then
+         * don't reschedule.
+         */
+        if (resched)
+                preempt_enable_no_resched_notrace();
+        else
+                preempt_enable_notrace();
+}
+static struct ftrace_ops trace_ops __read_mostly =
+{
+        .func = wakeup_tracer_call,
+};
+#endif /* CONFIG_FTRACE */
+/*
+ * Should this new latency be reported/recorded?
+ */
+static int report_latency(cycle_t delta)
+{
+        if (tracing_thresh) {
+                if (delta < tracing_thresh)
+                        return 0;
+        } else {
+                if (delta <= tracing_max_latency)
+                        return 0;
+        }
+        return 1;
+}
+static void notrace
+wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
+        struct task_struct *next)
+{
+        unsigned long latency = 0, t0 = 0, t1 = 0;
+        struct trace_array **ptr = private;
+        struct trace_array *tr = *ptr;
+        struct trace_array_cpu *data;
+        cycle_t T0, T1, delta;
+        unsigned long flags;
+        long disabled;
+        int cpu;
+        if (unlikely(!tracer_enabled))
+                return;
+        /*
+         * When we start a new trace, we set wakeup_task to NULL
+         * and then set tracer_enabled = 1. We want to make sure
+         * that another CPU does not see the tracer_enabled = 1
+         * and the wakeup_task with an older task, that might
+         * actually be the same as next.
+         */
+        smp_rmb();
+        if (next != wakeup_task)
+                return;
+        /* The task we are waiting for is waking up */
+        data = tr->data[wakeup_cpu];
+        /* disable local data, not wakeup_cpu data */
+        cpu = raw_smp_processor_id();
+        disabled = atomic_inc_return(&tr->data[cpu]->disabled);
+        if (likely(disabled != 1))
+                goto out;
+        local_irq_save(flags);
+        __raw_spin_lock(&wakeup_lock);
+        /* We could race with grabbing wakeup_lock */
+        if (unlikely(!tracer_enabled || next != wakeup_task))
+                goto out_unlock;
+        trace_function(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags);
+        /*
+         * usecs conversion is slow so we try to delay the conversion
+         * as long as possible:
+         */
+        T0 = data->preempt_timestamp;
+        T1 = ftrace_now(cpu);
+        delta = T1-T0;
+        if (!report_latency(delta))
+                goto out_unlock;
+        latency = nsecs_to_usecs(delta);
+        tracing_max_latency = delta;
+        t0 = nsecs_to_usecs(T0);
+        t1 = nsecs_to_usecs(T1);
+        update_max_tr(tr, wakeup_task, wakeup_cpu);
+out_unlock:
+        __wakeup_reset(tr);
+        __raw_spin_unlock(&wakeup_lock);
+        local_irq_restore(flags);
+out:
+        atomic_dec(&tr->data[cpu]->disabled);
+}
+static notrace void
+sched_switch_callback(void *probe_data, void *call_data,
+                      const char *format, va_list *args)
+{
+        struct task_struct *prev;
+        struct task_struct *next;
+        struct rq *__rq;
+        /* skip prev_pid %d next_pid %d prev_state %ld */
+        (void)va_arg(*args, int);
+        (void)va_arg(*args, int);
+        (void)va_arg(*args, long);
+        __rq = va_arg(*args, typeof(__rq));
+        prev = va_arg(*args, typeof(prev));
+        next = va_arg(*args, typeof(next));
+        tracing_record_cmdline(prev);
+        /*
+         * If tracer_switch_func only points to the local
+         * switch func, it still needs the ptr passed to it.
+         */
+        wakeup_sched_switch(probe_data, __rq, prev, next);
+}
+static void __wakeup_reset(struct trace_array *tr)
+{
+        struct trace_array_cpu *data;
+        int cpu;
+        for_each_possible_cpu(cpu) {
+                data = tr->data[cpu];
+                tracing_reset(data);
+        }
+        wakeup_cpu = -1;
+        wakeup_prio = -1;
+        if (wakeup_task)
+                put_task_struct(wakeup_task);
+        wakeup_task = NULL;
+}
+static void wakeup_reset(struct trace_array *tr)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        __raw_spin_lock(&wakeup_lock);
+        __wakeup_reset(tr);
+        __raw_spin_unlock(&wakeup_lock);
+        local_irq_restore(flags);
+}
+static void
+wakeup_check_start(struct trace_array *tr, struct task_struct *p,
+                   struct task_struct *curr)
+{
+        int cpu = smp_processor_id();
+        unsigned long flags;
+        long disabled;
+        if (likely(!rt_task(p)) ||
+                        p->prio >= wakeup_prio ||
+                        p->prio >= curr->prio)
+                return;
+        disabled = atomic_inc_return(&tr->data[cpu]->disabled);
+        if (unlikely(disabled != 1))
+                goto out;
+        /* interrupts should be off from try_to_wake_up */
+        __raw_spin_lock(&wakeup_lock);
+        /* check for races. */
+        if (!tracer_enabled || p->prio >= wakeup_prio)
+                goto out_locked;
+        /* reset the trace */
+        __wakeup_reset(tr);
+        wakeup_cpu = task_cpu(p);
+        wakeup_prio = p->prio;
+        wakeup_task = p;
+        get_task_struct(wakeup_task);
+        local_save_flags(flags);
+        tr->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu);
+        trace_function(tr, tr->data[wakeup_cpu],
+                       CALLER_ADDR1, CALLER_ADDR2, flags);
+out_locked:
+        __raw_spin_unlock(&wakeup_lock);
+out:
+        atomic_dec(&tr->data[cpu]->disabled);
+}
+static notrace void
+wake_up_callback(void *probe_data, void *call_data,
+                 const char *format, va_list *args)
+{
+        struct trace_array **ptr = probe_data;
+        struct trace_array *tr = *ptr;
+        struct task_struct *curr;
+        struct task_struct *task;
+        struct rq *__rq;
+        if (likely(!tracer_enabled))
+                return;
+        /* Skip pid %d state %ld */
+        (void)va_arg(*args, int);
+        (void)va_arg(*args, long);
+        /* now get the meat: "rq %p task %p rq->curr %p" */
+        __rq = va_arg(*args, typeof(__rq));
+        task = va_arg(*args, typeof(task));
+        curr = va_arg(*args, typeof(curr));
+        tracing_record_cmdline(task);
+        tracing_record_cmdline(curr);
+        wakeup_check_start(tr, task, curr);
+}
+static void start_wakeup_tracer(struct trace_array *tr)
+{
+        int ret;
+        ret = marker_probe_register("kernel_sched_wakeup",
+                        "pid %d state %ld ## rq %p task %p rq->curr %p",
+                        wake_up_callback,
+                        &wakeup_trace);
+        if (ret) {
+                pr_info("wakeup trace: Couldn't add marker"
+                        " probe to kernel_sched_wakeup\n");
+                return;
+        }
+        ret = marker_probe_register("kernel_sched_wakeup_new",
+                        "pid %d state %ld ## rq %p task %p rq->curr %p",
+                        wake_up_callback,
+                        &wakeup_trace);
+        if (ret) {
+                pr_info("wakeup trace: Couldn't add marker"
+                        " probe to kernel_sched_wakeup_new\n");
+                goto fail_deprobe;
+        }
+        ret = marker_probe_register("kernel_sched_schedule",
+                "prev_pid %d next_pid %d prev_state %ld "
+                "## rq %p prev %p next %p",
+                sched_switch_callback,
+                &wakeup_trace);
+        if (ret) {
+                pr_info("sched trace: Couldn't add marker"
+                        " probe to kernel_sched_schedule\n");
+                goto fail_deprobe_wake_new;
+        }
+        wakeup_reset(tr);
+        /*
+         * Don't let the tracer_enabled = 1 show up before
+         * the wakeup_task is reset. This may be overkill since
+         * wakeup_reset does a spin_unlock after setting the
+         * wakeup_task to NULL, but I want to be safe.
+         * This is a slow path anyway.
+         */
+        smp_wmb();
+        register_ftrace_function(&trace_ops);
+        tracer_enabled = 1;
+        return;
+fail_deprobe_wake_new:
+        marker_probe_unregister("kernel_sched_wakeup_new",
+                                wake_up_callback,
+                                &wakeup_trace);
+fail_deprobe:
+        marker_probe_unregister("kernel_sched_wakeup",
+                                wake_up_callback,
+                                &wakeup_trace);
+}
+static void stop_wakeup_tracer(struct trace_array *tr)
+{
+        tracer_enabled = 0;
+        unregister_ftrace_function(&trace_ops);
+        marker_probe_unregister("kernel_sched_schedule",
+                                sched_switch_callback,
+                                &wakeup_trace);
+        marker_probe_unregister("kernel_sched_wakeup_new",
+                                wake_up_callback,
+                                &wakeup_trace);
+        marker_probe_unregister("kernel_sched_wakeup",
+                                wake_up_callback,
+                                &wakeup_trace);
+}
+static void wakeup_tracer_init(struct trace_array *tr)
+{
+        wakeup_trace = tr;
+        if (tr->ctrl)
+                start_wakeup_tracer(tr);
+}
+static void wakeup_tracer_reset(struct trace_array *tr)
+{
+        if (tr->ctrl) {
+                stop_wakeup_tracer(tr);
+                /* make sure we put back any tasks we are tracing */
+                wakeup_reset(tr);
+        }
+}
+static void wakeup_tracer_ctrl_update(struct trace_array *tr)
+{
+        if (tr->ctrl)
+                start_wakeup_tracer(tr);
+        else
+                stop_wakeup_tracer(tr);
+}
+static void wakeup_tracer_open(struct trace_iterator *iter)
+{
+        /* stop the trace while dumping */
+        if (iter->tr->ctrl)
+                stop_wakeup_tracer(iter->tr);
+}
+static void wakeup_tracer_close(struct trace_iterator *iter)
+{
+        /* forget about any processes we were recording */
+        if (iter->tr->ctrl)
+                start_wakeup_tracer(iter->tr);
+}
+static struct tracer wakeup_tracer __read_mostly =
+{
+        .name           = "wakeup",
+        .init           = wakeup_tracer_init,
+        .reset          = wakeup_tracer_reset,
+        .open           = wakeup_tracer_open,
+        .close          = wakeup_tracer_close,
+        .ctrl_update    = wakeup_tracer_ctrl_update,
+        .print_max      = 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+        .selftest    = trace_selftest_startup_wakeup,
+#endif
+};
+__init static int init_wakeup_tracer(void)
+{
+        int ret;
+        ret = register_tracer(&wakeup_tracer);
+        if (ret)
+                return ret;
+        return 0;
+}
+device_initcall(init_wakeup_tracer);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
new file mode 100644
index 000000000000..0911b7e073bf
--- /dev/null
+++ b/kernel/trace/trace_selftest.c
@@ -0,0 +1,563 @@
+/* Include in trace.c */
+#include <linux/kthread.h>
+#include <linux/delay.h>
+static inline int trace_valid_entry(struct trace_entry *entry)
+{
+        switch (entry->type) {
+        case TRACE_FN:
+        case TRACE_CTX:
+        case TRACE_WAKE:
+        case TRACE_STACK:
+        case TRACE_SPECIAL:
+                return 1;
+        }
+        return 0;
+}
+static int
+trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data)
+{
+        struct trace_entry *entries;
+        struct page *page;
+        int idx = 0;
+        int i;
+        BUG_ON(list_empty(&data->trace_pages));
+        page = list_entry(data->trace_pages.next, struct page, lru);
+        entries = page_address(page);
+        check_pages(data);
+        if (head_page(data) != entries)
+                goto failed;
+        /*
+         * The starting trace buffer always has valid elements,
+         * if any element exists.
+         */
+        entries = head_page(data);
+        for (i = 0; i < tr->entries; i++) {
+                if (i < data->trace_idx && !trace_valid_entry(&entries[idx])) {
+                        printk(KERN_CONT ".. invalid entry %d ",
+                                entries[idx].type);
+                        goto failed;
+                }
+                idx++;
+                if (idx >= ENTRIES_PER_PAGE) {
+                        page = virt_to_page(entries);
+                        if (page->lru.next == &data->trace_pages) {
+                                if (i != tr->entries - 1) {
+                                        printk(KERN_CONT ".. entries buffer mismatch");
+                                        goto failed;
+                                }
+                        } else {
+                                page = list_entry(page->lru.next, struct page, lru);
+                                entries = page_address(page);
+                        }
+                        idx = 0;
+                }
+        }
+        page = virt_to_page(entries);
+        if (page->lru.next != &data->trace_pages) {
+                printk(KERN_CONT ".. too many entries");
+                goto failed;
+        }
+        return 0;
+ failed:
+        /* disable tracing */
+        tracing_disabled = 1;
+        printk(KERN_CONT ".. corrupted trace buffer .. ");
+        return -1;
+}
+/*
+ * Test the trace buffer to see if all the elements
+ * are still sane.
+ */
+static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
+{
+        unsigned long flags, cnt = 0;
+        int cpu, ret = 0;
+        /* Don't allow flipping of max traces now */
+        raw_local_irq_save(flags);
+        __raw_spin_lock(&ftrace_max_lock);
+        for_each_possible_cpu(cpu) {
+                if (!head_page(tr->data[cpu]))
+                        continue;
+                cnt += tr->data[cpu]->trace_idx;
+                ret = trace_test_buffer_cpu(tr, tr->data[cpu]);
+                if (ret)
+                        break;
+        }
+        __raw_spin_unlock(&ftrace_max_lock);
+        raw_local_irq_restore(flags);
+        if (count)
+                *count = cnt;
+        return ret;
+}
+#ifdef CONFIG_FTRACE
+#ifdef CONFIG_DYNAMIC_FTRACE
+#define __STR(x) #x
+#define STR(x) __STR(x)
+/* Test dynamic code modification and ftrace filters */
+int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
+                                           struct trace_array *tr,
+                                           int (*func)(void))
+{
+        unsigned long count;
+        int ret;
+        int save_ftrace_enabled = ftrace_enabled;
+        int save_tracer_enabled = tracer_enabled;
+        char *func_name;
+        /* The ftrace test PASSED */
+        printk(KERN_CONT "PASSED\n");
+        pr_info("Testing dynamic ftrace: ");
+        /* enable tracing, and record the filter function */
+        ftrace_enabled = 1;
+        tracer_enabled = 1;
+        /* passed in by parameter to fool gcc from optimizing */
+        func();
+        /* update the records */
+        ret = ftrace_force_update();
+        if (ret) {
+                printk(KERN_CONT ".. ftraced failed .. ");
+                return ret;
+        }
+        /*
+         * Some archs *cough*PowerPC*cough* add charachters to the
+         * start of the function names. We simply put a '*' to
+         * accomodate them.
+         */
+        func_name = "*" STR(DYN_FTRACE_TEST_NAME);
+        /* filter only on our function */
+        ftrace_set_filter(func_name, strlen(func_name), 1);
+        /* enable tracing */
+        tr->ctrl = 1;
+        trace->init(tr);
+        /* Sleep for a 1/10 of a second */
+        msleep(100);
+        /* we should have nothing in the buffer */
+        ret = trace_test_buffer(tr, &count);
+        if (ret)
+                goto out;
+        if (count) {
+                ret = -1;
+                printk(KERN_CONT ".. filter did not filter .. ");
+                goto out;
+        }
+        /* call our function again */
+        func();
+        /* sleep again */
+        msleep(100);
+        /* stop the tracing. */
+        tr->ctrl = 0;
+        trace->ctrl_update(tr);
+        ftrace_enabled = 0;
+        /* check the trace buffer */
+        ret = trace_test_buffer(tr, &count);
+        trace->reset(tr);
+        /* we should only have one item */
+        if (!ret && count != 1) {
+                printk(KERN_CONT ".. filter failed count=%ld ..", count);
+                ret = -1;
+                goto out;
+        }
+ out:
+        ftrace_enabled = save_ftrace_enabled;
+        tracer_enabled = save_tracer_enabled;
+        /* Enable tracing on all functions again */
+        ftrace_set_filter(NULL, 0, 1);
+        return ret;
+}
+#else
+# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; })
+#endif /* CONFIG_DYNAMIC_FTRACE */
+/*
+ * Simple verification test of ftrace function tracer.
+ * Enable ftrace, sleep 1/10 second, and then read the trace
+ * buffer to see if all is in order.
+ */
+int
+trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
+{
+        unsigned long count;
+        int ret;
+        int save_ftrace_enabled = ftrace_enabled;
+        int save_tracer_enabled = tracer_enabled;
+        /* make sure msleep has been recorded */
+        msleep(1);
+        /* force the recorded functions to be traced */
+        ret = ftrace_force_update();
+        if (ret) {
+                printk(KERN_CONT ".. ftraced failed .. ");
+                return ret;
+        }
+        /* start the tracing */
+        ftrace_enabled = 1;
+        tracer_enabled = 1;
+        tr->ctrl = 1;
+        trace->init(tr);
+        /* Sleep for a 1/10 of a second */
+        msleep(100);
+        /* stop the tracing. */
+        tr->ctrl = 0;
+        trace->ctrl_update(tr);
+        ftrace_enabled = 0;
+        /* check the trace buffer */
+        ret = trace_test_buffer(tr, &count);
+        trace->reset(tr);
+        if (!ret && !count) {
+                printk(KERN_CONT ".. no entries found ..");
+                ret = -1;
+                goto out;
+        }
+        ret = trace_selftest_startup_dynamic_tracing(trace, tr,
+                                                     DYN_FTRACE_TEST_NAME);
+ out:
+        ftrace_enabled = save_ftrace_enabled;
+        tracer_enabled = save_tracer_enabled;
+        /* kill ftrace totally if we failed */
+        if (ret)
+                ftrace_kill();
+        return ret;
+}
+#endif /* CONFIG_FTRACE */
+#ifdef CONFIG_IRQSOFF_TRACER
+int
+trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
+{
+        unsigned long save_max = tracing_max_latency;
+        unsigned long count;
+        int ret;
+        /* start the tracing */
+        tr->ctrl = 1;
+        trace->init(tr);
+        /* reset the max latency */
+        tracing_max_latency = 0;
+        /* disable interrupts for a bit */
+        local_irq_disable();
+        udelay(100);
+        local_irq_enable();
+        /* stop the tracing. */
+        tr->ctrl = 0;
+        trace->ctrl_update(tr);
+        /* check both trace buffers */
+        ret = trace_test_buffer(tr, NULL);
+        if (!ret)
+                ret = trace_test_buffer(&max_tr, &count);
+        trace->reset(tr);
+        if (!ret && !count) {
+                printk(KERN_CONT ".. no entries found ..");
+                ret = -1;
+        }
+        tracing_max_latency = save_max;
+        return ret;
+}
+#endif /* CONFIG_IRQSOFF_TRACER */
+#ifdef CONFIG_PREEMPT_TRACER
+int
+trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
+{
+        unsigned long save_max = tracing_max_latency;
+        unsigned long count;
+        int ret;
+        /* start the tracing */
+        tr->ctrl = 1;
+        trace->init(tr);
+        /* reset the max latency */
+        tracing_max_latency = 0;
+        /* disable preemption for a bit */
+        preempt_disable();
+        udelay(100);
+        preempt_enable();
+        /* stop the tracing. */
+        tr->ctrl = 0;
+        trace->ctrl_update(tr);
+        /* check both trace buffers */
+        ret = trace_test_buffer(tr, NULL);
+        if (!ret)
+                ret = trace_test_buffer(&max_tr, &count);
+        trace->reset(tr);
+        if (!ret && !count) {
+                printk(KERN_CONT ".. no entries found ..");
+                ret = -1;
+        }
+        tracing_max_latency = save_max;
+        return ret;
+}
+#endif /* CONFIG_PREEMPT_TRACER */
+#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER)
+int
+trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr)
+{
+        unsigned long save_max = tracing_max_latency;
+        unsigned long count;
+        int ret;
+        /* start the tracing */
+        tr->ctrl = 1;
+        trace->init(tr);
+        /* reset the max latency */
+        tracing_max_latency = 0;
+        /* disable preemption and interrupts for a bit */
+        preempt_disable();
+        local_irq_disable();
+        udelay(100);
+        preempt_enable();
+        /* reverse the order of preempt vs irqs */
+        local_irq_enable();
+        /* stop the tracing. */
+        tr->ctrl = 0;
+        trace->ctrl_update(tr);
+        /* check both trace buffers */
+        ret = trace_test_buffer(tr, NULL);
+        if (ret)
+                goto out;
+        ret = trace_test_buffer(&max_tr, &count);
+        if (ret)
+                goto out;
+        if (!ret && !count) {
+                printk(KERN_CONT ".. no entries found ..");
+                ret = -1;
+                goto out;
+        }
+        /* do the test by disabling interrupts first this time */
+        tracing_max_latency = 0;
+        tr->ctrl = 1;
+        trace->ctrl_update(tr);
+        preempt_disable();
+        local_irq_disable();
+        udelay(100);
+        preempt_enable();
+        /* reverse the order of preempt vs irqs */
+        local_irq_enable();
+        /* stop the tracing. */
+        tr->ctrl = 0;
+        trace->ctrl_update(tr);
+        /* check both trace buffers */
+        ret = trace_test_buffer(tr, NULL);
+        if (ret)
+                goto out;
+        ret = trace_test_buffer(&max_tr, &count);
+        if (!ret && !count) {
+                printk(KERN_CONT ".. no entries found ..");
+                ret = -1;
+                goto out;
+        }
+ out:
+        trace->reset(tr);
+        tracing_max_latency = save_max;
+        return ret;
+}
+#endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */
+#ifdef CONFIG_SCHED_TRACER
+static int trace_wakeup_test_thread(void *data)
+{
+        /* Make this a RT thread, doesn't need to be too high */
+        struct sched_param param = { .sched_priority = 5 };
+        struct completion *x = data;
+        sched_setscheduler(current, SCHED_FIFO, &param);
+        /* Make it know we have a new prio */
+        complete(x);
+        /* now go to sleep and let the test wake us up */
+        set_current_state(TASK_INTERRUPTIBLE);
+        schedule();
+        /* we are awake, now wait to disappear */
+        while (!kthread_should_stop()) {
+                /*
+                 * This is an RT task, do short sleeps to let
+                 * others run.
+                 */
+                msleep(100);
+        }
+        return 0;
+}
+int
+trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
+{
+        unsigned long save_max = tracing_max_latency;
+        struct task_struct *p;
+        struct completion isrt;
+        unsigned long count;
+        int ret;
+        init_completion(&isrt);
+        /* create a high prio thread */
+        p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test");
+        if (IS_ERR(p)) {
+                printk(KERN_CONT "Failed to create ftrace wakeup test thread ");
+                return -1;
+        }
+        /* make sure the thread is running at an RT prio */
+        wait_for_completion(&isrt);
+        /* start the tracing */
+        tr->ctrl = 1;
+        trace->init(tr);
+        /* reset the max latency */
+        tracing_max_latency = 0;
+        /* sleep to let the RT thread sleep too */
+        msleep(100);
+        /*
+         * Yes this is slightly racy. It is possible that for some
+         * strange reason that the RT thread we created, did not
+         * call schedule for 100ms after doing the completion,
+         * and we do a wakeup on a task that already is awake.
+         * But that is extremely unlikely, and the worst thing that
+         * happens in such a case, is that we disable tracing.
+         * Honestly, if this race does happen something is horrible
+         * wrong with the system.
+         */
+        wake_up_process(p);
+        /* stop the tracing. */
+        tr->ctrl = 0;
+        trace->ctrl_update(tr);
+        /* check both trace buffers */
+        ret = trace_test_buffer(tr, NULL);
+        if (!ret)
+                ret = trace_test_buffer(&max_tr, &count);
+        trace->reset(tr);
+        tracing_max_latency = save_max;
+        /* kill the thread */
+        kthread_stop(p);
+        if (!ret && !count) {
+                printk(KERN_CONT ".. no entries found ..");
+                ret = -1;
+        }
+        return ret;
+}
+#endif /* CONFIG_SCHED_TRACER */
+#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+int
+trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr)
+{
+        unsigned long count;
+        int ret;
+        /* start the tracing */
+        tr->ctrl = 1;
+        trace->init(tr);
+        /* Sleep for a 1/10 of a second */
+        msleep(100);
+        /* stop the tracing. */
+        tr->ctrl = 0;
+        trace->ctrl_update(tr);
+        /* check the trace buffer */
+        ret = trace_test_buffer(tr, &count);
+        trace->reset(tr);
+        if (!ret && !count) {
+                printk(KERN_CONT ".. no entries found ..");
+                ret = -1;
+        }
+        return ret;
+}
+#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
+#ifdef CONFIG_SYSPROF_TRACER
+int
+trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
+{
+        unsigned long count;
+        int ret;
+        /* start the tracing */
+        tr->ctrl = 1;
+        trace->init(tr);
+        /* Sleep for a 1/10 of a second */
+        msleep(100);
+        /* stop the tracing. */
+        tr->ctrl = 0;
+        trace->ctrl_update(tr);
+        /* check the trace buffer */
+        ret = trace_test_buffer(tr, &count);
+        trace->reset(tr);
+        return ret;
+}
+#endif /* CONFIG_SYSPROF_TRACER */
diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c
new file mode 100644
index 000000000000..54dd77cce5bf
--- /dev/null
+++ b/kernel/trace/trace_selftest_dynamic.c
@@ -0,0 +1,7 @@
+#include "trace.h"
+int DYN_FTRACE_TEST_NAME(void)
+{
+        /* used to call mcount */
+        return 0;
+}
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
new file mode 100644
index 000000000000..ce2d723c10e1
--- /dev/null
+++ b/kernel/trace/trace_sysprof.c
@@ -0,0 +1,365 @@
+/*
+ * trace stack traces
+ *
+ * Copyright (C) 2004-2008, Soeren Sandmann
+ * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ */
+#include <linux/kallsyms.h>
+#include <linux/debugfs.h>
+#include <linux/hrtimer.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/irq.h>
+#include <linux/fs.h>
+#include <asm/stacktrace.h>
+#include "trace.h"
+static struct trace_array       *sysprof_trace;
+static int __read_mostly        tracer_enabled;
+/*
+ * 1 msec sample interval by default:
+ */
+static unsigned long sample_period = 1000000;
+static const unsigned int sample_max_depth = 512;
+static DEFINE_MUTEX(sample_timer_lock);
+/*
+ * Per CPU hrtimers that do the profiling:
+ */
+static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer);
+struct stack_frame {
+        const void __user       *next_fp;
+        unsigned long           return_address;
+};
+static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+{
+        int ret;
+        if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
+                return 0;
+        ret = 1;
+        pagefault_disable();
+        if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
+                ret = 0;
+        pagefault_enable();
+        return ret;
+}
+struct backtrace_info {
+        struct trace_array_cpu  *data;
+        struct trace_array      *tr;
+        int                     pos;
+};
+static void
+backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+        /* Ignore warnings */
+}
+static void backtrace_warning(void *data, char *msg)
+{
+        /* Ignore warnings */
+}
+static int backtrace_stack(void *data, char *name)
+{
+        /* Don't bother with IRQ stacks for now */
+        return -1;
+}
+static void backtrace_address(void *data, unsigned long addr, int reliable)
+{
+        struct backtrace_info *info = data;
+        if (info->pos < sample_max_depth && reliable) {
+                __trace_special(info->tr, info->data, 1, addr, 0);
+                info->pos++;
+        }
+}
+const static struct stacktrace_ops backtrace_ops = {
+        .warning                = backtrace_warning,
+        .warning_symbol         = backtrace_warning_symbol,
+        .stack                  = backtrace_stack,
+        .address                = backtrace_address,
+};
+static int
+trace_kernel(struct pt_regs *regs, struct trace_array *tr,
+             struct trace_array_cpu *data)
+{
+        struct backtrace_info info;
+        unsigned long bp;
+        char *stack;
+        info.tr = tr;
+        info.data = data;
+        info.pos = 1;
+        __trace_special(info.tr, info.data, 1, regs->ip, 0);
+        stack = ((char *)regs + sizeof(struct pt_regs));
+#ifdef CONFIG_FRAME_POINTER
+        bp = regs->bp;
+#else
+        bp = 0;
+#endif
+        dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info);
+        return info.pos;
+}
+static void timer_notify(struct pt_regs *regs, int cpu)
+{
+        struct trace_array_cpu *data;
+        struct stack_frame frame;
+        struct trace_array *tr;
+        const void __user *fp;
+        int is_user;
+        int i;
+        if (!regs)
+                return;
+        tr = sysprof_trace;
+        data = tr->data[cpu];
+        is_user = user_mode(regs);
+        if (!current || current->pid == 0)
+                return;
+        if (is_user && current->state != TASK_RUNNING)
+                return;
+        __trace_special(tr, data, 0, 0, current->pid);
+        if (!is_user)
+                i = trace_kernel(regs, tr, data);
+        else
+                i = 0;
+        /*
+         * Trace user stack if we are not a kernel thread
+         */
+        if (current->mm && i < sample_max_depth) {
+                regs = (struct pt_regs *)current->thread.sp0 - 1;
+                fp = (void __user *)regs->bp;
+                __trace_special(tr, data, 2, regs->ip, 0);
+                while (i < sample_max_depth) {
+                        frame.next_fp = NULL;
+                        frame.return_address = 0;
+                        if (!copy_stack_frame(fp, &frame))
+                                break;
+                        if ((unsigned long)fp < regs->sp)
+                                break;
+                        __trace_special(tr, data, 2, frame.return_address,
+                                        (unsigned long)fp);
+                        fp = frame.next_fp;
+                        i++;
+                }
+        }
+        /*
+         * Special trace entry if we overflow the max depth:
+         */
+        if (i == sample_max_depth)
+                __trace_special(tr, data, -1, -1, -1);
+        __trace_special(tr, data, 3, current->pid, i);
+}
+static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer)
+{
+        /* trace here */
+        timer_notify(get_irq_regs(), smp_processor_id());
+        hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
+        return HRTIMER_RESTART;
+}
+static void start_stack_timer(int cpu)
+{
+        struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu);
+        hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        hrtimer->function = stack_trace_timer_fn;
+        hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+        hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
+}
+static void start_stack_timers(void)
+{
+        cpumask_t saved_mask = current->cpus_allowed;
+        int cpu;
+        for_each_online_cpu(cpu) {
+                cpumask_of_cpu_ptr(new_mask, cpu);
+                set_cpus_allowed_ptr(current, new_mask);
+                start_stack_timer(cpu);
+        }
+        set_cpus_allowed_ptr(current, &saved_mask);
+}
+static void stop_stack_timer(int cpu)
+{
+        struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu);
+        hrtimer_cancel(hrtimer);
+}
+static void stop_stack_timers(void)
+{
+        int cpu;
+        for_each_online_cpu(cpu)
+                stop_stack_timer(cpu);
+}
+static void stack_reset(struct trace_array *tr)
+{
+        int cpu;
+        tr->time_start = ftrace_now(tr->cpu);
+        for_each_online_cpu(cpu)
+                tracing_reset(tr->data[cpu]);
+}
+static void start_stack_trace(struct trace_array *tr)
+{
+        mutex_lock(&sample_timer_lock);
+        stack_reset(tr);
+        start_stack_timers();
+        tracer_enabled = 1;
+        mutex_unlock(&sample_timer_lock);
+}
+static void stop_stack_trace(struct trace_array *tr)
+{
+        mutex_lock(&sample_timer_lock);
+        stop_stack_timers();
+        tracer_enabled = 0;
+        mutex_unlock(&sample_timer_lock);
+}
+static void stack_trace_init(struct trace_array *tr)
+{
+        sysprof_trace = tr;
+        if (tr->ctrl)
+                start_stack_trace(tr);
+}
+static void stack_trace_reset(struct trace_array *tr)
+{
+        if (tr->ctrl)
+                stop_stack_trace(tr);
+}
+static void stack_trace_ctrl_update(struct trace_array *tr)
+{
+        /* When starting a new trace, reset the buffers */
+        if (tr->ctrl)
+                start_stack_trace(tr);
+        else
+                stop_stack_trace(tr);
+}
+static struct tracer stack_trace __read_mostly =
+{
+        .name           = "sysprof",
+        .init           = stack_trace_init,
+        .reset          = stack_trace_reset,
+        .ctrl_update    = stack_trace_ctrl_update,
+#ifdef CONFIG_FTRACE_SELFTEST
+        .selftest    = trace_selftest_startup_sysprof,
+#endif
+};
+__init static int init_stack_trace(void)
+{
+        return register_tracer(&stack_trace);
+}
+device_initcall(init_stack_trace);
+#define MAX_LONG_DIGITS 22
+static ssize_t
+sysprof_sample_read(struct file *filp, char __user *ubuf,
+                    size_t cnt, loff_t *ppos)
+{
+        char buf[MAX_LONG_DIGITS];
+        int r;
+        r = sprintf(buf, "%ld\n", nsecs_to_usecs(sample_period));
+        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+static ssize_t
+sysprof_sample_write(struct file *filp, const char __user *ubuf,
+                     size_t cnt, loff_t *ppos)
+{
+        char buf[MAX_LONG_DIGITS];
+        unsigned long val;
+        if (cnt > MAX_LONG_DIGITS-1)
+                cnt = MAX_LONG_DIGITS-1;
+        if (copy_from_user(&buf, ubuf, cnt))
+                return -EFAULT;
+        buf[cnt] = 0;
+        val = simple_strtoul(buf, NULL, 10);
+        /*
+         * Enforce a minimum sample period of 100 usecs:
+         */
+        if (val < 100)
+                val = 100;
+        mutex_lock(&sample_timer_lock);
+        stop_stack_timers();
+        sample_period = val * 1000;
+        start_stack_timers();
+        mutex_unlock(&sample_timer_lock);
+        return cnt;
+}
+static struct file_operations sysprof_sample_fops = {
+        .read           = sysprof_sample_read,
+        .write          = sysprof_sample_write,
+};
+void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
+{
+        struct dentry *entry;
+        entry = debugfs_create_file("sysprof_sample_period", 0644,
+                        d_tracer, NULL, &sysprof_sample_fops);
+        if (entry)
+                return;
+        pr_warning("Could not create debugfs 'dyn_ftrace_total_info' entry\n");
+}
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 4ab1b584961b..f9cd2561689c 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -28,14 +28,14 @@
 void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
 {
        struct timespec uptime, ts;
-        s64 ac_etime;
+        u64 ac_etime;
        BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
        /* calculate task elapsed time in timespec */
        do_posix_clock_monotonic_gettime(&uptime);
        ts = timespec_sub(uptime, tsk->start_time);
-        /* rebase elapsed time to usec */
+        /* rebase elapsed time to usec (should never be negative) */
        ac_etime = timespec_to_ns(&ts);
        do_div(ac_etime, NSEC_PER_USEC);
        stats->ac_etime = ac_etime;
@@ -84,9 +84,9 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 {
        struct mm_struct *mm;
-        /* convert pages-jiffies to Mbyte-usec */
+        /* convert pages-usec to Mbyte-usec */
-        stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB;
+        stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB;
-        stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB;
+        stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB;
        mm = get_task_mm(p);
        if (mm) {
                /* adjust to KB unit */
@@ -94,14 +94,14 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
                stats->hiwater_vm    = mm->hiwater_vm * PAGE_SIZE / KB;
                mmput(mm);
        }
-        stats->read_char        = p->rchar;
+        stats->read_char        = p->ioac.chr.rchar;
-        stats->write_char       = p->wchar;
+        stats->write_char       = p->ioac.chr.wchar;
-        stats->read_syscalls    = p->syscr;
+        stats->read_syscalls    = p->ioac.chr.syscr;
-        stats->write_syscalls   = p->syscw;
+        stats->write_syscalls   = p->ioac.chr.syscw;
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-        stats->read_bytes       = p->ioac.read_bytes;
+        stats->read_bytes       = p->ioac.blk.read_bytes;
-        stats->write_bytes      = p->ioac.write_bytes;
+        stats->write_bytes      = p->ioac.blk.write_bytes;
-        stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes;
+        stats->cancelled_write_bytes = p->ioac.blk.cancelled_write_bytes;
 #else
        stats->read_bytes       = 0;
        stats->write_bytes      = 0;
@@ -118,12 +118,19 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 void acct_update_integrals(struct task_struct *tsk)
 {
        if (likely(tsk->mm)) {
-                long delta = cputime_to_jiffies(
+                cputime_t time, dtime;
-                        cputime_sub(tsk->stime, tsk->acct_stimexpd));
+                struct timeval value;
+                u64 delta;
+                time = tsk->stime + tsk->utime;
+                dtime = cputime_sub(time, tsk->acct_timexpd);
+                jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
+                delta = value.tv_sec;
+                delta = delta * USEC_PER_SEC + value.tv_usec;
                if (delta == 0)
                        return;
-                tsk->acct_stimexpd = tsk->stime;
+                tsk->acct_timexpd = time;
                tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
                tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
        }
@@ -135,7 +142,7 @@ void acct_update_integrals(struct task_struct *tsk)
 */
 void acct_clear_integrals(struct task_struct *tsk)
 {
-        tsk->acct_stimexpd = 0;
+        tsk->acct_timexpd = 0;
        tsk->acct_rss_mem1 = 0;
        tsk->acct_vm_mem1 = 0;
 }
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ce7799540c91..ec7e4f62aaff 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -125,7 +125,7 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
 }
 static void insert_work(struct cpu_workqueue_struct *cwq,
-                                struct work_struct *work, int tail)
+                        struct work_struct *work, struct list_head *head)
 {
        set_wq_data(work, cwq);
        /*
@@ -133,21 +133,17 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
         * result of list_add() below, see try_to_grab_pending().
         */
        smp_wmb();
-        if (tail)
+        list_add_tail(&work->entry, head);
-                list_add_tail(&work->entry, &cwq->worklist);
-        else
-                list_add(&work->entry, &cwq->worklist);
        wake_up(&cwq->more_work);
 }
-/* Preempt must be disabled. */
 static void __queue_work(struct cpu_workqueue_struct *cwq,
                         struct work_struct *work)
 {
        unsigned long flags;
        spin_lock_irqsave(&cwq->lock, flags);
-        insert_work(cwq, work, 1);
+        insert_work(cwq, work, &cwq->worklist);
        spin_unlock_irqrestore(&cwq->lock, flags);
 }
@@ -163,17 +159,39 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
 */
 int queue_work(struct workqueue_struct *wq, struct work_struct *work)
 {
+        int ret;
+        ret = queue_work_on(get_cpu(), wq, work);
+        put_cpu();
+        return ret;
+}
+EXPORT_SYMBOL_GPL(queue_work);
+/**
+ * queue_work_on - queue work on specific cpu
+ * @cpu: CPU number to execute work on
+ * @wq: workqueue to use
+ * @work: work to queue
+ *
+ * Returns 0 if @work was already on a queue, non-zero otherwise.
+ *
+ * We queue the work to a specific CPU, the caller must ensure it
+ * can't go away.
+ */
+int
+queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
+{
        int ret = 0;
        if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
                BUG_ON(!list_empty(&work->entry));
-                __queue_work(wq_per_cpu(wq, get_cpu()), work);
+                __queue_work(wq_per_cpu(wq, cpu), work);
-                put_cpu();
                ret = 1;
        }
        return ret;
 }
-EXPORT_SYMBOL_GPL(queue_work);
+EXPORT_SYMBOL_GPL(queue_work_on);
 static void delayed_work_timer_fn(unsigned long __data)
 {
@@ -337,14 +355,14 @@ static void wq_barrier_func(struct work_struct *work)
 }
 static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
-                                        struct wq_barrier *barr, int tail)
+                        struct wq_barrier *barr, struct list_head *head)
 {
        INIT_WORK(&barr->work, wq_barrier_func);
        __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
        init_completion(&barr->done);
-        insert_work(cwq, &barr->work, tail);
+        insert_work(cwq, &barr->work, head);
 }
 static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
@@ -364,7 +382,7 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
                active = 0;
                spin_lock_irq(&cwq->lock);
                if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
-                        insert_wq_barrier(cwq, &barr, 1);
+                        insert_wq_barrier(cwq, &barr, &cwq->worklist);
                        active = 1;
                }
                spin_unlock_irq(&cwq->lock);
@@ -397,11 +415,62 @@ void flush_workqueue(struct workqueue_struct *wq)
        might_sleep();
        lock_acquire(&wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
        lock_release(&wq->lockdep_map, 1, _THIS_IP_);
-        for_each_cpu_mask(cpu, *cpu_map)
+        for_each_cpu_mask_nr(cpu, *cpu_map)
                flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
+/**
+ * flush_work - block until a work_struct's callback has terminated
+ * @work: the work which is to be flushed
+ *
+ * Returns false if @work has already terminated.
+ *
+ * It is expected that, prior to calling flush_work(), the caller has
+ * arranged for the work to not be requeued, otherwise it doesn't make
+ * sense to use this function.
+ */
+int flush_work(struct work_struct *work)
+{
+        struct cpu_workqueue_struct *cwq;
+        struct list_head *prev;
+        struct wq_barrier barr;
+        might_sleep();
+        cwq = get_wq_data(work);
+        if (!cwq)
+                return 0;
+        lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+        lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
+        prev = NULL;
+        spin_lock_irq(&cwq->lock);
+        if (!list_empty(&work->entry)) {
+                /*
+                 * See the comment near try_to_grab_pending()->smp_rmb().
+                 * If it was re-queued under us we are not going to wait.
+                 */
+                smp_rmb();
+                if (unlikely(cwq != get_wq_data(work)))
+                        goto out;
+                prev = &work->entry;
+        } else {
+                if (cwq->current_work != work)
+                        goto out;
+                prev = &cwq->worklist;
+        }
+        insert_wq_barrier(cwq, &barr, prev->next);
+out:
+        spin_unlock_irq(&cwq->lock);
+        if (!prev)
+                return 0;
+        wait_for_completion(&barr.done);
+        return 1;
+}
+EXPORT_SYMBOL_GPL(flush_work);
 /*
 * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
 * so this work can't be re-armed in any way.
@@ -449,7 +518,7 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
        spin_lock_irq(&cwq->lock);
        if (unlikely(cwq->current_work == work)) {
-                insert_wq_barrier(cwq, &barr, 0);
+                insert_wq_barrier(cwq, &barr, cwq->worklist.next);
                running = 1;
        }
        spin_unlock_irq(&cwq->lock);
@@ -477,7 +546,7 @@ static void wait_on_work(struct work_struct *work)
        wq = cwq->wq;
        cpu_map = wq_cpu_map(wq);
-        for_each_cpu_mask(cpu, *cpu_map)
+        for_each_cpu_mask_nr(cpu, *cpu_map)
                wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 }
@@ -553,6 +622,19 @@ int schedule_work(struct work_struct *work)
 }
 EXPORT_SYMBOL(schedule_work);
+/*
+ * schedule_work_on - put work task on a specific cpu
+ * @cpu: cpu to put the work task on
+ * @work: job to be done
+ *
+ * This puts a job on a specific cpu
+ */
+int schedule_work_on(int cpu, struct work_struct *work)
+{
+        return queue_work_on(cpu, keventd_wq, work);
+}
+EXPORT_SYMBOL(schedule_work_on);
 /**
 * schedule_delayed_work - put work task in global workqueue after delay
 * @dwork: job to be done
@@ -607,10 +689,10 @@ int schedule_on_each_cpu(work_func_t func)
                struct work_struct *work = per_cpu_ptr(works, cpu);
                INIT_WORK(work, func);
-                set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
+                schedule_work_on(cpu, work);
-                __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
        }
-        flush_workqueue(keventd_wq);
+        for_each_online_cpu(cpu)
+                flush_work(per_cpu_ptr(works, cpu));
        put_online_cpus();
        free_percpu(works);
        return 0;
@@ -747,7 +829,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
                err = create_workqueue_thread(cwq, singlethread_cpu);
                start_workqueue_thread(cwq, -1);
        } else {
-                get_online_cpus();
+                cpu_maps_update_begin();
                spin_lock(&workqueue_lock);
                list_add(&wq->list, &workqueues);
                spin_unlock(&workqueue_lock);
@@ -759,7 +841,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
                        err = create_workqueue_thread(cwq, cpu);
                        start_workqueue_thread(cwq, cpu);
                }
-                put_online_cpus();
+                cpu_maps_update_done();
        }
        if (err) {
@@ -773,8 +855,8 @@ EXPORT_SYMBOL_GPL(__create_workqueue_key);
 static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
 {
        /*
-         * Our caller is either destroy_workqueue() or CPU_DEAD,
+         * Our caller is either destroy_workqueue() or CPU_POST_DEAD,
-         * get_online_cpus() protects cwq->thread.
+         * cpu_add_remove_lock protects cwq->thread.
         */
        if (cwq->thread == NULL)
                return;
@@ -784,7 +866,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
        flush_cpu_workqueue(cwq);
        /*
-         * If the caller is CPU_DEAD and cwq->worklist was not empty,
+         * If the caller is CPU_POST_DEAD and cwq->worklist was not empty,
         * a concurrent flush_workqueue() can insert a barrier after us.
         * However, in that case run_workqueue() won't return and check
         * kthread_should_stop() until it flushes all work_struct's.
@@ -808,14 +890,14 @@ void destroy_workqueue(struct workqueue_struct *wq)
        const cpumask_t *cpu_map = wq_cpu_map(wq);
        int cpu;
-        get_online_cpus();
+        cpu_maps_update_begin();
        spin_lock(&workqueue_lock);
        list_del(&wq->list);
        spin_unlock(&workqueue_lock);
-        for_each_cpu_mask(cpu, *cpu_map)
+        for_each_cpu_mask_nr(cpu, *cpu_map)
                cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
-        put_online_cpus();
+        cpu_maps_update_done();
        free_percpu(wq->cpu_wq);
        kfree(wq);
@@ -829,6 +911,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
        unsigned int cpu = (unsigned long)hcpu;
        struct cpu_workqueue_struct *cwq;
        struct workqueue_struct *wq;
+        int ret = NOTIFY_OK;
        action &= ~CPU_TASKS_FROZEN;
@@ -836,7 +919,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
        case CPU_UP_PREPARE:
                cpu_set(cpu, cpu_populated_map);
        }
+undo:
        list_for_each_entry(wq, &workqueues, list) {
                cwq = per_cpu_ptr(wq->cpu_wq, cpu);
@@ -846,7 +929,9 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
                                break;
                        printk(KERN_ERR "workqueue [%s] for %i failed\n",
                                wq->name, cpu);
-                        return NOTIFY_BAD;
+                        action = CPU_UP_CANCELED;
+                        ret = NOTIFY_BAD;
+                        goto undo;
                case CPU_ONLINE:
                        start_workqueue_thread(cwq, cpu);
@@ -854,7 +939,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
                case CPU_UP_CANCELED:
                        start_workqueue_thread(cwq, -1);
-                case CPU_DEAD:
+                case CPU_POST_DEAD:
                        cleanup_workqueue_thread(cwq);
                        break;
                }
@@ -862,11 +947,11 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
        switch (action) {
        case CPU_UP_CANCELED:
-        case CPU_DEAD:
+        case CPU_POST_DEAD:
                cpu_clear(cpu, cpu_populated_map);
        }
-        return NOTIFY_OK;
+        return ret;
 }
 void __init init_workqueues(void)