32 files changed, 877 insertions, 440 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 9a3ec66a9d84..6a212b842d86 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,8 +11,6 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
            notifier.o ksysfs.o pm_qos_params.o sched_clock.o
-CFLAGS_REMOVE_sched.o = -mno-spe
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
 CFLAGS_REMOVE_lockdep.o = -pg
@@ -21,7 +19,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
 CFLAGS_REMOVE_rtmutex-debug.o = -pg
 CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
-CFLAGS_REMOVE_sched.o = -mno-spe -pg
 endif
 obj-$(CONFIG_FREEZER) += freezer.o
@@ -92,7 +89,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
-ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
+ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
 # needed for x86 only.  Why this used to be enabled for all architectures is beyond
 # me.  I suspect most platforms don't need this, but until we know that for sure
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 8ba0e0d934f2..8b509441f49a 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -24,6 +24,7 @@ struct audit_chunk {
        struct list_head trees;         /* with root here */
        int dead;
        int count;
+        atomic_long_t refs;
        struct rcu_head head;
        struct node {
                struct list_head list;
@@ -56,7 +57,8 @@ static LIST_HEAD(prune_list);
 * tree is refcounted; one reference for "some rules on rules_list refer to
 * it", one for each chunk with pointer to it.
 *
- * chunk is refcounted by embedded inotify_watch.
+ * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount
+ * of watch contributes 1 to .refs).
 *
 * node.index allows to get from node.list to containing chunk.
 * MSB of that sucker is stolen to mark taggings that we might have to
@@ -121,6 +123,7 @@ static struct audit_chunk *alloc_chunk(int count)
        INIT_LIST_HEAD(&chunk->hash);
        INIT_LIST_HEAD(&chunk->trees);
        chunk->count = count;
+        atomic_long_set(&chunk->refs, 1);
        for (i = 0; i < count; i++) {
                INIT_LIST_HEAD(&chunk->owners[i].list);
                chunk->owners[i].index = i;
@@ -129,9 +132,8 @@ static struct audit_chunk *alloc_chunk(int count)
        return chunk;
 }
-static void __free_chunk(struct rcu_head *rcu)
+static void free_chunk(struct audit_chunk *chunk)
 {
-        struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head);
        int i;
        for (i = 0; i < chunk->count; i++) {
@@ -141,14 +143,16 @@ static void __free_chunk(struct rcu_head *rcu)
        kfree(chunk);
 }
-static inline void free_chunk(struct audit_chunk *chunk)
+void audit_put_chunk(struct audit_chunk *chunk)
 {
-        call_rcu(&chunk->head, __free_chunk);
+        if (atomic_long_dec_and_test(&chunk->refs))
+                free_chunk(chunk);
 }
-void audit_put_chunk(struct audit_chunk *chunk)
+static void __put_chunk(struct rcu_head *rcu)
 {
-        put_inotify_watch(&chunk->watch);
+        struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head);
+        audit_put_chunk(chunk);
 }
 enum {HASH_SIZE = 128};
@@ -176,7 +180,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
        list_for_each_entry_rcu(p, list, hash) {
                if (p->watch.inode == inode) {
-                        get_inotify_watch(&p->watch);
+                        atomic_long_inc(&p->refs);
                        return p;
                }
        }
@@ -194,17 +198,49 @@ int audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree)
 /* tagging and untagging inodes with trees */
-static void untag_chunk(struct audit_chunk *chunk, struct node *p)
+static struct audit_chunk *find_chunk(struct node *p)
+{
+        int index = p->index & ~(1U<<31);
+        p -= index;
+        return container_of(p, struct audit_chunk, owners[0]);
+}
+static void untag_chunk(struct node *p)
 {
+        struct audit_chunk *chunk = find_chunk(p);
        struct audit_chunk *new;
        struct audit_tree *owner;
        int size = chunk->count - 1;
        int i, j;
+        if (!pin_inotify_watch(&chunk->watch)) {
+                /*
+                 * Filesystem is shutting down; all watches are getting
+                 * evicted, just take it off the node list for this
+                 * tree and let the eviction logics take care of the
+                 * rest.
+                 */
+                owner = p->owner;
+                if (owner->root == chunk) {
+                        list_del_init(&owner->same_root);
+                        owner->root = NULL;
+                }
+                list_del_init(&p->list);
+                p->owner = NULL;
+                put_tree(owner);
+                return;
+        }
+        spin_unlock(&hash_lock);
+        /*
+         * pin_inotify_watch() succeeded, so the watch won't go away
+         * from under us.
+         */
        mutex_lock(&chunk->watch.inode->inotify_mutex);
        if (chunk->dead) {
                mutex_unlock(&chunk->watch.inode->inotify_mutex);
-                return;
+                goto out;
        }
        owner = p->owner;
@@ -221,7 +257,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct node *p)
                inotify_evict_watch(&chunk->watch);
                mutex_unlock(&chunk->watch.inode->inotify_mutex);
                put_inotify_watch(&chunk->watch);
-                return;
+                goto out;
        }
        new = alloc_chunk(size);
@@ -263,7 +299,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct node *p)
        inotify_evict_watch(&chunk->watch);
        mutex_unlock(&chunk->watch.inode->inotify_mutex);
        put_inotify_watch(&chunk->watch);
-        return;
+        goto out;
 Fallback:
        // do the best we can
@@ -277,6 +313,9 @@ Fallback:
        put_tree(owner);
        spin_unlock(&hash_lock);
        mutex_unlock(&chunk->watch.inode->inotify_mutex);
+out:
+        unpin_inotify_watch(&chunk->watch);
+        spin_lock(&hash_lock);
 }
 static int create_chunk(struct inode *inode, struct audit_tree *tree)
@@ -387,13 +426,6 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
        return 0;
 }
-static struct audit_chunk *find_chunk(struct node *p)
-{
-        int index = p->index & ~(1U<<31);
-        p -= index;
-        return container_of(p, struct audit_chunk, owners[0]);
-}
 static void kill_rules(struct audit_tree *tree)
 {
        struct audit_krule *rule, *next;
@@ -431,17 +463,10 @@ static void prune_one(struct audit_tree *victim)
        spin_lock(&hash_lock);
        while (!list_empty(&victim->chunks)) {
                struct node *p;
-                struct audit_chunk *chunk;
                p = list_entry(victim->chunks.next, struct node, list);
-                chunk = find_chunk(p);
-                get_inotify_watch(&chunk->watch);
-                spin_unlock(&hash_lock);
-                untag_chunk(chunk, p);
-                put_inotify_watch(&chunk->watch);
+                untag_chunk(p);
-                spin_lock(&hash_lock);
        }
        spin_unlock(&hash_lock);
        put_tree(victim);
@@ -469,7 +494,6 @@ static void trim_marked(struct audit_tree *tree)
        while (!list_empty(&tree->chunks)) {
                struct node *node;
-                struct audit_chunk *chunk;
                node = list_entry(tree->chunks.next, struct node, list);
@@ -477,14 +501,7 @@ static void trim_marked(struct audit_tree *tree)
                if (!(node->index & (1U<<31)))
                        break;
-                chunk = find_chunk(node);
+                untag_chunk(node);
-                get_inotify_watch(&chunk->watch);
-                spin_unlock(&hash_lock);
-                untag_chunk(chunk, node);
-                put_inotify_watch(&chunk->watch);
-                spin_lock(&hash_lock);
        }
        if (!tree->root && !tree->goner) {
                tree->goner = 1;
@@ -878,7 +895,7 @@ static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask,
 static void destroy_watch(struct inotify_watch *watch)
 {
        struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch);
-        free_chunk(chunk);
+        call_rcu(&chunk->head, __put_chunk);
 }
 static const struct inotify_operations rtree_inotify_ops = {
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index b7d354e2b0ef..9fd85a4640a0 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1094,8 +1094,8 @@ static void audit_inotify_unregister(struct list_head *in_list)
        list_for_each_entry_safe(p, n, in_list, ilist) {
                list_del(&p->ilist);
                inotify_rm_watch(audit_ih, &p->wdata);
-                /* the put matching the get in audit_do_del_rule() */
+                /* the unpin matching the pin in audit_do_del_rule() */
-                put_inotify_watch(&p->wdata);
+                unpin_inotify_watch(&p->wdata);
        }
 }
@@ -1389,9 +1389,13 @@ static inline int audit_del_rule(struct audit_entry *entry,
                                /* Put parent on the inotify un-registration
                                 * list.  Grab a reference before releasing
                                 * audit_filter_mutex, to be released in
-                                 * audit_inotify_unregister(). */
+                                 * audit_inotify_unregister().
-                                list_add(&parent->ilist, &inotify_list);
+                                 * If filesystem is going away, just leave
-                                get_inotify_watch(&parent->wdata);
+                                 * the sucker alone, eviction will take
+                                 * care of it.
+                                 */
+                                if (pin_inotify_watch(&parent->wdata))
+                                        list_add(&parent->ilist, &inotify_list);
                        }
                }
        }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 35eebd5510c2..fe00b3b983a8 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2039,10 +2039,13 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
        struct cgroup *cgrp;
        struct cgroup_iter it;
        struct task_struct *tsk;
        /*
-         * Validate dentry by checking the superblock operations
+         * Validate dentry by checking the superblock operations,
+         * and make sure it's a directory.
         */
-        if (dentry->d_sb->s_op != &cgroup_ops)
+        if (dentry->d_sb->s_op != &cgroup_ops ||
+            !S_ISDIR(dentry->d_inode->i_mode))
                 goto err;
        ret = 0;
@@ -2472,10 +2475,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
                mutex_unlock(&cgroup_mutex);
                return -EBUSY;
        }
+        mutex_unlock(&cgroup_mutex);
-        parent = cgrp->parent;
-        root = cgrp->root;
-        sb = root->sb;
        /*
         * Call pre_destroy handlers of subsys. Notify subsystems
@@ -2483,7 +2483,14 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
         */
        cgroup_call_pre_destroy(cgrp);
-        if (cgroup_has_css_refs(cgrp)) {
+        mutex_lock(&cgroup_mutex);
+        parent = cgrp->parent;
+        root = cgrp->root;
+        sb = root->sb;
+        if (atomic_read(&cgrp->count)
+            || !list_empty(&cgrp->children)
+            || cgroup_has_css_refs(cgrp)) {
                mutex_unlock(&cgroup_mutex);
                return -EBUSY;
        }
@@ -2497,7 +2504,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
        list_del(&cgrp->sibling);
        spin_lock(&cgrp->dentry->d_lock);
        d = dget(cgrp->dentry);
-        cgrp->dentry = NULL;
        spin_unlock(&d->d_lock);
        cgroup_d_remove_dir(d);
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 7fa476f01d05..fb249e2bcada 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -184,9 +184,20 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
 {
        struct freezer *freezer;
-        task_lock(task);
+        /*
+         * No lock is needed, since the task isn't on tasklist yet,
+         * so it can't be moved to another cgroup, which means the
+         * freezer won't be removed and will be valid during this
+         * function call.
+         */
        freezer = task_freezer(task);
-        task_unlock(task);
+        /*
+         * The root cgroup is non-freezable, so we can skip the
+         * following check.
+         */
+        if (!freezer->css.cgroup->parent)
+                return;
        spin_lock_irq(&freezer->lock);
        BUG_ON(freezer->state == CGROUP_FROZEN);
@@ -331,7 +342,7 @@ static int freezer_write(struct cgroup *cgroup,
        else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0)
                goal_state = CGROUP_FROZEN;
        else
-                return -EIO;
+                return -EINVAL;
        if (!cgroup_lock_live_group(cgroup))
                return -ENODEV;
@@ -350,6 +361,8 @@ static struct cftype files[] = {
 static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
 {
+        if (!cgroup->parent)
+                return 0;
        return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
 }
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 86d49045daed..5a732c5ef08b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -499,3 +499,6 @@ const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
 #endif
 };
 EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
+const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
+EXPORT_SYMBOL(cpu_all_bits);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3e00526f52ec..da7ff6137f37 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -36,6 +36,7 @@
 #include <linux/list.h>
 #include <linux/mempolicy.h>
 #include <linux/mm.h>
+#include <linux/memory.h>
 #include <linux/module.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
@@ -587,7 +588,6 @@ static int generate_sched_domains(cpumask_t **domains,
        int ndoms;              /* number of sched domains in result */
        int nslot;              /* next empty doms[] cpumask_t slot */
-        ndoms = 0;
        doms = NULL;
        dattr = NULL;
        csa = NULL;
@@ -674,10 +674,8 @@ restart:
         * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
         */
        doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
-        if (!doms) {
+        if (!doms)
-                ndoms = 0;
                goto done;
-        }
        /*
         * The rest of the code, including the scheduler, can deal with
@@ -732,6 +730,13 @@ restart:
 done:
        kfree(csa);
+        /*
+         * Fallback to the default domain if kmalloc() failed.
+         * See comments in partition_sched_domains().
+         */
+        if (doms == NULL)
+                ndoms = 1;
        *domains    = doms;
        *attributes = dattr;
        return ndoms;
@@ -2011,12 +2016,23 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
 * See also the previous routine cpuset_track_online_cpus().
 */
-void cpuset_track_online_nodes(void)
+static int cpuset_track_online_nodes(struct notifier_block *self,
+                                unsigned long action, void *arg)
 {
        cgroup_lock();
-        top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
+        switch (action) {
-        scan_for_empty_cpusets(&top_cpuset);
+        case MEM_ONLINE:
+                top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
+                break;
+        case MEM_OFFLINE:
+                top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
+                scan_for_empty_cpusets(&top_cpuset);
+                break;
+        default:
+                break;
+        }
        cgroup_unlock();
+        return NOTIFY_OK;
 }
 #endif
@@ -2032,6 +2048,7 @@ void __init cpuset_init_smp(void)
        top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
        hotcpu_notifier(cpuset_track_online_cpus, 0);
+        hotplug_memory_notifier(cpuset_track_online_nodes, 10);
 }
 /**
diff --git a/kernel/exit.c b/kernel/exit.c
index 80137a5d9467..2d8be7ebb0f7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -40,7 +40,6 @@
 #include <linux/cn_proc.h>
 #include <linux/mutex.h>
 #include <linux/futex.h>
-#include <linux/compat.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/audit.h> /* for audit_free() */
 #include <linux/resource.h>
@@ -141,6 +140,11 @@ static void __exit_signal(struct task_struct *tsk)
        if (sig) {
                flush_sigqueue(&sig->shared_pending);
                taskstats_tgid_free(sig);
+                /*
+                 * Make sure ->signal can't go away under rq->lock,
+                 * see account_group_exec_runtime().
+                 */
+                task_rq_unlock_wait(tsk);
                __cleanup_signal(sig);
        }
 }
@@ -1054,14 +1058,6 @@ NORET_TYPE void do_exit(long code)
                exit_itimers(tsk->signal);
        }
        acct_collect(code, group_dead);
-#ifdef CONFIG_FUTEX
-        if (unlikely(tsk->robust_list))
-                exit_robust_list(tsk);
-#ifdef CONFIG_COMPAT
-        if (unlikely(tsk->compat_robust_list))
-                compat_exit_robust_list(tsk);
-#endif
-#endif
        if (group_dead)
                tty_audit_exit();
        if (unlikely(tsk->audit_context))
diff --git a/kernel/fork.c b/kernel/fork.c
index f6083561dfe0..2a372a0e206f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -40,6 +40,7 @@
 #include <linux/jiffies.h>
 #include <linux/tracehook.h>
 #include <linux/futex.h>
+#include <linux/compat.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/rcupdate.h>
 #include <linux/ptrace.h>
@@ -519,6 +520,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
 {
        struct completion *vfork_done = tsk->vfork_done;
+        /* Get rid of any futexes when releasing the mm */
+#ifdef CONFIG_FUTEX
+        if (unlikely(tsk->robust_list))
+                exit_robust_list(tsk);
+#ifdef CONFIG_COMPAT
+        if (unlikely(tsk->compat_robust_list))
+                compat_exit_robust_list(tsk);
+#endif
+#endif
        /* Get rid of any cached register state */
        deactivate_mm(tsk, mm);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 2b465dfde426..47e63349d1b2 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -664,14 +664,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
                /* Timer is expired, act upon the callback mode */
                switch(timer->cb_mode) {
-                case HRTIMER_CB_IRQSAFE_NO_RESTART:
-                        debug_hrtimer_deactivate(timer);
-                        /*
-                         * We can call the callback from here. No restart
-                         * happens, so no danger of recursion
-                         */
-                        BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
-                        return 1;
                case HRTIMER_CB_IRQSAFE_PERCPU:
                case HRTIMER_CB_IRQSAFE_UNLOCKED:
                        /*
@@ -683,7 +675,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
                         */
                        debug_hrtimer_deactivate(timer);
                        return 1;
-                case HRTIMER_CB_IRQSAFE:
                case HRTIMER_CB_SOFTIRQ:
                        /*
                         * Move everything else into the softirq pending list !
@@ -1209,6 +1200,7 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
                enum hrtimer_restart (*fn)(struct hrtimer *);
                struct hrtimer *timer;
                int restart;
+                int emulate_hardirq_ctx = 0;
                timer = list_entry(cpu_base->cb_pending.next,
                                   struct hrtimer, cb_entry);
@@ -1217,10 +1209,24 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
                timer_stats_account_hrtimer(timer);
                fn = timer->function;
+                /*
+                 * A timer might have been added to the cb_pending list
+                 * when it was migrated during a cpu-offline operation.
+                 * Emulate hardirq context for such timers.
+                 */
+                if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
+                    timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED)
+                        emulate_hardirq_ctx = 1;
                __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
                spin_unlock_irq(&cpu_base->lock);
-                restart = fn(timer);
+                if (unlikely(emulate_hardirq_ctx)) {
+                        local_irq_disable();
+                        restart = fn(timer);
+                        local_irq_enable();
+                } else
+                        restart = fn(timer);
                spin_lock_irq(&cpu_base->lock);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 5072cf1685a2..7b8b0f21a5b1 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -304,17 +304,24 @@ int sprint_symbol(char *buffer, unsigned long address)
        char *modname;
        const char *name;
        unsigned long offset, size;
-        char namebuf[KSYM_NAME_LEN];
+        int len;
-        name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
+        name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
        if (!name)
                return sprintf(buffer, "0x%lx", address);
+        if (name != buffer)
+                strcpy(buffer, name);
+        len = strlen(buffer);
+        buffer += len;
        if (modname)
-                return sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset,
+                len += sprintf(buffer, "+%#lx/%#lx [%s]",
-                                size, modname);
+                                                offset, size, modname);
        else
-                return sprintf(buffer, "%s+%#lx/%#lx", name, offset, size);
+                len += sprintf(buffer, "+%#lx/%#lx", offset, size);
+        return len;
 }
 /* Look up a kernel symbol and print it to the kernel messages. */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 8b57a2597f21..9f8a3f25259a 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -72,7 +72,7 @@ static bool kprobe_enabled;
 DEFINE_MUTEX(kprobe_mutex);             /* Protects kprobe_table */
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
 static struct {
-        spinlock_t lock ____cacheline_aligned;
+        spinlock_t lock ____cacheline_aligned_in_smp;
 } kretprobe_table_locks[KPROBE_TABLE_SIZE];
 static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
@@ -613,30 +613,37 @@ static int __kprobes __register_kprobe(struct kprobe *p,
                return -EINVAL;
        p->addr = addr;
-        if (!kernel_text_address((unsigned long) p->addr) ||
+        preempt_disable();
-            in_kprobes_functions((unsigned long) p->addr))
+        if (!__kernel_text_address((unsigned long) p->addr) ||
+            in_kprobes_functions((unsigned long) p->addr)) {
+                preempt_enable();
                return -EINVAL;
+        }
        p->mod_refcounted = 0;
        /*
         * Check if are we probing a module.
         */
-        probed_mod = module_text_address((unsigned long) p->addr);
+        probed_mod = __module_text_address((unsigned long) p->addr);
        if (probed_mod) {
-                struct module *calling_mod = module_text_address(called_from);
+                struct module *calling_mod;
+                calling_mod = __module_text_address(called_from);
                /*
                 * We must allow modules to probe themself and in this case
                 * avoid incrementing the module refcount, so as to allow
                 * unloading of self probing modules.
                 */
                if (calling_mod && calling_mod != probed_mod) {
-                        if (unlikely(!try_module_get(probed_mod)))
+                        if (unlikely(!try_module_get(probed_mod))) {
+                                preempt_enable();
                                return -EINVAL;
+                        }
                        p->mod_refcounted = 1;
                } else
                        probed_mod = NULL;
        }
+        preempt_enable();
        p->nmissed = 0;
        INIT_LIST_HEAD(&p->list);
@@ -718,6 +725,10 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
        struct kprobe *old_p;
        if (p->mod_refcounted) {
+                /*
+                 * Since we've already incremented refcount,
+                 * we don't need to disable preemption.
+                 */
                mod = module_text_address((unsigned long)p->addr);
                if (mod)
                        module_put(mod);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 153dcb2639c3..895337b16a24 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1308,9 +1308,10 @@ static inline int task_cputime_expired(const struct task_cputime *sample,
 */
 static inline int fastpath_timer_check(struct task_struct *tsk)
 {
-        struct signal_struct *sig = tsk->signal;
+        struct signal_struct *sig;
-        if (unlikely(!sig))
+        /* tsk == current, ensure it is safe to use ->signal/sighand */
+        if (unlikely(tsk->exit_state))
                return 0;
        if (!task_cputime_zero(&tsk->cputime_expires)) {
@@ -1323,6 +1324,8 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
                if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
                        return 1;
        }
+        sig = tsk->signal;
        if (!task_cputime_zero(&sig->cputime_expires)) {
                struct task_cputime group_sample;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 19122cf6d827..b8f7ce9473e8 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -174,7 +174,7 @@ static void suspend_test_finish(const char *label)
         * has some performance issues.  The stack dump of a WARN_ON
         * is more likely to get the right attention than a printk...
         */
-        WARN_ON(msec > (TEST_SUSPEND_SECONDS * 1000));
+        WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label);
 }
 #else
diff --git a/kernel/profile.c b/kernel/profile.c
index 9830a037d8db..5b7d1ac7124c 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -544,7 +544,7 @@ static const struct file_operations proc_profile_operations = {
 };
 #ifdef CONFIG_SMP
-static void __init profile_nop(void *unused)
+static inline void profile_nop(void *unused)
 {
 }
diff --git a/kernel/relay.c b/kernel/relay.c
index 8d13a7855c08..32b0befdcb6a 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -400,7 +400,7 @@ void relay_reset(struct rchan *chan)
        }
        mutex_lock(&relay_channels_mutex);
-        for_each_online_cpu(i)
+        for_each_possible_cpu(i)
                if (chan->buf[i])
                        __relay_reset(chan->buf[i], 0);
        mutex_unlock(&relay_channels_mutex);
@@ -611,10 +611,9 @@ struct rchan *relay_open(const char *base_filename,
        return chan;
 free_bufs:
-        for_each_online_cpu(i) {
+        for_each_possible_cpu(i) {
-                if (!chan->buf[i])
+                if (chan->buf[i])
-                        break;
+                        relay_close_buf(chan->buf[i]);
-                relay_close_buf(chan->buf[i]);
        }
        kref_put(&chan->kref, relay_destroy_channel);
diff --git a/kernel/sched.c b/kernel/sched.c
index ad10d0aae1d7..338340a3fb89 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -397,9 +397,9 @@ struct cfs_rq {
         * 'curr' points to currently running entity on this cfs_rq.
         * It is set to NULL otherwise (i.e when none are currently running).
         */
-        struct sched_entity *curr, *next;
+        struct sched_entity *curr, *next, *last;
-        unsigned long nr_spread_over;
+        unsigned int nr_spread_over;
 #ifdef CONFIG_FAIR_GROUP_SCHED
        struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
@@ -703,45 +703,18 @@ static __read_mostly char *sched_feat_names[] = {
 #undef SCHED_FEAT
-static int sched_feat_open(struct inode *inode, struct file *filp)
+static int sched_feat_show(struct seq_file *m, void *v)
-{
-        filp->private_data = inode->i_private;
-        return 0;
-}
-static ssize_t
-sched_feat_read(struct file *filp, char __user *ubuf,
-                size_t cnt, loff_t *ppos)
 {
-        char *buf;
-        int r = 0;
-        int len = 0;
        int i;
        for (i = 0; sched_feat_names[i]; i++) {
-                len += strlen(sched_feat_names[i]);
+                if (!(sysctl_sched_features & (1UL << i)))
-                len += 4;
+                        seq_puts(m, "NO_");
+                seq_printf(m, "%s ", sched_feat_names[i]);
        }
+        seq_puts(m, "\n");
-        buf = kmalloc(len + 2, GFP_KERNEL);
+        return 0;
-        if (!buf)
-                return -ENOMEM;
-        for (i = 0; sched_feat_names[i]; i++) {
-                if (sysctl_sched_features & (1UL << i))
-                        r += sprintf(buf + r, "%s ", sched_feat_names[i]);
-                else
-                        r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
-        }
-        r += sprintf(buf + r, "\n");
-        WARN_ON(r >= len + 2);
-        r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-        kfree(buf);
-        return r;
 }
 static ssize_t
@@ -786,10 +759,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
        return cnt;
 }
+static int sched_feat_open(struct inode *inode, struct file *filp)
+{
+        return single_open(filp, sched_feat_show, NULL);
+}
 static struct file_operations sched_feat_fops = {
-        .open   = sched_feat_open,
+        .open           = sched_feat_open,
-        .read   = sched_feat_read,
+        .write          = sched_feat_write,
-        .write  = sched_feat_write,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
 };
 static __init int sched_init_debug(void)
@@ -969,6 +949,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
        }
 }
+void task_rq_unlock_wait(struct task_struct *p)
+{
+        struct rq *rq = task_rq(p);
+        smp_mb(); /* spin-unlock-wait is not a full memory barrier */
+        spin_unlock_wait(&rq->lock);
+}
 static void __task_rq_unlock(struct rq *rq)
        __releases(rq->lock)
 {
@@ -1448,6 +1436,8 @@ static unsigned long cpu_avg_load_per_task(int cpu)
        if (rq->nr_running)
                rq->avg_load_per_task = rq->load.weight / rq->nr_running;
+        else
+                rq->avg_load_per_task = 0;
        return rq->avg_load_per_task;
 }
@@ -1463,27 +1453,13 @@ static void
 update_group_shares_cpu(struct task_group *tg, int cpu,
                        unsigned long sd_shares, unsigned long sd_rq_weight)
 {
-        int boost = 0;
        unsigned long shares;
        unsigned long rq_weight;
        if (!tg->se[cpu])
                return;
-        rq_weight = tg->cfs_rq[cpu]->load.weight;
+        rq_weight = tg->cfs_rq[cpu]->rq_weight;
-        /*
-         * If there are currently no tasks on the cpu pretend there is one of
-         * average load so that when a new task gets to run here it will not
-         * get delayed by group starvation.
-         */
-        if (!rq_weight) {
-                boost = 1;
-                rq_weight = NICE_0_LOAD;
-        }
-        if (unlikely(rq_weight > sd_rq_weight))
-                rq_weight = sd_rq_weight;
        /*
         *           \Sum shares * rq_weight
@@ -1491,7 +1467,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
         *               \Sum rq_weight
         *
         */
-        shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+        shares = (sd_shares * rq_weight) / sd_rq_weight;
        shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
        if (abs(shares - tg->se[cpu]->load.weight) >
@@ -1500,11 +1476,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
                unsigned long flags;
                spin_lock_irqsave(&rq->lock, flags);
-                /*
+                tg->cfs_rq[cpu]->shares = shares;
-                 * record the actual number of shares, not the boosted amount.
-                 */
-                tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
-                tg->cfs_rq[cpu]->rq_weight = rq_weight;
                __set_se_shares(tg->se[cpu], shares);
                spin_unlock_irqrestore(&rq->lock, flags);
@@ -1518,13 +1490,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
 */
 static int tg_shares_up(struct task_group *tg, void *data)
 {
-        unsigned long rq_weight = 0;
+        unsigned long weight, rq_weight = 0;
        unsigned long shares = 0;
        struct sched_domain *sd = data;
        int i;
        for_each_cpu_mask(i, sd->span) {
-                rq_weight += tg->cfs_rq[i]->load.weight;
+                /*
+                 * If there are currently no tasks on the cpu pretend there
+                 * is one of average load so that when a new task gets to
+                 * run here it will not get delayed by group starvation.
+                 */
+                weight = tg->cfs_rq[i]->load.weight;
+                if (!weight)
+                        weight = NICE_0_LOAD;
+                tg->cfs_rq[i]->rq_weight = weight;
+                rq_weight += weight;
                shares += tg->cfs_rq[i]->shares;
        }
@@ -1534,9 +1516,6 @@ static int tg_shares_up(struct task_group *tg, void *data)
        if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
                shares = tg->shares;
-        if (!rq_weight)
-                rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
        for_each_cpu_mask(i, sd->span)
                update_group_shares_cpu(tg, i, shares, rq_weight);
@@ -1805,7 +1784,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
        /*
         * Buddy candidates are cache hot:
         */
-        if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next))
+        if (sched_feat(CACHE_HOT_BUDDY) &&
+                        (&p->se == cfs_rq_of(&p->se)->next ||
+                         &p->se == cfs_rq_of(&p->se)->last))
                return 1;
        if (p->sched_class != &fair_sched_class)
@@ -5858,6 +5839,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
        struct rq *rq = cpu_rq(cpu);
        unsigned long flags;
+        spin_lock_irqsave(&rq->lock, flags);
        __sched_fork(idle);
        idle->se.exec_start = sched_clock();
@@ -5865,7 +5848,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
        idle->cpus_allowed = cpumask_of_cpu(cpu);
        __set_task_cpu(idle, cpu);
-        spin_lock_irqsave(&rq->lock, flags);
        rq->curr = rq->idle = idle;
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
        idle->oncpu = 1;
@@ -6112,7 +6094,6 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
 /*
 * Figure out where task on dead CPU should go, use force if necessary.
- * NOTE: interrupts should be disabled by the caller
 */
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
@@ -6622,28 +6603,6 @@ early_initcall(migration_init);
 #ifdef CONFIG_SCHED_DEBUG
-static inline const char *sd_level_to_string(enum sched_domain_level lvl)
-{
-        switch (lvl) {
-        case SD_LV_NONE:
-                        return "NONE";
-        case SD_LV_SIBLING:
-                        return "SIBLING";
-        case SD_LV_MC:
-                        return "MC";
-        case SD_LV_CPU:
-                        return "CPU";
-        case SD_LV_NODE:
-                        return "NODE";
-        case SD_LV_ALLNODES:
-                        return "ALLNODES";
-        case SD_LV_MAX:
-                        return "MAX";
-        }
-        return "MAX";
-}
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                                  cpumask_t *groupmask)
 {
@@ -6663,8 +6622,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                return -1;
        }
-        printk(KERN_CONT "span %s level %s\n",
+        printk(KERN_CONT "span %s level %s\n", str, sd->name);
-                str, sd_level_to_string(sd->level));
        if (!cpu_isset(cpu, sd->span)) {
                printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -6875,15 +6833,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
        struct sched_domain *tmp;
        /* Remove the sched domains which do not contribute to scheduling. */
-        for (tmp = sd; tmp; tmp = tmp->parent) {
+        for (tmp = sd; tmp; ) {
                struct sched_domain *parent = tmp->parent;
                if (!parent)
                        break;
                if (sd_parent_degenerate(tmp, parent)) {
                        tmp->parent = parent->parent;
                        if (parent->parent)
                                parent->parent->child = tmp;
-                }
+                } else
+                        tmp = tmp->parent;
        }
        if (sd && sd_degenerate(sd)) {
@@ -7318,13 +7278,21 @@ struct allmasks {
 };
 #if     NR_CPUS > 128
-#define SCHED_CPUMASK_ALLOC             1
+#define SCHED_CPUMASK_DECLARE(v)        struct allmasks *v
-#define SCHED_CPUMASK_FREE(v)           kfree(v)
+static inline void sched_cpumask_alloc(struct allmasks **masks)
-#define SCHED_CPUMASK_DECLARE(v)        struct allmasks *v
+{
+        *masks = kmalloc(sizeof(**masks), GFP_KERNEL);
+}
+static inline void sched_cpumask_free(struct allmasks *masks)
+{
+        kfree(masks);
+}
 #else
-#define SCHED_CPUMASK_ALLOC             0
+#define SCHED_CPUMASK_DECLARE(v)        struct allmasks _v, *v = &_v
-#define SCHED_CPUMASK_FREE(v)
+static inline void sched_cpumask_alloc(struct allmasks **masks)
-#define SCHED_CPUMASK_DECLARE(v)        struct allmasks _v, *v = &_v
+{ }
+static inline void sched_cpumask_free(struct allmasks *masks)
+{ }
 #endif
 #define SCHED_CPUMASK_VAR(v, a)         cpumask_t *v = (cpumask_t *) \
@@ -7400,9 +7368,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
                return -ENOMEM;
        }
-#if SCHED_CPUMASK_ALLOC
        /* get space for all scratch cpumask variables */
-        allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
+        sched_cpumask_alloc(&allmasks);
        if (!allmasks) {
                printk(KERN_WARNING "Cannot alloc cpumask array\n");
                kfree(rd);
@@ -7411,7 +7378,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #endif
                return -ENOMEM;
        }
-#endif
        tmpmask = (cpumask_t *)allmasks;
@@ -7665,13 +7632,14 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
                cpu_attach_domain(sd, rd, i);
        }
-        SCHED_CPUMASK_FREE((void *)allmasks);
+        sched_cpumask_free(allmasks);
        return 0;
 #ifdef CONFIG_NUMA
 error:
        free_sched_groups(cpu_map, tmpmask);
-        SCHED_CPUMASK_FREE((void *)allmasks);
+        sched_cpumask_free(allmasks);
+        kfree(rd);
        return -ENOMEM;
 #endif
 }
@@ -7734,8 +7702,6 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
        cpumask_t tmpmask;
        int i;
-        unregister_sched_domain_sysctl();
        for_each_cpu_mask_nr(i, *cpu_map)
                cpu_attach_domain(NULL, &def_root_domain, i);
        synchronize_sched();
@@ -7773,13 +7739,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
 *
 * The passed in 'doms_new' should be kmalloc'd. This routine takes
 * ownership of it and will kfree it when done with it. If the caller
- * failed the kmalloc call, then it can pass in doms_new == NULL,
+ * failed the kmalloc call, then it can pass in doms_new == NULL &&
- * and partition_sched_domains() will fallback to the single partition
+ * ndoms_new == 1, and partition_sched_domains() will fallback to
- * 'fallback_doms', it also forces the domains to be rebuilt.
+ * the single partition 'fallback_doms', it also forces the domains
+ * to be rebuilt.
 *
- * If doms_new==NULL it will be replaced with cpu_online_map.
+ * If doms_new == NULL it will be replaced with cpu_online_map.
- * ndoms_new==0 is a special case for destroying existing domains.
+ * ndoms_new == 0 is a special case for destroying existing domains,
- * It will not create the default domain.
+ * and it will not create the default domain.
 *
 * Call with hotplug lock held
 */
@@ -7812,7 +7779,7 @@ match1:
                ndoms_cur = 0;
                doms_new = &fallback_doms;
                cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
-                dattr_new = NULL;
+                WARN_ON_ONCE(dattr_new);
        }
        /* Build new domains */
@@ -8472,7 +8439,7 @@ static
 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
        struct cfs_rq *cfs_rq;
-        struct sched_entity *se, *parent_se;
+        struct sched_entity *se;
        struct rq *rq;
        int i;
@@ -8488,18 +8455,17 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
        for_each_possible_cpu(i) {
                rq = cpu_rq(i);
-                cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
+                cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
-                                GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+                                      GFP_KERNEL, cpu_to_node(i));
                if (!cfs_rq)
                        goto err;
-                se = kmalloc_node(sizeof(struct sched_entity),
+                se = kzalloc_node(sizeof(struct sched_entity),
-                                GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+                                  GFP_KERNEL, cpu_to_node(i));
                if (!se)
                        goto err;
-                parent_se = parent ? parent->se[i] : NULL;
+                init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
-                init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
        }
        return 1;
@@ -8560,7 +8526,7 @@ static
 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 {
        struct rt_rq *rt_rq;
-        struct sched_rt_entity *rt_se, *parent_se;
+        struct sched_rt_entity *rt_se;
        struct rq *rq;
        int i;
@@ -8577,18 +8543,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
        for_each_possible_cpu(i) {
                rq = cpu_rq(i);
-                rt_rq = kmalloc_node(sizeof(struct rt_rq),
+                rt_rq = kzalloc_node(sizeof(struct rt_rq),
-                                GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+                                     GFP_KERNEL, cpu_to_node(i));
                if (!rt_rq)
                        goto err;
-                rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
+                rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
-                                GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+                                     GFP_KERNEL, cpu_to_node(i));
                if (!rt_se)
                        goto err;
-                parent_se = parent ? parent->rt_se[i] : NULL;
+                init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
-                init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
        }
        return 1;
@@ -9231,11 +9196,12 @@ struct cgroup_subsys cpu_cgroup_subsys = {
 * (balbir@in.ibm.com).
 */
-/* track cpu usage of a group of tasks */
+/* track cpu usage of a group of tasks and its child groups */
 struct cpuacct {
        struct cgroup_subsys_state css;
        /* cpuusage holds pointer to a u64-type object on every cpu */
        u64 *cpuusage;
+        struct cpuacct *parent;
 };
 struct cgroup_subsys cpuacct_subsys;
@@ -9269,6 +9235,9 @@ static struct cgroup_subsys_state *cpuacct_create(
                return ERR_PTR(-ENOMEM);
        }
+        if (cgrp->parent)
+                ca->parent = cgroup_ca(cgrp->parent);
        return &ca->css;
 }
@@ -9348,14 +9317,16 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
 static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 {
        struct cpuacct *ca;
+        int cpu;
        if (!cpuacct_subsys.active)
                return;
+        cpu = task_cpu(tsk);
        ca = task_ca(tsk);
-        if (ca) {
-                u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
+        for (; ca; ca = ca->parent) {
+                u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
                *cpuusage += cputime;
        }
 }
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 5ae17762ec32..baf2f17af462 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -53,6 +53,40 @@ static unsigned long nsec_low(unsigned long long nsec)
 #define SPLIT_NS(x) nsec_high(x), nsec_low(x)
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void print_cfs_group_stats(struct seq_file *m, int cpu,
+                struct task_group *tg)
+{
+        struct sched_entity *se = tg->se[cpu];
+        if (!se)
+                return;
+#define P(F) \
+        SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)F)
+#define PN(F) \
+        SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
+        PN(se->exec_start);
+        PN(se->vruntime);
+        PN(se->sum_exec_runtime);
+#ifdef CONFIG_SCHEDSTATS
+        PN(se->wait_start);
+        PN(se->sleep_start);
+        PN(se->block_start);
+        PN(se->sleep_max);
+        PN(se->block_max);
+        PN(se->exec_max);
+        PN(se->slice_max);
+        PN(se->wait_max);
+        PN(se->wait_sum);
+        P(se->wait_count);
+#endif
+        P(se->load.weight);
+#undef PN
+#undef P
+}
+#endif
 static void
 print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 {
@@ -121,14 +155,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
        char path[128] = "";
-        struct cgroup *cgroup = NULL;
        struct task_group *tg = cfs_rq->tg;
-        if (tg)
+        cgroup_path(tg->css.cgroup, path, sizeof(path));
-                cgroup = tg->css.cgroup;
-        if (cgroup)
-                cgroup_path(cgroup, path, sizeof(path));
        SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
 #else
@@ -144,7 +173,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
        last = __pick_last_entity(cfs_rq);
        if (last)
                max_vruntime = last->vruntime;
-        min_vruntime = rq->cfs.min_vruntime;
+        min_vruntime = cfs_rq->min_vruntime;
        rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime;
        spin_unlock_irqrestore(&rq->lock, flags);
        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "MIN_vruntime",
@@ -161,31 +190,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
                        SPLIT_NS(spread0));
        SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
        SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
-#ifdef CONFIG_SCHEDSTATS
-#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
-        P(yld_exp_empty);
-        P(yld_act_empty);
-        P(yld_both_empty);
-        P(yld_count);
-        P(sched_switch);
-        P(sched_count);
-        P(sched_goidle);
-        P(ttwu_count);
-        P(ttwu_local);
-        P(bkl_count);
-#undef P
+        SEQ_printf(m, "  .%-30s: %d\n", "nr_spread_over",
-#endif
-        SEQ_printf(m, "  .%-30s: %ld\n", "nr_spread_over",
                        cfs_rq->nr_spread_over);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_SMP
        SEQ_printf(m, "  .%-30s: %lu\n", "shares", cfs_rq->shares);
 #endif
+        print_cfs_group_stats(m, cpu, cfs_rq->tg);
 #endif
 }
@@ -193,14 +205,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 {
 #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
        char path[128] = "";
-        struct cgroup *cgroup = NULL;
        struct task_group *tg = rt_rq->tg;
-        if (tg)
+        cgroup_path(tg->css.cgroup, path, sizeof(path));
-                cgroup = tg->css.cgroup;
-        if (cgroup)
-                cgroup_path(cgroup, path, sizeof(path));
        SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
 #else
@@ -260,6 +267,25 @@ static void print_cpu(struct seq_file *m, int cpu)
 #undef P
 #undef PN
+#ifdef CONFIG_SCHEDSTATS
+#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
+        P(yld_exp_empty);
+        P(yld_act_empty);
+        P(yld_both_empty);
+        P(yld_count);
+        P(sched_switch);
+        P(sched_count);
+        P(sched_goidle);
+        P(ttwu_count);
+        P(ttwu_local);
+        P(bkl_count);
+#undef P
+#endif
        print_cfs_stats(m, cpu);
        print_rt_stats(m, cpu);
@@ -271,7 +297,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
        u64 now = ktime_to_ns(ktime_get());
        int cpu;
-        SEQ_printf(m, "Sched Debug Version: v0.07, %s %.*s\n",
+        SEQ_printf(m, "Sched Debug Version: v0.08, %s %.*s\n",
                init_utsname()->release,
                (int)strcspn(init_utsname()->version, " "),
                init_utsname()->version);
@@ -422,10 +448,11 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 #undef __P
        {
+                unsigned int this_cpu = raw_smp_processor_id();
                u64 t0, t1;
-                t0 = sched_clock();
+                t0 = cpu_clock(this_cpu);
-                t1 = sched_clock();
+                t1 = cpu_clock(this_cpu);
                SEQ_printf(m, "%-35s:%21Ld\n",
                           "clock-delta", (long long)(t1-t0));
        }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ce514afd78ff..98345e45b059 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -341,23 +341,20 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
                cfs_rq->rb_leftmost = next_node;
        }
-        if (cfs_rq->next == se)
-                cfs_rq->next = NULL;
        rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 }
-static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
-{
-        return cfs_rq->rb_leftmost;
-}
 static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
 {
-        return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
+        struct rb_node *left = cfs_rq->rb_leftmost;
+        if (!left)
+                return NULL;
+        return rb_entry(left, struct sched_entity, run_node);
 }
-static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 {
        struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -719,6 +716,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
                __enqueue_entity(cfs_rq, se);
 }
+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+        if (cfs_rq->last == se)
+                cfs_rq->last = NULL;
+        if (cfs_rq->next == se)
+                cfs_rq->next = NULL;
+}
 static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 {
@@ -741,6 +747,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 #endif
        }
+        clear_buddies(cfs_rq, se);
        if (se != cfs_rq->curr)
                __dequeue_entity(cfs_rq, se);
        account_entity_dequeue(cfs_rq, se);
@@ -794,24 +802,15 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 static int
 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
-static struct sched_entity *
-pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-        if (!cfs_rq->next || wakeup_preempt_entity(cfs_rq->next, se) == 1)
-                return se;
-        return cfs_rq->next;
-}
 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 {
-        struct sched_entity *se = NULL;
+        struct sched_entity *se = __pick_next_entity(cfs_rq);
-        if (first_fair(cfs_rq)) {
+        if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1)
-                se = __pick_next_entity(cfs_rq);
+                return cfs_rq->next;
-                se = pick_next(cfs_rq, se);
-                set_next_entity(cfs_rq, se);
+        if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1)
-        }
+                return cfs_rq->last;
        return se;
 }
@@ -983,6 +982,8 @@ static void yield_task_fair(struct rq *rq)
        if (unlikely(cfs_rq->nr_running == 1))
                return;
+        clear_buddies(cfs_rq, se);
        if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
                update_rq_clock(rq);
                /*
@@ -1325,26 +1326,53 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
        return 0;
 }
+static void set_last_buddy(struct sched_entity *se)
+{
+        for_each_sched_entity(se)
+                cfs_rq_of(se)->last = se;
+}
+static void set_next_buddy(struct sched_entity *se)
+{
+        for_each_sched_entity(se)
+                cfs_rq_of(se)->next = se;
+}
 /*
 * Preempt the current task with a newly woken task if needed:
 */
 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 {
        struct task_struct *curr = rq->curr;
-        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
        struct sched_entity *se = &curr->se, *pse = &p->se;
        if (unlikely(rt_prio(p->prio))) {
+                struct cfs_rq *cfs_rq = task_cfs_rq(curr);
                update_rq_clock(rq);
                update_curr(cfs_rq);
                resched_task(curr);
                return;
        }
+        if (unlikely(p->sched_class != &fair_sched_class))
+                return;
        if (unlikely(se == pse))
                return;
-        cfs_rq_of(pse)->next = pse;
+        /*
+         * Only set the backward buddy when the current task is still on the
+         * rq. This can happen when a wakeup gets interleaved with schedule on
+         * the ->pre_schedule() or idle_balance() point, either of which can
+         * drop the rq lock.
+         *
+         * Also, during early boot the idle thread is in the fair class, for
+         * obvious reasons its a bad idea to schedule back to the idle thread.
+         */
+        if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
+                set_last_buddy(se);
+        set_next_buddy(pse);
        /*
         * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1396,6 +1424,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
        do {
                se = pick_next_entity(cfs_rq);
+                set_next_entity(cfs_rq, se);
                cfs_rq = group_cfs_rq(se);
        } while (cfs_rq);
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index fda016218296..da5d93b5d2c6 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -12,3 +12,4 @@ SCHED_FEAT(LB_BIAS, 1)
 SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
 SCHED_FEAT(ASYM_EFF_LOAD, 1)
 SCHED_FEAT(WAKEUP_OVERLAP, 0)
+SCHED_FEAT(LAST_BUDDY, 1)
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index ee71bec1da66..7dbf72a2b02c 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -298,9 +298,11 @@ static inline void account_group_user_time(struct task_struct *tsk,
 {
        struct signal_struct *sig;
-        sig = tsk->signal;
+        /* tsk == current, ensure it is safe to use ->signal */
-        if (unlikely(!sig))
+        if (unlikely(tsk->exit_state))
                return;
+        sig = tsk->signal;
        if (sig->cputime.totals) {
                struct task_cputime *times;
@@ -325,9 +327,11 @@ static inline void account_group_system_time(struct task_struct *tsk,
 {
        struct signal_struct *sig;
-        sig = tsk->signal;
+        /* tsk == current, ensure it is safe to use ->signal */
-        if (unlikely(!sig))
+        if (unlikely(tsk->exit_state))
                return;
+        sig = tsk->signal;
        if (sig->cputime.totals) {
                struct task_cputime *times;
@@ -353,8 +357,11 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
        struct signal_struct *sig;
        sig = tsk->signal;
+        /* see __exit_signal()->task_rq_unlock_wait() */
+        barrier();
        if (unlikely(!sig))
                return;
        if (sig->cputime.totals) {
                struct task_cputime *times;
diff --git a/kernel/smp.c b/kernel/smp.c
index f362a8553777..75c8dde58c55 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -51,10 +51,6 @@ static void csd_flag_wait(struct call_single_data *data)
 {
        /* Wait for response */
        do {
-                /*
-                 * We need to see the flags store in the IPI handler
-                 */
-                smp_mb();
                if (!(data->flags & CSD_FLAG_WAIT))
                        break;
                cpu_relax();
@@ -76,6 +72,11 @@ static void generic_exec_single(int cpu, struct call_single_data *data)
        list_add_tail(&data->list, &dst->list);
        spin_unlock_irqrestore(&dst->lock, flags);
+        /*
+         * Make the list addition visible before sending the ipi.
+         */
+        smp_mb();
        if (ipi)
                arch_send_call_function_single_ipi(cpu);
@@ -157,7 +158,7 @@ void generic_smp_call_function_single_interrupt(void)
         * Need to see other stores to list head for checking whether
         * list is empty without holding q->lock
         */
-        smp_mb();
+        smp_read_barrier_depends();
        while (!list_empty(&q->list)) {
                unsigned int data_flags;
@@ -191,7 +192,7 @@ void generic_smp_call_function_single_interrupt(void)
                /*
                 * See comment on outer loop
                 */
-                smp_mb();
+                smp_read_barrier_depends();
        }
 }
@@ -370,6 +371,11 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
        list_add_tail_rcu(&data->csd.list, &call_function_queue);
        spin_unlock_irqrestore(&call_function_lock, flags);
+        /*
+         * Make the list addition visible before sending the ipi.
+         */
+        smp_mb();
        /* Send a message to all CPUs in the map */
        arch_send_call_function_ipi(mask);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7110daeb9a90..e7c69a720d69 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -269,10 +269,11 @@ void irq_enter(void)
 {
        int cpu = smp_processor_id();
-        if (idle_cpu(cpu) && !in_interrupt())
+        if (idle_cpu(cpu) && !in_interrupt()) {
+                __irq_enter();
                tick_check_idle(cpu);
+        } else
-        __irq_enter();
+                __irq_enter();
 }
 #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 9bc4c00872c9..24e8ceacc388 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -112,7 +112,7 @@ static int chill(void *unused)
 int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
 {
        struct work_struct *sm_work;
-        int i;
+        int i, ret;
        /* Set up initial state. */
        mutex_lock(&lock);
@@ -137,8 +137,9 @@ int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
        /* This will release the thread on our CPU. */
        put_cpu();
        flush_workqueue(stop_machine_wq);
+        ret = active.fnret;
        mutex_unlock(&lock);
-        return active.fnret;
+        return ret;
 }
 int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index a77b27b11b04..e14a23281707 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -31,7 +31,7 @@ cond_syscall(sys_socketpair);
 cond_syscall(sys_bind);
 cond_syscall(sys_listen);
 cond_syscall(sys_accept);
-cond_syscall(sys_paccept);
+cond_syscall(sys_accept4);
 cond_syscall(sys_connect);
 cond_syscall(sys_getsockname);
 cond_syscall(sys_getpeername);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 5bbb1044f847..342fc9ccab46 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -568,6 +568,9 @@ static void tick_nohz_switch_to_nohz(void)
 */
 static void tick_nohz_kick_tick(int cpu)
 {
+#if 0
+        /* Switch back to 2.6.27 behaviour */
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
        ktime_t delta, now;
@@ -584,6 +587,7 @@ static void tick_nohz_kick_tick(int cpu)
                return;
        tick_nohz_restart(ts, now);
+#endif
 }
 #else
diff --git a/kernel/timer.c b/kernel/timer.c
index 56becf373c58..dbd50fabe4c7 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -112,27 +112,8 @@ timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
                                      tbase_get_deferrable(timer->base));
 }
-/**
+static unsigned long round_jiffies_common(unsigned long j, int cpu,
- * __round_jiffies - function to round jiffies to a full second
+                bool force_up)
- * @j: the time in (absolute) jiffies that should be rounded
- * @cpu: the processor number on which the timeout will happen
- *
- * __round_jiffies() rounds an absolute time in the future (in jiffies)
- * up or down to (approximately) full seconds. This is useful for timers
- * for which the exact time they fire does not matter too much, as long as
- * they fire approximately every X seconds.
- *
- * By rounding these timers to whole seconds, all such timers will fire
- * at the same time, rather than at various times spread out. The goal
- * of this is to have the CPU wake up less, which saves power.
- *
- * The exact rounding is skewed for each processor to avoid all
- * processors firing at the exact same time, which could lead
- * to lock contention or spurious cache line bouncing.
- *
- * The return value is the rounded version of the @j parameter.
- */
-unsigned long __round_jiffies(unsigned long j, int cpu)
 {
        int rem;
        unsigned long original = j;
@@ -154,8 +135,9 @@ unsigned long __round_jiffies(unsigned long j, int cpu)
         * due to delays of the timer irq, long irq off times etc etc) then
         * we should round down to the whole second, not up. Use 1/4th second
         * as cutoff for this rounding as an extreme upper bound for this.
+         * But never round down if @force_up is set.
         */
-        if (rem < HZ/4) /* round down */
+        if (rem < HZ/4 && !force_up) /* round down */
                j = j - rem;
        else /* round up */
                j = j - rem + HZ;
@@ -167,6 +149,31 @@ unsigned long __round_jiffies(unsigned long j, int cpu)
                return original;
        return j;
 }
+/**
+ * __round_jiffies - function to round jiffies to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * __round_jiffies() rounds an absolute time in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The exact rounding is skewed for each processor to avoid all
+ * processors firing at the exact same time, which could lead
+ * to lock contention or spurious cache line bouncing.
+ *
+ * The return value is the rounded version of the @j parameter.
+ */
+unsigned long __round_jiffies(unsigned long j, int cpu)
+{
+        return round_jiffies_common(j, cpu, false);
+}
 EXPORT_SYMBOL_GPL(__round_jiffies);
 /**
@@ -191,13 +198,10 @@ EXPORT_SYMBOL_GPL(__round_jiffies);
 */
 unsigned long __round_jiffies_relative(unsigned long j, int cpu)
 {
-        /*
+        unsigned long j0 = jiffies;
-         * In theory the following code can skip a jiffy in case jiffies
-         * increments right between the addition and the later subtraction.
+        /* Use j0 because jiffies might change while we run */
-         * However since the entire point of this function is to use approximate
+        return round_jiffies_common(j + j0, cpu, false) - j0;
-         * timeouts, it's entirely ok to not handle that.
-         */
-        return  __round_jiffies(j + jiffies, cpu) - jiffies;
 }
 EXPORT_SYMBOL_GPL(__round_jiffies_relative);
@@ -218,7 +222,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies_relative);
 */
 unsigned long round_jiffies(unsigned long j)
 {
-        return __round_jiffies(j, raw_smp_processor_id());
+        return round_jiffies_common(j, raw_smp_processor_id(), false);
 }
 EXPORT_SYMBOL_GPL(round_jiffies);
@@ -243,6 +247,71 @@ unsigned long round_jiffies_relative(unsigned long j)
 }
 EXPORT_SYMBOL_GPL(round_jiffies_relative);
+/**
+ * __round_jiffies_up - function to round jiffies up to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * This is the same as __round_jiffies() except that it will never
+ * round down.  This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long __round_jiffies_up(unsigned long j, int cpu)
+{
+        return round_jiffies_common(j, cpu, true);
+}
+EXPORT_SYMBOL_GPL(__round_jiffies_up);
+/**
+ * __round_jiffies_up_relative - function to round jiffies up to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * This is the same as __round_jiffies_relative() except that it will never
+ * round down.  This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
+{
+        unsigned long j0 = jiffies;
+        /* Use j0 because jiffies might change while we run */
+        return round_jiffies_common(j + j0, cpu, true) - j0;
+}
+EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);
+/**
+ * round_jiffies_up - function to round jiffies up to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ *
+ * This is the same as round_jiffies() except that it will never
+ * round down.  This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long round_jiffies_up(unsigned long j)
+{
+        return round_jiffies_common(j, raw_smp_processor_id(), true);
+}
+EXPORT_SYMBOL_GPL(round_jiffies_up);
+/**
+ * round_jiffies_up_relative - function to round jiffies up to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ *
+ * This is the same as round_jiffies_relative() except that it will never
+ * round down.  This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long round_jiffies_up_relative(unsigned long j)
+{
+        return __round_jiffies_up_relative(j, raw_smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
 static inline void set_running_timer(struct tvec_base *base,
                                        struct timer_list *timer)
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index b58f43bec363..33dbefd471e8 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -25,7 +25,7 @@ config TRACING
        bool
        select DEBUG_FS
        select RING_BUFFER
-        select STACKTRACE
+        select STACKTRACE if STACKTRACE_SUPPORT
        select TRACEPOINTS
        select NOP_TRACER
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4a39d24568c8..78db083390f0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -185,7 +185,6 @@ enum {
 };
 static int ftrace_filtered;
-static int tracing_on;
 static LIST_HEAD(ftrace_new_addrs);
@@ -327,96 +326,89 @@ ftrace_record_ip(unsigned long ip)
 static int
 __ftrace_replace_code(struct dyn_ftrace *rec,
-                      unsigned char *old, unsigned char *new, int enable)
+                      unsigned char *nop, int enable)
 {
        unsigned long ip, fl;
+        unsigned char *call, *old, *new;
        ip = rec->ip;
-        if (ftrace_filtered && enable) {
+        /*
+         * If this record is not to be traced and
+         * it is not enabled then do nothing.
+         *
+         * If this record is not to be traced and
+         * it is enabled then disabled it.
+         *
+         */
+        if (rec->flags & FTRACE_FL_NOTRACE) {
+                if (rec->flags & FTRACE_FL_ENABLED)
+                        rec->flags &= ~FTRACE_FL_ENABLED;
+                else
+                        return 0;
+        } else if (ftrace_filtered && enable) {
                /*
-                 * If filtering is on:
+                 * Filtering is on:
-                 *
-                 * If this record is set to be filtered and
-                 * is enabled then do nothing.
-                 *
-                 * If this record is set to be filtered and
-                 * it is not enabled, enable it.
-                 *
-                 * If this record is not set to be filtered
-                 * and it is not enabled do nothing.
-                 *
-                 * If this record is set not to trace then
-                 * do nothing.
-                 *
-                 * If this record is set not to trace and
-                 * it is enabled then disable it.
-                 *
-                 * If this record is not set to be filtered and
-                 * it is enabled, disable it.
                 */
-                fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE |
+                fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
-                                   FTRACE_FL_ENABLED);
-                if ((fl ==  (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) ||
+                /* Record is filtered and enabled, do nothing */
-                    (fl ==  (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE)) ||
+                if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
-                    !fl || (fl == FTRACE_FL_NOTRACE))
                        return 0;
-                /*
+                /* Record is not filtered and is not enabled do nothing */
-                 * If it is enabled disable it,
+                if (!fl)
-                 * otherwise enable it!
+                        return 0;
-                 */
-                if (fl & FTRACE_FL_ENABLED) {
+                /* Record is not filtered but enabled, disable it */
-                        /* swap new and old */
+                if (fl == FTRACE_FL_ENABLED)
-                        new = old;
-                        old = ftrace_call_replace(ip, FTRACE_ADDR);
                        rec->flags &= ~FTRACE_FL_ENABLED;
-                } else {
+                else
-                        new = ftrace_call_replace(ip, FTRACE_ADDR);
+                /* Otherwise record is filtered but not enabled, enable it */
                        rec->flags |= FTRACE_FL_ENABLED;
-                }
        } else {
+                /* Disable or not filtered */
                if (enable) {
-                        /*
+                        /* if record is enabled, do nothing */
-                         * If this record is set not to trace and is
-                         * not enabled, do nothing.
-                         */
-                        fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED);
-                        if (fl == FTRACE_FL_NOTRACE)
-                                return 0;
-                        new = ftrace_call_replace(ip, FTRACE_ADDR);
-                } else
-                        old = ftrace_call_replace(ip, FTRACE_ADDR);
-                if (enable) {
                        if (rec->flags & FTRACE_FL_ENABLED)
                                return 0;
                        rec->flags |= FTRACE_FL_ENABLED;
                } else {
+                        /* if record is not enabled do nothing */
                        if (!(rec->flags & FTRACE_FL_ENABLED))
                                return 0;
                        rec->flags &= ~FTRACE_FL_ENABLED;
                }
        }
+        call = ftrace_call_replace(ip, FTRACE_ADDR);
+        if (rec->flags & FTRACE_FL_ENABLED) {
+                old = nop;
+                new = call;
+        } else {
+                old = call;
+                new = nop;
+        }
        return ftrace_modify_code(ip, old, new);
 }
 static void ftrace_replace_code(int enable)
 {
        int i, failed;
-        unsigned char *new = NULL, *old = NULL;
+        unsigned char *nop = NULL;
        struct dyn_ftrace *rec;
        struct ftrace_page *pg;
-        if (enable)
+        nop = ftrace_nop_replace();
-                old = ftrace_nop_replace();
-        else
-                new = ftrace_nop_replace();
        for (pg = ftrace_pages_start; pg; pg = pg->next) {
                for (i = 0; i < pg->index; i++) {
@@ -434,7 +426,7 @@ static void ftrace_replace_code(int enable)
                                unfreeze_record(rec);
                        }
-                        failed = __ftrace_replace_code(rec, old, new, enable);
+                        failed = __ftrace_replace_code(rec, nop, enable);
                        if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
                                rec->flags |= FTRACE_FL_FAILED;
                                if ((system_state == SYSTEM_BOOTING) ||
@@ -506,13 +498,10 @@ static int __ftrace_modify_code(void *data)
 {
        int *command = data;
-        if (*command & FTRACE_ENABLE_CALLS) {
+        if (*command & FTRACE_ENABLE_CALLS)
                ftrace_replace_code(1);
-                tracing_on = 1;
+        else if (*command & FTRACE_DISABLE_CALLS)
-        } else if (*command & FTRACE_DISABLE_CALLS) {
                ftrace_replace_code(0);
-                tracing_on = 0;
-        }
        if (*command & FTRACE_UPDATE_TRACE_FUNC)
                ftrace_update_ftrace_func(ftrace_trace_function);
@@ -538,8 +527,7 @@ static void ftrace_startup(void)
        mutex_lock(&ftrace_start_lock);
        ftrace_start++;
-        if (ftrace_start == 1)
+        command |= FTRACE_ENABLE_CALLS;
-                command |= FTRACE_ENABLE_CALLS;
        if (saved_ftrace_func != ftrace_trace_function) {
                saved_ftrace_func = ftrace_trace_function;
@@ -677,7 +665,7 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
        cnt = num_to_init / ENTRIES_PER_PAGE;
        pr_info("ftrace: allocating %ld entries in %d pages\n",
-                num_to_init, cnt);
+                num_to_init, cnt + 1);
        for (i = 0; i < cnt; i++) {
                pg->next = (void *)get_zeroed_page(GFP_KERNEL);
@@ -738,6 +726,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
                    ((iter->flags & FTRACE_ITER_FAILURES) &&
                     !(rec->flags & FTRACE_FL_FAILED)) ||
+                    ((iter->flags & FTRACE_ITER_FILTER) &&
+                     !(rec->flags & FTRACE_FL_FILTER)) ||
                    ((iter->flags & FTRACE_ITER_NOTRACE) &&
                     !(rec->flags & FTRACE_FL_NOTRACE))) {
                        rec = NULL;
@@ -757,13 +748,11 @@ static void *t_start(struct seq_file *m, loff_t *pos)
        void *p = NULL;
        loff_t l = -1;
-        if (*pos != iter->pos) {
+        if (*pos > iter->pos)
-                for (p = t_next(m, p, &l); p && l < *pos; p = t_next(m, p, &l))
+                *pos = iter->pos;
-                        ;
-        } else {
+        l = *pos;
-                l = *pos;
+        p = t_next(m, p, &l);
-                p = t_next(m, p, &l);
-        }
        return p;
 }
@@ -774,15 +763,21 @@ static void t_stop(struct seq_file *m, void *p)
 static int t_show(struct seq_file *m, void *v)
 {
+        struct ftrace_iterator *iter = m->private;
        struct dyn_ftrace *rec = v;
        char str[KSYM_SYMBOL_LEN];
+        int ret = 0;
        if (!rec)
                return 0;
        kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
-        seq_printf(m, "%s\n", str);
+        ret = seq_printf(m, "%s\n", str);
+        if (ret < 0) {
+                iter->pos--;
+                iter->idx--;
+        }
        return 0;
 }
@@ -808,7 +803,7 @@ ftrace_avail_open(struct inode *inode, struct file *file)
                return -ENOMEM;
        iter->pg = ftrace_pages_start;
-        iter->pos = -1;
+        iter->pos = 0;
        ret = seq_open(file, &show_ftrace_seq_ops);
        if (!ret) {
@@ -895,7 +890,7 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
        if (file->f_mode & FMODE_READ) {
                iter->pg = ftrace_pages_start;
-                iter->pos = -1;
+                iter->pos = 0;
                iter->flags = enable ? FTRACE_ITER_FILTER :
                        FTRACE_ITER_NOTRACE;
@@ -1186,7 +1181,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
        mutex_lock(&ftrace_sysctl_lock);
        mutex_lock(&ftrace_start_lock);
-        if (iter->filtered && ftrace_start && ftrace_enabled)
+        if (ftrace_start && ftrace_enabled)
                ftrace_run_update_code(FTRACE_ENABLE_CALLS);
        mutex_unlock(&ftrace_start_lock);
        mutex_unlock(&ftrace_sysctl_lock);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index cedf4e268285..f780e9552f91 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -16,14 +16,49 @@
 #include <linux/list.h>
 #include <linux/fs.h>
+#include "trace.h"
+/* Global flag to disable all recording to ring buffers */
+static int ring_buffers_off __read_mostly;
+/**
+ * tracing_on - enable all tracing buffers
+ *
+ * This function enables all tracing buffers that may have been
+ * disabled with tracing_off.
+ */
+void tracing_on(void)
+{
+        ring_buffers_off = 0;
+}
+/**
+ * tracing_off - turn off all tracing buffers
+ *
+ * This function stops all tracing buffers from recording data.
+ * It does not disable any overhead the tracers themselves may
+ * be causing. This function simply causes all recording to
+ * the ring buffers to fail.
+ */
+void tracing_off(void)
+{
+        ring_buffers_off = 1;
+}
 /* Up this if you want to test the TIME_EXTENTS and normalization */
 #define DEBUG_SHIFT 0
 /* FIXME!!! */
 u64 ring_buffer_time_stamp(int cpu)
 {
+        u64 time;
+        preempt_disable_notrace();
        /* shift to debug/test normalization and TIME_EXTENTS */
-        return sched_clock() << DEBUG_SHIFT;
+        time = sched_clock() << DEBUG_SHIFT;
+        preempt_enable_notrace();
+        return time;
 }
 void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
@@ -503,6 +538,12 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
        LIST_HEAD(pages);
        int i, cpu;
+        /*
+         * Always succeed at resizing a non-existent buffer:
+         */
+        if (!buffer)
+                return size;
        size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
        size *= BUF_PAGE_SIZE;
        buffer_size = buffer->pages * BUF_PAGE_SIZE;
@@ -576,6 +617,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
                list_del_init(&page->list);
                free_buffer_page(page);
        }
+        mutex_unlock(&buffer->mutex);
        return -ENOMEM;
 }
@@ -1022,8 +1064,23 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
        struct ring_buffer_event *event;
        u64 ts, delta;
        int commit = 0;
+        int nr_loops = 0;
 again:
+        /*
+         * We allow for interrupts to reenter here and do a trace.
+         * If one does, it will cause this original code to loop
+         * back here. Even with heavy interrupts happening, this
+         * should only happen a few times in a row. If this happens
+         * 1000 times in a row, there must be either an interrupt
+         * storm or we have something buggy.
+         * Bail!
+         */
+        if (unlikely(++nr_loops > 1000)) {
+                RB_WARN_ON(cpu_buffer, 1);
+                return NULL;
+        }
        ts = ring_buffer_time_stamp(cpu_buffer->cpu);
        /*
@@ -1045,7 +1102,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
                /* Did the write stamp get updated already? */
                if (unlikely(ts < cpu_buffer->write_stamp))
-                        goto again;
+                        delta = 0;
                if (test_time_stamp(delta)) {
@@ -1118,6 +1175,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
        struct ring_buffer_event *event;
        int cpu, resched;
+        if (ring_buffers_off)
+                return NULL;
        if (atomic_read(&buffer->record_disabled))
                return NULL;
@@ -1234,6 +1294,9 @@ int ring_buffer_write(struct ring_buffer *buffer,
        int ret = -EBUSY;
        int cpu, resched;
+        if (ring_buffers_off)
+                return -EBUSY;
        if (atomic_read(&buffer->record_disabled))
                return -EBUSY;
@@ -1532,10 +1595,23 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
        struct buffer_page *reader = NULL;
        unsigned long flags;
+        int nr_loops = 0;
        spin_lock_irqsave(&cpu_buffer->lock, flags);
 again:
+        /*
+         * This should normally only loop twice. But because the
+         * start of the reader inserts an empty page, it causes
+         * a case where we will loop three times. There should be no
+         * reason to loop four times (that I know of).
+         */
+        if (unlikely(++nr_loops > 3)) {
+                RB_WARN_ON(cpu_buffer, 1);
+                reader = NULL;
+                goto out;
+        }
        reader = cpu_buffer->reader_page;
        /* If there's more to read, return this page */
@@ -1665,6 +1741,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
        struct ring_buffer_per_cpu *cpu_buffer;
        struct ring_buffer_event *event;
        struct buffer_page *reader;
+        int nr_loops = 0;
        if (!cpu_isset(cpu, buffer->cpumask))
                return NULL;
@@ -1672,6 +1749,19 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
        cpu_buffer = buffer->buffers[cpu];
 again:
+        /*
+         * We repeat when a timestamp is encountered. It is possible
+         * to get multiple timestamps from an interrupt entering just
+         * as one timestamp is about to be written. The max times
+         * that this can happen is the number of nested interrupts we
+         * can have.  Nesting 10 deep of interrupts is clearly
+         * an anomaly.
+         */
+        if (unlikely(++nr_loops > 10)) {
+                RB_WARN_ON(cpu_buffer, 1);
+                return NULL;
+        }
        reader = rb_get_reader_page(cpu_buffer);
        if (!reader)
                return NULL;
@@ -1722,6 +1812,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
        struct ring_buffer *buffer;
        struct ring_buffer_per_cpu *cpu_buffer;
        struct ring_buffer_event *event;
+        int nr_loops = 0;
        if (ring_buffer_iter_empty(iter))
                return NULL;
@@ -1730,6 +1821,19 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
        buffer = cpu_buffer->buffer;
 again:
+        /*
+         * We repeat when a timestamp is encountered. It is possible
+         * to get multiple timestamps from an interrupt entering just
+         * as one timestamp is about to be written. The max times
+         * that this can happen is the number of nested interrupts we
+         * can have. Nesting 10 deep of interrupts is clearly
+         * an anomaly.
+         */
+        if (unlikely(++nr_loops > 10)) {
+                RB_WARN_ON(cpu_buffer, 1);
+                return NULL;
+        }
        if (rb_per_cpu_empty(cpu_buffer))
                return NULL;
@@ -2014,3 +2118,69 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
        return 0;
 }
+static ssize_t
+rb_simple_read(struct file *filp, char __user *ubuf,
+               size_t cnt, loff_t *ppos)
+{
+        int *p = filp->private_data;
+        char buf[64];
+        int r;
+        /* !ring_buffers_off == tracing_on */
+        r = sprintf(buf, "%d\n", !*p);
+        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+static ssize_t
+rb_simple_write(struct file *filp, const char __user *ubuf,
+                size_t cnt, loff_t *ppos)
+{
+        int *p = filp->private_data;
+        char buf[64];
+        long val;
+        int ret;
+        if (cnt >= sizeof(buf))
+                return -EINVAL;
+        if (copy_from_user(&buf, ubuf, cnt))
+                return -EFAULT;
+        buf[cnt] = 0;
+        ret = strict_strtoul(buf, 10, &val);
+        if (ret < 0)
+                return ret;
+        /* !ring_buffers_off == tracing_on */
+        *p = !val;
+        (*ppos)++;
+        return cnt;
+}
+static struct file_operations rb_simple_fops = {
+        .open           = tracing_open_generic,
+        .read           = rb_simple_read,
+        .write          = rb_simple_write,
+};
+static __init int rb_init_debugfs(void)
+{
+        struct dentry *d_tracer;
+        struct dentry *entry;
+        d_tracer = tracing_init_dentry();
+        entry = debugfs_create_file("tracing_on", 0644, d_tracer,
+                                    &ring_buffers_off, &rb_simple_fops);
+        if (!entry)
+                pr_warning("Could not create debugfs 'tracing_on' entry\n");
+        return 0;
+}
+fs_initcall(rb_init_debugfs);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8a499e2adaec..d86e3252f300 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -705,6 +705,7 @@ static void ftrace_trace_stack(struct trace_array *tr,
                               unsigned long flags,
                               int skip, int pc)
 {
+#ifdef CONFIG_STACKTRACE
        struct ring_buffer_event *event;
        struct stack_entry *entry;
        struct stack_trace trace;
@@ -730,6 +731,7 @@ static void ftrace_trace_stack(struct trace_array *tr,
        save_stack_trace(&trace);
        ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+#endif
 }
 void __trace_stack(struct trace_array *tr,
@@ -1086,17 +1088,20 @@ static void s_stop(struct seq_file *m, void *p)
        mutex_unlock(&trace_types_lock);
 }
-#define KRETPROBE_MSG "[unknown/kretprobe'd]"
 #ifdef CONFIG_KRETPROBES
-static inline int kretprobed(unsigned long addr)
+static inline const char *kretprobed(const char *name)
 {
-        return addr == (unsigned long)kretprobe_trampoline;
+        static const char tramp_name[] = "kretprobe_trampoline";
+        int size = sizeof(tramp_name);
+        if (strncmp(tramp_name, name, size) == 0)
+                return "[unknown/kretprobe'd]";
+        return name;
 }
 #else
-static inline int kretprobed(unsigned long addr)
+static inline const char *kretprobed(const char *name)
 {
-        return 0;
+        return name;
 }
 #endif /* CONFIG_KRETPROBES */
@@ -1105,10 +1110,13 @@ seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
 {
 #ifdef CONFIG_KALLSYMS
        char str[KSYM_SYMBOL_LEN];
+        const char *name;
        kallsyms_lookup(address, NULL, NULL, NULL, str);
-        return trace_seq_printf(s, fmt, str);
+        name = kretprobed(str);
+        return trace_seq_printf(s, fmt, name);
 #endif
        return 1;
 }
@@ -1119,9 +1127,12 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
 {
 #ifdef CONFIG_KALLSYMS
        char str[KSYM_SYMBOL_LEN];
+        const char *name;
        sprint_symbol(str, address);
-        return trace_seq_printf(s, fmt, str);
+        name = kretprobed(str);
+        return trace_seq_printf(s, fmt, name);
 #endif
        return 1;
 }
@@ -1375,10 +1386,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
                seq_print_ip_sym(s, field->ip, sym_flags);
                trace_seq_puts(s, " (");
-                if (kretprobed(field->parent_ip))
+                seq_print_ip_sym(s, field->parent_ip, sym_flags);
-                        trace_seq_puts(s, KRETPROBE_MSG);
-                else
-                        seq_print_ip_sym(s, field->parent_ip, sym_flags);
                trace_seq_puts(s, ")\n");
                break;
        }
@@ -1494,12 +1502,9 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
                        ret = trace_seq_printf(s, " <-");
                        if (!ret)
                                return TRACE_TYPE_PARTIAL_LINE;
-                        if (kretprobed(field->parent_ip))
+                        ret = seq_print_ip_sym(s,
-                                ret = trace_seq_puts(s, KRETPROBE_MSG);
+                                               field->parent_ip,
-                        else
+                                               sym_flags);
-                                ret = seq_print_ip_sym(s,
-                                                       field->parent_ip,
-                                                       sym_flags);
                        if (!ret)
                                return TRACE_TYPE_PARTIAL_LINE;
                }
@@ -1750,7 +1755,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
                return TRACE_TYPE_HANDLED;
        SEQ_PUT_FIELD_RET(s, entry->pid);
-        SEQ_PUT_FIELD_RET(s, iter->cpu);
+        SEQ_PUT_FIELD_RET(s, entry->cpu);
        SEQ_PUT_FIELD_RET(s, iter->ts);
        switch (entry->type) {
@@ -1931,6 +1936,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
                        ring_buffer_read_finish(iter->buffer_iter[cpu]);
        }
        mutex_unlock(&trace_types_lock);
+        kfree(iter);
        return ERR_PTR(-ENOMEM);
 }
@@ -2671,7 +2677,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
 {
        unsigned long val;
        char buf[64];
-        int ret;
+        int ret, cpu;
        struct trace_array *tr = filp->private_data;
        if (cnt >= sizeof(buf))
@@ -2699,6 +2705,14 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
                goto out;
        }
+        /* disable all cpu buffers */
+        for_each_tracing_cpu(cpu) {
+                if (global_trace.data[cpu])
+                        atomic_inc(&global_trace.data[cpu]->disabled);
+                if (max_tr.data[cpu])
+                        atomic_inc(&max_tr.data[cpu]->disabled);
+        }
        if (val != global_trace.entries) {
                ret = ring_buffer_resize(global_trace.buffer, val);
                if (ret < 0) {
@@ -2730,6 +2744,13 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
        if (tracing_disabled)
                cnt = -ENOMEM;
 out:
+        for_each_tracing_cpu(cpu) {
+                if (global_trace.data[cpu])
+                        atomic_dec(&global_trace.data[cpu]->disabled);
+                if (max_tr.data[cpu])
+                        atomic_dec(&max_tr.data[cpu]->disabled);
+        }
        max_tr.entries = global_trace.entries;
        mutex_unlock(&trace_types_lock);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f928f2a87b9b..d4dc69ddebd7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -970,6 +970,51 @@ undo:
        return ret;
 }
+#ifdef CONFIG_SMP
+struct work_for_cpu {
+        struct work_struct work;
+        long (*fn)(void *);
+        void *arg;
+        long ret;
+};
+static void do_work_for_cpu(struct work_struct *w)
+{
+        struct work_for_cpu *wfc = container_of(w, struct work_for_cpu, work);
+        wfc->ret = wfc->fn(wfc->arg);
+}
+/**
+ * work_on_cpu - run a function in user context on a particular cpu
+ * @cpu: the cpu to run on
+ * @fn: the function to run
+ * @arg: the function arg
+ *
+ * This will return -EINVAL in the cpu is not online, or the return value
+ * of @fn otherwise.
+ */
+long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
+{
+        struct work_for_cpu wfc;
+        INIT_WORK(&wfc.work, do_work_for_cpu);
+        wfc.fn = fn;
+        wfc.arg = arg;
+        get_online_cpus();
+        if (unlikely(!cpu_online(cpu)))
+                wfc.ret = -EINVAL;
+        else {
+                schedule_work_on(cpu, &wfc.work);
+                flush_work(&wfc.work);
+        }
+        put_online_cpus();
+        return wfc.ret;
+}
+EXPORT_SYMBOL_GPL(work_on_cpu);
+#endif /* CONFIG_SMP */
 void __init init_workqueues(void)
 {
        cpu_populated_map = cpu_online_map;