23 files changed, 297 insertions, 115 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index 989c7c202b3d..b9e2edd00726 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -800,6 +800,12 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
        sig->cputime_expires.virt_exp = cputime_zero;
        sig->cputime_expires.sched_exp = 0;
+        if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
+                sig->cputime_expires.prof_exp =
+                        secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
+                sig->cputimer.running = 1;
+        }
        /* The timer lists. */
        INIT_LIST_HEAD(&sig->cpu_timers[0]);
        INIT_LIST_HEAD(&sig->cpu_timers[1]);
@@ -815,11 +821,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
                atomic_inc(&current->signal->live);
                return 0;
        }
-        sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
-        if (sig)
-                posix_cpu_timers_init_group(sig);
+        sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
        tsk->signal = sig;
        if (!sig)
                return -ENOMEM;
@@ -859,6 +862,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
        task_unlock(current->group_leader);
+        posix_cpu_timers_init_group(sig);
        acct_init_pacct(&sig->pacct);
        tty_audit_fork(sig);
diff --git a/kernel/futex.c b/kernel/futex.c
index 6b50a024bca2..eef8cd26b5e5 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -883,7 +883,12 @@ retry_private:
 out_unlock:
        double_unlock_hb(hb1, hb2);
-        /* drop_futex_key_refs() must be called outside the spinlocks. */
+        /*
+         * drop_futex_key_refs() must be called outside the spinlocks. During
+         * the requeue we moved futex_q's from the hash bucket at key1 to the
+         * one at key2 and updated their key pointer.  We no longer need to
+         * hold the references to key1.
+         */
        while (--drop_count >= 0)
                drop_futex_key_refs(&key1);
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 243d6121e50e..44bbdcbaf8d2 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -54,6 +54,7 @@ static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
 static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
 {
        free_kstat_irqs(old_desc, desc);
+        free_desc_masks(old_desc, desc);
        arch_free_chip_data(old_desc, desc);
 }
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 84bbadd4d021..4ebaf8519abf 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -76,6 +76,7 @@ static int kthread(void *_create)
        /* OK, tell user we're spawned, wait for stop or wakeup */
        __set_current_state(TASK_UNINTERRUPTIBLE);
+        create->result = current;
        complete(&create->started);
        schedule();
@@ -96,22 +97,10 @@ static void create_kthread(struct kthread_create_info *create)
        /* We want our own signal handler (we take no signals by default). */
        pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
-        if (pid < 0) {
+        if (pid < 0)
                create->result = ERR_PTR(pid);
-        } else {
+        else
-                struct sched_param param = { .sched_priority = 0 };
                wait_for_completion(&create->started);
-                read_lock(&tasklist_lock);
-                create->result = find_task_by_pid_ns(pid, &init_pid_ns);
-                read_unlock(&tasklist_lock);
-                /*
-                 * root may have changed our (kthreadd's) priority or CPU mask.
-                 * The kernel thread should not inherit these properties.
-                 */
-                sched_setscheduler(create->result, SCHED_NORMAL, &param);
-                set_user_nice(create->result, KTHREAD_NICE_LEVEL);
-                set_cpus_allowed_ptr(create->result, cpu_all_mask);
-        }
        complete(&create->done);
 }
@@ -154,11 +143,20 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
        wait_for_completion(&create.done);
        if (!IS_ERR(create.result)) {
+                struct sched_param param = { .sched_priority = 0 };
                va_list args;
                va_start(args, namefmt);
                vsnprintf(create.result->comm, sizeof(create.result->comm),
                          namefmt, args);
                va_end(args);
+                /*
+                 * root may have changed our (kthreadd's) priority or CPU mask.
+                 * The kernel thread should not inherit these properties.
+                 */
+                sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param);
+                set_user_nice(create.result, KTHREAD_NICE_LEVEL);
+                set_cpus_allowed_ptr(create.result, cpu_all_mask);
        }
        return create.result;
 }
diff --git a/kernel/module.c b/kernel/module.c
index 05f014efa32c..e797812a4d95 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2388,6 +2388,9 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
        blocking_notifier_call_chain(&module_notify_list,
                                     MODULE_STATE_LIVE, mod);
+        /* We need to finish all async code before the module init sequence is done */
+        async_synchronize_full();
        mutex_lock(&module_mutex);
        /* Drop initial reference. */
        module_put(mod);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 5d79781394a3..507cf2b5e9f1 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -148,7 +148,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
        preempt_disable();
        mutex_acquire(&lock->dep_map, subclass, 0, ip);
-#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES)
+#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) && \
+    !defined(CONFIG_HAVE_DEFAULT_NO_SPIN_MUTEXES)
        /*
         * Optimistic spinning.
         *
diff --git a/kernel/panic.c b/kernel/panic.c
index 3fd8c5bf8b39..934fb377f4b3 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -213,8 +213,16 @@ unsigned long get_taint(void)
 void add_taint(unsigned flag)
 {
-        /* can't trust the integrity of the kernel anymore: */
+        /*
-        debug_locks = 0;
+         * Can't trust the integrity of the kernel anymore.
+         * We don't call directly debug_locks_off() because the issue
+         * is not necessarily serious enough to set oops_in_progress to 1
+         * Also we want to keep up lockdep for staging development and
+         * post-warning case.
+         */
+        if (flag != TAINT_CRAP && flag != TAINT_WARN && __debug_locks_off())
+                printk(KERN_WARNING "Disabling lockdep due to kernel taint\n");
        set_bit(flag, &tainted_mask);
 }
 EXPORT_SYMBOL(add_taint);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 8e5d9a68b022..c9dcf98b4463 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -18,7 +18,7 @@ void update_rlimit_cpu(unsigned long rlim_new)
        cputime = secs_to_cputime(rlim_new);
        if (cputime_eq(current->signal->it_prof_expires, cputime_zero) ||
-            cputime_lt(current->signal->it_prof_expires, cputime)) {
+            cputime_gt(current->signal->it_prof_expires, cputime)) {
                spin_lock_irq(&current->sighand->siglock);
                set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
                spin_unlock_irq(&current->sighand->siglock);
@@ -224,7 +224,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
                cpu->cpu = virt_ticks(p);
                break;
        case CPUCLOCK_SCHED:
-                cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
+                cpu->sched = task_sched_runtime(p);
                break;
        }
        return 0;
@@ -305,18 +305,19 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
 {
        struct task_cputime cputime;
-        thread_group_cputime(p, &cputime);
        switch (CPUCLOCK_WHICH(which_clock)) {
        default:
                return -EINVAL;
        case CPUCLOCK_PROF:
+                thread_group_cputime(p, &cputime);
                cpu->cpu = cputime_add(cputime.utime, cputime.stime);
                break;
        case CPUCLOCK_VIRT:
+                thread_group_cputime(p, &cputime);
                cpu->cpu = cputime.utime;
                break;
        case CPUCLOCK_SCHED:
-                cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
+                cpu->sched = thread_group_sched_runtime(p);
                break;
        }
        return 0;
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 5f21ab2bbcdf..0854770b63b9 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,6 +22,7 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
+#include <scsi/scsi_scan.h>
 #include <asm/suspend.h>
 #include "power.h"
@@ -645,6 +646,13 @@ static int software_resume(void)
                return 0;
        /*
+         * We can't depend on SCSI devices being available after loading one of
+         * their modules if scsi_complete_async_scans() is not called and the
+         * resume device usually is a SCSI one.
+         */
+        scsi_complete_async_scans();
+        /*
         * name_to_dev_t() below takes a sysfs buffer mutex when sysfs
         * is configured into the kernel. Since the regular hibernate
         * trigger path is via sysfs which takes a buffer mutex before
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 6c85359364f2..ed97375daae9 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -24,6 +24,7 @@
 #include <linux/cpu.h>
 #include <linux/freezer.h>
 #include <linux/smp_lock.h>
+#include <scsi/scsi_scan.h>
 #include <asm/uaccess.h>
@@ -92,6 +93,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
        filp->private_data = data;
        memset(&data->handle, 0, sizeof(struct snapshot_handle));
        if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
+                /* Hibernating.  The image device should be accessible. */
                data->swap = swsusp_resume_device ?
                        swap_type_of(swsusp_resume_device, 0, NULL) : -1;
                data->mode = O_RDONLY;
@@ -99,6 +101,13 @@ static int snapshot_open(struct inode *inode, struct file *filp)
                if (error)
                        pm_notifier_call_chain(PM_POST_HIBERNATION);
        } else {
+                /*
+                 * Resuming.  We may need to wait for the image device to
+                 * appear.
+                 */
+                wait_for_device_probe();
+                scsi_complete_async_scans();
                data->swap = -1;
                data->mode = O_WRONLY;
                error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index aaad0ec34194..64191fa09b7e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -21,9 +21,7 @@
 #include <linux/audit.h>
 #include <linux/pid_namespace.h>
 #include <linux/syscalls.h>
+#include <linux/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/uaccess.h>
 /*
@@ -48,7 +46,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
        list_add(&child->ptrace_entry, &new_parent->ptraced);
        child->parent = new_parent;
 }
- 
 /*
 * Turn a tracing stop into a normal stop now, since with no tracer there
 * would be no way to wake it up with SIGCONT or SIGKILL.  If there was a
@@ -173,7 +171,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
        task_lock(task);
        err = __ptrace_may_access(task, mode);
        task_unlock(task);
-        return (!err ? true : false);
+        return !err;
 }
 int ptrace_attach(struct task_struct *task)
@@ -358,7 +356,7 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst
                copied += retval;
                src += retval;
                dst += retval;
-                len -= retval;                  
+                len -= retval;
        }
        return copied;
 }
@@ -383,7 +381,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
                copied += retval;
                src += retval;
                dst += retval;
-                len -= retval;                  
+                len -= retval;
        }
        return copied;
 }
@@ -496,9 +494,9 @@ static int ptrace_resume(struct task_struct *child, long request, long data)
                if (unlikely(!arch_has_single_step()))
                        return -EIO;
                user_enable_single_step(child);
-        }
+        } else {
-        else
                user_disable_single_step(child);
+        }
        child->exit_code = data;
        wake_up_process(child);
diff --git a/kernel/sched.c b/kernel/sched.c
index 6cc1fd5d5072..5724508c3b66 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1418,10 +1418,22 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
                   struct rq_iterator *iterator);
 #endif
+/* Time spent by the tasks of the cpu accounting group executing in ... */
+enum cpuacct_stat_index {
+        CPUACCT_STAT_USER,      /* ... user mode */
+        CPUACCT_STAT_SYSTEM,    /* ... kernel mode */
+        CPUACCT_STAT_NSTATS,
+};
 #ifdef CONFIG_CGROUP_CPUACCT
 static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+static void cpuacct_update_stats(struct task_struct *tsk,
+                enum cpuacct_stat_index idx, cputime_t val);
 #else
 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
+static inline void cpuacct_update_stats(struct task_struct *tsk,
+                enum cpuacct_stat_index idx, cputime_t val) {}
 #endif
 static inline void inc_cpu_load(struct rq *rq, unsigned long load)
@@ -4511,9 +4523,25 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
 EXPORT_PER_CPU_SYMBOL(kstat);
 /*
- * Return any ns on the sched_clock that have not yet been banked in
+ * Return any ns on the sched_clock that have not yet been accounted in
 * @p in case that task is currently running.
+ *
+ * Called with task_rq_lock() held on @rq.
 */
+static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
+{
+        u64 ns = 0;
+        if (task_current(rq, p)) {
+                update_rq_clock(rq);
+                ns = rq->clock - p->se.exec_start;
+                if ((s64)ns < 0)
+                        ns = 0;
+        }
+        return ns;
+}
 unsigned long long task_delta_exec(struct task_struct *p)
 {
        unsigned long flags;
@@ -4521,16 +4549,49 @@ unsigned long long task_delta_exec(struct task_struct *p)
        u64 ns = 0;
        rq = task_rq_lock(p, &flags);
+        ns = do_task_delta_exec(p, rq);
+        task_rq_unlock(rq, &flags);
-        if (task_current(rq, p)) {
+        return ns;
-                u64 delta_exec;
+}
-                update_rq_clock(rq);
+/*
-                delta_exec = rq->clock - p->se.exec_start;
+ * Return accounted runtime for the task.
-                if ((s64)delta_exec > 0)
+ * In case the task is currently running, return the runtime plus current's
-                        ns = delta_exec;
+ * pending runtime that have not been accounted yet.
-        }
+ */
+unsigned long long task_sched_runtime(struct task_struct *p)
+{
+        unsigned long flags;
+        struct rq *rq;
+        u64 ns = 0;
+        rq = task_rq_lock(p, &flags);
+        ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
+        task_rq_unlock(rq, &flags);
+        return ns;
+}
+/*
+ * Return sum_exec_runtime for the thread group.
+ * In case the task is currently running, return the sum plus current's
+ * pending runtime that have not been accounted yet.
+ *
+ * Note that the thread group might have other running tasks as well,
+ * so the return value not includes other pending runtime that other
+ * running tasks might have.
+ */
+unsigned long long thread_group_sched_runtime(struct task_struct *p)
+{
+        struct task_cputime totals;
+        unsigned long flags;
+        struct rq *rq;
+        u64 ns;
+        rq = task_rq_lock(p, &flags);
+        thread_group_cputime(p, &totals);
+        ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
        task_rq_unlock(rq, &flags);
        return ns;
@@ -4559,6 +4620,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
                cpustat->nice = cputime64_add(cpustat->nice, tmp);
        else
                cpustat->user = cputime64_add(cpustat->user, tmp);
+        cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
        /* Account for user time used */
        acct_update_integrals(p);
 }
@@ -4620,6 +4683,8 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
        else
                cpustat->system = cputime64_add(cpustat->system, tmp);
+        cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
        /* Account for system time used */
        acct_update_integrals(p);
 }
@@ -7302,7 +7367,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                cpumask_or(groupmask, groupmask, sched_group_cpus(group));
                cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
-                printk(KERN_CONT " %s", str);
+                printk(KERN_CONT " %s (__cpu_power = %d)", str,
+                                                group->__cpu_power);
                group = group->next;
        } while (group != sd->groups);
@@ -9925,6 +9991,7 @@ struct cpuacct {
        struct cgroup_subsys_state css;
        /* cpuusage holds pointer to a u64-type object on every cpu */
        u64 *cpuusage;
+        struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
        struct cpuacct *parent;
 };
@@ -9949,20 +10016,32 @@ static struct cgroup_subsys_state *cpuacct_create(
        struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
        struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+        int i;
        if (!ca)
-                return ERR_PTR(-ENOMEM);
+                goto out;
        ca->cpuusage = alloc_percpu(u64);
-        if (!ca->cpuusage) {
+        if (!ca->cpuusage)
-                kfree(ca);
+                goto out_free_ca;
-                return ERR_PTR(-ENOMEM);
-        }
+        for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+                if (percpu_counter_init(&ca->cpustat[i], 0))
+                        goto out_free_counters;
        if (cgrp->parent)
                ca->parent = cgroup_ca(cgrp->parent);
        return &ca->css;
+out_free_counters:
+        while (--i >= 0)
+                percpu_counter_destroy(&ca->cpustat[i]);
+        free_percpu(ca->cpuusage);
+out_free_ca:
+        kfree(ca);
+out:
+        return ERR_PTR(-ENOMEM);
 }
 /* destroy an existing cpu accounting group */
@@ -9970,7 +10049,10 @@ static void
 cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
        struct cpuacct *ca = cgroup_ca(cgrp);
+        int i;
+        for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+                percpu_counter_destroy(&ca->cpustat[i]);
        free_percpu(ca->cpuusage);
        kfree(ca);
 }
@@ -10057,6 +10139,25 @@ static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
        return 0;
 }
+static const char *cpuacct_stat_desc[] = {
+        [CPUACCT_STAT_USER] = "user",
+        [CPUACCT_STAT_SYSTEM] = "system",
+};
+static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
+                struct cgroup_map_cb *cb)
+{
+        struct cpuacct *ca = cgroup_ca(cgrp);
+        int i;
+        for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
+                s64 val = percpu_counter_read(&ca->cpustat[i]);
+                val = cputime64_to_clock_t(val);
+                cb->fill(cb, cpuacct_stat_desc[i], val);
+        }
+        return 0;
+}
 static struct cftype files[] = {
        {
                .name = "usage",
@@ -10067,7 +10168,10 @@ static struct cftype files[] = {
                .name = "usage_percpu",
                .read_seq_string = cpuacct_percpu_seq_read,
        },
+        {
+                .name = "stat",
+                .read_map = cpuacct_stats_show,
+        },
 };
 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -10089,12 +10193,38 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
                return;
        cpu = task_cpu(tsk);
+        rcu_read_lock();
        ca = task_ca(tsk);
        for (; ca; ca = ca->parent) {
                u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
                *cpuusage += cputime;
        }
+        rcu_read_unlock();
+}
+/*
+ * Charge the system/user time to the task's accounting group.
+ */
+static void cpuacct_update_stats(struct task_struct *tsk,
+                enum cpuacct_stat_index idx, cputime_t val)
+{
+        struct cpuacct *ca;
+        if (unlikely(!cpuacct_subsys.active))
+                return;
+        rcu_read_lock();
+        ca = task_ca(tsk);
+        do {
+                percpu_counter_add(&ca->cpustat[idx], val);
+                ca = ca->parent;
+        } while (ca);
+        rcu_read_unlock();
 }
 struct cgroup_subsys cpuacct_subsys = {
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 1e00bfacf9b8..cdd3c89574cd 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -55,7 +55,7 @@ static int convert_prio(int prio)
 * cpupri_find - find the best (lowest-pri) CPU in the system
 * @cp: The cpupri context
 * @p: The task
- * @lowest_mask: A mask to fill in with selected CPUs
+ * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
 *
 * Note: This function returns the recommended CPUs as calculated during the
 * current invokation.  By the time the call returns, the CPUs may have in
@@ -81,7 +81,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
                if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
                        continue;
-                cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
+                if (lowest_mask)
+                        cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
                return 1;
        }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 299d012b4394..f2c66f8f9712 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -948,20 +948,15 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 {
-        cpumask_var_t mask;
        if (rq->curr->rt.nr_cpus_allowed == 1)
                return;
-        if (!alloc_cpumask_var(&mask, GFP_ATOMIC))
-                return;
        if (p->rt.nr_cpus_allowed != 1
-            && cpupri_find(&rq->rd->cpupri, p, mask))
+            && cpupri_find(&rq->rd->cpupri, p, NULL))
-                goto free;
+                return;
-        if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask))
+        if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
-                goto free;
+                return;
        /*
         * There appears to be other cpus that can accept
@@ -970,8 +965,6 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
         */
        requeue_task_rt(rq, p, 1);
        resched_task(rq->curr);
-free:
-        free_cpumask_var(mask);
 }
 #endif /* CONFIG_SMP */
diff --git a/kernel/timer.c b/kernel/timer.c
index b4555568b4e4..cffffad01c31 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -531,10 +531,13 @@ static void __init_timer(struct timer_list *timer,
 }
 /**
- * init_timer - initialize a timer.
+ * init_timer_key - initialize a timer
 * @timer: the timer to be initialized
+ * @name: name of the timer
+ * @key: lockdep class key of the fake lock used for tracking timer
+ *       sync lock dependencies
 *
- * init_timer() must be done to a timer prior calling *any* of the
+ * init_timer_key() must be done to a timer prior calling *any* of the
 * other timer functions.
 */
 void init_timer_key(struct timer_list *timer,
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 2246141bda4d..417d1985e299 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -312,7 +312,7 @@ config KMEMTRACE
          and profile kernel code.
          This requires an userspace application to use. See
-          Documentation/vm/kmemtrace.txt for more information.
+          Documentation/trace/kmemtrace.txt for more information.
          Saying Y will make the kernel somewhat larger and slower. However,
          if you disable kmemtrace at run-time or boot-time, the performance
@@ -403,7 +403,7 @@ config MMIOTRACE
          implementation and works via page faults. Tracing is disabled by
          default and can be enabled at run-time.
-          See Documentation/tracers/mmiotrace.txt.
+          See Documentation/trace/mmiotrace.txt.
          If you are not helping to develop drivers, say N.
 config MMIOTRACE_TEST
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b32ff446c3fb..921ef5d1f0ba 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1377,12 +1377,12 @@ static int blk_trace_str2mask(const char *str)
 {
        int i;
        int mask = 0;
-        char *s, *token;
+        char *buf, *s, *token;
-        s = kstrdup(str, GFP_KERNEL);
+        buf = kstrdup(str, GFP_KERNEL);
-        if (s == NULL)
+        if (buf == NULL)
                return -ENOMEM;
-        s = strstrip(s);
+        s = strstrip(buf);
        while (1) {
                token = strsep(&s, ",");
@@ -1403,7 +1403,7 @@ static int blk_trace_str2mask(const char *str)
                        break;
                }
        }
-        kfree(s);
+        kfree(buf);
        return mask;
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9d28476a9851..1ce5dc6372b8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3277,19 +3277,13 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
        info->tr        = &global_trace;
        info->cpu       = cpu;
-        info->spare     = ring_buffer_alloc_read_page(info->tr->buffer);
+        info->spare     = NULL;
        /* Force reading ring buffer for first read */
        info->read      = (unsigned int)-1;
-        if (!info->spare)
-                goto out;
        filp->private_data = info;
-        return 0;
+        return nonseekable_open(inode, filp);
- out:
-        kfree(info);
-        return -ENOMEM;
 }
 static ssize_t
@@ -3304,6 +3298,11 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
        if (!count)
                return 0;
+        if (!info->spare)
+                info->spare = ring_buffer_alloc_read_page(info->tr->buffer);
+        if (!info->spare)
+                return -ENOMEM;
        /* Do we have previous read data to read? */
        if (info->read < PAGE_SIZE)
                goto read;
@@ -3342,7 +3341,8 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
 {
        struct ftrace_buffer_info *info = file->private_data;
-        ring_buffer_free_read_page(info->tr->buffer, info->spare);
+        if (info->spare)
+                ring_buffer_free_read_page(info->tr->buffer, info->spare);
        kfree(info);
        return 0;
@@ -3428,14 +3428,19 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
        int size, i;
        size_t ret;
-        /*
+        if (*ppos & (PAGE_SIZE - 1)) {
-         * We can't seek on a buffer input
+                WARN_ONCE(1, "Ftrace: previous read must page-align\n");
-         */
+                return -EINVAL;
-        if (unlikely(*ppos))
+        }
-                return -ESPIPE;
+        if (len & (PAGE_SIZE - 1)) {
+                WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
+                if (len < PAGE_SIZE)
+                        return -EINVAL;
+                len &= PAGE_MASK;
+        }
-        for (i = 0; i < PIPE_BUFFERS && len; i++, len -= size) {
+        for (i = 0; i < PIPE_BUFFERS && len; i++, len -= PAGE_SIZE) {
                struct page *page;
                int r;
@@ -3474,6 +3479,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                spd.partial[i].offset = 0;
                spd.partial[i].private = (unsigned long)ref;
                spd.nr_pages++;
+                *ppos += PAGE_SIZE;
        }
        spd.nr_pages = i;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 64ec4d278ffb..576f4fa2af0d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -503,6 +503,7 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
        if (copy_from_user(&buf, ubuf, cnt))
                return -EFAULT;
+        buf[cnt] = '\0';
        pred = kzalloc(sizeof(*pred), GFP_KERNEL);
        if (!pred)
@@ -520,9 +521,10 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
                return cnt;
        }
-        if (filter_add_pred(call, pred)) {
+        err = filter_add_pred(call, pred);
+        if (err < 0) {
                filter_free_pred(pred);
-                return -EINVAL;
+                return err;
        }
        *ppos += cnt;
@@ -569,6 +571,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
        if (copy_from_user(&buf, ubuf, cnt))
                return -EFAULT;
+        buf[cnt] = '\0';
        pred = kzalloc(sizeof(*pred), GFP_KERNEL);
        if (!pred)
@@ -586,10 +589,11 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
                return cnt;
        }
-        if (filter_add_subsystem_pred(system, pred)) {
+        err = filter_add_subsystem_pred(system, pred);
+        if (err < 0) {
                filter_free_subsystem_preds(system);
                filter_free_pred(pred);
-                return -EINVAL;
+                return err;
        }
        *ppos += cnt;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 026be412f356..e03cbf1e38f3 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -215,7 +215,7 @@ static int __filter_add_pred(struct ftrace_event_call *call,
                }
        }
-        return -ENOMEM;
+        return -ENOSPC;
 }
 static int is_string_field(const char *type)
@@ -319,7 +319,7 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
        }
        if (i == MAX_FILTER_PRED)
-                return -EINVAL;
+                return -ENOSPC;
        events_for_each(call) {
                int err;
@@ -410,16 +410,22 @@ int filter_parse(char **pbuf, struct filter_pred *pred)
                }
        }
+        if (!val_str) {
+                pred->field_name = NULL;
+                return -EINVAL;
+        }
        pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
        if (!pred->field_name)
                return -ENOMEM;
-        pred->val = simple_strtoull(val_str, &tmp, 10);
+        pred->val = simple_strtoull(val_str, &tmp, 0);
        if (tmp == val_str) {
                pred->str_val = kstrdup(val_str, GFP_KERNEL);
                if (!pred->str_val)
                        return -ENOMEM;
-        }
+        } else if (*tmp != '\0')
+                return -EINVAL;
        return 0;
 }
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 30743f7d4110..d363c6672c6c 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -105,10 +105,10 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
                return 0;
 #undef __entry
-#define __entry "REC"
+#define __entry REC
 #undef TP_printk
-#define TP_printk(fmt, args...) "%s, %s\n", #fmt, #args
+#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
 #undef TP_fast_assign
 #define TP_fast_assign(args...) args
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index a2a3af29c943..5e579645ac86 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,5 +1,5 @@
+#include <trace/syscall.h>
 #include <linux/kernel.h>
-#include <linux/ftrace.h>
 #include <asm/syscall.h>
 #include "trace_output.h"
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b6b966ce1451..f71fb2a08950 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -966,20 +966,20 @@ undo:
 }
 #ifdef CONFIG_SMP
-static struct workqueue_struct *work_on_cpu_wq __read_mostly;
 struct work_for_cpu {
-        struct work_struct work;
+        struct completion completion;
        long (*fn)(void *);
        void *arg;
        long ret;
 };
-static void do_work_for_cpu(struct work_struct *w)
+static int do_work_for_cpu(void *_wfc)
 {
-        struct work_for_cpu *wfc = container_of(w, struct work_for_cpu, work);
+        struct work_for_cpu *wfc = _wfc;
        wfc->ret = wfc->fn(wfc->arg);
+        complete(&wfc->completion);
+        return 0;
 }
 /**
@@ -990,17 +990,23 @@ static void do_work_for_cpu(struct work_struct *w)
 *
 * This will return the value @fn returns.
 * It is up to the caller to ensure that the cpu doesn't go offline.
+ * The caller must not hold any locks which would prevent @fn from completing.
 */
 long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
 {
-        struct work_for_cpu wfc;
+        struct task_struct *sub_thread;
+        struct work_for_cpu wfc = {
-        INIT_WORK(&wfc.work, do_work_for_cpu);
+                .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion),
-        wfc.fn = fn;
+                .fn = fn,
-        wfc.arg = arg;
+                .arg = arg,
-        queue_work_on(cpu, work_on_cpu_wq, &wfc.work);
+        };
-        flush_work(&wfc.work);
+        sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu");
+        if (IS_ERR(sub_thread))
+                return PTR_ERR(sub_thread);
+        kthread_bind(sub_thread, cpu);
+        wake_up_process(sub_thread);
+        wait_for_completion(&wfc.completion);
        return wfc.ret;
 }
 EXPORT_SYMBOL_GPL(work_on_cpu);
@@ -1016,8 +1022,4 @@ void __init init_workqueues(void)
        hotcpu_notifier(workqueue_cpu_callback, 0);
        keventd_wq = create_workqueue("events");
        BUG_ON(!keventd_wq);
-#ifdef CONFIG_SMP
-        work_on_cpu_wq = create_workqueue("work_on_cpu");
-        BUG_ON(!work_on_cpu_wq);
-#endif
 }