Merge branch 'x86/urgent' into x86/xen

Conflicts: arch/frv/include/asm/pgtable.h arch/x86/include/asm/required-features.h arch/x86/xen/mmu.c Merge reason: x86/xen was on a .29 base still, move it to a fresher branch and pick up Xen fixes as well, plus resolve conflicts Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Ingo Molnar <mingo@elte.hu> 2009-05-08 04:50:00 -0400
committer: Ingo Molnar <mingo@elte.hu> 2009-05-08 04:50:00 -0400
commit: f066a155334642b8a206eec625b1925d88c48aeb (patch)
tree: cb12975e60b70d1dae3b7397bab955de78a4d01e /kernel
parent: e7c064889606aab3569669078c69b87b2c527e72 (diff)
parent: 33df4db04a79660150e1948e3296eeb451ac121b (diff)
53 files changed, 967 insertions, 412 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index bab1dffe37e9..42423665660a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -74,6 +74,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_KGDB) += kgdb.o
 obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
+obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 917ab9525568..6e7351739a82 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -734,9 +734,6 @@ int audit_tag_tree(char *old, char *new)
        dentry = dget(path.dentry);
        path_put(&path);
-        if (dentry == tagged->mnt_root && dentry == mnt->mnt_root)
-                follow_up(&mnt, &dentry);
        list_add_tail(&list, &tagged->mnt_list);
        mutex_lock(&audit_filter_mutex);
diff --git a/kernel/exit.c b/kernel/exit.c
index 32cbf2607cb0..abf9cf3b95c6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -923,6 +923,8 @@ NORET_TYPE void do_exit(long code)
                schedule();
        }
+        exit_irq_thread();
        exit_signals(tsk);  /* sets PF_EXITING */
        /*
         * tsk->flags are checked in the futex code to protect against
diff --git a/kernel/fork.c b/kernel/fork.c
index 660c2b8765bc..b9e2edd00726 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -645,6 +645,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
        tsk->min_flt = tsk->maj_flt = 0;
        tsk->nvcsw = tsk->nivcsw = 0;
+#ifdef CONFIG_DETECT_HUNG_TASK
+        tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
+#endif
        tsk->mm = NULL;
        tsk->active_mm = NULL;
@@ -797,6 +800,12 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
        sig->cputime_expires.virt_exp = cputime_zero;
        sig->cputime_expires.sched_exp = 0;
+        if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
+                sig->cputime_expires.prof_exp =
+                        secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
+                sig->cputimer.running = 1;
+        }
        /* The timer lists. */
        INIT_LIST_HEAD(&sig->cpu_timers[0]);
        INIT_LIST_HEAD(&sig->cpu_timers[1]);
@@ -812,11 +821,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
                atomic_inc(&current->signal->live);
                return 0;
        }
-        sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
-        if (sig)
-                posix_cpu_timers_init_group(sig);
+        sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
        tsk->signal = sig;
        if (!sig)
                return -ENOMEM;
@@ -856,6 +862,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
        task_unlock(current->group_leader);
+        posix_cpu_timers_init_group(sig);
        acct_init_pacct(&sig->pacct);
        tty_audit_fork(sig);
@@ -1032,11 +1040,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->default_timer_slack_ns = current->timer_slack_ns;
-#ifdef CONFIG_DETECT_SOFTLOCKUP
-        p->last_switch_count = 0;
-        p->last_switch_timestamp = 0;
-#endif
        task_io_accounting_init(&p->ioac);
        acct_clear_integrals(p);
diff --git a/kernel/futex.c b/kernel/futex.c
index 6b50a024bca2..eef8cd26b5e5 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -883,7 +883,12 @@ retry_private:
 out_unlock:
        double_unlock_hb(hb1, hb2);
-        /* drop_futex_key_refs() must be called outside the spinlocks. */
+        /*
+         * drop_futex_key_refs() must be called outside the spinlocks. During
+         * the requeue we moved futex_q's from the hash bucket at key1 to the
+         * one at key2 and updated their key pointer.  We no longer need to
+         * hold the references to key1.
+         */
        while (--drop_count >= 0)
                drop_futex_key_refs(&key1);
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
new file mode 100644
index 000000000000..022a4927b785
--- /dev/null
+++ b/kernel/hung_task.c
@@ -0,0 +1,217 @@
+/*
+ * Detect Hung Task
+ *
+ * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state
+ *
+ */
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/lockdep.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+/*
+ * The number of tasks checked:
+ */
+unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
+/*
+ * Limit number of tasks checked in a batch.
+ *
+ * This value controls the preemptibility of khungtaskd since preemption
+ * is disabled during the critical section. It also controls the size of
+ * the RCU grace period. So it needs to be upper-bound.
+ */
+#define HUNG_TASK_BATCHING 1024
+/*
+ * Zero means infinite timeout - no checking done:
+ */
+unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
+unsigned long __read_mostly sysctl_hung_task_warnings = 10;
+static int __read_mostly did_panic;
+static struct task_struct *watchdog_task;
+/*
+ * Should we panic (and reboot, if panic_timeout= is set) when a
+ * hung task is detected:
+ */
+unsigned int __read_mostly sysctl_hung_task_panic =
+                                CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE;
+static int __init hung_task_panic_setup(char *str)
+{
+        sysctl_hung_task_panic = simple_strtoul(str, NULL, 0);
+        return 1;
+}
+__setup("hung_task_panic=", hung_task_panic_setup);
+static int
+hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
+{
+        did_panic = 1;
+        return NOTIFY_DONE;
+}
+static struct notifier_block panic_block = {
+        .notifier_call = hung_task_panic,
+};
+static void check_hung_task(struct task_struct *t, unsigned long timeout)
+{
+        unsigned long switch_count = t->nvcsw + t->nivcsw;
+        /*
+         * Ensure the task is not frozen.
+         * Also, when a freshly created task is scheduled once, changes
+         * its state to TASK_UNINTERRUPTIBLE without having ever been
+         * switched out once, it musn't be checked.
+         */
+        if (unlikely(t->flags & PF_FROZEN || !switch_count))
+                return;
+        if (switch_count != t->last_switch_count) {
+                t->last_switch_count = switch_count;
+                return;
+        }
+        if (!sysctl_hung_task_warnings)
+                return;
+        sysctl_hung_task_warnings--;
+        /*
+         * Ok, the task did not get scheduled for more than 2 minutes,
+         * complain:
+         */
+        printk(KERN_ERR "INFO: task %s:%d blocked for more than "
+                        "%ld seconds.\n", t->comm, t->pid, timeout);
+        printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
+                        " disables this message.\n");
+        sched_show_task(t);
+        __debug_show_held_locks(t);
+        touch_nmi_watchdog();
+        if (sysctl_hung_task_panic)
+                panic("hung_task: blocked tasks");
+}
+/*
+ * To avoid extending the RCU grace period for an unbounded amount of time,
+ * periodically exit the critical section and enter a new one.
+ *
+ * For preemptible RCU it is sufficient to call rcu_read_unlock in order
+ * exit the grace period. For classic RCU, a reschedule is required.
+ */
+static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
+{
+        get_task_struct(g);
+        get_task_struct(t);
+        rcu_read_unlock();
+        cond_resched();
+        rcu_read_lock();
+        put_task_struct(t);
+        put_task_struct(g);
+}
+/*
+ * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
+ * a really long time (120 seconds). If that happens, print out
+ * a warning.
+ */
+static void check_hung_uninterruptible_tasks(unsigned long timeout)
+{
+        int max_count = sysctl_hung_task_check_count;
+        int batch_count = HUNG_TASK_BATCHING;
+        struct task_struct *g, *t;
+        /*
+         * If the system crashed already then all bets are off,
+         * do not report extra hung tasks:
+         */
+        if (test_taint(TAINT_DIE) || did_panic)
+                return;
+        rcu_read_lock();
+        do_each_thread(g, t) {
+                if (!--max_count)
+                        goto unlock;
+                if (!--batch_count) {
+                        batch_count = HUNG_TASK_BATCHING;
+                        rcu_lock_break(g, t);
+                        /* Exit if t or g was unhashed during refresh. */
+                        if (t->state == TASK_DEAD || g->state == TASK_DEAD)
+                                goto unlock;
+                }
+                /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
+                if (t->state == TASK_UNINTERRUPTIBLE)
+                        check_hung_task(t, timeout);
+        } while_each_thread(g, t);
+ unlock:
+        rcu_read_unlock();
+}
+static unsigned long timeout_jiffies(unsigned long timeout)
+{
+        /* timeout of 0 will disable the watchdog */
+        return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT;
+}
+/*
+ * Process updating of timeout sysctl
+ */
+int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
+                                  struct file *filp, void __user *buffer,
+                                  size_t *lenp, loff_t *ppos)
+{
+        int ret;
+        ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+        if (ret || !write)
+                goto out;
+        wake_up_process(watchdog_task);
+ out:
+        return ret;
+}
+/*
+ * kthread which checks for tasks stuck in D state
+ */
+static int watchdog(void *dummy)
+{
+        set_user_nice(current, 0);
+        for ( ; ; ) {
+                unsigned long timeout = sysctl_hung_task_timeout_secs;
+                while (schedule_timeout_interruptible(timeout_jiffies(timeout)))
+                        timeout = sysctl_hung_task_timeout_secs;
+                check_hung_uninterruptible_tasks(timeout);
+        }
+        return 0;
+}
+static int __init hung_task_init(void)
+{
+        atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+        watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
+        return 0;
+}
+module_init(hung_task_init);
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 38a25b8d8bff..d06df9c41cba 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -26,10 +26,12 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
 }
 /**
- *      devm_request_irq - allocate an interrupt line for a managed device
+ *      devm_request_threaded_irq - allocate an interrupt line for a managed device
 *      @dev: device to request interrupt for
 *      @irq: Interrupt line to allocate
 *      @handler: Function to be called when the IRQ occurs
+ *      @thread_fn: function to be called in a threaded interrupt context. NULL
+ *                  for devices which handle everything in @handler
 *      @irqflags: Interrupt type flags
 *      @devname: An ascii name for the claiming device
 *      @dev_id: A cookie passed back to the handler function
@@ -42,9 +44,10 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
 *      If an IRQ allocated with this function needs to be freed
 *      separately, dev_free_irq() must be used.
 */
-int devm_request_irq(struct device *dev, unsigned int irq,
+int devm_request_threaded_irq(struct device *dev, unsigned int irq,
-                     irq_handler_t handler, unsigned long irqflags,
+                              irq_handler_t handler, irq_handler_t thread_fn,
-                     const char *devname, void *dev_id)
+                              unsigned long irqflags, const char *devname,
+                              void *dev_id)
 {
        struct irq_devres *dr;
        int rc;
@@ -54,7 +57,8 @@ int devm_request_irq(struct device *dev, unsigned int irq,
        if (!dr)
                return -ENOMEM;
-        rc = request_irq(irq, handler, irqflags, devname, dev_id);
+        rc = request_threaded_irq(irq, handler, thread_fn, irqflags, devname,
+                                  dev_id);
        if (rc) {
                devres_free(dr);
                return rc;
@@ -66,7 +70,7 @@ int devm_request_irq(struct device *dev, unsigned int irq,
        return 0;
 }
-EXPORT_SYMBOL(devm_request_irq);
+EXPORT_SYMBOL(devm_request_threaded_irq);
 /**
 *      devm_free_irq - free an interrupt
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 343acecae629..26e08754744f 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -339,6 +339,15 @@ irqreturn_t no_action(int cpl, void *dev_id)
        return IRQ_NONE;
 }
+static void warn_no_thread(unsigned int irq, struct irqaction *action)
+{
+        if (test_and_set_bit(IRQTF_WARNED, &action->thread_flags))
+                return;
+        printk(KERN_WARNING "IRQ %d device %s returned IRQ_WAKE_THREAD "
+               "but no thread function available.", irq, action->name);
+}
 DEFINE_TRACE(irq_handler_entry);
 DEFINE_TRACE(irq_handler_exit);
@@ -354,8 +363,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
        irqreturn_t ret, retval = IRQ_NONE;
        unsigned int status = 0;
-        WARN_ONCE(!in_irq(), "BUG: IRQ handler called from non-hardirq context!");
        if (!(action->flags & IRQF_DISABLED))
                local_irq_enable_in_hardirq();
@@ -363,8 +370,47 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
                trace_irq_handler_entry(irq, action);
                ret = action->handler(irq, action->dev_id);
                trace_irq_handler_exit(irq, action, ret);
-                if (ret == IRQ_HANDLED)
+                switch (ret) {
+                case IRQ_WAKE_THREAD:
+                        /*
+                         * Set result to handled so the spurious check
+                         * does not trigger.
+                         */
+                        ret = IRQ_HANDLED;
+                        /*
+                         * Catch drivers which return WAKE_THREAD but
+                         * did not set up a thread function
+                         */
+                        if (unlikely(!action->thread_fn)) {
+                                warn_no_thread(irq, action);
+                                break;
+                        }
+                        /*
+                         * Wake up the handler thread for this
+                         * action. In case the thread crashed and was
+                         * killed we just pretend that we handled the
+                         * interrupt. The hardirq handler above has
+                         * disabled the device interrupt, so no irq
+                         * storm is lurking.
+                         */
+                        if (likely(!test_bit(IRQTF_DIED,
+                                             &action->thread_flags))) {
+                                set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
+                                wake_up_process(action->thread);
+                        }
+                        /* Fall through to add to randomness */
+                case IRQ_HANDLED:
                        status |= action->flags;
+                        break;
+                default:
+                        break;
+                }
                retval |= ret;
                action = action->next;
        } while (action);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1516ab77355c..2734eca59243 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -8,16 +8,15 @@
 */
 #include <linux/irq.h>
+#include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/interrupt.h>
 #include <linux/slab.h>
+#include <linux/sched.h>
 #include "internals.h"
-#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
-cpumask_var_t irq_default_affinity;
 /**
 *      synchronize_irq - wait for pending IRQ handlers (on other CPUs)
 *      @irq: interrupt number to wait for
@@ -53,9 +52,18 @@ void synchronize_irq(unsigned int irq)
                /* Oops, that failed? */
        } while (status & IRQ_INPROGRESS);
+        /*
+         * We made sure that no hardirq handler is running. Now verify
+         * that no threaded handlers are active.
+         */
+        wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active));
 }
 EXPORT_SYMBOL(synchronize_irq);
+#ifdef CONFIG_SMP
+cpumask_var_t irq_default_affinity;
 /**
 *      irq_can_set_affinity - Check if the affinity of a given irq can be set
 *      @irq:           Interrupt to check
@@ -72,6 +80,18 @@ int irq_can_set_affinity(unsigned int irq)
        return 1;
 }
+static void
+irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask)
+{
+        struct irqaction *action = desc->action;
+        while (action) {
+                if (action->thread)
+                        set_cpus_allowed_ptr(action->thread, cpumask);
+                action = action->next;
+        }
+}
 /**
 *      irq_set_affinity - Set the irq affinity of a given irq
 *      @irq:           Interrupt to set affinity
@@ -89,10 +109,9 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
        spin_lock_irqsave(&desc->lock, flags);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-        if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
+        if (desc->status & IRQ_MOVE_PCNTXT)
-                cpumask_copy(desc->affinity, cpumask);
                desc->chip->set_affinity(irq, cpumask);
-        } else {
+        else {
                desc->status |= IRQ_MOVE_PENDING;
                cpumask_copy(desc->pending_mask, cpumask);
        }
@@ -100,6 +119,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
        cpumask_copy(desc->affinity, cpumask);
        desc->chip->set_affinity(irq, cpumask);
 #endif
+        irq_set_thread_affinity(desc, cpumask);
        desc->status |= IRQ_AFFINITY_SET;
        spin_unlock_irqrestore(&desc->lock, flags);
        return 0;
@@ -150,6 +170,8 @@ int irq_select_affinity_usr(unsigned int irq)
        spin_lock_irqsave(&desc->lock, flags);
        ret = setup_affinity(irq, desc);
+        if (!ret)
+                irq_set_thread_affinity(desc, desc->affinity);
        spin_unlock_irqrestore(&desc->lock, flags);
        return ret;
@@ -401,6 +423,90 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
        return ret;
 }
+static int irq_wait_for_interrupt(struct irqaction *action)
+{
+        while (!kthread_should_stop()) {
+                set_current_state(TASK_INTERRUPTIBLE);
+                if (test_and_clear_bit(IRQTF_RUNTHREAD,
+                                       &action->thread_flags)) {
+                        __set_current_state(TASK_RUNNING);
+                        return 0;
+                }
+                schedule();
+        }
+        return -1;
+}
+/*
+ * Interrupt handler thread
+ */
+static int irq_thread(void *data)
+{
+        struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
+        struct irqaction *action = data;
+        struct irq_desc *desc = irq_to_desc(action->irq);
+        int wake;
+        sched_setscheduler(current, SCHED_FIFO, &param);
+        current->irqaction = action;
+        while (!irq_wait_for_interrupt(action)) {
+                atomic_inc(&desc->threads_active);
+                spin_lock_irq(&desc->lock);
+                if (unlikely(desc->status & IRQ_DISABLED)) {
+                        /*
+                         * CHECKME: We might need a dedicated
+                         * IRQ_THREAD_PENDING flag here, which
+                         * retriggers the thread in check_irq_resend()
+                         * but AFAICT IRQ_PENDING should be fine as it
+                         * retriggers the interrupt itself --- tglx
+                         */
+                        desc->status |= IRQ_PENDING;
+                        spin_unlock_irq(&desc->lock);
+                } else {
+                        spin_unlock_irq(&desc->lock);
+                        action->thread_fn(action->irq, action->dev_id);
+                }
+                wake = atomic_dec_and_test(&desc->threads_active);
+                if (wake && waitqueue_active(&desc->wait_for_threads))
+                        wake_up(&desc->wait_for_threads);
+        }
+        /*
+         * Clear irqaction. Otherwise exit_irq_thread() would make
+         * fuzz about an active irq thread going into nirvana.
+         */
+        current->irqaction = NULL;
+        return 0;
+}
+/*
+ * Called from do_exit()
+ */
+void exit_irq_thread(void)
+{
+        struct task_struct *tsk = current;
+        if (!tsk->irqaction)
+                return;
+        printk(KERN_ERR
+               "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
+               tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq);
+        /*
+         * Set the THREAD DIED flag to prevent further wakeups of the
+         * soon to be gone threaded handler.
+         */
+        set_bit(IRQTF_DIED, &tsk->irqaction->flags);
+}
 /*
 * Internal function to register an irqaction - typically used to
 * allocate special interrupts that are part of the architecture.
@@ -437,6 +543,26 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
        }
        /*
+         * Threaded handler ?
+         */
+        if (new->thread_fn) {
+                struct task_struct *t;
+                t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
+                                   new->name);
+                if (IS_ERR(t))
+                        return PTR_ERR(t);
+                /*
+                 * We keep the reference to the task struct even if
+                 * the thread dies to avoid that the interrupt code
+                 * references an already freed task_struct.
+                 */
+                get_task_struct(t);
+                new->thread = t;
+                wake_up_process(t);
+        }
+        /*
         * The following block of code has to be executed atomically
         */
        spin_lock_irqsave(&desc->lock, flags);
@@ -473,15 +599,15 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
        if (!shared) {
                irq_chip_set_defaults(desc->chip);
+                init_waitqueue_head(&desc->wait_for_threads);
                /* Setup the type (level, edge polarity) if configured: */
                if (new->flags & IRQF_TRIGGER_MASK) {
                        ret = __irq_set_trigger(desc, irq,
                                        new->flags & IRQF_TRIGGER_MASK);
-                        if (ret) {
+                        if (ret)
-                                spin_unlock_irqrestore(&desc->lock, flags);
+                                goto out_thread;
-                                return ret;
-                        }
                } else
                        compat_irq_chip_set_default_handler(desc);
 #if defined(CONFIG_IRQ_PER_CPU)
@@ -549,8 +675,19 @@ mismatch:
                dump_stack();
        }
 #endif
+        ret = -EBUSY;
+out_thread:
        spin_unlock_irqrestore(&desc->lock, flags);
-        return -EBUSY;
+        if (new->thread) {
+                struct task_struct *t = new->thread;
+                new->thread = NULL;
+                if (likely(!test_bit(IRQTF_DIED, &new->thread_flags)))
+                        kthread_stop(t);
+                put_task_struct(t);
+        }
+        return ret;
 }
 /**
@@ -576,6 +713,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        struct irqaction *action, **action_ptr;
+        struct task_struct *irqthread;
        unsigned long flags;
        WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
@@ -622,6 +760,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
                else
                        desc->chip->disable(irq);
        }
+        irqthread = action->thread;
+        action->thread = NULL;
        spin_unlock_irqrestore(&desc->lock, flags);
        unregister_handler_proc(irq, action);
@@ -629,6 +771,12 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
        /* Make sure it's not being used on another CPU: */
        synchronize_irq(irq);
+        if (irqthread) {
+                if (!test_bit(IRQTF_DIED, &action->thread_flags))
+                        kthread_stop(irqthread);
+                put_task_struct(irqthread);
+        }
 #ifdef CONFIG_DEBUG_SHIRQ
        /*
         * It's a shared IRQ -- the driver ought to be prepared for an IRQ
@@ -681,9 +829,12 @@ void free_irq(unsigned int irq, void *dev_id)
 EXPORT_SYMBOL(free_irq);
 /**
- *      request_irq - allocate an interrupt line
+ *      request_threaded_irq - allocate an interrupt line
 *      @irq: Interrupt line to allocate
- *      @handler: Function to be called when the IRQ occurs
+ *      @handler: Function to be called when the IRQ occurs.
+ *                Primary handler for threaded interrupts
+ *      @thread_fn: Function called from the irq handler thread
+ *                  If NULL, no irq thread is created
 *      @irqflags: Interrupt type flags
 *      @devname: An ascii name for the claiming device
 *      @dev_id: A cookie passed back to the handler function
@@ -695,6 +846,15 @@ EXPORT_SYMBOL(free_irq);
 *      raises, you must take care both to initialise your hardware
 *      and to set up the interrupt handler in the right order.
 *
+ *      If you want to set up a threaded irq handler for your device
+ *      then you need to supply @handler and @thread_fn. @handler ist
+ *      still called in hard interrupt context and has to check
+ *      whether the interrupt originates from the device. If yes it
+ *      needs to disable the interrupt on the device and return
+ *      IRQ_THREAD_WAKE which will wake up the handler thread and run
+ *      @thread_fn. This split handler design is necessary to support
+ *      shared interrupts.
+ *
 *      Dev_id must be globally unique. Normally the address of the
 *      device data structure is used as the cookie. Since the handler
 *      receives this value it makes sense to use it.
@@ -710,8 +870,9 @@ EXPORT_SYMBOL(free_irq);
 *      IRQF_TRIGGER_*          Specify active edge(s) or level
 *
 */
-int request_irq(unsigned int irq, irq_handler_t handler,
+int request_threaded_irq(unsigned int irq, irq_handler_t handler,
-                unsigned long irqflags, const char *devname, void *dev_id)
+                         irq_handler_t thread_fn, unsigned long irqflags,
+                         const char *devname, void *dev_id)
 {
        struct irqaction *action;
        struct irq_desc *desc;
@@ -759,6 +920,7 @@ int request_irq(unsigned int irq, irq_handler_t handler,
                return -ENOMEM;
        action->handler = handler;
+        action->thread_fn = thread_fn;
        action->flags = irqflags;
        action->name = devname;
        action->dev_id = dev_id;
@@ -788,4 +950,4 @@ int request_irq(unsigned int irq, irq_handler_t handler,
 #endif
        return retval;
 }
-EXPORT_SYMBOL(request_irq);
+EXPORT_SYMBOL(request_threaded_irq);
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 243d6121e50e..44bbdcbaf8d2 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -54,6 +54,7 @@ static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
 static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
 {
        free_kstat_irqs(old_desc, desc);
+        free_desc_masks(old_desc, desc);
        arch_free_chip_data(old_desc, desc);
 }
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 84bbadd4d021..4ebaf8519abf 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -76,6 +76,7 @@ static int kthread(void *_create)
        /* OK, tell user we're spawned, wait for stop or wakeup */
        __set_current_state(TASK_UNINTERRUPTIBLE);
+        create->result = current;
        complete(&create->started);
        schedule();
@@ -96,22 +97,10 @@ static void create_kthread(struct kthread_create_info *create)
        /* We want our own signal handler (we take no signals by default). */
        pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
-        if (pid < 0) {
+        if (pid < 0)
                create->result = ERR_PTR(pid);
-        } else {
+        else
-                struct sched_param param = { .sched_priority = 0 };
                wait_for_completion(&create->started);
-                read_lock(&tasklist_lock);
-                create->result = find_task_by_pid_ns(pid, &init_pid_ns);
-                read_unlock(&tasklist_lock);
-                /*
-                 * root may have changed our (kthreadd's) priority or CPU mask.
-                 * The kernel thread should not inherit these properties.
-                 */
-                sched_setscheduler(create->result, SCHED_NORMAL, &param);
-                set_user_nice(create->result, KTHREAD_NICE_LEVEL);
-                set_cpus_allowed_ptr(create->result, cpu_all_mask);
-        }
        complete(&create->done);
 }
@@ -154,11 +143,20 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
        wait_for_completion(&create.done);
        if (!IS_ERR(create.result)) {
+                struct sched_param param = { .sched_priority = 0 };
                va_list args;
                va_start(args, namefmt);
                vsnprintf(create.result->comm, sizeof(create.result->comm),
                          namefmt, args);
                va_end(args);
+                /*
+                 * root may have changed our (kthreadd's) priority or CPU mask.
+                 * The kernel thread should not inherit these properties.
+                 */
+                sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param);
+                set_user_nice(create.result, KTHREAD_NICE_LEVEL);
+                set_cpus_allowed_ptr(create.result, cpu_all_mask);
        }
        return create.result;
 }
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index b0f011866969..accb40cdb12a 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2490,13 +2490,20 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 void lockdep_init_map(struct lockdep_map *lock, const char *name,
                      struct lock_class_key *key, int subclass)
 {
-        if (unlikely(!debug_locks))
+        lock->class_cache = NULL;
+#ifdef CONFIG_LOCK_STAT
+        lock->cpu = raw_smp_processor_id();
+#endif
+        if (DEBUG_LOCKS_WARN_ON(!name)) {
+                lock->name = "NULL";
                return;
+        }
+        lock->name = name;
        if (DEBUG_LOCKS_WARN_ON(!key))
                return;
-        if (DEBUG_LOCKS_WARN_ON(!name))
-                return;
        /*
         * Sanity check, the lock-class key must be persistent:
         */
@@ -2505,12 +2512,11 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
                DEBUG_LOCKS_WARN_ON(1);
                return;
        }
-        lock->name = name;
        lock->key = key;
-        lock->class_cache = NULL;
-#ifdef CONFIG_LOCK_STAT
+        if (unlikely(!debug_locks))
-        lock->cpu = raw_smp_processor_id();
+                return;
-#endif
        if (subclass)
                register_lock_class(lock, subclass, 1);
 }
diff --git a/kernel/module.c b/kernel/module.c
index 05f014efa32c..e797812a4d95 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2388,6 +2388,9 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
        blocking_notifier_call_chain(&module_notify_list,
                                     MODULE_STATE_LIVE, mod);
+        /* We need to finish all async code before the module init sequence is done */
+        async_synchronize_full();
        mutex_lock(&module_mutex);
        /* Drop initial reference. */
        module_put(mod);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 5d79781394a3..507cf2b5e9f1 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -148,7 +148,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
        preempt_disable();
        mutex_acquire(&lock->dep_map, subclass, 0, ip);
-#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES)
+#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) && \
+    !defined(CONFIG_HAVE_DEFAULT_NO_SPIN_MUTEXES)
        /*
         * Optimistic spinning.
         *
diff --git a/kernel/panic.c b/kernel/panic.c
index 3fd8c5bf8b39..3dcaa1661357 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -213,8 +213,16 @@ unsigned long get_taint(void)
 void add_taint(unsigned flag)
 {
-        /* can't trust the integrity of the kernel anymore: */
+        /*
-        debug_locks = 0;
+         * Can't trust the integrity of the kernel anymore.
+         * We don't call directly debug_locks_off() because the issue
+         * is not necessarily serious enough to set oops_in_progress to 1
+         * Also we want to keep up lockdep for staging development and
+         * post-warning case.
+         */
+        if (flag != TAINT_CRAP && flag != TAINT_WARN && __debug_locks_off())
+                printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
        set_bit(flag, &tainted_mask);
 }
 EXPORT_SYMBOL(add_taint);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 8e5d9a68b022..bece7c0b67b2 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -18,7 +18,7 @@ void update_rlimit_cpu(unsigned long rlim_new)
        cputime = secs_to_cputime(rlim_new);
        if (cputime_eq(current->signal->it_prof_expires, cputime_zero) ||
-            cputime_lt(current->signal->it_prof_expires, cputime)) {
+            cputime_gt(current->signal->it_prof_expires, cputime)) {
                spin_lock_irq(&current->sighand->siglock);
                set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
                spin_unlock_irq(&current->sighand->siglock);
@@ -224,7 +224,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
                cpu->cpu = virt_ticks(p);
                break;
        case CPUCLOCK_SCHED:
-                cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
+                cpu->sched = task_sched_runtime(p);
                break;
        }
        return 0;
@@ -305,18 +305,19 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
 {
        struct task_cputime cputime;
-        thread_group_cputime(p, &cputime);
        switch (CPUCLOCK_WHICH(which_clock)) {
        default:
                return -EINVAL;
        case CPUCLOCK_PROF:
+                thread_group_cputime(p, &cputime);
                cpu->cpu = cputime_add(cputime.utime, cputime.stime);
                break;
        case CPUCLOCK_VIRT:
+                thread_group_cputime(p, &cputime);
                cpu->cpu = cputime.utime;
                break;
        case CPUCLOCK_SCHED:
-                cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
+                cpu->sched = thread_group_sched_runtime(p);
                break;
        }
        return 0;
@@ -1419,19 +1420,19 @@ void run_posix_cpu_timers(struct task_struct *tsk)
         * timer call will interfere.
         */
        list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) {
-                int firing;
+                int cpu_firing;
                spin_lock(&timer->it_lock);
                list_del_init(&timer->it.cpu.entry);
-                firing = timer->it.cpu.firing;
+                cpu_firing = timer->it.cpu.firing;
                timer->it.cpu.firing = 0;
                /*
                 * The firing flag is -1 if we collided with a reset
                 * of the timer, which already reported this
                 * almost-firing as an overrun.  So don't generate an event.
                 */
-                if (likely(firing >= 0)) {
+                if (likely(cpu_firing >= 0))
                        cpu_timer_fire(timer);
-                }
                spin_unlock(&timer->it_lock);
        }
 }
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 5f21ab2bbcdf..e71ca9cd81b2 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,6 +22,7 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
+#include <scsi/scsi_scan.h>
 #include <asm/suspend.h>
 #include "power.h"
@@ -655,32 +656,42 @@ static int software_resume(void)
         * here to avoid lockdep complaining.
         */
        mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING);
+        if (swsusp_resume_device)
+                goto Check_image;
+        if (!strlen(resume_file)) {
+                error = -ENOENT;
+                goto Unlock;
+        }
+        pr_debug("PM: Checking image partition %s\n", resume_file);
+        /* Check if the device is there */
+        swsusp_resume_device = name_to_dev_t(resume_file);
        if (!swsusp_resume_device) {
-                if (!strlen(resume_file)) {
-                        mutex_unlock(&pm_mutex);
-                        return -ENOENT;
-                }
                /*
                 * Some device discovery might still be in progress; we need
                 * to wait for this to finish.
                 */
                wait_for_device_probe();
+                /*
+                 * We can't depend on SCSI devices being available after loading
+                 * one of their modules until scsi_complete_async_scans() is
+                 * called and the resume device usually is a SCSI one.
+                 */
+                scsi_complete_async_scans();
                swsusp_resume_device = name_to_dev_t(resume_file);
-                pr_debug("PM: Resume from partition %s\n", resume_file);
+                if (!swsusp_resume_device) {
-        } else {
+                        error = -ENODEV;
-                pr_debug("PM: Resume from partition %d:%d\n",
+                        goto Unlock;
-                                MAJOR(swsusp_resume_device),
+                }
-                                MINOR(swsusp_resume_device));
        }
-        if (noresume) {
+ Check_image:
-                /**
+        pr_debug("PM: Resume from partition %d:%d\n",
-                 * FIXME: If noresume is specified, we need to find the
+                MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
-                 * partition and reset it back to normal swap space.
-                 */
-                mutex_unlock(&pm_mutex);
-                return 0;
-        }
        pr_debug("PM: Checking hibernation image.\n");
        error = swsusp_check();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f172f41858bb..f99ed6a75eac 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -291,20 +291,26 @@ static int suspend_enter(suspend_state_t state)
        device_pm_lock();
+        if (suspend_ops->prepare) {
+                error = suspend_ops->prepare();
+                if (error)
+                        goto Done;
+        }
        error = device_power_down(PMSG_SUSPEND);
        if (error) {
                printk(KERN_ERR "PM: Some devices failed to power down\n");
-                goto Done;
+                goto Platfrom_finish;
        }
-        if (suspend_ops->prepare) {
+        if (suspend_ops->prepare_late) {
-                error = suspend_ops->prepare();
+                error = suspend_ops->prepare_late();
                if (error)
                        goto Power_up_devices;
        }
        if (suspend_test(TEST_PLATFORM))
-                goto Platfrom_finish;
+                goto Platform_wake;
        error = disable_nonboot_cpus();
        if (error || suspend_test(TEST_CPUS))
@@ -326,13 +332,17 @@ static int suspend_enter(suspend_state_t state)
 Enable_cpus:
        enable_nonboot_cpus();
- Platfrom_finish:
+ Platform_wake:
-        if (suspend_ops->finish)
+        if (suspend_ops->wake)
-                suspend_ops->finish();
+                suspend_ops->wake();
 Power_up_devices:
        device_power_up(PMSG_RESUME);
+ Platfrom_finish:
+        if (suspend_ops->finish)
+                suspend_ops->finish();
 Done:
        device_pm_unlock();
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 505f319e489c..8ba052c86d48 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -64,8 +64,6 @@ static int submit(int rw, pgoff_t page_off, struct page *page,
        struct bio *bio;
        bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
-        if (!bio)
-                return -ENOMEM;
        bio->bi_sector = page_off * (PAGE_SIZE >> 9);
        bio->bi_bdev = resume_bdev;
        bio->bi_end_io = end_swap_bio_read;
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 6c85359364f2..ed97375daae9 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -24,6 +24,7 @@
 #include <linux/cpu.h>
 #include <linux/freezer.h>
 #include <linux/smp_lock.h>
+#include <scsi/scsi_scan.h>
 #include <asm/uaccess.h>
@@ -92,6 +93,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
        filp->private_data = data;
        memset(&data->handle, 0, sizeof(struct snapshot_handle));
        if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
+                /* Hibernating.  The image device should be accessible. */
                data->swap = swsusp_resume_device ?
                        swap_type_of(swsusp_resume_device, 0, NULL) : -1;
                data->mode = O_RDONLY;
@@ -99,6 +101,13 @@ static int snapshot_open(struct inode *inode, struct file *filp)
                if (error)
                        pm_notifier_call_chain(PM_POST_HIBERNATION);
        } else {
+                /*
+                 * Resuming.  We may need to wait for the image device to
+                 * appear.
+                 */
+                wait_for_device_probe();
+                scsi_complete_async_scans();
                data->swap = -1;
                data->mode = O_WRONLY;
                error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index aaad0ec34194..0692ab5a0d67 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -21,9 +21,7 @@
 #include <linux/audit.h>
 #include <linux/pid_namespace.h>
 #include <linux/syscalls.h>
+#include <linux/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/uaccess.h>
 /*
@@ -48,7 +46,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
        list_add(&child->ptrace_entry, &new_parent->ptraced);
        child->parent = new_parent;
 }
- 
 /*
 * Turn a tracing stop into a normal stop now, since with no tracer there
 * would be no way to wake it up with SIGCONT or SIGKILL.  If there was a
@@ -173,7 +171,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
        task_lock(task);
        err = __ptrace_may_access(task, mode);
        task_unlock(task);
-        return (!err ? true : false);
+        return !err;
 }
 int ptrace_attach(struct task_struct *task)
@@ -190,7 +188,7 @@ int ptrace_attach(struct task_struct *task)
        /* Protect exec's credential calculations against our interference;
         * SUID, SGID and LSM creds get determined differently under ptrace.
         */
-        retval = mutex_lock_interruptible(&current->cred_exec_mutex);
+        retval = mutex_lock_interruptible(&task->cred_exec_mutex);
        if (retval  < 0)
                goto out;
@@ -234,7 +232,7 @@ repeat:
 bad:
        write_unlock_irqrestore(&tasklist_lock, flags);
        task_unlock(task);
-        mutex_unlock(&current->cred_exec_mutex);
+        mutex_unlock(&task->cred_exec_mutex);
 out:
        return retval;
 }
@@ -358,7 +356,7 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst
                copied += retval;
                src += retval;
                dst += retval;
-                len -= retval;                  
+                len -= retval;
        }
        return copied;
 }
@@ -383,7 +381,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
                copied += retval;
                src += retval;
                dst += retval;
-                len -= retval;                  
+                len -= retval;
        }
        return copied;
 }
@@ -496,9 +494,9 @@ static int ptrace_resume(struct task_struct *child, long request, long data)
                if (unlikely(!arch_has_single_step()))
                        return -EIO;
                user_enable_single_step(child);
-        }
+        } else {
-        else
                user_disable_single_step(child);
+        }
        child->exit_code = data;
        wake_up_process(child);
@@ -606,10 +604,11 @@ repeat:
                ret = security_ptrace_traceme(current->parent);
                /*
-                 * Set the ptrace bit in the process ptrace flags.
+                 * Check PF_EXITING to ensure ->real_parent has not passed
-                 * Then link us on our parent's ptraced list.
+                 * exit_ptrace(). Otherwise we don't report the error but
+                 * pretend ->real_parent untraces us right after return.
                 */
-                if (!ret) {
+                if (!ret && !(current->real_parent->flags & PF_EXITING)) {
                        current->ptrace |= PT_PTRACED;
                        __ptrace_link(current, current->real_parent);
                }
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 2c7b8457d0d2..a967c9feb90a 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -58,6 +58,10 @@ static DEFINE_MUTEX(rcu_barrier_mutex);
 static struct completion rcu_barrier_completion;
 int rcu_scheduler_active __read_mostly;
+static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0);
+static struct rcu_head rcu_migrate_head[3];
+static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq);
 /*
 * Awaken the corresponding synchronize_rcu() instance now that a
 * grace period has elapsed.
@@ -122,7 +126,10 @@ static void rcu_barrier_func(void *type)
        }
 }
-static inline void wait_migrated_callbacks(void);
+static inline void wait_migrated_callbacks(void)
+{
+        wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
+}
 /*
 * Orchestrate the specified type of RCU barrier, waiting for all
@@ -179,21 +186,12 @@ void rcu_barrier_sched(void)
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
-static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0);
-static struct rcu_head rcu_migrate_head[3];
-static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq);
 static void rcu_migrate_callback(struct rcu_head *notused)
 {
        if (atomic_dec_and_test(&rcu_migrate_type_count))
                wake_up(&rcu_migrate_wq);
 }
-static inline void wait_migrated_callbacks(void)
-{
-        wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
-}
 static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
                unsigned long action, void *hcpu)
 {
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 7f3266922572..d2a372fb0b9b 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -530,8 +530,6 @@ static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
        rdp->qs_pending = 1;
        rdp->passed_quiesc = 0;
        rdp->gpnum = rsp->gpnum;
-        rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
-                                      RCU_JIFFIES_TILL_FORCE_QS;
 }
 /*
@@ -578,8 +576,6 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
        rsp->gpnum++;
        rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
        rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
-        rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
-                                      RCU_JIFFIES_TILL_FORCE_QS;
        record_gp_stall_check_time(rsp);
        dyntick_record_completed(rsp, rsp->completed - 1);
        note_new_gpnum(rsp, rdp);
@@ -1055,7 +1051,6 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
 {
        unsigned long flags;
        long lastcomp;
-        struct rcu_data *rdp = rsp->rda[smp_processor_id()];
        struct rcu_node *rnp = rcu_get_root(rsp);
        u8 signaled;
@@ -1066,16 +1061,13 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
                return; /* Someone else is already on the job. */
        }
        if (relaxed &&
-            (long)(rsp->jiffies_force_qs - jiffies) >= 0 &&
+            (long)(rsp->jiffies_force_qs - jiffies) >= 0)
-            (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) >= 0)
                goto unlock_ret; /* no emergency and done recently. */
        rsp->n_force_qs++;
        spin_lock(&rnp->lock);
        lastcomp = rsp->completed;
        signaled = rsp->signaled;
        rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
-        rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
-                                      RCU_JIFFIES_TILL_FORCE_QS;
        if (lastcomp == rsp->gpnum) {
                rsp->n_force_qs_ngp++;
                spin_unlock(&rnp->lock);
@@ -1144,8 +1136,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
         * If an RCU GP has gone long enough, go check for dyntick
         * idle CPUs and, if needed, send resched IPIs.
         */
-        if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
+        if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
-            (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
                force_quiescent_state(rsp, 1);
        /*
@@ -1230,8 +1221,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
        if (unlikely(++rdp->qlen > qhimark)) {
                rdp->blimit = LONG_MAX;
                force_quiescent_state(rsp, 0);
-        } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
+        } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
-                   (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
                force_quiescent_state(rsp, 1);
        local_irq_restore(flags);
 }
@@ -1290,8 +1280,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
        /* Has an RCU GP gone long enough to send resched IPIs &c? */
        if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) &&
-            ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
+            ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0))
-             (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0))
                return 1;
        /* nothing to do */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 4ee954f6a8d5..4b1875ba9404 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -49,14 +49,12 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 {
        if (!rdp->beenonline)
                return;
-        seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d rpfq=%ld rp=%x",
+        seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d",
                   rdp->cpu,
                   cpu_is_offline(rdp->cpu) ? '!' : ' ',
                   rdp->completed, rdp->gpnum,
                   rdp->passed_quiesc, rdp->passed_quiesc_completed,
-                   rdp->qs_pending,
+                   rdp->qs_pending);
-                   rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
-                   (int)(rdp->n_rcu_pending & 0xffff));
 #ifdef CONFIG_NO_HZ
        seq_printf(m, " dt=%d/%d dn=%d df=%lu",
                   rdp->dynticks->dynticks,
@@ -102,14 +100,12 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
 {
        if (!rdp->beenonline)
                return;
-        seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d,%ld,%ld",
+        seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d",
                   rdp->cpu,
                   cpu_is_offline(rdp->cpu) ? "\"Y\"" : "\"N\"",
                   rdp->completed, rdp->gpnum,
                   rdp->passed_quiesc, rdp->passed_quiesc_completed,
-                   rdp->qs_pending,
+                   rdp->qs_pending);
-                   rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
-                   rdp->n_rcu_pending);
 #ifdef CONFIG_NO_HZ
        seq_printf(m, ",%d,%d,%d,%lu",
                   rdp->dynticks->dynticks,
@@ -123,7 +119,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
 static int show_rcudata_csv(struct seq_file *m, void *unused)
 {
-        seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",\"rpfq\",\"rp\",");
+        seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",");
 #ifdef CONFIG_NO_HZ
        seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
 #endif /* #ifdef CONFIG_NO_HZ */
diff --git a/kernel/resource.c b/kernel/resource.c
index fd5d7d574bb9..ac5f3a36923f 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -533,43 +533,21 @@ static void __init __reserve_region_with_split(struct resource *root,
        res->end = end;
        res->flags = IORESOURCE_BUSY;
-        for (;;) {
+        conflict = __request_resource(parent, res);
-                conflict = __request_resource(parent, res);
+        if (!conflict)
-                if (!conflict)
+                return;
-                        break;
-                if (conflict != parent) {
-                        parent = conflict;
-                        if (!(conflict->flags & IORESOURCE_BUSY))
-                                continue;
-                }
-                /* Uhhuh, that didn't work out.. */
-                kfree(res);
-                res = NULL;
-                break;
-        }
-        if (!res) {
-                /* failed, split and try again */
-                /* conflict covered whole area */
-                if (conflict->start <= start && conflict->end >= end)
-                        return;
-                if (conflict->start > start)
+        /* failed, split and try again */
-                        __reserve_region_with_split(root, start, conflict->start-1, name);
+        kfree(res);
-                if (!(conflict->flags & IORESOURCE_BUSY)) {
-                        resource_size_t common_start, common_end;
-                        common_start = max(conflict->start, start);
+        /* conflict covered whole area */
-                        common_end = min(conflict->end, end);
+        if (conflict->start <= start && conflict->end >= end)
-                        if (common_start < common_end)
+                return;
-                                __reserve_region_with_split(root, common_start, common_end, name);
-                }
-                if (conflict->end < end)
-                        __reserve_region_with_split(root, conflict->end+1, end, name);
-        }
+        if (conflict->start > start)
+                __reserve_region_with_split(root, start, conflict->start-1, name);
+        if (conflict->end < end)
+                __reserve_region_with_split(root, conflict->end+1, end, name);
 }
 void __init reserve_region_with_split(struct resource *root,
diff --git a/kernel/sched.c b/kernel/sched.c
index b38bd96098f6..9e0fd1ef1a47 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1418,10 +1418,22 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
                   struct rq_iterator *iterator);
 #endif
+/* Time spent by the tasks of the cpu accounting group executing in ... */
+enum cpuacct_stat_index {
+        CPUACCT_STAT_USER,      /* ... user mode */
+        CPUACCT_STAT_SYSTEM,    /* ... kernel mode */
+        CPUACCT_STAT_NSTATS,
+};
 #ifdef CONFIG_CGROUP_CPUACCT
 static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+static void cpuacct_update_stats(struct task_struct *tsk,
+                enum cpuacct_stat_index idx, cputime_t val);
 #else
 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
+static inline void cpuacct_update_stats(struct task_struct *tsk,
+                enum cpuacct_stat_index idx, cputime_t val) {}
 #endif
 static inline void inc_cpu_load(struct rq *rq, unsigned long load)
@@ -4511,9 +4523,25 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
 EXPORT_PER_CPU_SYMBOL(kstat);
 /*
- * Return any ns on the sched_clock that have not yet been banked in
+ * Return any ns on the sched_clock that have not yet been accounted in
 * @p in case that task is currently running.
+ *
+ * Called with task_rq_lock() held on @rq.
 */
+static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
+{
+        u64 ns = 0;
+        if (task_current(rq, p)) {
+                update_rq_clock(rq);
+                ns = rq->clock - p->se.exec_start;
+                if ((s64)ns < 0)
+                        ns = 0;
+        }
+        return ns;
+}
 unsigned long long task_delta_exec(struct task_struct *p)
 {
        unsigned long flags;
@@ -4521,16 +4549,49 @@ unsigned long long task_delta_exec(struct task_struct *p)
        u64 ns = 0;
        rq = task_rq_lock(p, &flags);
+        ns = do_task_delta_exec(p, rq);
+        task_rq_unlock(rq, &flags);
-        if (task_current(rq, p)) {
+        return ns;
-                u64 delta_exec;
+}
-                update_rq_clock(rq);
+/*
-                delta_exec = rq->clock - p->se.exec_start;
+ * Return accounted runtime for the task.
-                if ((s64)delta_exec > 0)
+ * In case the task is currently running, return the runtime plus current's
-                        ns = delta_exec;
+ * pending runtime that have not been accounted yet.
-        }
+ */
+unsigned long long task_sched_runtime(struct task_struct *p)
+{
+        unsigned long flags;
+        struct rq *rq;
+        u64 ns = 0;
+        rq = task_rq_lock(p, &flags);
+        ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
+        task_rq_unlock(rq, &flags);
+        return ns;
+}
+/*
+ * Return sum_exec_runtime for the thread group.
+ * In case the task is currently running, return the sum plus current's
+ * pending runtime that have not been accounted yet.
+ *
+ * Note that the thread group might have other running tasks as well,
+ * so the return value not includes other pending runtime that other
+ * running tasks might have.
+ */
+unsigned long long thread_group_sched_runtime(struct task_struct *p)
+{
+        struct task_cputime totals;
+        unsigned long flags;
+        struct rq *rq;
+        u64 ns;
+        rq = task_rq_lock(p, &flags);
+        thread_group_cputime(p, &totals);
+        ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
        task_rq_unlock(rq, &flags);
        return ns;
@@ -4559,6 +4620,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
                cpustat->nice = cputime64_add(cpustat->nice, tmp);
        else
                cpustat->user = cputime64_add(cpustat->user, tmp);
+        cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
        /* Account for user time used */
        acct_update_integrals(p);
 }
@@ -4620,6 +4683,8 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
        else
                cpustat->system = cputime64_add(cpustat->system, tmp);
+        cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
        /* Account for system time used */
        acct_update_integrals(p);
 }
@@ -4667,7 +4732,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
        if (user_tick)
                account_user_time(p, one_jiffy, one_jiffy_scaled);
-        else if (p != rq->idle)
+        else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
                account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
                                    one_jiffy_scaled);
        else
@@ -4781,7 +4846,7 @@ void scheduler_tick(void)
 #endif
 }
-unsigned long get_parent_ip(unsigned long addr)
+notrace unsigned long get_parent_ip(unsigned long addr)
 {
        if (in_lock_functions(addr)) {
                addr = CALLER_ADDR2;
@@ -7302,7 +7367,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                cpumask_or(groupmask, groupmask, sched_group_cpus(group));
                cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
                printk(KERN_CONT " %s", str);
+                if (group->__cpu_power != SCHED_LOAD_SCALE) {
+                        printk(KERN_CONT " (__cpu_power = %d)",
+                                group->__cpu_power);
+                }
                group = group->next;
        } while (group != sd->groups);
@@ -9925,6 +9995,7 @@ struct cpuacct {
        struct cgroup_subsys_state css;
        /* cpuusage holds pointer to a u64-type object on every cpu */
        u64 *cpuusage;
+        struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
        struct cpuacct *parent;
 };
@@ -9949,20 +10020,32 @@ static struct cgroup_subsys_state *cpuacct_create(
        struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
        struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+        int i;
        if (!ca)
-                return ERR_PTR(-ENOMEM);
+                goto out;
        ca->cpuusage = alloc_percpu(u64);
-        if (!ca->cpuusage) {
+        if (!ca->cpuusage)
-                kfree(ca);
+                goto out_free_ca;
-                return ERR_PTR(-ENOMEM);
-        }
+        for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+                if (percpu_counter_init(&ca->cpustat[i], 0))
+                        goto out_free_counters;
        if (cgrp->parent)
                ca->parent = cgroup_ca(cgrp->parent);
        return &ca->css;
+out_free_counters:
+        while (--i >= 0)
+                percpu_counter_destroy(&ca->cpustat[i]);
+        free_percpu(ca->cpuusage);
+out_free_ca:
+        kfree(ca);
+out:
+        return ERR_PTR(-ENOMEM);
 }
 /* destroy an existing cpu accounting group */
@@ -9970,7 +10053,10 @@ static void
 cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
        struct cpuacct *ca = cgroup_ca(cgrp);
+        int i;
+        for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+                percpu_counter_destroy(&ca->cpustat[i]);
        free_percpu(ca->cpuusage);
        kfree(ca);
 }
@@ -10057,6 +10143,25 @@ static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
        return 0;
 }
+static const char *cpuacct_stat_desc[] = {
+        [CPUACCT_STAT_USER] = "user",
+        [CPUACCT_STAT_SYSTEM] = "system",
+};
+static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
+                struct cgroup_map_cb *cb)
+{
+        struct cpuacct *ca = cgroup_ca(cgrp);
+        int i;
+        for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
+                s64 val = percpu_counter_read(&ca->cpustat[i]);
+                val = cputime64_to_clock_t(val);
+                cb->fill(cb, cpuacct_stat_desc[i], val);
+        }
+        return 0;
+}
 static struct cftype files[] = {
        {
                .name = "usage",
@@ -10067,7 +10172,10 @@ static struct cftype files[] = {
                .name = "usage_percpu",
                .read_seq_string = cpuacct_percpu_seq_read,
        },
+        {
+                .name = "stat",
+                .read_map = cpuacct_stats_show,
+        },
 };
 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -10089,12 +10197,38 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
                return;
        cpu = task_cpu(tsk);
+        rcu_read_lock();
        ca = task_ca(tsk);
        for (; ca; ca = ca->parent) {
                u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
                *cpuusage += cputime;
        }
+        rcu_read_unlock();
+}
+/*
+ * Charge the system/user time to the task's accounting group.
+ */
+static void cpuacct_update_stats(struct task_struct *tsk,
+                enum cpuacct_stat_index idx, cputime_t val)
+{
+        struct cpuacct *ca;
+        if (unlikely(!cpuacct_subsys.active))
+                return;
+        rcu_read_lock();
+        ca = task_ca(tsk);
+        do {
+                percpu_counter_add(&ca->cpustat[idx], val);
+                ca = ca->parent;
+        } while (ca);
+        rcu_read_unlock();
 }
 struct cgroup_subsys cpuacct_subsys = {
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 1e00bfacf9b8..cdd3c89574cd 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -55,7 +55,7 @@ static int convert_prio(int prio)
 * cpupri_find - find the best (lowest-pri) CPU in the system
 * @cp: The cpupri context
 * @p: The task
- * @lowest_mask: A mask to fill in with selected CPUs
+ * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
 *
 * Note: This function returns the recommended CPUs as calculated during the
 * current invokation.  By the time the call returns, the CPUs may have in
@@ -81,7 +81,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
                if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
                        continue;
-                cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
+                if (lowest_mask)
+                        cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
                return 1;
        }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 299d012b4394..f2c66f8f9712 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -948,20 +948,15 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 {
-        cpumask_var_t mask;
        if (rq->curr->rt.nr_cpus_allowed == 1)
                return;
-        if (!alloc_cpumask_var(&mask, GFP_ATOMIC))
-                return;
        if (p->rt.nr_cpus_allowed != 1
-            && cpupri_find(&rq->rd->cpupri, p, mask))
+            && cpupri_find(&rq->rd->cpupri, p, NULL))
-                goto free;
+                return;
-        if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask))
+        if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
-                goto free;
+                return;
        /*
         * There appears to be other cpus that can accept
@@ -970,8 +965,6 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
         */
        requeue_task_rt(rq, p, 1);
        resched_task(rq->curr);
-free:
-        free_cpumask_var(mask);
 }
 #endif /* CONFIG_SMP */
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index cf2bc01186ef..b28d19135f43 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -609,14 +609,14 @@ void slow_work_unregister_user(void)
        if (slow_work_user_count == 0) {
                printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
                slow_work_threads_should_exit = true;
+                del_timer_sync(&slow_work_cull_timer);
+                del_timer_sync(&slow_work_oom_timer);
                wake_up_all(&slow_work_thread_wq);
                wait_for_completion(&slow_work_last_thread_exited);
                printk(KERN_NOTICE "Slow work thread pool:"
                       " Shut down complete\n");
        }
-        del_timer_sync(&slow_work_cull_timer);
        mutex_unlock(&slow_work_user_lock);
 }
 EXPORT_SYMBOL(slow_work_unregister_user);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 2fecefacdc5b..b525dd348511 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -472,9 +472,9 @@ void tasklet_kill(struct tasklet_struct *t)
                printk("Attempt to kill tasklet from interrupt\n");
        while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
-                do
+                do {
                        yield();
-                while (test_bit(TASKLET_STATE_SCHED, &t->state));
+                } while (test_bit(TASKLET_STATE_SCHED, &t->state));
        }
        tasklet_unlock_wait(t);
        clear_bit(TASKLET_STATE_SCHED, &t->state);
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 85d5a2455103..88796c330838 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -166,97 +166,11 @@ void softlockup_tick(void)
 }
 /*
- * Have a reasonable limit on the number of tasks checked:
- */
-unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
-/*
- * Zero means infinite timeout - no checking done:
- */
-unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480;
-unsigned long __read_mostly sysctl_hung_task_warnings = 10;
-/*
- * Only do the hung-tasks check on one CPU:
- */
-static int check_cpu __read_mostly = -1;
-static void check_hung_task(struct task_struct *t, unsigned long now)
-{
-        unsigned long switch_count = t->nvcsw + t->nivcsw;
-        if (t->flags & PF_FROZEN)
-                return;
-        if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
-                t->last_switch_count = switch_count;
-                t->last_switch_timestamp = now;
-                return;
-        }
-        if ((long)(now - t->last_switch_timestamp) <
-                                        sysctl_hung_task_timeout_secs)
-                return;
-        if (!sysctl_hung_task_warnings)
-                return;
-        sysctl_hung_task_warnings--;
-        /*
-         * Ok, the task did not get scheduled for more than 2 minutes,
-         * complain:
-         */
-        printk(KERN_ERR "INFO: task %s:%d blocked for more than "
-                        "%ld seconds.\n", t->comm, t->pid,
-                        sysctl_hung_task_timeout_secs);
-        printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
-                        " disables this message.\n");
-        sched_show_task(t);
-        __debug_show_held_locks(t);
-        t->last_switch_timestamp = now;
-        touch_nmi_watchdog();
-        if (softlockup_panic)
-                panic("softlockup: blocked tasks");
-}
-/*
- * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
- * a really long time (120 seconds). If that happens, print out
- * a warning.
- */
-static void check_hung_uninterruptible_tasks(int this_cpu)
-{
-        int max_count = sysctl_hung_task_check_count;
-        unsigned long now = get_timestamp(this_cpu);
-        struct task_struct *g, *t;
-        /*
-         * If the system crashed already then all bets are off,
-         * do not report extra hung tasks:
-         */
-        if (test_taint(TAINT_DIE) || did_panic)
-                return;
-        read_lock(&tasklist_lock);
-        do_each_thread(g, t) {
-                if (!--max_count)
-                        goto unlock;
-                /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
-                if (t->state == TASK_UNINTERRUPTIBLE)
-                        check_hung_task(t, now);
-        } while_each_thread(g, t);
- unlock:
-        read_unlock(&tasklist_lock);
-}
-/*
 * The watchdog thread - runs every second and touches the timestamp.
 */
 static int watchdog(void *__bind_cpu)
 {
        struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
-        int this_cpu = (long)__bind_cpu;
        sched_setscheduler(current, SCHED_FIFO, &param);
@@ -276,11 +190,6 @@ static int watchdog(void *__bind_cpu)
                if (kthread_should_stop())
                        break;
-                if (this_cpu == check_cpu) {
-                        if (sysctl_hung_task_timeout_secs)
-                                check_hung_uninterruptible_tasks(this_cpu);
-                }
                set_current_state(TASK_INTERRUPTIBLE);
        }
        __set_current_state(TASK_RUNNING);
@@ -312,18 +221,9 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
                break;
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
-                check_cpu = cpumask_any(cpu_online_mask);
                wake_up_process(per_cpu(watchdog_task, hotcpu));
                break;
 #ifdef CONFIG_HOTPLUG_CPU
-        case CPU_DOWN_PREPARE:
-        case CPU_DOWN_PREPARE_FROZEN:
-                if (hotcpu == check_cpu) {
-                        /* Pick any other online cpu. */
-                        check_cpu = cpumask_any_but(cpu_online_mask, hotcpu);
-                }
-                break;
        case CPU_UP_CANCELED:
        case CPU_UP_CANCELED_FROZEN:
                if (!per_cpu(watchdog_task, hotcpu))
diff --git a/kernel/sys.c b/kernel/sys.c
index 51dbb55604e8..e7998cf31498 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -360,6 +360,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
                void __user *, arg)
 {
        char buffer[256];
+        int ret = 0;
        /* We only trust the superuser with rebooting the system. */
        if (!capable(CAP_SYS_BOOT))
@@ -397,7 +398,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
                kernel_halt();
                unlock_kernel();
                do_exit(0);
-                break;
+                panic("cannot halt");
        case LINUX_REBOOT_CMD_POWER_OFF:
                kernel_power_off();
@@ -417,29 +418,22 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
 #ifdef CONFIG_KEXEC
        case LINUX_REBOOT_CMD_KEXEC:
-                {
+                ret = kernel_kexec();
-                        int ret;
+                break;
-                        ret = kernel_kexec();
-                        unlock_kernel();
-                        return ret;
-                }
 #endif
 #ifdef CONFIG_HIBERNATION
        case LINUX_REBOOT_CMD_SW_SUSPEND:
-                {
+                ret = hibernate();
-                        int ret = hibernate();
+                break;
-                        unlock_kernel();
-                        return ret;
-                }
 #endif
        default:
-                unlock_kernel();
+                ret = -EINVAL;
-                return -EINVAL;
+                break;
        }
        unlock_kernel();
-        return 0;
+        return ret;
 }
 static void deferred_cad(struct work_struct *dummy)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 72eb1a41dcab..ea78fa101ad6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -103,6 +103,9 @@ static unsigned long one_ul = 1;
 static int one_hundred = 100;
 static int one_thousand = 1000;
+/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
+static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
 static int minolduid;
@@ -814,6 +817,19 @@ static struct ctl_table kern_table[] = {
                .extra1         = &neg_one,
                .extra2         = &sixty,
        },
+#endif
+#ifdef CONFIG_DETECT_HUNG_TASK
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "hung_task_panic",
+                .data           = &sysctl_hung_task_panic,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_minmax,
+                .strategy       = &sysctl_intvec,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
        {
                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "hung_task_check_count",
@@ -829,7 +845,7 @@ static struct ctl_table kern_table[] = {
                .data           = &sysctl_hung_task_timeout_secs,
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
-                .proc_handler   = &proc_doulongvec_minmax,
+                .proc_handler   = &proc_dohung_task_timeout_secs,
                .strategy       = &sysctl_intvec,
        },
        {
@@ -889,16 +905,6 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = &proc_dointvec,
        },
 #endif
-#ifdef CONFIG_UNEVICTABLE_LRU
-        {
-                .ctl_name       = CTL_UNNUMBERED,
-                .procname       = "scan_unevictable_pages",
-                .data           = &scan_unevictable_pages,
-                .maxlen         = sizeof(scan_unevictable_pages),
-                .mode           = 0644,
-                .proc_handler   = &scan_unevictable_handler,
-        },
-#endif
 #ifdef CONFIG_SLOW_WORK
        {
                .ctl_name       = CTL_UNNUMBERED,
@@ -1003,7 +1009,7 @@ static struct ctl_table vm_table[] = {
                .mode           = 0644,
                .proc_handler   = &dirty_bytes_handler,
                .strategy       = &sysctl_intvec,
-                .extra1         = &one_ul,
+                .extra1         = &dirty_bytes_min,
        },
        {
                .procname       = "dirty_writeback_centisecs",
@@ -1289,6 +1295,16 @@ static struct ctl_table vm_table[] = {
                .extra2         = &one,
        },
 #endif
+#ifdef CONFIG_UNEVICTABLE_LRU
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "scan_unevictable_pages",
+                .data           = &scan_unevictable_pages,
+                .maxlen         = sizeof(scan_unevictable_pages),
+                .mode           = 0644,
+                .proc_handler   = &scan_unevictable_handler,
+        },
+#endif
 /*
 * NOTE: do not add new entries to this table unless you have read
 * Documentation/sysctl/ctl_unnumbered.txt
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c46c931a7fe7..ecfd7b5187e0 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -181,12 +181,12 @@ static void clocksource_watchdog(unsigned long data)
        resumed = test_and_clear_bit(0, &watchdog_resumed);
-        wdnow = watchdog->read();
+        wdnow = watchdog->read(watchdog);
        wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
        watchdog_last = wdnow;
        list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
-                csnow = cs->read();
+                csnow = cs->read(cs);
                if (unlikely(resumed)) {
                        cs->wd_last = csnow;
@@ -247,7 +247,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
                list_add(&cs->wd_list, &watchdog_list);
                if (!started && watchdog) {
-                        watchdog_last = watchdog->read();
+                        watchdog_last = watchdog->read(watchdog);
                        watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
                        add_timer_on(&watchdog_timer,
                                     cpumask_first(cpu_online_mask));
@@ -268,7 +268,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
                                cse->flags &= ~CLOCK_SOURCE_WATCHDOG;
                        /* Start if list is not empty */
                        if (!list_empty(&watchdog_list)) {
-                                watchdog_last = watchdog->read();
+                                watchdog_last = watchdog->read(watchdog);
                                watchdog_timer.expires =
                                        jiffies + WATCHDOG_INTERVAL;
                                add_timer_on(&watchdog_timer,
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 06f197560f3b..c3f6c30816e3 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -50,7 +50,7 @@
 */
 #define JIFFIES_SHIFT   8
-static cycle_t jiffies_read(void)
+static cycle_t jiffies_read(struct clocksource *cs)
 {
        return (cycle_t) jiffies;
 }
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 21a5ca849514..83c4417b6a3c 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -93,7 +93,17 @@ void tick_handle_periodic(struct clock_event_device *dev)
        for (;;) {
                if (!clockevents_program_event(dev, next, ktime_get()))
                        return;
-                tick_periodic(cpu);
+                /*
+                 * Have to be careful here. If we're in oneshot mode,
+                 * before we call tick_periodic() in a loop, we need
+                 * to be sure we're using a real hardware clocksource.
+                 * Otherwise we could get trapped in an infinite
+                 * loop, as the tick_periodic() increments jiffies,
+                 * when then will increment time, posibly causing
+                 * the loop to trigger again and again.
+                 */
+                if (timekeeping_valid_for_hres())
+                        tick_periodic(cpu);
                next = ktime_add(next, tick_period);
        }
 }
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 900f1b6598d1..687dff49f6e7 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -182,7 +182,7 @@ EXPORT_SYMBOL(do_settimeofday);
 */
 static void change_clocksource(void)
 {
-        struct clocksource *new;
+        struct clocksource *new, *old;
        new = clocksource_get_next();
@@ -191,11 +191,16 @@ static void change_clocksource(void)
        clocksource_forward_now();
-        new->raw_time = clock->raw_time;
+        if (clocksource_enable(new))
+                return;
+        new->raw_time = clock->raw_time;
+        old = clock;
        clock = new;
+        clocksource_disable(old);
        clock->cycle_last = 0;
-        clock->cycle_last = clocksource_read(new);
+        clock->cycle_last = clocksource_read(clock);
        clock->error = 0;
        clock->xtime_nsec = 0;
        clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
@@ -292,6 +297,7 @@ void __init timekeeping_init(void)
        ntp_init();
        clock = clocksource_get_next();
+        clocksource_enable(clock);
        clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
        clock->cycle_last = clocksource_read(clock);
diff --git a/kernel/timer.c b/kernel/timer.c
index b4555568b4e4..cffffad01c31 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -531,10 +531,13 @@ static void __init_timer(struct timer_list *timer,
 }
 /**
- * init_timer - initialize a timer.
+ * init_timer_key - initialize a timer
 * @timer: the timer to be initialized
+ * @name: name of the timer
+ * @key: lockdep class key of the fake lock used for tracking timer
+ *       sync lock dependencies
 *
- * init_timer() must be done to a timer prior calling *any* of the
+ * init_timer_key() must be done to a timer prior calling *any* of the
 * other timer functions.
 */
 void init_timer_key(struct timer_list *timer,
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 2246141bda4d..417d1985e299 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -312,7 +312,7 @@ config KMEMTRACE
          and profile kernel code.
          This requires an userspace application to use. See
-          Documentation/vm/kmemtrace.txt for more information.
+          Documentation/trace/kmemtrace.txt for more information.
          Saying Y will make the kernel somewhat larger and slower. However,
          if you disable kmemtrace at run-time or boot-time, the performance
@@ -403,7 +403,7 @@ config MMIOTRACE
          implementation and works via page faults. Tracing is disabled by
          default and can be enabled at run-time.
-          See Documentation/tracers/mmiotrace.txt.
+          See Documentation/trace/mmiotrace.txt.
          If you are not helping to develop drivers, say N.
 config MMIOTRACE_TEST
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 947c5b3f90c4..921ef5d1f0ba 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -327,10 +327,10 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
        char *msg;
        struct blk_trace *bt;
-        if (count > BLK_TN_MAX_MSG)
+        if (count >= BLK_TN_MAX_MSG)
                return -EINVAL;
-        msg = kmalloc(count, GFP_KERNEL);
+        msg = kmalloc(count + 1, GFP_KERNEL);
        if (msg == NULL)
                return -ENOMEM;
@@ -339,6 +339,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
                return -EFAULT;
        }
+        msg[count] = '\0';
        bt = filp->private_data;
        __trace_note_message(bt, "%s", msg);
        kfree(msg);
@@ -642,7 +643,7 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
        if (blk_pc_request(rq)) {
                what |= BLK_TC_ACT(BLK_TC_PC);
                __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors,
-                                sizeof(rq->cmd), rq->cmd);
+                                rq->cmd_len, rq->cmd);
        } else  {
                what |= BLK_TC_ACT(BLK_TC_FS);
                __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
@@ -1376,12 +1377,12 @@ static int blk_trace_str2mask(const char *str)
 {
        int i;
        int mask = 0;
-        char *s, *token;
+        char *buf, *s, *token;
-        s = kstrdup(str, GFP_KERNEL);
+        buf = kstrdup(str, GFP_KERNEL);
-        if (s == NULL)
+        if (buf == NULL)
                return -ENOMEM;
-        s = strstrip(s);
+        s = strstrip(buf);
        while (1) {
                token = strsep(&s, ",");
@@ -1402,7 +1403,7 @@ static int blk_trace_str2mask(const char *str)
                        break;
                }
        }
-        kfree(s);
+        kfree(buf);
        return mask;
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a0174a40c563..a884c09006c4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -30,6 +30,7 @@
 #include <linux/percpu.h>
 #include <linux/splice.h>
 #include <linux/kdebug.h>
+#include <linux/string.h>
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/poll.h>
@@ -147,8 +148,7 @@ static int __init set_ftrace_dump_on_oops(char *str)
 }
 __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
-long
+unsigned long long ns2usecs(cycle_t nsec)
-ns2usecs(cycle_t nsec)
 {
        nsec += 500;
        do_div(nsec, 1000);
@@ -1632,7 +1632,11 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
                return;
        cpumask_set_cpu(iter->cpu, iter->started);
-        trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
+        /* Don't print started cpu buffer for the first entry of the trace */
+        if (iter->idx > 1)
+                trace_seq_printf(s, "##### CPU %u buffer started ####\n",
+                                iter->cpu);
 }
 static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
@@ -1867,6 +1871,11 @@ __tracing_open(struct inode *inode, struct file *file)
        if (current_trace)
                *iter->trace = *current_trace;
+        if (!alloc_cpumask_var(&iter->started, GFP_KERNEL))
+                goto fail;
+        cpumask_clear(iter->started);
        if (current_trace && current_trace->print_max)
                iter->tr = &max_tr;
        else
@@ -1917,6 +1926,7 @@ __tracing_open(struct inode *inode, struct file *file)
                if (iter->buffer_iter[cpu])
                        ring_buffer_read_finish(iter->buffer_iter[cpu]);
        }
+        free_cpumask_var(iter->started);
 fail:
        mutex_unlock(&trace_types_lock);
        kfree(iter->trace);
@@ -1960,6 +1970,7 @@ static int tracing_release(struct inode *inode, struct file *file)
        seq_release(inode, file);
        mutex_destroy(&iter->mutex);
+        free_cpumask_var(iter->started);
        kfree(iter->trace);
        kfree(iter);
        return 0;
@@ -2358,9 +2369,9 @@ static const char readme_msg[] =
        "# mkdir /debug\n"
        "# mount -t debugfs nodev /debug\n\n"
        "# cat /debug/tracing/available_tracers\n"
-        "wakeup preemptirqsoff preemptoff irqsoff ftrace sched_switch none\n\n"
+        "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n"
        "# cat /debug/tracing/current_tracer\n"
-        "none\n"
+        "nop\n"
        "# echo sched_switch > /debug/tracing/current_tracer\n"
        "# cat /debug/tracing/current_tracer\n"
        "sched_switch\n"
@@ -3266,19 +3277,13 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
        info->tr        = &global_trace;
        info->cpu       = cpu;
-        info->spare     = ring_buffer_alloc_read_page(info->tr->buffer);
+        info->spare     = NULL;
        /* Force reading ring buffer for first read */
        info->read      = (unsigned int)-1;
-        if (!info->spare)
-                goto out;
        filp->private_data = info;
-        return 0;
+        return nonseekable_open(inode, filp);
- out:
-        kfree(info);
-        return -ENOMEM;
 }
 static ssize_t
@@ -3293,6 +3298,11 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
        if (!count)
                return 0;
+        if (!info->spare)
+                info->spare = ring_buffer_alloc_read_page(info->tr->buffer);
+        if (!info->spare)
+                return -ENOMEM;
        /* Do we have previous read data to read? */
        if (info->read < PAGE_SIZE)
                goto read;
@@ -3331,7 +3341,8 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
 {
        struct ftrace_buffer_info *info = file->private_data;
-        ring_buffer_free_read_page(info->tr->buffer, info->spare);
+        if (info->spare)
+                ring_buffer_free_read_page(info->tr->buffer, info->spare);
        kfree(info);
        return 0;
@@ -3417,14 +3428,19 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
        int size, i;
        size_t ret;
-        /*
+        if (*ppos & (PAGE_SIZE - 1)) {
-         * We can't seek on a buffer input
+                WARN_ONCE(1, "Ftrace: previous read must page-align\n");
-         */
+                return -EINVAL;
-        if (unlikely(*ppos))
+        }
-                return -ESPIPE;
+        if (len & (PAGE_SIZE - 1)) {
+                WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
+                if (len < PAGE_SIZE)
+                        return -EINVAL;
+                len &= PAGE_MASK;
+        }
-        for (i = 0; i < PIPE_BUFFERS && len; i++, len -= size) {
+        for (i = 0; i < PIPE_BUFFERS && len; i++, len -= PAGE_SIZE) {
                struct page *page;
                int r;
@@ -3432,6 +3448,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                if (!ref)
                        break;
+                ref->ref = 1;
                ref->buffer = info->tr->buffer;
                ref->page = ring_buffer_alloc_read_page(ref->buffer);
                if (!ref->page) {
@@ -3463,6 +3480,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                spd.partial[i].offset = 0;
                spd.partial[i].private = (unsigned long)ref;
                spd.nr_pages++;
+                *ppos += PAGE_SIZE;
        }
        spd.nr_pages = i;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cbc168f1e43d..e685ac2b2ba1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -602,7 +602,7 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 extern void *head_page(struct trace_array_cpu *data);
-extern long ns2usecs(cycle_t nsec);
+extern unsigned long long ns2usecs(cycle_t nsec);
 extern int
 trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
 extern int
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index ad8c22efff41..8333715e4066 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -155,6 +155,13 @@ static enum print_line_t trace_branch_print(struct trace_iterator *iter,
        return TRACE_TYPE_HANDLED;
 }
+static void branch_print_header(struct seq_file *s)
+{
+        seq_puts(s, "#           TASK-PID    CPU#    TIMESTAMP  CORRECT"
+                "  FUNC:FILE:LINE\n");
+        seq_puts(s, "#              | |       |          |         |   "
+                "    |\n");
+}
 static struct trace_event trace_branch_event = {
        .type           = TRACE_BRANCH,
@@ -169,6 +176,7 @@ static struct tracer branch_trace __read_mostly =
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest       = trace_selftest_startup_branch,
 #endif /* CONFIG_FTRACE_SELFTEST */
+        .print_header   = branch_print_header,
 };
 __init static int init_branch_tracer(void)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 64ec4d278ffb..576f4fa2af0d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -503,6 +503,7 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
        if (copy_from_user(&buf, ubuf, cnt))
                return -EFAULT;
+        buf[cnt] = '\0';
        pred = kzalloc(sizeof(*pred), GFP_KERNEL);
        if (!pred)
@@ -520,9 +521,10 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
                return cnt;
        }
-        if (filter_add_pred(call, pred)) {
+        err = filter_add_pred(call, pred);
+        if (err < 0) {
                filter_free_pred(pred);
-                return -EINVAL;
+                return err;
        }
        *ppos += cnt;
@@ -569,6 +571,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
        if (copy_from_user(&buf, ubuf, cnt))
                return -EFAULT;
+        buf[cnt] = '\0';
        pred = kzalloc(sizeof(*pred), GFP_KERNEL);
        if (!pred)
@@ -586,10 +589,11 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
                return cnt;
        }
-        if (filter_add_subsystem_pred(system, pred)) {
+        err = filter_add_subsystem_pred(system, pred);
+        if (err < 0) {
                filter_free_subsystem_preds(system);
                filter_free_pred(pred);
-                return -EINVAL;
+                return err;
        }
        *ppos += cnt;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 026be412f356..e03cbf1e38f3 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -215,7 +215,7 @@ static int __filter_add_pred(struct ftrace_event_call *call,
                }
        }
-        return -ENOMEM;
+        return -ENOSPC;
 }
 static int is_string_field(const char *type)
@@ -319,7 +319,7 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
        }
        if (i == MAX_FILTER_PRED)
-                return -EINVAL;
+                return -ENOSPC;
        events_for_each(call) {
                int err;
@@ -410,16 +410,22 @@ int filter_parse(char **pbuf, struct filter_pred *pred)
                }
        }
+        if (!val_str) {
+                pred->field_name = NULL;
+                return -EINVAL;
+        }
        pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
        if (!pred->field_name)
                return -ENOMEM;
-        pred->val = simple_strtoull(val_str, &tmp, 10);
+        pred->val = simple_strtoull(val_str, &tmp, 0);
        if (tmp == val_str) {
                pred->str_val = kstrdup(val_str, GFP_KERNEL);
                if (!pred->str_val)
                        return -ENOMEM;
-        }
+        } else if (*tmp != '\0')
+                return -EINVAL;
        return 0;
 }
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 30743f7d4110..d363c6672c6c 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -105,10 +105,10 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
                return 0;
 #undef __entry
-#define __entry "REC"
+#define __entry REC
 #undef TP_printk
-#define TP_printk(fmt, args...) "%s, %s\n", #fmt, #args
+#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
 #undef TP_fast_assign
 #define TP_fast_assign(args...) args
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 4d9952d3df50..07a22c33ebf3 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -40,7 +40,7 @@
 #undef TRACE_FIELD_ZERO_CHAR
 #define TRACE_FIELD_ZERO_CHAR(item)                                     \
-        ret = trace_seq_printf(s, "\tfield: char " #item ";\t"          \
+        ret = trace_seq_printf(s, "\tfield:char " #item ";\t"           \
                               "offset:%u;\tsize:0;\n",                 \
                               (unsigned int)offsetof(typeof(field), item)); \
        if (!ret)                                                       \
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index d72b9a63b247..64b54a59c55b 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -423,7 +423,7 @@ int trace_print_lat_context(struct trace_iterator *iter)
                trace_find_cmdline(entry->pid, comm);
-                ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]"
+                ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08llx]"
                                       " %ld.%03ldms (+%ld.%03ldms): ", comm,
                                       entry->pid, iter->cpu, entry->flags,
                                       entry->preempt_count, iter->idx,
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index bae791ebcc51..118439709fb7 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -186,6 +186,12 @@ static enum print_line_t power_print_line(struct trace_iterator *iter)
        return TRACE_TYPE_UNHANDLED;
 }
+static void power_print_header(struct seq_file *s)
+{
+        seq_puts(s, "#   TIMESTAMP      STATE  EVENT\n");
+        seq_puts(s, "#       |            |      |\n");
+}
 static struct tracer power_tracer __read_mostly =
 {
        .name           = "power",
@@ -194,6 +200,7 @@ static struct tracer power_tracer __read_mostly =
        .stop           = stop_power_trace,
        .reset          = power_trace_reset,
        .print_line     = power_print_line,
+        .print_header   = power_print_header,
 };
 static int init_power_trace(void)
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index de35f200abd3..9117cea6f1ae 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -62,6 +62,9 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
        pc = preempt_count();
        tracing_record_cmdline(current);
+        if (sched_stopped)
+                return;
        local_irq_save(flags);
        cpu = raw_smp_processor_id();
        data = ctx_trace->data[cpu];
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 3c5ad6b2ec84..5bc00e8f153e 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -154,7 +154,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
        if (unlikely(!tracer_enabled || next != wakeup_task))
                goto out_unlock;
-        trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+        trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
        tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
        /*
@@ -257,6 +257,12 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
        data = wakeup_trace->data[wakeup_cpu];
        data->preempt_timestamp = ftrace_now(cpu);
        tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
+        /*
+         * We must be careful in using CALLER_ADDR2. But since wake_up
+         * is not called by an assembly function  (where as schedule is)
+         * it should be safe to use it here.
+         */
        trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
 out_locked:
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index a2a3af29c943..5e579645ac86 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,5 +1,5 @@
+#include <trace/syscall.h>
 #include <linux/kernel.h>
-#include <linux/ftrace.h>
 #include <asm/syscall.h>
 #include "trace_output.h"
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b6b966ce1451..f71fb2a08950 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -966,20 +966,20 @@ undo:
 }
 #ifdef CONFIG_SMP
-static struct workqueue_struct *work_on_cpu_wq __read_mostly;
 struct work_for_cpu {
-        struct work_struct work;
+        struct completion completion;
        long (*fn)(void *);
        void *arg;
        long ret;
 };
-static void do_work_for_cpu(struct work_struct *w)
+static int do_work_for_cpu(void *_wfc)
 {
-        struct work_for_cpu *wfc = container_of(w, struct work_for_cpu, work);
+        struct work_for_cpu *wfc = _wfc;
        wfc->ret = wfc->fn(wfc->arg);
+        complete(&wfc->completion);
+        return 0;
 }
 /**
@@ -990,17 +990,23 @@ static void do_work_for_cpu(struct work_struct *w)
 *
 * This will return the value @fn returns.
 * It is up to the caller to ensure that the cpu doesn't go offline.
+ * The caller must not hold any locks which would prevent @fn from completing.
 */
 long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
 {
-        struct work_for_cpu wfc;
+        struct task_struct *sub_thread;
+        struct work_for_cpu wfc = {
-        INIT_WORK(&wfc.work, do_work_for_cpu);
+                .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion),
-        wfc.fn = fn;
+                .fn = fn,
-        wfc.arg = arg;
+                .arg = arg,
-        queue_work_on(cpu, work_on_cpu_wq, &wfc.work);
+        };
-        flush_work(&wfc.work);
+        sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu");
+        if (IS_ERR(sub_thread))
+                return PTR_ERR(sub_thread);
+        kthread_bind(sub_thread, cpu);
+        wake_up_process(sub_thread);
+        wait_for_completion(&wfc.completion);
        return wfc.ret;
 }
 EXPORT_SYMBOL_GPL(work_on_cpu);
@@ -1016,8 +1022,4 @@ void __init init_workqueues(void)
        hotcpu_notifier(workqueue_cpu_callback, 0);
        keventd_wq = create_workqueue("events");
        BUG_ON(!keventd_wq);
-#ifdef CONFIG_SMP
-        work_on_cpu_wq = create_workqueue("work_on_cpu");
-        BUG_ON(!work_on_cpu_wq);
-#endif
 }
author	Ingo Molnar <mingo@elte.hu>	2009-05-08 04:50:00 -0400
committer	Ingo Molnar <mingo@elte.hu>	2009-05-08 04:50:00 -0400
commit	f066a155334642b8a206eec625b1925d88c48aeb (patch)
tree	cb12975e60b70d1dae3b7397bab955de78a4d01e /kernel
parent	e7c064889606aab3569669078c69b87b2c527e72 (diff)
parent	33df4db04a79660150e1948e3296eeb451ac121b (diff)