38 files changed, 1046 insertions, 163 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 864ff75d65f2..6aebdeb2aa34 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
 obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
+obj-$(CONFIG_PADATA) += padata.o
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1fbcc748044a..aa3bee566446 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2936,14 +2936,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        for_each_subsys(root, ss) {
                struct cgroup_subsys_state *css = ss->create(ss, cgrp);
                if (IS_ERR(css)) {
                        err = PTR_ERR(css);
                        goto err_destroy;
                }
                init_cgroup_css(css, ss, cgrp);
-                if (ss->use_id)
+                if (ss->use_id) {
-                        if (alloc_css_id(ss, parent, cgrp))
+                        err = alloc_css_id(ss, parent, cgrp);
+                        if (err)
                                goto err_destroy;
+                }
                /* At error, ->destroy() callback has to free assigned ID. */
        }
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 1c8ddd6ee940..677f25376a38 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -151,13 +151,13 @@ static inline void check_for_tasks(int cpu)
        write_lock_irq(&tasklist_lock);
        for_each_process(p) {
-                if (task_cpu(p) == cpu &&
+                if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
                    (!cputime_eq(p->utime, cputime_zero) ||
                     !cputime_eq(p->stime, cputime_zero)))
-                        printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
+                        printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
-                                (state = %ld, flags = %x) \n",
+                                "(state = %ld, flags = %x)\n",
-                                 p->comm, task_pid_nr(p), cpu,
+                                p->comm, task_pid_nr(p), cpu,
-                                 p->state, p->flags);
+                                p->state, p->flags);
        }
        write_unlock_irq(&tasklist_lock);
 }
diff --git a/kernel/cred.c b/kernel/cred.c
index dd76cfe5f5b0..1ed8ca18790c 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -224,7 +224,7 @@ struct cred *cred_alloc_blank(void)
 #ifdef CONFIG_KEYS
        new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
        if (!new->tgcred) {
-                kfree(new);
+                kmem_cache_free(cred_jar, new);
                return NULL;
        }
        atomic_set(&new->tgcred->usage, 1);
diff --git a/kernel/fork.c b/kernel/fork.c
index 5b2959b3ffc2..f88bd984df35 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1241,21 +1241,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        /* Need tasklist lock for parent etc handling! */
        write_lock_irq(&tasklist_lock);
-        /*
-         * The task hasn't been attached yet, so its cpus_allowed mask will
-         * not be changed, nor will its assigned CPU.
-         *
-         * The cpus_allowed mask of the parent may have changed after it was
-         * copied first time - so re-copy it here, then check the child's CPU
-         * to ensure it is on a valid CPU (and if not, just force it back to
-         * parent's CPU). This avoids alot of nasty races.
-         */
-        p->cpus_allowed = current->cpus_allowed;
-        p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
-        if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
-                        !cpu_online(task_cpu(p))))
-                set_task_cpu(p, smp_processor_id());
        /* CLONE_PARENT re-uses the old parent */
        if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
                p->real_parent = current->real_parent;
diff --git a/kernel/futex.c b/kernel/futex.c
index d9b3a2228f9d..e7a35f1039e7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -530,8 +530,25 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
                                return -EINVAL;
                        WARN_ON(!atomic_read(&pi_state->refcount));
-                        WARN_ON(pid && pi_state->owner &&
-                                pi_state->owner->pid != pid);
+                        /*
+                         * When pi_state->owner is NULL then the owner died
+                         * and another waiter is on the fly. pi_state->owner
+                         * is fixed up by the task which acquires
+                         * pi_state->rt_mutex.
+                         *
+                         * We do not check for pid == 0 which can happen when
+                         * the owner died and robust_list_exit() cleared the
+                         * TID.
+                         */
+                        if (pid && pi_state->owner) {
+                                /*
+                                 * Bail out if user space manipulated the
+                                 * futex value.
+                                 */
+                                if (pid != task_pid_vnr(pi_state->owner))
+                                        return -EINVAL;
+                        }
                        atomic_inc(&pi_state->refcount);
                        *ps = pi_state;
@@ -758,6 +775,13 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
        if (!pi_state)
                return -EINVAL;
+        /*
+         * If current does not own the pi_state then the futex is
+         * inconsistent and user space fiddled with the futex value.
+         */
+        if (pi_state->owner != current)
+                return -EINVAL;
        raw_spin_lock(&pi_state->pi_mutex.wait_lock);
        new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
@@ -1971,7 +1995,7 @@ retry_private:
        /* Unqueue and drop the lock */
        unqueue_me_pi(&q);
-        goto out;
+        goto out_put_key;
 out_unlock_put_key:
        queue_unlock(&q, hb);
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 50dbd5999588..967e66143e11 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -243,38 +243,70 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
 *       ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
 *            + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM
 */
-int reserve_bp_slot(struct perf_event *bp)
+static int __reserve_bp_slot(struct perf_event *bp)
 {
        struct bp_busy_slots slots = {0};
-        int ret = 0;
-        mutex_lock(&nr_bp_mutex);
        fetch_bp_busy_slots(&slots, bp);
        /* Flexible counters need to keep at least one slot */
-        if (slots.pinned + (!!slots.flexible) == HBP_NUM) {
+        if (slots.pinned + (!!slots.flexible) == HBP_NUM)
-                ret = -ENOSPC;
+                return -ENOSPC;
-                goto end;
-        }
        toggle_bp_slot(bp, true);
-end:
+        return 0;
+}
+int reserve_bp_slot(struct perf_event *bp)
+{
+        int ret;
+        mutex_lock(&nr_bp_mutex);
+        ret = __reserve_bp_slot(bp);
        mutex_unlock(&nr_bp_mutex);
        return ret;
 }
+static void __release_bp_slot(struct perf_event *bp)
+{
+        toggle_bp_slot(bp, false);
+}
 void release_bp_slot(struct perf_event *bp)
 {
        mutex_lock(&nr_bp_mutex);
-        toggle_bp_slot(bp, false);
+        __release_bp_slot(bp);
        mutex_unlock(&nr_bp_mutex);
 }
+/*
+ * Allow the kernel debugger to reserve breakpoint slots without
+ * taking a lock using the dbg_* variant of for the reserve and
+ * release breakpoint slots.
+ */
+int dbg_reserve_bp_slot(struct perf_event *bp)
+{
+        if (mutex_is_locked(&nr_bp_mutex))
+                return -1;
+        return __reserve_bp_slot(bp);
+}
+int dbg_release_bp_slot(struct perf_event *bp)
+{
+        if (mutex_is_locked(&nr_bp_mutex))
+                return -1;
+        __release_bp_slot(bp);
+        return 0;
+}
 int register_perf_hw_breakpoint(struct perf_event *bp)
 {
@@ -296,6 +328,10 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
        if (!bp->attr.disabled || !bp->overflow_handler)
                ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
+        /* if arch_validate_hwbkpt_settings() fails then release bp slot */
+        if (ret)
+                release_bp_slot(bp);
        return ret;
 }
@@ -324,8 +360,8 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
 int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
 {
        u64 old_addr = bp->attr.bp_addr;
+        u64 old_len = bp->attr.bp_len;
        int old_type = bp->attr.bp_type;
-        int old_len = bp->attr.bp_len;
        int err = 0;
        perf_event_disable(bp);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index a9a93d9ee7a7..ef077fb73155 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -32,6 +32,7 @@
 #include <linux/console.h>
 #include <linux/vmalloc.h>
 #include <linux/swap.h>
+#include <linux/kmsg_dump.h>
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -1074,6 +1075,9 @@ void crash_kexec(struct pt_regs *regs)
        if (mutex_trylock(&kexec_mutex)) {
                if (kexec_crash_image) {
                        struct pt_regs fixed_regs;
+                        kmsg_dump(KMSG_DUMP_KEXEC);
                        crash_setup_regs(&fixed_regs, regs);
                        crash_save_vmcoreinfo();
                        machine_crash_shutdown(&fixed_regs);
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 32c5c15d750d..35edbe22e9a9 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -80,7 +80,7 @@ int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask)
        buffer = kmalloc(size, gfp_mask);
        if (!buffer) {
-                _kfifo_init(fifo, 0, 0);
+                _kfifo_init(fifo, NULL, 0);
                return -ENOMEM;
        }
@@ -97,6 +97,7 @@ EXPORT_SYMBOL(kfifo_alloc);
 void kfifo_free(struct kfifo *fifo)
 {
        kfree(fifo->buffer);
+        _kfifo_init(fifo, NULL, 0);
 }
 EXPORT_SYMBOL(kfifo_free);
@@ -349,6 +350,7 @@ EXPORT_SYMBOL(__kfifo_from_user_n);
 * @fifo: the fifo to be used.
 * @from: pointer to the data to be added.
 * @len: the length of the data to be added.
+ * @total: the actual returned data length.
 *
 * This function copies at most @len bytes from the @from into the
 * FIFO depending and returns -EFAULT/0.
@@ -399,7 +401,7 @@ EXPORT_SYMBOL(__kfifo_to_user_n);
 * @fifo: the fifo to be used.
 * @to: where the data must be copied.
 * @len: the size of the destination buffer.
- @ @lenout: pointer to output variable with copied data
+ * @lenout: pointer to output variable with copied data
 *
 * This function copies at most @len bytes from the FIFO into the
 * @to buffer and 0 or -EFAULT.
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 2eb517e23514..761fdd2b3034 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -583,6 +583,9 @@ static void kgdb_wait(struct pt_regs *regs)
        smp_wmb();
        atomic_set(&cpu_in_kgdb[cpu], 1);
+        /* Disable any cpu specific hw breakpoints */
+        kgdb_disable_hw_debug(regs);
        /* Wait till primary CPU is done with debugging */
        while (atomic_read(&passive_cpu_wait[cpu]))
                cpu_relax();
@@ -596,7 +599,7 @@ static void kgdb_wait(struct pt_regs *regs)
        /* Signal the primary CPU that we are done: */
        atomic_set(&cpu_in_kgdb[cpu], 0);
-        touch_softlockup_watchdog();
+        touch_softlockup_watchdog_sync();
        clocksource_touch_watchdog();
        local_irq_restore(flags);
 }
@@ -1450,7 +1453,7 @@ acquirelock:
            (kgdb_info[cpu].task &&
             kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
                atomic_set(&kgdb_active, -1);
-                touch_softlockup_watchdog();
+                touch_softlockup_watchdog_sync();
                clocksource_touch_watchdog();
                local_irq_restore(flags);
@@ -1550,7 +1553,7 @@ kgdb_restore:
        }
        /* Free kgdb_active */
        atomic_set(&kgdb_active, -1);
-        touch_softlockup_watchdog();
+        touch_softlockup_watchdog_sync();
        clocksource_touch_watchdog();
        local_irq_restore(flags);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b7df302a0204..c4b43430d393 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -93,6 +93,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
        {"native_get_debugreg",},
        {"irq_entries_start",},
        {"common_interrupt",},
+        {"mcount",},    /* mcount can be called from everywhere */
        {NULL}    /* Terminator */
 };
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 5feaddcdbe49..c62ec14609b9 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2147,7 +2147,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
                return ret;
        return print_irq_inversion_bug(curr, &root, target_entry,
-                                        this, 1, irqclass);
+                                        this, 0, irqclass);
 }
 void print_irqtrace_events(struct task_struct *curr)
diff --git a/kernel/padata.c b/kernel/padata.c
new file mode 100644
index 000000000000..6f9bcb8313d6
--- /dev/null
+++ b/kernel/padata.c
@@ -0,0 +1,690 @@
+/*
+ * padata.c - generic interface to process data streams in parallel
+ *
+ * Copyright (C) 2008, 2009 secunet Security Networks AG
+ * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include <linux/module.h>
+#include <linux/cpumask.h>
+#include <linux/err.h>
+#include <linux/cpu.h>
+#include <linux/padata.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/rcupdate.h>
+#define MAX_SEQ_NR INT_MAX - NR_CPUS
+#define MAX_OBJ_NUM 10000 * NR_CPUS
+static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
+{
+        int cpu, target_cpu;
+        target_cpu = cpumask_first(pd->cpumask);
+        for (cpu = 0; cpu < cpu_index; cpu++)
+                target_cpu = cpumask_next(target_cpu, pd->cpumask);
+        return target_cpu;
+}
+static int padata_cpu_hash(struct padata_priv *padata)
+{
+        int cpu_index;
+        struct parallel_data *pd;
+        pd =  padata->pd;
+        /*
+         * Hash the sequence numbers to the cpus by taking
+         * seq_nr mod. number of cpus in use.
+         */
+        cpu_index =  padata->seq_nr % cpumask_weight(pd->cpumask);
+        return padata_index_to_cpu(pd, cpu_index);
+}
+static void padata_parallel_worker(struct work_struct *work)
+{
+        struct padata_queue *queue;
+        struct parallel_data *pd;
+        struct padata_instance *pinst;
+        LIST_HEAD(local_list);
+        local_bh_disable();
+        queue = container_of(work, struct padata_queue, pwork);
+        pd = queue->pd;
+        pinst = pd->pinst;
+        spin_lock(&queue->parallel.lock);
+        list_replace_init(&queue->parallel.list, &local_list);
+        spin_unlock(&queue->parallel.lock);
+        while (!list_empty(&local_list)) {
+                struct padata_priv *padata;
+                padata = list_entry(local_list.next,
+                                    struct padata_priv, list);
+                list_del_init(&padata->list);
+                padata->parallel(padata);
+        }
+        local_bh_enable();
+}
+/*
+ * padata_do_parallel - padata parallelization function
+ *
+ * @pinst: padata instance
+ * @padata: object to be parallelized
+ * @cb_cpu: cpu the serialization callback function will run on,
+ *          must be in the cpumask of padata.
+ *
+ * The parallelization callback function will run with BHs off.
+ * Note: Every object which is parallelized by padata_do_parallel
+ * must be seen by padata_do_serial.
+ */
+int padata_do_parallel(struct padata_instance *pinst,
+                       struct padata_priv *padata, int cb_cpu)
+{
+        int target_cpu, err;
+        struct padata_queue *queue;
+        struct parallel_data *pd;
+        rcu_read_lock_bh();
+        pd = rcu_dereference(pinst->pd);
+        err = 0;
+        if (!(pinst->flags & PADATA_INIT))
+                goto out;
+        err =  -EBUSY;
+        if ((pinst->flags & PADATA_RESET))
+                goto out;
+        if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM)
+                goto out;
+        err = -EINVAL;
+        if (!cpumask_test_cpu(cb_cpu, pd->cpumask))
+                goto out;
+        err = -EINPROGRESS;
+        atomic_inc(&pd->refcnt);
+        padata->pd = pd;
+        padata->cb_cpu = cb_cpu;
+        if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr))
+                atomic_set(&pd->seq_nr, -1);
+        padata->seq_nr = atomic_inc_return(&pd->seq_nr);
+        target_cpu = padata_cpu_hash(padata);
+        queue = per_cpu_ptr(pd->queue, target_cpu);
+        spin_lock(&queue->parallel.lock);
+        list_add_tail(&padata->list, &queue->parallel.list);
+        spin_unlock(&queue->parallel.lock);
+        queue_work_on(target_cpu, pinst->wq, &queue->pwork);
+out:
+        rcu_read_unlock_bh();
+        return err;
+}
+EXPORT_SYMBOL(padata_do_parallel);
+static struct padata_priv *padata_get_next(struct parallel_data *pd)
+{
+        int cpu, num_cpus, empty, calc_seq_nr;
+        int seq_nr, next_nr, overrun, next_overrun;
+        struct padata_queue *queue, *next_queue;
+        struct padata_priv *padata;
+        struct padata_list *reorder;
+        empty = 0;
+        next_nr = -1;
+        next_overrun = 0;
+        next_queue = NULL;
+        num_cpus = cpumask_weight(pd->cpumask);
+        for_each_cpu(cpu, pd->cpumask) {
+                queue = per_cpu_ptr(pd->queue, cpu);
+                reorder = &queue->reorder;
+                /*
+                 * Calculate the seq_nr of the object that should be
+                 * next in this queue.
+                 */
+                overrun = 0;
+                calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus)
+                               + queue->cpu_index;
+                if (unlikely(calc_seq_nr > pd->max_seq_nr)) {
+                        calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1;
+                        overrun = 1;
+                }
+                if (!list_empty(&reorder->list)) {
+                        padata = list_entry(reorder->list.next,
+                                            struct padata_priv, list);
+                        seq_nr  = padata->seq_nr;
+                        BUG_ON(calc_seq_nr != seq_nr);
+                } else {
+                        seq_nr = calc_seq_nr;
+                        empty++;
+                }
+                if (next_nr < 0 || seq_nr < next_nr
+                    || (next_overrun && !overrun)) {
+                        next_nr = seq_nr;
+                        next_overrun = overrun;
+                        next_queue = queue;
+                }
+        }
+        padata = NULL;
+        if (empty == num_cpus)
+                goto out;
+        reorder = &next_queue->reorder;
+        if (!list_empty(&reorder->list)) {
+                padata = list_entry(reorder->list.next,
+                                    struct padata_priv, list);
+                if (unlikely(next_overrun)) {
+                        for_each_cpu(cpu, pd->cpumask) {
+                                queue = per_cpu_ptr(pd->queue, cpu);
+                                atomic_set(&queue->num_obj, 0);
+                        }
+                }
+                spin_lock(&reorder->lock);
+                list_del_init(&padata->list);
+                atomic_dec(&pd->reorder_objects);
+                spin_unlock(&reorder->lock);
+                atomic_inc(&next_queue->num_obj);
+                goto out;
+        }
+        if (next_nr % num_cpus == next_queue->cpu_index) {
+                padata = ERR_PTR(-ENODATA);
+                goto out;
+        }
+        padata = ERR_PTR(-EINPROGRESS);
+out:
+        return padata;
+}
+static void padata_reorder(struct parallel_data *pd)
+{
+        struct padata_priv *padata;
+        struct padata_queue *queue;
+        struct padata_instance *pinst = pd->pinst;
+try_again:
+        if (!spin_trylock_bh(&pd->lock))
+                goto out;
+        while (1) {
+                padata = padata_get_next(pd);
+                if (!padata || PTR_ERR(padata) == -EINPROGRESS)
+                        break;
+                if (PTR_ERR(padata) == -ENODATA) {
+                        spin_unlock_bh(&pd->lock);
+                        goto out;
+                }
+                queue = per_cpu_ptr(pd->queue, padata->cb_cpu);
+                spin_lock(&queue->serial.lock);
+                list_add_tail(&padata->list, &queue->serial.list);
+                spin_unlock(&queue->serial.lock);
+                queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork);
+        }
+        spin_unlock_bh(&pd->lock);
+        if (atomic_read(&pd->reorder_objects))
+                goto try_again;
+out:
+        return;
+}
+static void padata_serial_worker(struct work_struct *work)
+{
+        struct padata_queue *queue;
+        struct parallel_data *pd;
+        LIST_HEAD(local_list);
+        local_bh_disable();
+        queue = container_of(work, struct padata_queue, swork);
+        pd = queue->pd;
+        spin_lock(&queue->serial.lock);
+        list_replace_init(&queue->serial.list, &local_list);
+        spin_unlock(&queue->serial.lock);
+        while (!list_empty(&local_list)) {
+                struct padata_priv *padata;
+                padata = list_entry(local_list.next,
+                                    struct padata_priv, list);
+                list_del_init(&padata->list);
+                padata->serial(padata);
+                atomic_dec(&pd->refcnt);
+        }
+        local_bh_enable();
+}
+/*
+ * padata_do_serial - padata serialization function
+ *
+ * @padata: object to be serialized.
+ *
+ * padata_do_serial must be called for every parallelized object.
+ * The serialization callback function will run with BHs off.
+ */
+void padata_do_serial(struct padata_priv *padata)
+{
+        int cpu;
+        struct padata_queue *queue;
+        struct parallel_data *pd;
+        pd = padata->pd;
+        cpu = get_cpu();
+        queue = per_cpu_ptr(pd->queue, cpu);
+        spin_lock(&queue->reorder.lock);
+        atomic_inc(&pd->reorder_objects);
+        list_add_tail(&padata->list, &queue->reorder.list);
+        spin_unlock(&queue->reorder.lock);
+        put_cpu();
+        padata_reorder(pd);
+}
+EXPORT_SYMBOL(padata_do_serial);
+static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
+                                             const struct cpumask *cpumask)
+{
+        int cpu, cpu_index, num_cpus;
+        struct padata_queue *queue;
+        struct parallel_data *pd;
+        cpu_index = 0;
+        pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
+        if (!pd)
+                goto err;
+        pd->queue = alloc_percpu(struct padata_queue);
+        if (!pd->queue)
+                goto err_free_pd;
+        if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL))
+                goto err_free_queue;
+        for_each_possible_cpu(cpu) {
+                queue = per_cpu_ptr(pd->queue, cpu);
+                queue->pd = pd;
+                if (cpumask_test_cpu(cpu, cpumask)
+                    && cpumask_test_cpu(cpu, cpu_active_mask)) {
+                        queue->cpu_index = cpu_index;
+                        cpu_index++;
+                } else
+                        queue->cpu_index = -1;
+                INIT_LIST_HEAD(&queue->reorder.list);
+                INIT_LIST_HEAD(&queue->parallel.list);
+                INIT_LIST_HEAD(&queue->serial.list);
+                spin_lock_init(&queue->reorder.lock);
+                spin_lock_init(&queue->parallel.lock);
+                spin_lock_init(&queue->serial.lock);
+                INIT_WORK(&queue->pwork, padata_parallel_worker);
+                INIT_WORK(&queue->swork, padata_serial_worker);
+                atomic_set(&queue->num_obj, 0);
+        }
+        cpumask_and(pd->cpumask, cpumask, cpu_active_mask);
+        num_cpus = cpumask_weight(pd->cpumask);
+        pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1;
+        atomic_set(&pd->seq_nr, -1);
+        atomic_set(&pd->reorder_objects, 0);
+        atomic_set(&pd->refcnt, 0);
+        pd->pinst = pinst;
+        spin_lock_init(&pd->lock);
+        return pd;
+err_free_queue:
+        free_percpu(pd->queue);
+err_free_pd:
+        kfree(pd);
+err:
+        return NULL;
+}
+static void padata_free_pd(struct parallel_data *pd)
+{
+        free_cpumask_var(pd->cpumask);
+        free_percpu(pd->queue);
+        kfree(pd);
+}
+static void padata_replace(struct padata_instance *pinst,
+                           struct parallel_data *pd_new)
+{
+        struct parallel_data *pd_old = pinst->pd;
+        pinst->flags |= PADATA_RESET;
+        rcu_assign_pointer(pinst->pd, pd_new);
+        synchronize_rcu();
+        while (atomic_read(&pd_old->refcnt) != 0)
+                yield();
+        flush_workqueue(pinst->wq);
+        padata_free_pd(pd_old);
+        pinst->flags &= ~PADATA_RESET;
+}
+/*
+ * padata_set_cpumask - set the cpumask that padata should use
+ *
+ * @pinst: padata instance
+ * @cpumask: the cpumask to use
+ */
+int padata_set_cpumask(struct padata_instance *pinst,
+                        cpumask_var_t cpumask)
+{
+        struct parallel_data *pd;
+        int err = 0;
+        might_sleep();
+        mutex_lock(&pinst->lock);
+        pd = padata_alloc_pd(pinst, cpumask);
+        if (!pd) {
+                err = -ENOMEM;
+                goto out;
+        }
+        cpumask_copy(pinst->cpumask, cpumask);
+        padata_replace(pinst, pd);
+out:
+        mutex_unlock(&pinst->lock);
+        return err;
+}
+EXPORT_SYMBOL(padata_set_cpumask);
+static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
+{
+        struct parallel_data *pd;
+        if (cpumask_test_cpu(cpu, cpu_active_mask)) {
+                pd = padata_alloc_pd(pinst, pinst->cpumask);
+                if (!pd)
+                        return -ENOMEM;
+                padata_replace(pinst, pd);
+        }
+        return 0;
+}
+/*
+ * padata_add_cpu - add a cpu to the padata cpumask
+ *
+ * @pinst: padata instance
+ * @cpu: cpu to add
+ */
+int padata_add_cpu(struct padata_instance *pinst, int cpu)
+{
+        int err;
+        might_sleep();
+        mutex_lock(&pinst->lock);
+        cpumask_set_cpu(cpu, pinst->cpumask);
+        err = __padata_add_cpu(pinst, cpu);
+        mutex_unlock(&pinst->lock);
+        return err;
+}
+EXPORT_SYMBOL(padata_add_cpu);
+static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
+{
+        struct parallel_data *pd;
+        if (cpumask_test_cpu(cpu, cpu_online_mask)) {
+                pd = padata_alloc_pd(pinst, pinst->cpumask);
+                if (!pd)
+                        return -ENOMEM;
+                padata_replace(pinst, pd);
+        }
+        return 0;
+}
+/*
+ * padata_remove_cpu - remove a cpu from the padata cpumask
+ *
+ * @pinst: padata instance
+ * @cpu: cpu to remove
+ */
+int padata_remove_cpu(struct padata_instance *pinst, int cpu)
+{
+        int err;
+        might_sleep();
+        mutex_lock(&pinst->lock);
+        cpumask_clear_cpu(cpu, pinst->cpumask);
+        err = __padata_remove_cpu(pinst, cpu);
+        mutex_unlock(&pinst->lock);
+        return err;
+}
+EXPORT_SYMBOL(padata_remove_cpu);
+/*
+ * padata_start - start the parallel processing
+ *
+ * @pinst: padata instance to start
+ */
+void padata_start(struct padata_instance *pinst)
+{
+        might_sleep();
+        mutex_lock(&pinst->lock);
+        pinst->flags |= PADATA_INIT;
+        mutex_unlock(&pinst->lock);
+}
+EXPORT_SYMBOL(padata_start);
+/*
+ * padata_stop - stop the parallel processing
+ *
+ * @pinst: padata instance to stop
+ */
+void padata_stop(struct padata_instance *pinst)
+{
+        might_sleep();
+        mutex_lock(&pinst->lock);
+        pinst->flags &= ~PADATA_INIT;
+        mutex_unlock(&pinst->lock);
+}
+EXPORT_SYMBOL(padata_stop);
+static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
+                                         unsigned long action, void *hcpu)
+{
+        int err;
+        struct padata_instance *pinst;
+        int cpu = (unsigned long)hcpu;
+        pinst = container_of(nfb, struct padata_instance, cpu_notifier);
+        switch (action) {
+        case CPU_ONLINE:
+        case CPU_ONLINE_FROZEN:
+                if (!cpumask_test_cpu(cpu, pinst->cpumask))
+                        break;
+                mutex_lock(&pinst->lock);
+                err = __padata_add_cpu(pinst, cpu);
+                mutex_unlock(&pinst->lock);
+                if (err)
+                        return NOTIFY_BAD;
+                break;
+        case CPU_DOWN_PREPARE:
+        case CPU_DOWN_PREPARE_FROZEN:
+                if (!cpumask_test_cpu(cpu, pinst->cpumask))
+                        break;
+                mutex_lock(&pinst->lock);
+                err = __padata_remove_cpu(pinst, cpu);
+                mutex_unlock(&pinst->lock);
+                if (err)
+                        return NOTIFY_BAD;
+                break;
+        case CPU_UP_CANCELED:
+        case CPU_UP_CANCELED_FROZEN:
+                if (!cpumask_test_cpu(cpu, pinst->cpumask))
+                        break;
+                mutex_lock(&pinst->lock);
+                __padata_remove_cpu(pinst, cpu);
+                mutex_unlock(&pinst->lock);
+        case CPU_DOWN_FAILED:
+        case CPU_DOWN_FAILED_FROZEN:
+                if (!cpumask_test_cpu(cpu, pinst->cpumask))
+                        break;
+                mutex_lock(&pinst->lock);
+                __padata_add_cpu(pinst, cpu);
+                mutex_unlock(&pinst->lock);
+        }
+        return NOTIFY_OK;
+}
+/*
+ * padata_alloc - allocate and initialize a padata instance
+ *
+ * @cpumask: cpumask that padata uses for parallelization
+ * @wq: workqueue to use for the allocated padata instance
+ */
+struct padata_instance *padata_alloc(const struct cpumask *cpumask,
+                                     struct workqueue_struct *wq)
+{
+        int err;
+        struct padata_instance *pinst;
+        struct parallel_data *pd;
+        pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL);
+        if (!pinst)
+                goto err;
+        pd = padata_alloc_pd(pinst, cpumask);
+        if (!pd)
+                goto err_free_inst;
+        rcu_assign_pointer(pinst->pd, pd);
+        pinst->wq = wq;
+        cpumask_copy(pinst->cpumask, cpumask);
+        pinst->flags = 0;
+        pinst->cpu_notifier.notifier_call = padata_cpu_callback;
+        pinst->cpu_notifier.priority = 0;
+        err = register_hotcpu_notifier(&pinst->cpu_notifier);
+        if (err)
+                goto err_free_pd;
+        mutex_init(&pinst->lock);
+        return pinst;
+err_free_pd:
+        padata_free_pd(pd);
+err_free_inst:
+        kfree(pinst);
+err:
+        return NULL;
+}
+EXPORT_SYMBOL(padata_alloc);
+/*
+ * padata_free - free a padata instance
+ *
+ * @ padata_inst: padata instance to free
+ */
+void padata_free(struct padata_instance *pinst)
+{
+        padata_stop(pinst);
+        synchronize_rcu();
+        while (atomic_read(&pinst->pd->refcnt) != 0)
+                yield();
+        unregister_hotcpu_notifier(&pinst->cpu_notifier);
+        padata_free_pd(pinst->pd);
+        kfree(pinst);
+}
+EXPORT_SYMBOL(padata_free);
diff --git a/kernel/panic.c b/kernel/panic.c
index 5827f7b97254..c787333282b8 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -75,7 +75,6 @@ NORET_TYPE void panic(const char * fmt, ...)
        dump_stack();
 #endif
-        kmsg_dump(KMSG_DUMP_PANIC);
        /*
         * If we have crashed and we have a crash kernel loaded let it handle
         * everything else.
@@ -83,6 +82,8 @@ NORET_TYPE void panic(const char * fmt, ...)
         */
        crash_kexec(NULL);
+        kmsg_dump(KMSG_DUMP_PANIC);
        /*
         * Note smp_send_stop is the usual smp shutdown function, which
         * unfortunately means it may not be hardened to work in a panic
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 603c0d8b5df1..2ae7409bf38f 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3259,8 +3259,6 @@ static void perf_event_task_output(struct perf_event *event,
        task_event->event_id.tid = perf_event_tid(event, task);
        task_event->event_id.ptid = perf_event_tid(event, current);
-        task_event->event_id.time = perf_clock();
        perf_output_put(&handle, task_event->event_id);
        perf_output_end(&handle);
@@ -3268,6 +3266,9 @@ static void perf_event_task_output(struct perf_event *event,
 static int perf_event_task_match(struct perf_event *event)
 {
+        if (event->state < PERF_EVENT_STATE_INACTIVE)
+                return 0;
        if (event->cpu != -1 && event->cpu != smp_processor_id())
                return 0;
@@ -3297,7 +3298,7 @@ static void perf_event_task_event(struct perf_task_event *task_event)
        cpuctx = &get_cpu_var(perf_cpu_context);
        perf_event_task_ctx(&cpuctx->ctx, task_event);
        if (!ctx)
-                ctx = rcu_dereference(task_event->task->perf_event_ctxp);
+                ctx = rcu_dereference(current->perf_event_ctxp);
        if (ctx)
                perf_event_task_ctx(ctx, task_event);
        put_cpu_var(perf_cpu_context);
@@ -3328,6 +3329,7 @@ static void perf_event_task(struct task_struct *task,
                        /* .ppid */
                        /* .tid  */
                        /* .ptid */
+                        .time = perf_clock(),
                },
        };
@@ -3377,6 +3379,9 @@ static void perf_event_comm_output(struct perf_event *event,
 static int perf_event_comm_match(struct perf_event *event)
 {
+        if (event->state < PERF_EVENT_STATE_INACTIVE)
+                return 0;
        if (event->cpu != -1 && event->cpu != smp_processor_id())
                return 0;
@@ -3494,6 +3499,9 @@ static void perf_event_mmap_output(struct perf_event *event,
 static int perf_event_mmap_match(struct perf_event *event,
                                   struct perf_mmap_event *mmap_event)
 {
+        if (event->state < PERF_EVENT_STATE_INACTIVE)
+                return 0;
        if (event->cpu != -1 && event->cpu != smp_processor_id())
                return 0;
@@ -4571,7 +4579,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
        if (attr->type >= PERF_TYPE_MAX)
                return -EINVAL;
-        if (attr->__reserved_1 || attr->__reserved_2)
+        if (attr->__reserved_1)
                return -EINVAL;
        if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 91e09d3b2eb2..5c36ea9d55d2 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,6 +27,15 @@ config PM_DEBUG
        code. This is helpful when debugging and reporting PM bugs, like
        suspend support.
+config PM_ADVANCED_DEBUG
+        bool "Extra PM attributes in sysfs for low-level debugging/testing"
+        depends on PM_DEBUG
+        default n
+        ---help---
+        Add extra sysfs attributes allowing one to access some Power Management
+        fields of device objects from user space.  If you are not a kernel
+        developer interested in debugging/testing Power Management, say "no".
 config PM_VERBOSE
        bool "Verbose Power Management debugging"
        depends on PM_DEBUG
@@ -85,6 +94,11 @@ config PM_SLEEP
        depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
        default y
+config PM_SLEEP_ADVANCED_DEBUG
+        bool
+        depends on PM_ADVANCED_DEBUG
+        default n
 config SUSPEND
        bool "Suspend to RAM and standby"
        depends on PM && ARCH_SUSPEND_POSSIBLE
@@ -222,3 +236,8 @@ config PM_RUNTIME
          and the bus type drivers of the buses the devices are on are
          responsible for the actual handling of the autosuspend requests and
          wake-up events.
+config PM_OPS
+        bool
+        depends on PM_SLEEP || PM_RUNTIME
+        default y
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 0998c7139053..b58800b21fc0 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -44,6 +44,32 @@ int pm_notifier_call_chain(unsigned long val)
                        == NOTIFY_BAD) ? -EINVAL : 0;
 }
+/* If set, devices may be suspended and resumed asynchronously. */
+int pm_async_enabled = 1;
+static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr,
+                             char *buf)
+{
+        return sprintf(buf, "%d\n", pm_async_enabled);
+}
+static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
+                              const char *buf, size_t n)
+{
+        unsigned long val;
+        if (strict_strtoul(buf, 10, &val))
+                return -EINVAL;
+        if (val > 1)
+                return -EINVAL;
+        pm_async_enabled = val;
+        return n;
+}
+power_attr(pm_async);
 #ifdef CONFIG_PM_DEBUG
 int pm_test_level = TEST_NONE;
@@ -208,9 +234,12 @@ static struct attribute * g[] = {
 #ifdef CONFIG_PM_TRACE
        &pm_trace_attr.attr,
 #endif
-#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PM_DEBUG)
+#ifdef CONFIG_PM_SLEEP
+        &pm_async_attr.attr,
+#ifdef CONFIG_PM_DEBUG
        &pm_test_attr.attr,
 #endif
+#endif
        NULL,
 };
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 36cb168e4330..830cadecbdfc 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1181,7 +1181,7 @@ static void free_unnecessary_pages(void)
        memory_bm_position_reset(&copy_bm);
-        while (to_free_normal > 0 && to_free_highmem > 0) {
+        while (to_free_normal > 0 || to_free_highmem > 0) {
                unsigned long pfn = memory_bm_next_pfn(&copy_bm);
                struct page *page = pfn_to_page(pfn);
@@ -1500,7 +1500,7 @@ asmlinkage int swsusp_save(void)
 {
        unsigned int nr_pages, nr_highmem;
-        printk(KERN_INFO "PM: Creating hibernation image: \n");
+        printk(KERN_INFO "PM: Creating hibernation image:\n");
        drain_local_pages(NULL);
        nr_pages = count_data_pages();
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 09b2b0ae9e9d..1d575733d4e1 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -657,10 +657,6 @@ int swsusp_read(unsigned int *flags_p)
        struct swsusp_info *header;
        *flags_p = swsusp_header->flags;
-        if (IS_ERR(resume_bdev)) {
-                pr_debug("PM: Image device not initialised\n");
-                return PTR_ERR(resume_bdev);
-        }
        memset(&snapshot, 0, sizeof(struct snapshot_handle));
        error = snapshot_write_next(&snapshot, PAGE_SIZE);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
deleted file mode 100644
index 5b3601bd1893..000000000000
--- a/kernel/power/swsusp.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * linux/kernel/power/swsusp.c
- *
- * This file provides code to write suspend image to swap and read it back.
- *
- * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
- *
- * This file is released under the GPLv2.
- *
- * I'd like to thank the following people for their work:
- *
- * Pavel Machek <pavel@ucw.cz>:
- * Modifications, defectiveness pointing, being with me at the very beginning,
- * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
- *
- * Steve Doddi <dirk@loth.demon.co.uk>:
- * Support the possibility of hardware state restoring.
- *
- * Raph <grey.havens@earthling.net>:
- * Support for preserving states of network devices and virtual console
- * (including X and svgatextmode)
- *
- * Kurt Garloff <garloff@suse.de>:
- * Straightened the critical function in order to prevent compilers from
- * playing tricks with local variables.
- *
- * Andreas Mohr <a.mohr@mailto.de>
- *
- * Alex Badea <vampire@go.ro>:
- * Fixed runaway init
- *
- * Rafael J. Wysocki <rjw@sisk.pl>
- * Reworked the freeing of memory and the handling of swap
- *
- * More state savers are welcome. Especially for the scsi layer...
- *
- * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
- */
-#include <linux/mm.h>
-#include <linux/suspend.h>
-#include <linux/spinlock.h>
-#include <linux/kernel.h>
-#include <linux/major.h>
-#include <linux/swap.h>
-#include <linux/pm.h>
-#include <linux/swapops.h>
-#include <linux/bootmem.h>
-#include <linux/syscalls.h>
-#include <linux/highmem.h>
-#include <linux/time.h>
-#include <linux/rbtree.h>
-#include <linux/io.h>
-#include "power.h"
-int in_suspend __nosavedata = 0;
diff --git a/kernel/power/user.c b/kernel/power/user.c
index bf0014d6a5f0..4d2289626a84 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -195,6 +195,15 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
        return res;
 }
+static void snapshot_deprecated_ioctl(unsigned int cmd)
+{
+        if (printk_ratelimit())
+                printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will "
+                                "be removed soon, update your suspend-to-disk "
+                                "utilities\n",
+                                __builtin_return_address(0), cmd);
+}
 static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                                                        unsigned long arg)
 {
@@ -246,8 +255,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                data->frozen = 0;
                break;
-        case SNAPSHOT_CREATE_IMAGE:
        case SNAPSHOT_ATOMIC_SNAPSHOT:
+                snapshot_deprecated_ioctl(cmd);
+        case SNAPSHOT_CREATE_IMAGE:
                if (data->mode != O_RDONLY || !data->frozen  || data->ready) {
                        error = -EPERM;
                        break;
@@ -275,8 +285,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                data->ready = 0;
                break;
-        case SNAPSHOT_PREF_IMAGE_SIZE:
        case SNAPSHOT_SET_IMAGE_SIZE:
+                snapshot_deprecated_ioctl(cmd);
+        case SNAPSHOT_PREF_IMAGE_SIZE:
                image_size = arg;
                break;
@@ -290,15 +301,17 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                error = put_user(size, (loff_t __user *)arg);
                break;
-        case SNAPSHOT_AVAIL_SWAP_SIZE:
        case SNAPSHOT_AVAIL_SWAP:
+                snapshot_deprecated_ioctl(cmd);
+        case SNAPSHOT_AVAIL_SWAP_SIZE:
                size = count_swap_pages(data->swap, 1);
                size <<= PAGE_SHIFT;
                error = put_user(size, (loff_t __user *)arg);
                break;
-        case SNAPSHOT_ALLOC_SWAP_PAGE:
        case SNAPSHOT_GET_SWAP_PAGE:
+                snapshot_deprecated_ioctl(cmd);
+        case SNAPSHOT_ALLOC_SWAP_PAGE:
                if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
                        error = -ENODEV;
                        break;
@@ -321,6 +334,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                break;
        case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */
+                snapshot_deprecated_ioctl(cmd);
                if (!swsusp_swap_in_use()) {
                        /*
                         * User space encodes device types as two-byte values,
@@ -362,6 +376,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                break;
        case SNAPSHOT_PMOPS: /* This ioctl is deprecated */
+                snapshot_deprecated_ioctl(cmd);
                error = -EINVAL;
                switch (arg) {
diff --git a/kernel/printk.c b/kernel/printk.c
index 17463ca2e229..1751c456b71f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1467,6 +1467,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
 static const char const *kmsg_reasons[] = {
        [KMSG_DUMP_OOPS]        = "oops",
        [KMSG_DUMP_PANIC]       = "panic",
+        [KMSG_DUMP_KEXEC]       = "kexec",
 };
 static const char *kmsg_to_str(enum kmsg_dump_reason reason)
diff --git a/kernel/resource.c b/kernel/resource.c
index af96c1e4b54b..24e9e60c1459 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -188,6 +188,36 @@ static int __release_resource(struct resource *old)
        return -EINVAL;
 }
+static void __release_child_resources(struct resource *r)
+{
+        struct resource *tmp, *p;
+        resource_size_t size;
+        p = r->child;
+        r->child = NULL;
+        while (p) {
+                tmp = p;
+                p = p->sibling;
+                tmp->parent = NULL;
+                tmp->sibling = NULL;
+                __release_child_resources(tmp);
+                printk(KERN_DEBUG "release child resource %pR\n", tmp);
+                /* need to restore size, and keep flags */
+                size = resource_size(tmp);
+                tmp->start = 0;
+                tmp->end = size - 1;
+        }
+}
+void release_child_resources(struct resource *r)
+{
+        write_lock(&resource_lock);
+        __release_child_resources(r);
+        write_unlock(&resource_lock);
+}
 /**
 * request_resource - request and reserve an I/O or memory resource
 * @root: root resource descriptor
@@ -303,8 +333,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
 static int find_resource(struct resource *root, struct resource *new,
                         resource_size_t size, resource_size_t min,
                         resource_size_t max, resource_size_t align,
-                         void (*alignf)(void *, struct resource *,
+                         resource_size_t (*alignf)(void *,
-                                        resource_size_t, resource_size_t),
+                                                   const struct resource *,
+                                                   resource_size_t,
+                                                   resource_size_t),
                         void *alignf_data)
 {
        struct resource *this = root->child;
@@ -330,7 +362,7 @@ static int find_resource(struct resource *root, struct resource *new,
                        tmp.end = max;
                tmp.start = ALIGN(tmp.start, align);
                if (alignf)
-                        alignf(alignf_data, &tmp, size, align);
+                        tmp.start = alignf(alignf_data, &tmp, size, align);
                if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) {
                        new->start = tmp.start;
                        new->end = tmp.start + size - 1;
@@ -358,8 +390,10 @@ static int find_resource(struct resource *root, struct resource *new,
 int allocate_resource(struct resource *root, struct resource *new,
                      resource_size_t size, resource_size_t min,
                      resource_size_t max, resource_size_t align,
-                      void (*alignf)(void *, struct resource *,
+                      resource_size_t (*alignf)(void *,
-                                     resource_size_t, resource_size_t),
+                                                const struct resource *,
+                                                resource_size_t,
+                                                resource_size_t),
                      void *alignf_data)
 {
        int err;
diff --git a/kernel/sched.c b/kernel/sched.c
index c535cc4f6428..3a8fb30a91b1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2320,14 +2320,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 }
 /*
- * Called from:
+ * Gets called from 3 sites (exec, fork, wakeup), since it is called without
+ * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
+ * by:
 *
- *  - fork, @p is stable because it isn't on the tasklist yet
+ *  exec:           is unstable, retry loop
- *
+ *  fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
- *  - exec, @p is unstable, retry loop
- *
- *  - wake-up, we serialize ->cpus_allowed against TASK_WAKING so
- *             we should be good.
 */
 static inline
 int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
@@ -2620,9 +2618,6 @@ void sched_fork(struct task_struct *p, int clone_flags)
        if (p->sched_class->task_fork)
                p->sched_class->task_fork(p);
-#ifdef CONFIG_SMP
-        cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
-#endif
        set_task_cpu(p, cpu);
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
@@ -2652,6 +2647,21 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 {
        unsigned long flags;
        struct rq *rq;
+        int cpu = get_cpu();
+#ifdef CONFIG_SMP
+        /*
+         * Fork balancing, do it here and not earlier because:
+         *  - cpus_allowed can change in the fork path
+         *  - any previously selected cpu might disappear through hotplug
+         *
+         * We still have TASK_WAKING but PF_STARTING is gone now, meaning
+         * ->cpus_allowed is stable, we have preemption disabled, meaning
+         * cpu_online_mask is stable.
+         */
+        cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
+        set_task_cpu(p, cpu);
+#endif
        rq = task_rq_lock(p, &flags);
        BUG_ON(p->state != TASK_WAKING);
@@ -2665,6 +2675,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
                p->sched_class->task_woken(rq, p);
 #endif
        task_rq_unlock(rq, &flags);
+        put_cpu();
 }
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -5530,8 +5541,11 @@ need_resched_nonpreemptible:
        post_schedule(rq);
-        if (unlikely(reacquire_kernel_lock(current) < 0))
+        if (unlikely(reacquire_kernel_lock(current) < 0)) {
+                prev = rq->curr;
+                switch_count = &prev->nivcsw;
                goto need_resched_nonpreemptible;
+        }
        preempt_enable_no_resched();
        if (need_resched())
@@ -7136,14 +7150,18 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
         * the ->cpus_allowed mask from under waking tasks, which would be
         * possible when we change rq->lock in ttwu(), so synchronize against
         * TASK_WAKING to avoid that.
+         *
+         * Make an exception for freshly cloned tasks, since cpuset namespaces
+         * might move the task about, we have to validate the target in
+         * wake_up_new_task() anyway since the cpu might have gone away.
         */
 again:
-        while (p->state == TASK_WAKING)
+        while (p->state == TASK_WAKING && !(p->flags & PF_STARTING))
                cpu_relax();
        rq = task_rq_lock(p, &flags);
-        if (p->state == TASK_WAKING) {
+        if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) {
                task_rq_unlock(rq, &flags);
                goto again;
        }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 42ac3c9f66f6..8fe7ee81c552 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1508,7 +1508,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                         * If there's an idle sibling in this domain, make that
                         * the wake_affine target instead of the current cpu.
                         */
-                        if (tmp->flags & SD_PREFER_SIBLING)
+                        if (tmp->flags & SD_SHARE_PKG_RESOURCES)
                                target = select_idle_sibling(p, tmp, target);
                        if (target >= 0) {
diff --git a/kernel/smp.c b/kernel/smp.c
index f10408422444..9867b6bfefce 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -12,8 +12,6 @@
 #include <linux/smp.h>
 #include <linux/cpu.h>
-static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
 static struct {
        struct list_head        queue;
        raw_spinlock_t          lock;
@@ -33,12 +31,14 @@ struct call_function_data {
        cpumask_var_t           cpumask;
 };
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
 struct call_single_queue {
        struct list_head        list;
        raw_spinlock_t          lock;
 };
-static DEFINE_PER_CPU(struct call_function_data, cfd_data);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_queue, call_single_queue);
 static int
 hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
@@ -256,7 +256,7 @@ void generic_smp_call_function_single_interrupt(void)
        }
 }
-static DEFINE_PER_CPU(struct call_single_data, csd_data);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
 /*
 * smp_call_function_single - Run a function on a specific CPU
diff --git a/kernel/softirq.c b/kernel/softirq.c
index a09502e2ef75..7c1a67ef0274 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -500,22 +500,17 @@ EXPORT_SYMBOL(tasklet_kill);
 */
 /*
- * The trampoline is called when the hrtimer expires. If this is
+ * The trampoline is called when the hrtimer expires. It schedules a tasklet
- * called from the hrtimer interrupt then we schedule the tasklet as
+ * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
- * the timer callback function expects to run in softirq context. If
+ * hrtimer callback, but from softirq context.
- * it's called in softirq context anyway (i.e. high resolution timers
- * disabled) then the hrtimer callback is called right away.
 */
 static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
 {
        struct tasklet_hrtimer *ttimer =
                container_of(timer, struct tasklet_hrtimer, timer);
-        if (hrtimer_is_hres_active(timer)) {
+        tasklet_hi_schedule(&ttimer->tasklet);
-                tasklet_hi_schedule(&ttimer->tasklet);
+        return HRTIMER_NORESTART;
-                return HRTIMER_NORESTART;
-        }
-        return ttimer->function(timer);
 }
 /*
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index d22579087e27..0d4c7898ab80 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -25,6 +25,7 @@ static DEFINE_SPINLOCK(print_lock);
 static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */
 static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */
 static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
+static DEFINE_PER_CPU(bool, softlock_touch_sync);
 static int __read_mostly did_panic;
 int __read_mostly softlockup_thresh = 60;
@@ -79,6 +80,12 @@ void touch_softlockup_watchdog(void)
 }
 EXPORT_SYMBOL(touch_softlockup_watchdog);
+void touch_softlockup_watchdog_sync(void)
+{
+        __raw_get_cpu_var(softlock_touch_sync) = true;
+        __raw_get_cpu_var(softlockup_touch_ts) = 0;
+}
 void touch_all_softlockup_watchdogs(void)
 {
        int cpu;
@@ -118,6 +125,14 @@ void softlockup_tick(void)
        }
        if (touch_ts == 0) {
+                if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) {
+                        /*
+                         * If the time stamp was touched atomically
+                         * make sure the scheduler tick is up to date.
+                         */
+                        per_cpu(softlock_touch_sync, this_cpu) = false;
+                        sched_clock_tick();
+                }
                __touch_softlockup_watchdog();
                return;
        }
diff --git a/kernel/sys.c b/kernel/sys.c
index 26a6b73a6b85..18bde979f346 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -222,6 +222,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
        if (which > PRIO_USER || which < PRIO_PROCESS)
                return -EINVAL;
+        rcu_read_lock();
        read_lock(&tasklist_lock);
        switch (which) {
                case PRIO_PROCESS:
@@ -267,6 +268,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
        }
 out_unlock:
        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return retval;
 }
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 6f740d9f0948..d7395fdfb9f3 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -259,7 +259,8 @@ void clockevents_notify(unsigned long reason, void *arg)
                cpu = *((int *)arg);
                list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
                        if (cpumask_test_cpu(cpu, dev->cpumask) &&
-                            cpumask_weight(dev->cpumask) == 1) {
+                            cpumask_weight(dev->cpumask) == 1 &&
+                            !tick_is_broadcast_device(dev)) {
                                BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
                                list_del(&dev->list);
                        }
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index e85c23404d34..13700833c181 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -343,7 +343,19 @@ static void clocksource_resume_watchdog(void)
 {
        unsigned long flags;
-        spin_lock_irqsave(&watchdog_lock, flags);
+        /*
+         * We use trylock here to avoid a potential dead lock when
+         * kgdb calls this code after the kernel has been stopped with
+         * watchdog_lock held. When watchdog_lock is held we just
+         * return and accept, that the watchdog might trigger and mark
+         * the monitored clock source (usually TSC) unstable.
+         *
+         * This does not affect the other caller clocksource_resume()
+         * because at this point the kernel is UP, interrupts are
+         * disabled and nothing can hold watchdog_lock.
+         */
+        if (!spin_trylock_irqsave(&watchdog_lock, flags))
+                return;
        clocksource_reset_watchdog();
        spin_unlock_irqrestore(&watchdog_lock, flags);
 }
@@ -458,8 +470,8 @@ void clocksource_resume(void)
 * clocksource_touch_watchdog - Update watchdog
 *
 * Update the watchdog after exception contexts such as kgdb so as not
- * to incorrectly trip the watchdog.
+ * to incorrectly trip the watchdog. This might fail when the kernel
- *
+ * was stopped in code which holds watchdog_lock.
 */
 void clocksource_touch_watchdog(void)
 {
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 7faaa32fbf4f..e2ab064c6d41 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -880,6 +880,7 @@ void getboottime(struct timespec *ts)
        set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
 }
+EXPORT_SYMBOL_GPL(getboottime);
 /**
 * monotonic_to_bootbased - Convert the monotonic time to boot based.
@@ -889,6 +890,7 @@ void monotonic_to_bootbased(struct timespec *ts)
 {
        *ts = timespec_add_safe(*ts, total_sleep_time);
 }
+EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
 unsigned long get_seconds(void)
 {
diff --git a/kernel/timer.c b/kernel/timer.c
index 15533b792397..c61a7949387f 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1198,6 +1198,7 @@ void update_process_times(int user_tick)
        run_local_timers();
        rcu_check_callbacks(cpu, user_tick);
        printk_tick();
+        perf_event_do_pending();
        scheduler_tick();
        run_posix_cpu_timers(p);
 }
@@ -1209,8 +1210,6 @@ static void run_timer_softirq(struct softirq_action *h)
 {
        struct tvec_base *base = __get_cpu_var(tvec_bases);
-        perf_event_do_pending();
        hrtimer_run_pending();
        if (time_after_eq(jiffies, base->timer_jiffies))
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 6c22d8a2f289..60e2ce0181ee 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -27,9 +27,7 @@ config HAVE_FUNCTION_GRAPH_TRACER
 config HAVE_FUNCTION_GRAPH_FP_TEST
        bool
        help
-         An arch may pass in a unique value (frame pointer) to both the
+          See Documentation/trace/ftrace-design.txt
-         entering and exiting of a function. On exit, the value is compared
-         and if it does not match, then it will panic the kernel.
 config HAVE_FUNCTION_TRACE_MCOUNT_TEST
        bool
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index edefe3b2801b..8c1b2d290718 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -464,6 +464,8 @@ struct ring_buffer_iter {
        struct ring_buffer_per_cpu      *cpu_buffer;
        unsigned long                   head;
        struct buffer_page              *head_page;
+        struct buffer_page              *cache_reader_page;
+        unsigned long                   cache_read;
        u64                             read_stamp;
 };
@@ -2716,6 +2718,8 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
                iter->read_stamp = cpu_buffer->read_stamp;
        else
                iter->read_stamp = iter->head_page->page->time_stamp;
+        iter->cache_reader_page = cpu_buffer->reader_page;
+        iter->cache_read = cpu_buffer->read;
 }
 /**
@@ -3060,13 +3064,22 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
        struct ring_buffer_event *event;
        int nr_loops = 0;
-        if (ring_buffer_iter_empty(iter))
-                return NULL;
        cpu_buffer = iter->cpu_buffer;
        buffer = cpu_buffer->buffer;
+        /*
+         * Check if someone performed a consuming read to
+         * the buffer. A consuming read invalidates the iterator
+         * and we need to reset the iterator in this case.
+         */
+        if (unlikely(iter->cache_read != cpu_buffer->read ||
+                     iter->cache_reader_page != cpu_buffer->reader_page))
+                rb_iter_reset(iter);
 again:
+        if (ring_buffer_iter_empty(iter))
+                return NULL;
        /*
         * We repeat when a timestamp is encountered.
         * We can get multiple timestamps by nested interrupts or also
@@ -3081,6 +3094,11 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
        if (rb_per_cpu_empty(cpu_buffer))
                return NULL;
+        if (iter->head >= local_read(&iter->head_page->page->commit)) {
+                rb_inc_iter(iter);
+                goto again;
+        }
        event = rb_iter_head_event(iter);
        switch (event->type_len) {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0df1b0f2cb9e..eac6875cb990 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -951,6 +951,11 @@ void trace_find_cmdline(int pid, char comm[])
                return;
        }
+        if (WARN_ON_ONCE(pid < 0)) {
+                strcpy(comm, "<XXX>");
+                return;
+        }
        if (pid > PID_MAX_DEFAULT) {
                strcpy(comm, "<...>");
                return;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 6ea90c0e2c96..50b1b8239806 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -689,7 +689,7 @@ static int create_trace_probe(int argc, char **argv)
                        return -EINVAL;
                }
                /* an address specified */
-                ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr);
+                ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr);
                if (ret) {
                        pr_info("Failed to parse address.\n");
                        return ret;
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 678a5120ee30..f4bc9b27de5f 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -157,6 +157,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
        unsigned long val, flags;
        char buf[64];
        int ret;
+        int cpu;
        if (count >= sizeof(buf))
                return -EINVAL;
@@ -171,9 +172,20 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
                return ret;
        local_irq_save(flags);
+        /*
+         * In case we trace inside arch_spin_lock() or after (NMI),
+         * we will cause circular lock, so we also need to increase
+         * the percpu trace_active here.
+         */
+        cpu = smp_processor_id();
+        per_cpu(trace_active, cpu)++;
        arch_spin_lock(&max_stack_lock);
        *ptr = val;
        arch_spin_unlock(&max_stack_lock);
+        per_cpu(trace_active, cpu)--;
        local_irq_restore(flags);
        return count;
@@ -206,7 +218,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
+        int cpu;
        local_irq_disable();
+        cpu = smp_processor_id();
+        per_cpu(trace_active, cpu)++;
        arch_spin_lock(&max_stack_lock);
        if (*pos == 0)
@@ -217,7 +235,13 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 static void t_stop(struct seq_file *m, void *p)
 {
+        int cpu;
        arch_spin_unlock(&max_stack_lock);
+        cpu = smp_processor_id();
+        per_cpu(trace_active, cpu)--;
        local_irq_enable();
 }