48 files changed, 1296 insertions, 484 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 35536d9c0964..76768ee812b2 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -220,9 +220,16 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE
 endif
+config ARCH_SUPPORTS_ATOMIC_RMW
+        bool
 config MUTEX_SPIN_ON_OWNER
        def_bool y
-        depends on SMP && !DEBUG_MUTEXES
+        depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
+config RWSEM_SPIN_ON_OWNER
+       def_bool y
+       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
 config ARCH_USE_QUEUE_RWLOCK
        bool
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7868fc3c0bc5..70776aec2562 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1648,10 +1648,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                         int flags, const char *unused_dev_name,
                         void *data)
 {
+        struct super_block *pinned_sb = NULL;
+        struct cgroup_subsys *ss;
        struct cgroup_root *root;
        struct cgroup_sb_opts opts;
        struct dentry *dentry;
        int ret;
+        int i;
        bool new_sb;
        /*
@@ -1677,6 +1680,27 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                goto out_unlock;
        }
+        /*
+         * Destruction of cgroup root is asynchronous, so subsystems may
+         * still be dying after the previous unmount.  Let's drain the
+         * dying subsystems.  We just need to ensure that the ones
+         * unmounted previously finish dying and don't care about new ones
+         * starting.  Testing ref liveliness is good enough.
+         */
+        for_each_subsys(ss, i) {
+                if (!(opts.subsys_mask & (1 << i)) ||
+                    ss->root == &cgrp_dfl_root)
+                        continue;
+                if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
+                        mutex_unlock(&cgroup_mutex);
+                        msleep(10);
+                        ret = restart_syscall();
+                        goto out_free;
+                }
+                cgroup_put(&ss->root->cgrp);
+        }
        for_each_root(root) {
                bool name_match = false;
@@ -1717,15 +1741,23 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                }
                /*
-                 * A root's lifetime is governed by its root cgroup.
+                 * We want to reuse @root whose lifetime is governed by its
-                 * tryget_live failure indicate that the root is being
+                 * ->cgrp.  Let's check whether @root is alive and keep it
-                 * destroyed.  Wait for destruction to complete so that the
+                 * that way.  As cgroup_kill_sb() can happen anytime, we
-                 * subsystems are free.  We can use wait_queue for the wait
+                 * want to block it by pinning the sb so that @root doesn't
-                 * but this path is super cold.  Let's just sleep for a bit
+                 * get killed before mount is complete.
-                 * and retry.
+                 *
+                 * With the sb pinned, tryget_live can reliably indicate
+                 * whether @root can be reused.  If it's being killed,
+                 * drain it.  We can use wait_queue for the wait but this
+                 * path is super cold.  Let's just sleep a bit and retry.
                 */
-                if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
+                pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
+                if (IS_ERR(pinned_sb) ||
+                    !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
                        mutex_unlock(&cgroup_mutex);
+                        if (!IS_ERR_OR_NULL(pinned_sb))
+                                deactivate_super(pinned_sb);
                        msleep(10);
                        ret = restart_syscall();
                        goto out_free;
@@ -1770,6 +1802,16 @@ out_free:
                                CGROUP_SUPER_MAGIC, &new_sb);
        if (IS_ERR(dentry) || !new_sb)
                cgroup_put(&root->cgrp);
+        /*
+         * If @pinned_sb, we're reusing an existing root and holding an
+         * extra ref on its sb.  Mount is complete.  Put the extra ref.
+         */
+        if (pinned_sb) {
+                WARN_ON(new_sb);
+                deactivate_super(pinned_sb);
+        }
        return dentry;
 }
@@ -3328,7 +3370,7 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
        rcu_read_lock();
        css_for_each_child(child, css) {
-                if (css->flags & CSS_ONLINE) {
+                if (child->flags & CSS_ONLINE) {
                        ret = true;
                        break;
                }
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 019d45008448..5664985c46a0 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -19,6 +19,7 @@
 #include <linux/sched.h>
 #include <linux/hardirq.h>
 #include <linux/export.h>
+#include <linux/kprobes.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/context_tracking.h>
@@ -104,6 +105,7 @@ void context_tracking_user_enter(void)
        }
        local_irq_restore(flags);
 }
+NOKPROBE_SYMBOL(context_tracking_user_enter);
 #ifdef CONFIG_PREEMPT
 /**
@@ -181,6 +183,7 @@ void context_tracking_user_exit(void)
        }
        local_irq_restore(flags);
 }
+NOKPROBE_SYMBOL(context_tracking_user_exit);
 /**
 * __context_tracking_task_switch - context switch the syscall callbacks
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f6b33c696224..116a4164720a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1181,7 +1181,13 @@ done:
 int current_cpuset_is_being_rebound(void)
 {
-        return task_cs(current) == cpuset_being_rebound;
+        int ret;
+        rcu_read_lock();
+        ret = task_cs(current) == cpuset_being_rebound;
+        rcu_read_unlock();
+        return ret;
 }
 static int update_relax_domain_level(struct cpuset *cs, s64 val)
@@ -1617,7 +1623,17 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
         * resources, wait for the previously scheduled operations before
         * proceeding, so that we don't end up keep removing tasks added
         * after execution capability is restored.
+         *
+         * cpuset_hotplug_work calls back into cgroup core via
+         * cgroup_transfer_tasks() and waiting for it from a cgroupfs
+         * operation like this one can lead to a deadlock through kernfs
+         * active_ref protection.  Let's break the protection.  Losing the
+         * protection is okay as we check whether @cs is online after
+         * grabbing cpuset_mutex anyway.  This only happens on the legacy
+         * hierarchies.
         */
+        css_get(&cs->css);
+        kernfs_break_active_protection(of->kn);
        flush_work(&cpuset_hotplug_work);
        mutex_lock(&cpuset_mutex);
@@ -1645,6 +1661,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
        free_trial_cpuset(trialcs);
 out_unlock:
        mutex_unlock(&cpuset_mutex);
+        kernfs_unbreak_active_protection(of->kn);
+        css_put(&cs->css);
        return retval ?: nbytes;
 }
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5fa58e4cffac..6b17ac1b0c2a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -40,6 +40,7 @@
 #include <linux/mm_types.h>
 #include <linux/cgroup.h>
 #include <linux/module.h>
+#include <linux/mman.h>
 #include "internal.h"
@@ -2319,7 +2320,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
        next_parent = rcu_dereference(next_ctx->parent_ctx);
        /* If neither context have a parent context; they cannot be clones. */
-        if (!parent && !next_parent)
+        if (!parent || !next_parent)
                goto unlock;
        if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
@@ -5128,6 +5129,7 @@ struct perf_mmap_event {
        int                     maj, min;
        u64                     ino;
        u64                     ino_generation;
+        u32                     prot, flags;
        struct {
                struct perf_event_header        header;
@@ -5169,6 +5171,8 @@ static void perf_event_mmap_output(struct perf_event *event,
                mmap_event->event_id.header.size += sizeof(mmap_event->min);
                mmap_event->event_id.header.size += sizeof(mmap_event->ino);
                mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
+                mmap_event->event_id.header.size += sizeof(mmap_event->prot);
+                mmap_event->event_id.header.size += sizeof(mmap_event->flags);
        }
        perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
@@ -5187,6 +5191,8 @@ static void perf_event_mmap_output(struct perf_event *event,
                perf_output_put(&handle, mmap_event->min);
                perf_output_put(&handle, mmap_event->ino);
                perf_output_put(&handle, mmap_event->ino_generation);
+                perf_output_put(&handle, mmap_event->prot);
+                perf_output_put(&handle, mmap_event->flags);
        }
        __output_copy(&handle, mmap_event->file_name,
@@ -5205,6 +5211,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
        struct file *file = vma->vm_file;
        int maj = 0, min = 0;
        u64 ino = 0, gen = 0;
+        u32 prot = 0, flags = 0;
        unsigned int size;
        char tmp[16];
        char *buf = NULL;
@@ -5235,6 +5242,28 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
                gen = inode->i_generation;
                maj = MAJOR(dev);
                min = MINOR(dev);
+                if (vma->vm_flags & VM_READ)
+                        prot |= PROT_READ;
+                if (vma->vm_flags & VM_WRITE)
+                        prot |= PROT_WRITE;
+                if (vma->vm_flags & VM_EXEC)
+                        prot |= PROT_EXEC;
+                if (vma->vm_flags & VM_MAYSHARE)
+                        flags = MAP_SHARED;
+                else
+                        flags = MAP_PRIVATE;
+                if (vma->vm_flags & VM_DENYWRITE)
+                        flags |= MAP_DENYWRITE;
+                if (vma->vm_flags & VM_MAYEXEC)
+                        flags |= MAP_EXECUTABLE;
+                if (vma->vm_flags & VM_LOCKED)
+                        flags |= MAP_LOCKED;
+                if (vma->vm_flags & VM_HUGETLB)
+                        flags |= MAP_HUGETLB;
                goto got_name;
        } else {
                name = (char *)arch_vma_name(vma);
@@ -5275,6 +5304,8 @@ got_name:
        mmap_event->min = min;
        mmap_event->ino = ino;
        mmap_event->ino_generation = gen;
+        mmap_event->prot = prot;
+        mmap_event->flags = flags;
        if (!(vma->vm_flags & VM_EXEC))
                mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
@@ -5315,6 +5346,8 @@ void perf_event_mmap(struct vm_area_struct *vma)
                /* .min (attr_mmap2 only) */
                /* .ino (attr_mmap2 only) */
                /* .ino_generation (attr_mmap2 only) */
+                /* .prot (attr_mmap2 only) */
+                /* .flags (attr_mmap2 only) */
        };
        perf_event_mmap_event(&mmap_event);
@@ -6897,10 +6930,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
        if (ret)
                return -EFAULT;
-        /* disabled for now */
-        if (attr->mmap2)
-                return -EINVAL;
        if (attr->__reserved_1)
                return -EINVAL;
@@ -7429,7 +7458,19 @@ __perf_event_exit_task(struct perf_event *child_event,
                         struct perf_event_context *child_ctx,
                         struct task_struct *child)
 {
-        perf_remove_from_context(child_event, true);
+        /*
+         * Do not destroy the 'original' grouping; because of the context
+         * switch optimization the original events could've ended up in a
+         * random child task.
+         *
+         * If we were to destroy the original group, all group related
+         * operations would cease to function properly after this random
+         * child dies.
+         *
+         * Do destroy all inherited groups, we don't care about those
+         * and being thorough is better.
+         */
+        perf_remove_from_context(child_event, !!child_event->parent);
        /*
         * It can happen that the parent exits first, and has events
@@ -7445,7 +7486,7 @@ __perf_event_exit_task(struct perf_event *child_event,
 static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
 {
        struct perf_event *child_event, *next;
-        struct perf_event_context *child_ctx;
+        struct perf_event_context *child_ctx, *parent_ctx;
        unsigned long flags;
        if (likely(!child->perf_event_ctxp[ctxn])) {
@@ -7470,6 +7511,15 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
        raw_spin_lock(&child_ctx->lock);
        task_ctx_sched_out(child_ctx);
        child->perf_event_ctxp[ctxn] = NULL;
+        /*
+         * In order to avoid freeing: child_ctx->parent_ctx->task
+         * under perf_event_context::lock, grab another reference.
+         */
+        parent_ctx = child_ctx->parent_ctx;
+        if (parent_ctx)
+                get_ctx(parent_ctx);
        /*
         * If this context is a clone; unclone it so it can't get
         * swapped to another process while we're removing all
@@ -7480,6 +7530,13 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
        raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
        /*
+         * Now that we no longer hold perf_event_context::lock, drop
+         * our extra child_ctx->parent_ctx reference.
+         */
+        if (parent_ctx)
+                put_ctx(parent_ctx);
+        /*
         * Report the task dead after unscheduling the events so that we
         * won't get any samples after PERF_RECORD_EXIT. We can however still
         * get a few PERF_RECORD_READ events.
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index c445e392e93f..6f3254e8c137 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -846,7 +846,7 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u
 {
        int err;
-        if (!consumer_del(uprobe, uc))  /* WARN? */
+        if (WARN_ON(!consumer_del(uprobe, uc)))
                return;
        err = register_for_each_vma(uprobe, NULL);
@@ -927,7 +927,7 @@ int uprobe_apply(struct inode *inode, loff_t offset,
        int ret = -ENOENT;
        uprobe = find_uprobe(inode, offset);
-        if (!uprobe)
+        if (WARN_ON(!uprobe))
                return ret;
        down_write(&uprobe->register_rwsem);
@@ -952,7 +952,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
        struct uprobe *uprobe;
        uprobe = find_uprobe(inode, offset);
-        if (!uprobe)
+        if (WARN_ON(!uprobe))
                return;
        down_write(&uprobe->register_rwsem);
diff --git a/kernel/fork.c b/kernel/fork.c
index d2799d1fc952..6a13c46cd87d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1487,7 +1487,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        total_forks++;
        spin_unlock(&current->sighand->siglock);
+        syscall_tracepoint_update(p);
        write_unlock_irq(&tasklist_lock);
        proc_fork_connector(p);
        cgroup_post_fork(p);
        if (clone_flags & CLONE_THREAD)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 7339e42a85ab..1487a123db5c 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -455,9 +455,9 @@ EXPORT_SYMBOL_GPL(irq_alloc_hwirqs);
 */
 void irq_free_hwirqs(unsigned int from, int cnt)
 {
-        int i;
+        int i, j;
-        for (i = from; cnt > 0; i++, cnt--) {
+        for (i = from, j = cnt; j > 0; i++, j--) {
                irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE);
                arch_teardown_hwirq(i);
        }
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 6748688813d0..369f41a94124 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1617,6 +1617,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 #ifdef CONFIG_MEMORY_FAILURE
        VMCOREINFO_NUMBER(PG_hwpoison);
 #endif
+        VMCOREINFO_NUMBER(PG_head_mask);
        VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
        arch_crash_save_vmcoreinfo();
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3214289df5a7..734e9a7d280b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2037,19 +2037,23 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
 {
        unsigned long *iter;
        struct kprobe_blacklist_entry *ent;
-        unsigned long offset = 0, size = 0;
+        unsigned long entry, offset = 0, size = 0;
        for (iter = start; iter < end; iter++) {
-                if (!kallsyms_lookup_size_offset(*iter, &size, &offset)) {
+                entry = arch_deref_entry_point((void *)*iter);
-                        pr_err("Failed to find blacklist %p\n", (void *)*iter);
+                if (!kernel_text_address(entry) ||
+                    !kallsyms_lookup_size_offset(entry, &size, &offset)) {
+                        pr_err("Failed to find blacklist at %p\n",
+                                (void *)entry);
                        continue;
                }
                ent = kmalloc(sizeof(*ent), GFP_KERNEL);
                if (!ent)
                        return -ENOMEM;
-                ent->start_addr = *iter;
+                ent->start_addr = entry;
-                ent->end_addr = *iter + size;
+                ent->end_addr = entry + size;
                INIT_LIST_HEAD(&ent->list);
                list_add_tail(&ent->list, &kprobe_blacklist);
        }
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
index 838dc9e00669..be9ee1559fca 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/mcs_spinlock.c
@@ -14,21 +14,47 @@
 * called from interrupt context and we have preemption disabled while
 * spinning.
 */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
+/*
+ * We use the value 0 to represent "no CPU", thus the encoded value
+ * will be the CPU number incremented by 1.
+ */
+static inline int encode_cpu(int cpu_nr)
+{
+        return cpu_nr + 1;
+}
+static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
+{
+        int cpu_nr = encoded_cpu_val - 1;
+        return per_cpu_ptr(&osq_node, cpu_nr);
+}
 /*
 * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
 * Can return NULL in case we were the last queued and we updated @lock instead.
 */
-static inline struct optimistic_spin_queue *
+static inline struct optimistic_spin_node *
-osq_wait_next(struct optimistic_spin_queue **lock,
+osq_wait_next(struct optimistic_spin_queue *lock,
-              struct optimistic_spin_queue *node,
+              struct optimistic_spin_node *node,
-              struct optimistic_spin_queue *prev)
+              struct optimistic_spin_node *prev)
 {
-        struct optimistic_spin_queue *next = NULL;
+        struct optimistic_spin_node *next = NULL;
+        int curr = encode_cpu(smp_processor_id());
+        int old;
+        /*
+         * If there is a prev node in queue, then the 'old' value will be
+         * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
+         * we're currently last in queue, then the queue will then become empty.
+         */
+        old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
        for (;;) {
-                if (*lock == node && cmpxchg(lock, node, prev) == node) {
+                if (atomic_read(&lock->tail) == curr &&
+                    atomic_cmpxchg(&lock->tail, curr, old) == curr) {
                        /*
                         * We were the last queued, we moved @lock back. @prev
                         * will now observe @lock and will complete its
@@ -59,18 +85,23 @@ osq_wait_next(struct optimistic_spin_queue **lock,
        return next;
 }
-bool osq_lock(struct optimistic_spin_queue **lock)
+bool osq_lock(struct optimistic_spin_queue *lock)
 {
-        struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
+        struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
-        struct optimistic_spin_queue *prev, *next;
+        struct optimistic_spin_node *prev, *next;
+        int curr = encode_cpu(smp_processor_id());
+        int old;
        node->locked = 0;
        node->next = NULL;
+        node->cpu = curr;
-        node->prev = prev = xchg(lock, node);
+        old = atomic_xchg(&lock->tail, curr);
-        if (likely(prev == NULL))
+        if (old == OSQ_UNLOCKED_VAL)
                return true;
+        prev = decode_cpu(old);
+        node->prev = prev;
        ACCESS_ONCE(prev->next) = node;
        /*
@@ -149,20 +180,21 @@ unqueue:
        return false;
 }
-void osq_unlock(struct optimistic_spin_queue **lock)
+void osq_unlock(struct optimistic_spin_queue *lock)
 {
-        struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
+        struct optimistic_spin_node *node, *next;
-        struct optimistic_spin_queue *next;
+        int curr = encode_cpu(smp_processor_id());
        /*
         * Fast path for the uncontended case.
         */
-        if (likely(cmpxchg(lock, node, NULL) == node))
+        if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr))
                return;
        /*
         * Second most likely case.
         */
+        node = this_cpu_ptr(&osq_node);
        next = xchg(&node->next, NULL);
        if (next) {
                ACCESS_ONCE(next->locked) = 1;
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index a2dbac4aca6b..74356dc0ce29 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -118,12 +118,13 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
 * mutex_lock()/rwsem_down_{read,write}() etc.
 */
-struct optimistic_spin_queue {
+struct optimistic_spin_node {
-        struct optimistic_spin_queue *next, *prev;
+        struct optimistic_spin_node *next, *prev;
        int locked; /* 1 if lock acquired */
+        int cpu; /* encoded CPU # value */
 };
-extern bool osq_lock(struct optimistic_spin_queue **lock);
+extern bool osq_lock(struct optimistic_spin_queue *lock);
-extern void osq_unlock(struct optimistic_spin_queue **lock);
+extern void osq_unlock(struct optimistic_spin_queue *lock);
 #endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index bc73d33c6760..acca2c1a3c5e 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -60,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
        INIT_LIST_HEAD(&lock->wait_list);
        mutex_clear_owner(lock);
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-        lock->osq = NULL;
+        osq_lock_init(&lock->osq);
 #endif
        debug_mutex_init(lock, name, key);
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
index 14193d596d78..ab29b6a22669 100644
--- a/kernel/locking/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
@@ -31,3 +31,8 @@ static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
 {
        return (waiter != NULL);
 }
+static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
+{
+        debug_rt_mutex_print_deadlock(w);
+}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index a620d4d08ca6..fc605941b9b8 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -83,6 +83,47 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
                owner = *p;
        } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
 }
+/*
+ * Safe fastpath aware unlock:
+ * 1) Clear the waiters bit
+ * 2) Drop lock->wait_lock
+ * 3) Try to unlock the lock with cmpxchg
+ */
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+        __releases(lock->wait_lock)
+{
+        struct task_struct *owner = rt_mutex_owner(lock);
+        clear_rt_mutex_waiters(lock);
+        raw_spin_unlock(&lock->wait_lock);
+        /*
+         * If a new waiter comes in between the unlock and the cmpxchg
+         * we have two situations:
+         *
+         * unlock(wait_lock);
+         *                                      lock(wait_lock);
+         * cmpxchg(p, owner, 0) == owner
+         *                                      mark_rt_mutex_waiters(lock);
+         *                                      acquire(lock);
+         * or:
+         *
+         * unlock(wait_lock);
+         *                                      lock(wait_lock);
+         *                                      mark_rt_mutex_waiters(lock);
+         *
+         * cmpxchg(p, owner, 0) != owner
+         *                                      enqueue_waiter();
+         *                                      unlock(wait_lock);
+         * lock(wait_lock);
+         * wake waiter();
+         * unlock(wait_lock);
+         *                                      lock(wait_lock);
+         *                                      acquire(lock);
+         */
+        return rt_mutex_cmpxchg(lock, owner, NULL);
+}
 #else
 # define rt_mutex_cmpxchg(l,c,n)        (0)
 static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
@@ -90,6 +131,17 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
        lock->owner = (struct task_struct *)
                        ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
 }
+/*
+ * Simple slow path only version: lock->owner is protected by lock->wait_lock.
+ */
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+        __releases(lock->wait_lock)
+{
+        lock->owner = NULL;
+        raw_spin_unlock(&lock->wait_lock);
+        return true;
+}
 #endif
 static inline int
@@ -260,27 +312,36 @@ static void rt_mutex_adjust_prio(struct task_struct *task)
 */
 int max_lock_depth = 1024;
+static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
+{
+        return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
+}
 /*
 * Adjust the priority chain. Also used for deadlock detection.
 * Decreases task's usage by one - may thus free the task.
 *
- * @task: the task owning the mutex (owner) for which a chain walk is probably
+ * @task:       the task owning the mutex (owner) for which a chain walk is
- *        needed
+ *              probably needed
 * @deadlock_detect: do we have to carry out deadlock detection?
- * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck
+ * @orig_lock:  the mutex (can be NULL if we are walking the chain to recheck
- *             things for a task that has just got its priority adjusted, and
+ *              things for a task that has just got its priority adjusted, and
- *             is waiting on a mutex)
+ *              is waiting on a mutex)
+ * @next_lock:  the mutex on which the owner of @orig_lock was blocked before
+ *              we dropped its pi_lock. Is never dereferenced, only used for
+ *              comparison to detect lock chain changes.
 * @orig_waiter: rt_mutex_waiter struct for the task that has just donated
- *               its priority to the mutex owner (can be NULL in the case
+ *              its priority to the mutex owner (can be NULL in the case
- *               depicted above or if the top waiter is gone away and we are
+ *              depicted above or if the top waiter is gone away and we are
- *               actually deboosting the owner)
+ *              actually deboosting the owner)
- * @top_task: the current top waiter
+ * @top_task:   the current top waiter
 *
 * Returns 0 or -EDEADLK.
 */
 static int rt_mutex_adjust_prio_chain(struct task_struct *task,
                                      int deadlock_detect,
                                      struct rt_mutex *orig_lock,
+                                      struct rt_mutex *next_lock,
                                      struct rt_mutex_waiter *orig_waiter,
                                      struct task_struct *top_task)
 {
@@ -314,7 +375,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
                }
                put_task_struct(task);
-                return deadlock_detect ? -EDEADLK : 0;
+                return -EDEADLK;
        }
 retry:
        /*
@@ -339,6 +400,18 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
                goto out_unlock_pi;
        /*
+         * We dropped all locks after taking a refcount on @task, so
+         * the task might have moved on in the lock chain or even left
+         * the chain completely and blocks now on an unrelated lock or
+         * on @orig_lock.
+         *
+         * We stored the lock on which @task was blocked in @next_lock,
+         * so we can detect the chain change.
+         */
+        if (next_lock != waiter->lock)
+                goto out_unlock_pi;
+        /*
         * Drop out, when the task has no waiters. Note,
         * top_waiter can be NULL, when we are in the deboosting
         * mode!
@@ -377,7 +450,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
        if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
                debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
                raw_spin_unlock(&lock->wait_lock);
-                ret = deadlock_detect ? -EDEADLK : 0;
+                ret = -EDEADLK;
                goto out_unlock_pi;
        }
@@ -422,11 +495,26 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
                __rt_mutex_adjust_prio(task);
        }
+        /*
+         * Check whether the task which owns the current lock is pi
+         * blocked itself. If yes we store a pointer to the lock for
+         * the lock chain change detection above. After we dropped
+         * task->pi_lock next_lock cannot be dereferenced anymore.
+         */
+        next_lock = task_blocked_on_lock(task);
        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
        top_waiter = rt_mutex_top_waiter(lock);
        raw_spin_unlock(&lock->wait_lock);
+        /*
+         * We reached the end of the lock chain. Stop right here. No
+         * point to go back just to figure that out.
+         */
+        if (!next_lock)
+                goto out_put_task;
        if (!detect_deadlock && waiter != top_waiter)
                goto out_put_task;
@@ -536,8 +624,9 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 {
        struct task_struct *owner = rt_mutex_owner(lock);
        struct rt_mutex_waiter *top_waiter = waiter;
-        unsigned long flags;
+        struct rt_mutex *next_lock;
        int chain_walk = 0, res;
+        unsigned long flags;
        /*
         * Early deadlock detection. We really don't want the task to
@@ -548,7 +637,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
         * which is wrong, as the other waiter is not in a deadlock
         * situation.
         */
-        if (detect_deadlock && owner == task)
+        if (owner == task)
                return -EDEADLK;
        raw_spin_lock_irqsave(&task->pi_lock, flags);
@@ -569,20 +658,28 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
        if (!owner)
                return 0;
+        raw_spin_lock_irqsave(&owner->pi_lock, flags);
        if (waiter == rt_mutex_top_waiter(lock)) {
-                raw_spin_lock_irqsave(&owner->pi_lock, flags);
                rt_mutex_dequeue_pi(owner, top_waiter);
                rt_mutex_enqueue_pi(owner, waiter);
                __rt_mutex_adjust_prio(owner);
                if (owner->pi_blocked_on)
                        chain_walk = 1;
-                raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+        } else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) {
-        }
-        else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock))
                chain_walk = 1;
+        }
-        if (!chain_walk)
+        /* Store the lock on which owner is blocked or NULL */
+        next_lock = task_blocked_on_lock(owner);
+        raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+        /*
+         * Even if full deadlock detection is on, if the owner is not
+         * blocked itself, we can avoid finding this out in the chain
+         * walk.
+         */
+        if (!chain_walk || !next_lock)
                return 0;
        /*
@@ -594,8 +691,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
        raw_spin_unlock(&lock->wait_lock);
-        res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
+        res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock,
-                                         task);
+                                         next_lock, waiter, task);
        raw_spin_lock(&lock->wait_lock);
@@ -605,7 +702,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 /*
 * Wake up the next waiter on the lock.
 *
- * Remove the top waiter from the current tasks waiter list and wake it up.
+ * Remove the top waiter from the current tasks pi waiter list and
+ * wake it up.
 *
 * Called with lock->wait_lock held.
 */
@@ -626,10 +724,23 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
         */
        rt_mutex_dequeue_pi(current, waiter);
-        rt_mutex_set_owner(lock, NULL);
+        /*
+         * As we are waking up the top waiter, and the waiter stays
+         * queued on the lock until it gets the lock, this lock
+         * obviously has waiters. Just set the bit here and this has
+         * the added benefit of forcing all new tasks into the
+         * slow path making sure no task of lower priority than
+         * the top waiter can steal this lock.
+         */
+        lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
        raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+        /*
+         * It's safe to dereference waiter as it cannot go away as
+         * long as we hold lock->wait_lock. The waiter task needs to
+         * acquire it in order to dequeue the waiter.
+         */
        wake_up_process(waiter->task);
 }
@@ -644,8 +755,8 @@ static void remove_waiter(struct rt_mutex *lock,
 {
        int first = (waiter == rt_mutex_top_waiter(lock));
        struct task_struct *owner = rt_mutex_owner(lock);
+        struct rt_mutex *next_lock = NULL;
        unsigned long flags;
-        int chain_walk = 0;
        raw_spin_lock_irqsave(&current->pi_lock, flags);
        rt_mutex_dequeue(lock, waiter);
@@ -669,13 +780,13 @@ static void remove_waiter(struct rt_mutex *lock,
                }
                __rt_mutex_adjust_prio(owner);
-                if (owner->pi_blocked_on)
+                /* Store the lock on which owner is blocked or NULL */
-                        chain_walk = 1;
+                next_lock = task_blocked_on_lock(owner);
                raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
        }
-        if (!chain_walk)
+        if (!next_lock)
                return;
        /* gets dropped in rt_mutex_adjust_prio_chain()! */
@@ -683,7 +794,7 @@ static void remove_waiter(struct rt_mutex *lock,
        raw_spin_unlock(&lock->wait_lock);
-        rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current);
+        rt_mutex_adjust_prio_chain(owner, 0, lock, next_lock, NULL, current);
        raw_spin_lock(&lock->wait_lock);
 }
@@ -696,6 +807,7 @@ static void remove_waiter(struct rt_mutex *lock,
 void rt_mutex_adjust_pi(struct task_struct *task)
 {
        struct rt_mutex_waiter *waiter;
+        struct rt_mutex *next_lock;
        unsigned long flags;
        raw_spin_lock_irqsave(&task->pi_lock, flags);
@@ -706,12 +818,13 @@ void rt_mutex_adjust_pi(struct task_struct *task)
                raw_spin_unlock_irqrestore(&task->pi_lock, flags);
                return;
        }
+        next_lock = waiter->lock;
        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
        /* gets dropped in rt_mutex_adjust_prio_chain()! */
        get_task_struct(task);
-        rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
+        rt_mutex_adjust_prio_chain(task, 0, NULL, next_lock, NULL, task);
 }
 /**
@@ -763,6 +876,26 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
        return ret;
 }
+static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
+                                     struct rt_mutex_waiter *w)
+{
+        /*
+         * If the result is not -EDEADLOCK or the caller requested
+         * deadlock detection, nothing to do here.
+         */
+        if (res != -EDEADLOCK || detect_deadlock)
+                return;
+        /*
+         * Yell lowdly and stop the task right here.
+         */
+        rt_mutex_print_deadlock(w);
+        while (1) {
+                set_current_state(TASK_INTERRUPTIBLE);
+                schedule();
+        }
+}
 /*
 * Slow path lock function:
 */
@@ -802,8 +935,10 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
        set_current_state(TASK_RUNNING);
-        if (unlikely(ret))
+        if (unlikely(ret)) {
                remove_waiter(lock, &waiter);
+                rt_mutex_handle_deadlock(ret, detect_deadlock, &waiter);
+        }
        /*
         * try_to_take_rt_mutex() sets the waiter bit
@@ -859,12 +994,49 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
        rt_mutex_deadlock_account_unlock(current);
-        if (!rt_mutex_has_waiters(lock)) {
+        /*
-                lock->owner = NULL;
+         * We must be careful here if the fast path is enabled. If we
-                raw_spin_unlock(&lock->wait_lock);
+         * have no waiters queued we cannot set owner to NULL here
-                return;
+         * because of:
+         *
+         * foo->lock->owner = NULL;
+         *                      rtmutex_lock(foo->lock);   <- fast path
+         *                      free = atomic_dec_and_test(foo->refcnt);
+         *                      rtmutex_unlock(foo->lock); <- fast path
+         *                      if (free)
+         *                              kfree(foo);
+         * raw_spin_unlock(foo->lock->wait_lock);
+         *
+         * So for the fastpath enabled kernel:
+         *
+         * Nothing can set the waiters bit as long as we hold
+         * lock->wait_lock. So we do the following sequence:
+         *
+         *      owner = rt_mutex_owner(lock);
+         *      clear_rt_mutex_waiters(lock);
+         *      raw_spin_unlock(&lock->wait_lock);
+         *      if (cmpxchg(&lock->owner, owner, 0) == owner)
+         *              return;
+         *      goto retry;
+         *
+         * The fastpath disabled variant is simple as all access to
+         * lock->owner is serialized by lock->wait_lock:
+         *
+         *      lock->owner = NULL;
+         *      raw_spin_unlock(&lock->wait_lock);
+         */
+        while (!rt_mutex_has_waiters(lock)) {
+                /* Drops lock->wait_lock ! */
+                if (unlock_rt_mutex_safe(lock) == true)
+                        return;
+                /* Relock the rtmutex and try again */
+                raw_spin_lock(&lock->wait_lock);
        }
+        /*
+         * The wakeup next waiter path does not suffer from the above
+         * race. See the comments there.
+         */
        wakeup_next_waiter(lock);
        raw_spin_unlock(&lock->wait_lock);
@@ -1112,7 +1284,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
                return 1;
        }
-        ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
+        /* We enforce deadlock detection for futexes */
+        ret = task_blocks_on_rt_mutex(lock, waiter, task, 1);
        if (ret && !rt_mutex_owner(lock)) {
                /*
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
index a1a1dd06421d..f6a1f3c133b1 100644
--- a/kernel/locking/rtmutex.h
+++ b/kernel/locking/rtmutex.h
@@ -24,3 +24,8 @@
 #define debug_rt_mutex_print_deadlock(w)                do { } while (0)
 #define debug_rt_mutex_detect_deadlock(w,d)             (d)
 #define debug_rt_mutex_reset_waiter(w)                  do { } while (0)
+static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
+{
+        WARN(1, "rtmutex deadlock detected\n");
+}
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 9be8a9144978..2c93571162cb 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -26,7 +26,7 @@ int rwsem_is_locked(struct rw_semaphore *sem)
        unsigned long flags;
        if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) {
-                ret = (sem->activity != 0);
+                ret = (sem->count != 0);
                raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
        }
        return ret;
@@ -46,7 +46,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
        debug_check_no_locks_freed((void *)sem, sizeof(*sem));
        lockdep_init_map(&sem->dep_map, name, key, 0);
 #endif
-        sem->activity = 0;
+        sem->count = 0;
        raw_spin_lock_init(&sem->wait_lock);
        INIT_LIST_HEAD(&sem->wait_list);
 }
@@ -95,7 +95,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
                waiter = list_entry(next, struct rwsem_waiter, list);
        } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
-        sem->activity += woken;
+        sem->count += woken;
 out:
        return sem;
@@ -126,9 +126,9 @@ void __sched __down_read(struct rw_semaphore *sem)
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        if (sem->activity >= 0 && list_empty(&sem->wait_list)) {
+        if (sem->count >= 0 && list_empty(&sem->wait_list)) {
                /* granted */
-                sem->activity++;
+                sem->count++;
                raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
                goto out;
        }
@@ -170,9 +170,9 @@ int __down_read_trylock(struct rw_semaphore *sem)
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        if (sem->activity >= 0 && list_empty(&sem->wait_list)) {
+        if (sem->count >= 0 && list_empty(&sem->wait_list)) {
                /* granted */
-                sem->activity++;
+                sem->count++;
                ret = 1;
        }
@@ -206,7 +206,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
                 * itself into sleep and waiting for system woke it or someone
                 * else in the head of the wait list up.
                 */
-                if (sem->activity == 0)
+                if (sem->count == 0)
                        break;
                set_task_state(tsk, TASK_UNINTERRUPTIBLE);
                raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -214,7 +214,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
                raw_spin_lock_irqsave(&sem->wait_lock, flags);
        }
        /* got the lock */
-        sem->activity = -1;
+        sem->count = -1;
        list_del(&waiter.list);
        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -235,9 +235,9 @@ int __down_write_trylock(struct rw_semaphore *sem)
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        if (sem->activity == 0) {
+        if (sem->count == 0) {
                /* got the lock */
-                sem->activity = -1;
+                sem->count = -1;
                ret = 1;
        }
@@ -255,7 +255,7 @@ void __up_read(struct rw_semaphore *sem)
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        if (--sem->activity == 0 && !list_empty(&sem->wait_list))
+        if (--sem->count == 0 && !list_empty(&sem->wait_list))
                sem = __rwsem_wake_one_writer(sem);
        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -270,7 +270,7 @@ void __up_write(struct rw_semaphore *sem)
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        sem->activity = 0;
+        sem->count = 0;
        if (!list_empty(&sem->wait_list))
                sem = __rwsem_do_wake(sem, 1);
@@ -287,7 +287,7 @@ void __downgrade_write(struct rw_semaphore *sem)
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        sem->activity = 1;
+        sem->count = 1;
        if (!list_empty(&sem->wait_list))
                sem = __rwsem_do_wake(sem, 0);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index dacc32142fcc..a2391ac135c8 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -82,9 +82,9 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
        sem->count = RWSEM_UNLOCKED_VALUE;
        raw_spin_lock_init(&sem->wait_lock);
        INIT_LIST_HEAD(&sem->wait_list);
-#ifdef CONFIG_SMP
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
        sem->owner = NULL;
-        sem->osq = NULL;
+        osq_lock_init(&sem->osq);
 #endif
 }
@@ -262,7 +262,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
        return false;
 }
-#ifdef CONFIG_SMP
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 /*
 * Try to acquire write lock before the writer has been put on wait queue.
 */
@@ -285,10 +285,10 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
 static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
 {
        struct task_struct *owner;
-        bool on_cpu = true;
+        bool on_cpu = false;
        if (need_resched())
-                return 0;
+                return false;
        rcu_read_lock();
        owner = ACCESS_ONCE(sem->owner);
@@ -297,9 +297,9 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
        rcu_read_unlock();
        /*
-         * If sem->owner is not set, the rwsem owner may have
+         * If sem->owner is not set, yet we have just recently entered the
-         * just acquired it and not set the owner yet or the rwsem
+         * slowpath, then there is a possibility reader(s) may have the lock.
-         * has been released.
+         * To be safe, avoid spinning in these situations.
         */
        return on_cpu;
 }
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 42f806de49d4..e2d3bc7f03b4 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -12,7 +12,7 @@
 #include <linux/atomic.h>
-#if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM)
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 static inline void rwsem_set_owner(struct rw_semaphore *sem)
 {
        sem->owner = current;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 9a83d780facd..e4e4121fa327 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -253,9 +253,6 @@ config APM_EMULATION
          anything, try disabling/enabling this option (or disabling/enabling
          APM in your BIOS).
-config ARCH_HAS_OPP
-        bool
 config PM_OPP
        bool
        ---help---
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 49e0a20fd010..fcc2611d3f14 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -35,6 +35,7 @@
 static int nocompress;
 static int noresume;
+static int nohibernate;
 static int resume_wait;
 static unsigned int resume_delay;
 static char resume_file[256] = CONFIG_PM_STD_PARTITION;
@@ -62,6 +63,11 @@ bool freezer_test_done;
 static const struct platform_hibernation_ops *hibernation_ops;
+bool hibernation_available(void)
+{
+        return (nohibernate == 0);
+}
 /**
 * hibernation_set_ops - Set the global hibernate operations.
 * @ops: Hibernation operations to use in subsequent hibernation transitions.
@@ -642,6 +648,11 @@ int hibernate(void)
 {
        int error;
+        if (!hibernation_available()) {
+                pr_debug("PM: Hibernation not available.\n");
+                return -EPERM;
+        }
        lock_system_sleep();
        /* The snapshot device should not be opened while we're running */
        if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
@@ -734,7 +745,7 @@ static int software_resume(void)
        /*
         * If the user said "noresume".. bail out early.
         */
-        if (noresume)
+        if (noresume || !hibernation_available())
                return 0;
        /*
@@ -900,6 +911,9 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
        int i;
        char *start = buf;
+        if (!hibernation_available())
+                return sprintf(buf, "[disabled]\n");
        for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
                if (!hibernation_modes[i])
                        continue;
@@ -934,6 +948,9 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
        char *p;
        int mode = HIBERNATION_INVALID;
+        if (!hibernation_available())
+                return -EPERM;
        p = memchr(buf, '\n', n);
        len = p ? p - buf : n;
@@ -1101,6 +1118,10 @@ static int __init hibernate_setup(char *str)
                noresume = 1;
        else if (!strncmp(str, "nocompress", 10))
                nocompress = 1;
+        else if (!strncmp(str, "no", 2)) {
+                noresume = 1;
+                nohibernate = 1;
+        }
        return 1;
 }
@@ -1125,9 +1146,23 @@ static int __init resumedelay_setup(char *str)
        return 1;
 }
+static int __init nohibernate_setup(char *str)
+{
+        noresume = 1;
+        nohibernate = 1;
+        return 1;
+}
+static int __init kaslr_nohibernate_setup(char *str)
+{
+        return nohibernate_setup(str);
+}
 __setup("noresume", noresume_setup);
 __setup("resume_offset=", resume_offset_setup);
 __setup("resume=", resume_setup);
 __setup("hibernate=", hibernate_setup);
 __setup("resumewait", resumewait_setup);
 __setup("resumedelay=", resumedelay_setup);
+__setup("nohibernate", nohibernate_setup);
+__setup("kaslr", kaslr_nohibernate_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 573410d6647e..9a59d042ea84 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -296,25 +296,22 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
        suspend_state_t i;
        for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
-                if (pm_states[i].state)
+                if (pm_states[i])
-                        s += sprintf(s,"%s ", pm_states[i].label);
+                        s += sprintf(s,"%s ", pm_states[i]);
 #endif
-#ifdef CONFIG_HIBERNATION
+        if (hibernation_available())
-        s += sprintf(s, "%s\n", "disk");
+                s += sprintf(s, "disk ");
-#else
        if (s != buf)
                /* convert the last space to a newline */
                *(s-1) = '\n';
-#endif
        return (s - buf);
 }
 static suspend_state_t decode_state(const char *buf, size_t n)
 {
 #ifdef CONFIG_SUSPEND
-        suspend_state_t state = PM_SUSPEND_MIN;
+        suspend_state_t state;
-        struct pm_sleep_state *s;
 #endif
        char *p;
        int len;
@@ -327,10 +324,12 @@ static suspend_state_t decode_state(const char *buf, size_t n)
                return PM_SUSPEND_MAX;
 #ifdef CONFIG_SUSPEND
-        for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++)
+        for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) {
-                if (s->state && len == strlen(s->label)
+                const char *label = pm_states[state];
-                    && !strncmp(buf, s->label, len))
-                        return s->state;
+                if (label && len == strlen(label) && !strncmp(buf, label, len))
+                        return state;
+        }
 #endif
        return PM_SUSPEND_ON;
@@ -448,8 +447,8 @@ static ssize_t autosleep_show(struct kobject *kobj,
 #ifdef CONFIG_SUSPEND
        if (state < PM_SUSPEND_MAX)
-                return sprintf(buf, "%s\n", pm_states[state].state ?
+                return sprintf(buf, "%s\n", pm_states[state] ?
-                                        pm_states[state].label : "error");
+                                        pm_states[state] : "error");
 #endif
 #ifdef CONFIG_HIBERNATION
        return sprintf(buf, "disk\n");
@@ -617,7 +616,6 @@ static struct attribute_group attr_group = {
        .attrs = g,
 };
-#ifdef CONFIG_PM_RUNTIME
 struct workqueue_struct *pm_wq;
 EXPORT_SYMBOL_GPL(pm_wq);
@@ -627,9 +625,6 @@ static int __init pm_start_workqueue(void)
        return pm_wq ? 0 : -ENOMEM;
 }
-#else
-static inline int pm_start_workqueue(void) { return 0; }
-#endif
 static int __init pm_init(void)
 {
diff --git a/kernel/power/power.h b/kernel/power/power.h
index c60f13b5270a..5d49dcac2537 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -178,13 +178,8 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *,
                                unsigned int, char *);
 #ifdef CONFIG_SUSPEND
-struct pm_sleep_state {
-        const char *label;
-        suspend_state_t state;
-};
 /* kernel/power/suspend.c */
-extern struct pm_sleep_state pm_states[];
+extern const char *pm_states[];
 extern int suspend_devices_and_enter(suspend_state_t state);
 #else /* !CONFIG_SUSPEND */
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 0ca8d83e2369..4ee194eb524b 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -186,6 +186,7 @@ void thaw_processes(void)
        printk("Restarting tasks ... ");
+        __usermodehelper_set_disable_depth(UMH_FREEZING);
        thaw_workqueues();
        read_lock(&tasklist_lock);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 1ea328aafdc9..4fc5c32422b3 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -248,33 +248,61 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
 *      information is stored (in the form of a block of bitmap)
 *      It also contains the pfns that correspond to the start and end of
 *      the represented memory area.
+ *
+ *      The memory bitmap is organized as a radix tree to guarantee fast random
+ *      access to the bits. There is one radix tree for each zone (as returned
+ *      from create_mem_extents).
+ *
+ *      One radix tree is represented by one struct mem_zone_bm_rtree. There are
+ *      two linked lists for the nodes of the tree, one for the inner nodes and
+ *      one for the leave nodes. The linked leave nodes are used for fast linear
+ *      access of the memory bitmap.
+ *
+ *      The struct rtree_node represents one node of the radix tree.
 */
 #define BM_END_OF_MAP   (~0UL)
 #define BM_BITS_PER_BLOCK       (PAGE_SIZE * BITS_PER_BYTE)
+#define BM_BLOCK_SHIFT          (PAGE_SHIFT + 3)
+#define BM_BLOCK_MASK           ((1UL << BM_BLOCK_SHIFT) - 1)
-struct bm_block {
+/*
-        struct list_head hook;  /* hook into a list of bitmap blocks */
+ * struct rtree_node is a wrapper struct to link the nodes
-        unsigned long start_pfn;        /* pfn represented by the first bit */
+ * of the rtree together for easy linear iteration over
-        unsigned long end_pfn;  /* pfn represented by the last bit plus 1 */
+ * bits and easy freeing
-        unsigned long *data;    /* bitmap representing pages */
+ */
+struct rtree_node {
+        struct list_head list;
+        unsigned long *data;
 };
-static inline unsigned long bm_block_bits(struct bm_block *bb)
+/*
-{
+ * struct mem_zone_bm_rtree represents a bitmap used for one
-        return bb->end_pfn - bb->start_pfn;
+ * populated memory zone.
-}
+ */
+struct mem_zone_bm_rtree {
+        struct list_head list;          /* Link Zones together         */
+        struct list_head nodes;         /* Radix Tree inner nodes      */
+        struct list_head leaves;        /* Radix Tree leaves           */
+        unsigned long start_pfn;        /* Zone start page frame       */
+        unsigned long end_pfn;          /* Zone end page frame + 1     */
+        struct rtree_node *rtree;       /* Radix Tree Root             */
+        int levels;                     /* Number of Radix Tree Levels */
+        unsigned int blocks;            /* Number of Bitmap Blocks     */
+};
 /* strcut bm_position is used for browsing memory bitmaps */
 struct bm_position {
-        struct bm_block *block;
+        struct mem_zone_bm_rtree *zone;
-        int bit;
+        struct rtree_node *node;
+        unsigned long node_pfn;
+        int node_bit;
 };
 struct memory_bitmap {
-        struct list_head blocks;        /* list of bitmap blocks */
+        struct list_head zones;
        struct linked_page *p_list;     /* list of pages used to store zone
                                         * bitmap objects and bitmap block
                                         * objects
@@ -284,38 +312,178 @@ struct memory_bitmap {
 /* Functions that operate on memory bitmaps */
-static void memory_bm_position_reset(struct memory_bitmap *bm)
+#define BM_ENTRIES_PER_LEVEL    (PAGE_SIZE / sizeof(unsigned long))
+#if BITS_PER_LONG == 32
+#define BM_RTREE_LEVEL_SHIFT    (PAGE_SHIFT - 2)
+#else
+#define BM_RTREE_LEVEL_SHIFT    (PAGE_SHIFT - 3)
+#endif
+#define BM_RTREE_LEVEL_MASK     ((1UL << BM_RTREE_LEVEL_SHIFT) - 1)
+/*
+ *      alloc_rtree_node - Allocate a new node and add it to the radix tree.
+ *
+ *      This function is used to allocate inner nodes as well as the
+ *      leave nodes of the radix tree. It also adds the node to the
+ *      corresponding linked list passed in by the *list parameter.
+ */
+static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed,
+                                           struct chain_allocator *ca,
+                                           struct list_head *list)
 {
-        bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook);
+        struct rtree_node *node;
-        bm->cur.bit = 0;
-}
-static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
+        node = chain_alloc(ca, sizeof(struct rtree_node));
+        if (!node)
+                return NULL;
-/**
+        node->data = get_image_page(gfp_mask, safe_needed);
- *      create_bm_block_list - create a list of block bitmap objects
+        if (!node->data)
- *      @pages - number of pages to track
+                return NULL;
- *      @list - list to put the allocated blocks into
- *      @ca - chain allocator to be used for allocating memory
+        list_add_tail(&node->list, list);
+        return node;
+}
+/*
+ *      add_rtree_block - Add a new leave node to the radix tree
+ *
+ *      The leave nodes need to be allocated in order to keep the leaves
+ *      linked list in order. This is guaranteed by the zone->blocks
+ *      counter.
 */
-static int create_bm_block_list(unsigned long pages,
+static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask,
-                                struct list_head *list,
+                           int safe_needed, struct chain_allocator *ca)
-                                struct chain_allocator *ca)
 {
-        unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK);
+        struct rtree_node *node, *block, **dst;
+        unsigned int levels_needed, block_nr;
+        int i;
-        while (nr_blocks-- > 0) {
+        block_nr = zone->blocks;
-                struct bm_block *bb;
+        levels_needed = 0;
-                bb = chain_alloc(ca, sizeof(struct bm_block));
+        /* How many levels do we need for this block nr? */
-                if (!bb)
+        while (block_nr) {
+                levels_needed += 1;
+                block_nr >>= BM_RTREE_LEVEL_SHIFT;
+        }
+        /* Make sure the rtree has enough levels */
+        for (i = zone->levels; i < levels_needed; i++) {
+                node = alloc_rtree_node(gfp_mask, safe_needed, ca,
+                                        &zone->nodes);
+                if (!node)
                        return -ENOMEM;
-                list_add(&bb->hook, list);
+                node->data[0] = (unsigned long)zone->rtree;
+                zone->rtree = node;
+                zone->levels += 1;
+        }
+        /* Allocate new block */
+        block = alloc_rtree_node(gfp_mask, safe_needed, ca, &zone->leaves);
+        if (!block)
+                return -ENOMEM;
+        /* Now walk the rtree to insert the block */
+        node = zone->rtree;
+        dst = &zone->rtree;
+        block_nr = zone->blocks;
+        for (i = zone->levels; i > 0; i--) {
+                int index;
+                if (!node) {
+                        node = alloc_rtree_node(gfp_mask, safe_needed, ca,
+                                                &zone->nodes);
+                        if (!node)
+                                return -ENOMEM;
+                        *dst = node;
+                }
+                index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT);
+                index &= BM_RTREE_LEVEL_MASK;
+                dst = (struct rtree_node **)&((*dst)->data[index]);
+                node = *dst;
        }
+        zone->blocks += 1;
+        *dst = block;
        return 0;
 }
+static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
+                               int clear_nosave_free);
+/*
+ *      create_zone_bm_rtree - create a radix tree for one zone
+ *
+ *      Allocated the mem_zone_bm_rtree structure and initializes it.
+ *      This function also allocated and builds the radix tree for the
+ *      zone.
+ */
+static struct mem_zone_bm_rtree *
+create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed,
+                     struct chain_allocator *ca,
+                     unsigned long start, unsigned long end)
+{
+        struct mem_zone_bm_rtree *zone;
+        unsigned int i, nr_blocks;
+        unsigned long pages;
+        pages = end - start;
+        zone  = chain_alloc(ca, sizeof(struct mem_zone_bm_rtree));
+        if (!zone)
+                return NULL;
+        INIT_LIST_HEAD(&zone->nodes);
+        INIT_LIST_HEAD(&zone->leaves);
+        zone->start_pfn = start;
+        zone->end_pfn = end;
+        nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK);
+        for (i = 0; i < nr_blocks; i++) {
+                if (add_rtree_block(zone, gfp_mask, safe_needed, ca)) {
+                        free_zone_bm_rtree(zone, PG_UNSAFE_CLEAR);
+                        return NULL;
+                }
+        }
+        return zone;
+}
+/*
+ *      free_zone_bm_rtree - Free the memory of the radix tree
+ *
+ *      Free all node pages of the radix tree. The mem_zone_bm_rtree
+ *      structure itself is not freed here nor are the rtree_node
+ *      structs.
+ */
+static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
+                               int clear_nosave_free)
+{
+        struct rtree_node *node;
+        list_for_each_entry(node, &zone->nodes, list)
+                free_image_page(node->data, clear_nosave_free);
+        list_for_each_entry(node, &zone->leaves, list)
+                free_image_page(node->data, clear_nosave_free);
+}
+static void memory_bm_position_reset(struct memory_bitmap *bm)
+{
+        bm->cur.zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree,
+                                  list);
+        bm->cur.node = list_entry(bm->cur.zone->leaves.next,
+                                  struct rtree_node, list);
+        bm->cur.node_pfn = 0;
+        bm->cur.node_bit = 0;
+}
+static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
 struct mem_extent {
        struct list_head hook;
        unsigned long start;
@@ -407,40 +575,22 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
        int error;
        chain_init(&ca, gfp_mask, safe_needed);
-        INIT_LIST_HEAD(&bm->blocks);
+        INIT_LIST_HEAD(&bm->zones);
        error = create_mem_extents(&mem_extents, gfp_mask);
        if (error)
                return error;
        list_for_each_entry(ext, &mem_extents, hook) {
-                struct bm_block *bb;
+                struct mem_zone_bm_rtree *zone;
-                unsigned long pfn = ext->start;
-                unsigned long pages = ext->end - ext->start;
-                bb = list_entry(bm->blocks.prev, struct bm_block, hook);
-                error = create_bm_block_list(pages, bm->blocks.prev, &ca);
+                zone = create_zone_bm_rtree(gfp_mask, safe_needed, &ca,
-                if (error)
+                                            ext->start, ext->end);
+                if (!zone) {
+                        error = -ENOMEM;
                        goto Error;
-                list_for_each_entry_continue(bb, &bm->blocks, hook) {
-                        bb->data = get_image_page(gfp_mask, safe_needed);
-                        if (!bb->data) {
-                                error = -ENOMEM;
-                                goto Error;
-                        }
-                        bb->start_pfn = pfn;
-                        if (pages >= BM_BITS_PER_BLOCK) {
-                                pfn += BM_BITS_PER_BLOCK;
-                                pages -= BM_BITS_PER_BLOCK;
-                        } else {
-                                /* This is executed only once in the loop */
-                                pfn += pages;
-                        }
-                        bb->end_pfn = pfn;
                }
+                list_add_tail(&zone->list, &bm->zones);
        }
        bm->p_list = ca.chain;
@@ -460,51 +610,83 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
  */
 static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
 {
-        struct bm_block *bb;
+        struct mem_zone_bm_rtree *zone;
-        list_for_each_entry(bb, &bm->blocks, hook)
+        list_for_each_entry(zone, &bm->zones, list)
-                if (bb->data)
+                free_zone_bm_rtree(zone, clear_nosave_free);
-                        free_image_page(bb->data, clear_nosave_free);
        free_list_of_pages(bm->p_list, clear_nosave_free);
-        INIT_LIST_HEAD(&bm->blocks);
+        INIT_LIST_HEAD(&bm->zones);
 }
 /**
- *      memory_bm_find_bit - find the bit in the bitmap @bm that corresponds
+ *      memory_bm_find_bit - Find the bit for pfn in the memory
- *      to given pfn.  The cur_zone_bm member of @bm and the cur_block member
+ *                           bitmap
- *      of @bm->cur_zone_bm are updated.
+ *
+ *      Find the bit in the bitmap @bm that corresponds to given pfn.
+ *      The cur.zone, cur.block and cur.node_pfn member of @bm are
+ *      updated.
+ *      It walks the radix tree to find the page which contains the bit for
+ *      pfn and returns the bit position in **addr and *bit_nr.
 */
 static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
-                                void **addr, unsigned int *bit_nr)
+                              void **addr, unsigned int *bit_nr)
 {
-        struct bm_block *bb;
+        struct mem_zone_bm_rtree *curr, *zone;
+        struct rtree_node *node;
+        int i, block_nr;
+        zone = bm->cur.zone;
+        if (pfn >= zone->start_pfn && pfn < zone->end_pfn)
+                goto zone_found;
+        zone = NULL;
+        /* Find the right zone */
+        list_for_each_entry(curr, &bm->zones, list) {
+                if (pfn >= curr->start_pfn && pfn < curr->end_pfn) {
+                        zone = curr;
+                        break;
+                }
+        }
+        if (!zone)
+                return -EFAULT;
+zone_found:
        /*
-         * Check if the pfn corresponds to the current bitmap block and find
+         * We have a zone. Now walk the radix tree to find the leave
-         * the block where it fits if this is not the case.
+         * node for our pfn.
         */
-        bb = bm->cur.block;
-        if (pfn < bb->start_pfn)
-                list_for_each_entry_continue_reverse(bb, &bm->blocks, hook)
-                        if (pfn >= bb->start_pfn)
-                                break;
-        if (pfn >= bb->end_pfn)
+        node = bm->cur.node;
-                list_for_each_entry_continue(bb, &bm->blocks, hook)
+        if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
-                        if (pfn >= bb->start_pfn && pfn < bb->end_pfn)
+                goto node_found;
-                                break;
-        if (&bb->hook == &bm->blocks)
+        node      = zone->rtree;
-                return -EFAULT;
+        block_nr  = (pfn - zone->start_pfn) >> BM_BLOCK_SHIFT;
+        for (i = zone->levels; i > 0; i--) {
+                int index;
+                index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT);
+                index &= BM_RTREE_LEVEL_MASK;
+                BUG_ON(node->data[index] == 0);
+                node = (struct rtree_node *)node->data[index];
+        }
+node_found:
+        /* Update last position */
+        bm->cur.zone = zone;
+        bm->cur.node = node;
+        bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK;
+        /* Set return values */
+        *addr = node->data;
+        *bit_nr = (pfn - zone->start_pfn) & BM_BLOCK_MASK;
-        /* The block has been found */
-        bm->cur.block = bb;
-        pfn -= bb->start_pfn;
-        bm->cur.bit = pfn + 1;
-        *bit_nr = pfn;
-        *addr = bb->data;
        return 0;
 }
@@ -528,6 +710,7 @@ static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn)
        error = memory_bm_find_bit(bm, pfn, &addr, &bit);
        if (!error)
                set_bit(bit, addr);
        return error;
 }
@@ -542,6 +725,14 @@ static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
        clear_bit(bit, addr);
 }
+static void memory_bm_clear_current(struct memory_bitmap *bm)
+{
+        int bit;
+        bit = max(bm->cur.node_bit - 1, 0);
+        clear_bit(bit, bm->cur.node->data);
+}
 static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
 {
        void *addr;
@@ -561,38 +752,70 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
        return !memory_bm_find_bit(bm, pfn, &addr, &bit);
 }
-/**
+/*
- *      memory_bm_next_pfn - find the pfn that corresponds to the next set bit
+ *      rtree_next_node - Jumps to the next leave node
- *      in the bitmap @bm.  If the pfn cannot be found, BM_END_OF_MAP is
+ *
- *      returned.
+ *      Sets the position to the beginning of the next node in the
+ *      memory bitmap. This is either the next node in the current
+ *      zone's radix tree or the first node in the radix tree of the
+ *      next zone.
 *
- *      It is required to run memory_bm_position_reset() before the first call to
+ *      Returns true if there is a next node, false otherwise.
- *      this function.
 */
+static bool rtree_next_node(struct memory_bitmap *bm)
+{
+        bm->cur.node = list_entry(bm->cur.node->list.next,
+                                  struct rtree_node, list);
+        if (&bm->cur.node->list != &bm->cur.zone->leaves) {
+                bm->cur.node_pfn += BM_BITS_PER_BLOCK;
+                bm->cur.node_bit  = 0;
+                touch_softlockup_watchdog();
+                return true;
+        }
+        /* No more nodes, goto next zone */
+        bm->cur.zone = list_entry(bm->cur.zone->list.next,
+                                  struct mem_zone_bm_rtree, list);
+        if (&bm->cur.zone->list != &bm->zones) {
+                bm->cur.node = list_entry(bm->cur.zone->leaves.next,
+                                          struct rtree_node, list);
+                bm->cur.node_pfn = 0;
+                bm->cur.node_bit = 0;
+                return true;
+        }
+        /* No more zones */
+        return false;
+}
+/**
+ *      memory_bm_rtree_next_pfn - Find the next set bit in the bitmap @bm
+ *
+ *      Starting from the last returned position this function searches
+ *      for the next set bit in the memory bitmap and returns its
+ *      number. If no more bit is set BM_END_OF_MAP is returned.
+ *
+ *      It is required to run memory_bm_position_reset() before the
+ *      first call to this function.
+ */
 static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
 {
-        struct bm_block *bb;
+        unsigned long bits, pfn, pages;
        int bit;
-        bb = bm->cur.block;
        do {
-                bit = bm->cur.bit;
+                pages     = bm->cur.zone->end_pfn - bm->cur.zone->start_pfn;
-                bit = find_next_bit(bb->data, bm_block_bits(bb), bit);
+                bits      = min(pages - bm->cur.node_pfn, BM_BITS_PER_BLOCK);
-                if (bit < bm_block_bits(bb))
+                bit       = find_next_bit(bm->cur.node->data, bits,
-                        goto Return_pfn;
+                                          bm->cur.node_bit);
+                if (bit < bits) {
-                bb = list_entry(bb->hook.next, struct bm_block, hook);
+                        pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit;
-                bm->cur.block = bb;
+                        bm->cur.node_bit = bit + 1;
-                bm->cur.bit = 0;
+                        return pfn;
-        } while (&bb->hook != &bm->blocks);
+                }
+        } while (rtree_next_node(bm));
-        memory_bm_position_reset(bm);
        return BM_END_OF_MAP;
- Return_pfn:
-        bm->cur.bit = bit + 1;
-        return bb->start_pfn + bit;
 }
 /**
@@ -816,12 +1039,17 @@ void free_basic_memory_bitmaps(void)
 unsigned int snapshot_additional_pages(struct zone *zone)
 {
-        unsigned int res;
+        unsigned int rtree, nodes;
+        rtree = nodes = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
+        rtree += DIV_ROUND_UP(rtree * sizeof(struct rtree_node),
+                              LINKED_PAGE_DATA_SIZE);
+        while (nodes > 1) {
+                nodes = DIV_ROUND_UP(nodes, BM_ENTRIES_PER_LEVEL);
+                rtree += nodes;
+        }
-        res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
+        return 2 * rtree;
-        res += DIV_ROUND_UP(res * sizeof(struct bm_block),
-                            LINKED_PAGE_DATA_SIZE);
-        return 2 * res;
 }
 #ifdef CONFIG_HIGHMEM
@@ -1094,23 +1322,35 @@ static struct memory_bitmap copy_bm;
 void swsusp_free(void)
 {
-        struct zone *zone;
+        unsigned long fb_pfn, fr_pfn;
-        unsigned long pfn, max_zone_pfn;
-        for_each_populated_zone(zone) {
+        memory_bm_position_reset(forbidden_pages_map);
-                max_zone_pfn = zone_end_pfn(zone);
+        memory_bm_position_reset(free_pages_map);
-                for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
-                        if (pfn_valid(pfn)) {
+loop:
-                                struct page *page = pfn_to_page(pfn);
+        fr_pfn = memory_bm_next_pfn(free_pages_map);
+        fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
-                                if (swsusp_page_is_forbidden(page) &&
-                                    swsusp_page_is_free(page)) {
+        /*
-                                        swsusp_unset_page_forbidden(page);
+         * Find the next bit set in both bitmaps. This is guaranteed to
-                                        swsusp_unset_page_free(page);
+         * terminate when fb_pfn == fr_pfn == BM_END_OF_MAP.
-                                        __free_page(page);
+         */
-                                }
+        do {
-                        }
+                if (fb_pfn < fr_pfn)
+                        fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
+                if (fr_pfn < fb_pfn)
+                        fr_pfn = memory_bm_next_pfn(free_pages_map);
+        } while (fb_pfn != fr_pfn);
+        if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) {
+                struct page *page = pfn_to_page(fr_pfn);
+                memory_bm_clear_current(forbidden_pages_map);
+                memory_bm_clear_current(free_pages_map);
+                __free_page(page);
+                goto loop;
        }
        nr_copy_pages = 0;
        nr_meta_pages = 0;
        restore_pblist = NULL;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4dd8822f732a..9a071bea80eb 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -31,20 +31,11 @@
 #include "power.h"
-struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = {
+static const char *pm_labels[] = { "mem", "standby", "freeze", };
-        [PM_SUSPEND_FREEZE] = { .label = "freeze", .state = PM_SUSPEND_FREEZE },
+const char *pm_states[PM_SUSPEND_MAX];
-        [PM_SUSPEND_STANDBY] = { .label = "standby", },
-        [PM_SUSPEND_MEM] = { .label = "mem", },
-};
 static const struct platform_suspend_ops *suspend_ops;
 static const struct platform_freeze_ops *freeze_ops;
-static bool need_suspend_ops(suspend_state_t state)
-{
-        return state > PM_SUSPEND_FREEZE;
-}
 static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
 static bool suspend_freeze_wake;
@@ -97,10 +88,7 @@ static bool relative_states;
 static int __init sleep_states_setup(char *str)
 {
        relative_states = !strncmp(str, "1", 1);
-        if (relative_states) {
+        pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2];
-                pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE;
-                pm_states[PM_SUSPEND_FREEZE].state = 0;
-        }
        return 1;
 }
@@ -113,20 +101,20 @@ __setup("relative_sleep_states=", sleep_states_setup);
 void suspend_set_ops(const struct platform_suspend_ops *ops)
 {
        suspend_state_t i;
-        int j = PM_SUSPEND_MAX - 1;
+        int j = 0;
        lock_system_sleep();
        suspend_ops = ops;
        for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--)
-                if (valid_state(i))
+                if (valid_state(i)) {
-                        pm_states[j--].state = i;
+                        pm_states[i] = pm_labels[j++];
-                else if (!relative_states)
+                } else if (!relative_states) {
-                        pm_states[j--].state = 0;
+                        pm_states[i] = NULL;
+                        j++;
+                }
-        pm_states[j--].state = PM_SUSPEND_FREEZE;
+        pm_states[PM_SUSPEND_FREEZE] = pm_labels[j];
-        while (j >= PM_SUSPEND_MIN)
-                pm_states[j--].state = 0;
        unlock_system_sleep();
 }
@@ -145,6 +133,65 @@ int suspend_valid_only_mem(suspend_state_t state)
 }
 EXPORT_SYMBOL_GPL(suspend_valid_only_mem);
+static bool sleep_state_supported(suspend_state_t state)
+{
+        return state == PM_SUSPEND_FREEZE || (suspend_ops && suspend_ops->enter);
+}
+static int platform_suspend_prepare(suspend_state_t state)
+{
+        return state != PM_SUSPEND_FREEZE && suspend_ops->prepare ?
+                suspend_ops->prepare() : 0;
+}
+static int platform_suspend_prepare_late(suspend_state_t state)
+{
+        return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ?
+                suspend_ops->prepare_late() : 0;
+}
+static void platform_suspend_wake(suspend_state_t state)
+{
+        if (state != PM_SUSPEND_FREEZE && suspend_ops->wake)
+                suspend_ops->wake();
+}
+static void platform_suspend_finish(suspend_state_t state)
+{
+        if (state != PM_SUSPEND_FREEZE && suspend_ops->finish)
+                suspend_ops->finish();
+}
+static int platform_suspend_begin(suspend_state_t state)
+{
+        if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin)
+                return freeze_ops->begin();
+        else if (suspend_ops->begin)
+                return suspend_ops->begin(state);
+        else
+                return 0;
+}
+static void platform_suspend_end(suspend_state_t state)
+{
+        if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
+                freeze_ops->end();
+        else if (suspend_ops->end)
+                suspend_ops->end();
+}
+static void platform_suspend_recover(suspend_state_t state)
+{
+        if (state != PM_SUSPEND_FREEZE && suspend_ops->recover)
+                suspend_ops->recover();
+}
+static bool platform_suspend_again(suspend_state_t state)
+{
+        return state != PM_SUSPEND_FREEZE && suspend_ops->suspend_again ?
+                suspend_ops->suspend_again() : false;
+}
 static int suspend_test(int level)
 {
 #ifdef CONFIG_PM_DEBUG
@@ -168,7 +215,7 @@ static int suspend_prepare(suspend_state_t state)
 {
        int error;
-        if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter))
+        if (!sleep_state_supported(state))
                return -EPERM;
        pm_prepare_console();
@@ -214,23 +261,18 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 {
        int error;
-        if (need_suspend_ops(state) && suspend_ops->prepare) {
+        error = platform_suspend_prepare(state);
-                error = suspend_ops->prepare();
+        if (error)
-                if (error)
+                goto Platform_finish;
-                        goto Platform_finish;
-        }
        error = dpm_suspend_end(PMSG_SUSPEND);
        if (error) {
                printk(KERN_ERR "PM: Some devices failed to power down\n");
                goto Platform_finish;
        }
+        error = platform_suspend_prepare_late(state);
-        if (need_suspend_ops(state) && suspend_ops->prepare_late) {
+        if (error)
-                error = suspend_ops->prepare_late();
+                goto Platform_wake;
-                if (error)
-                        goto Platform_wake;
-        }
        if (suspend_test(TEST_PLATFORM))
                goto Platform_wake;
@@ -278,15 +320,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
        ftrace_start();
 Platform_wake:
-        if (need_suspend_ops(state) && suspend_ops->wake)
+        platform_suspend_wake(state);
-                suspend_ops->wake();
        dpm_resume_start(PMSG_RESUME);
 Platform_finish:
-        if (need_suspend_ops(state) && suspend_ops->finish)
+        platform_suspend_finish(state);
-                suspend_ops->finish();
        return error;
 }
@@ -299,18 +337,13 @@ int suspend_devices_and_enter(suspend_state_t state)
        int error;
        bool wakeup = false;
-        if (need_suspend_ops(state) && !suspend_ops)
+        if (!sleep_state_supported(state))
                return -ENOSYS;
-        if (need_suspend_ops(state) && suspend_ops->begin) {
+        error = platform_suspend_begin(state);
-                error = suspend_ops->begin(state);
+        if (error)
-                if (error)
+                goto Close;
-                        goto Close;
-        } else if (state == PM_SUSPEND_FREEZE && freeze_ops->begin) {
-                error = freeze_ops->begin();
-                if (error)
-                        goto Close;
-        }
        suspend_console();
        suspend_test_start();
        error = dpm_suspend_start(PMSG_SUSPEND);
@@ -324,25 +357,20 @@ int suspend_devices_and_enter(suspend_state_t state)
        do {
                error = suspend_enter(state, &wakeup);
-        } while (!error && !wakeup && need_suspend_ops(state)
+        } while (!error && !wakeup && platform_suspend_again(state));
-                && suspend_ops->suspend_again && suspend_ops->suspend_again());
 Resume_devices:
        suspend_test_start();
        dpm_resume_end(PMSG_RESUME);
        suspend_test_finish("resume devices");
        resume_console();
- Close:
-        if (need_suspend_ops(state) && suspend_ops->end)
-                suspend_ops->end();
-        else if (state == PM_SUSPEND_FREEZE && freeze_ops->end)
-                freeze_ops->end();
+ Close:
+        platform_suspend_end(state);
        return error;
 Recover_platform:
-        if (need_suspend_ops(state) && suspend_ops->recover)
+        platform_suspend_recover(state);
-                suspend_ops->recover();
        goto Resume_devices;
 }
@@ -395,7 +423,7 @@ static int enter_state(suspend_state_t state)
        printk("done.\n");
        trace_suspend_resume(TPS("sync_filesystems"), 0, false);
-        pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label);
+        pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
        error = suspend_prepare(state);
        if (error)
                goto Unlock;
@@ -404,7 +432,7 @@ static int enter_state(suspend_state_t state)
                goto Finish;
        trace_suspend_resume(TPS("suspend_enter"), state, false);
-        pr_debug("PM: Entering %s sleep\n", pm_states[state].label);
+        pr_debug("PM: Entering %s sleep\n", pm_states[state]);
        pm_restrict_gfp_mask();
        error = suspend_devices_and_enter(state);
        pm_restore_gfp_mask();
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 269b097e78ea..2f524928b6aa 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -92,13 +92,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
        }
        if (state == PM_SUSPEND_MEM) {
-                printk(info_test, pm_states[state].label);
+                printk(info_test, pm_states[state]);
                status = pm_suspend(state);
                if (status == -ENODEV)
                        state = PM_SUSPEND_STANDBY;
        }
        if (state == PM_SUSPEND_STANDBY) {
-                printk(info_test, pm_states[state].label);
+                printk(info_test, pm_states[state]);
                status = pm_suspend(state);
        }
        if (status < 0)
@@ -141,8 +141,8 @@ static int __init setup_test_suspend(char *value)
        /* "=mem" ==> "mem" */
        value++;
        for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
-                if (!strcmp(pm_states[i].label, value)) {
+                if (!strcmp(pm_states[i], value)) {
-                        test_state = pm_states[i].state;
+                        test_state = i;
                        return 0;
                }
@@ -162,8 +162,8 @@ static int __init test_suspend(void)
        /* PM is initialized by now; is that state testable? */
        if (test_state == PM_SUSPEND_ON)
                goto done;
-        if (!pm_states[test_state].state) {
+        if (!pm_states[test_state]) {
-                printk(warn_bad_state, pm_states[test_state].label);
+                printk(warn_bad_state, pm_states[test_state]);
                goto done;
        }
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 98d357584cd6..526e8911460a 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -49,6 +49,9 @@ static int snapshot_open(struct inode *inode, struct file *filp)
        struct snapshot_data *data;
        int error;
+        if (!hibernation_available())
+                return -EPERM;
        lock_system_sleep();
        if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index ea2d5f6962ed..13e839dbca07 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1416,9 +1416,10 @@ static int have_callable_console(void)
 /*
 * Can we actually use the console at this time on this cpu?
 *
- * Console drivers may assume that per-cpu resources have been allocated. So
+ * Console drivers may assume that per-cpu resources have
- * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
+ * been allocated. So unless they're explicitly marked as
- * call them until this CPU is officially up.
+ * being able to cope (CON_ANYTIME) don't call them until
+ * this CPU is officially up.
 */
 static inline int can_use_console(unsigned int cpu)
 {
@@ -1431,10 +1432,8 @@ static inline int can_use_console(unsigned int cpu)
 * console_lock held, and 'console_locked' set) if it
 * is successful, false otherwise.
 */
-static int console_trylock_for_printk(void)
+static int console_trylock_for_printk(unsigned int cpu)
 {
-        unsigned int cpu = smp_processor_id();
        if (!console_trylock())
                return 0;
        /*
@@ -1609,8 +1608,7 @@ asmlinkage int vprintk_emit(int facility, int level,
                 */
                if (!oops_in_progress && !lockdep_recursing(current)) {
                        recursion_bug = 1;
-                        local_irq_restore(flags);
+                        goto out_restore_irqs;
-                        return 0;
                }
                zap_locks();
        }
@@ -1718,27 +1716,21 @@ asmlinkage int vprintk_emit(int facility, int level,
        logbuf_cpu = UINT_MAX;
        raw_spin_unlock(&logbuf_lock);
-        lockdep_on();
-        local_irq_restore(flags);
        /* If called from the scheduler, we can not call up(). */
-        if (in_sched)
+        if (!in_sched) {
-                return printed_len;
+                /*
+                 * Try to acquire and then immediately release the console
-        /*
+                 * semaphore.  The release will print out buffers and wake up
-         * Disable preemption to avoid being preempted while holding
+                 * /dev/kmsg and syslog() users.
-         * console_sem which would prevent anyone from printing to console
+                 */
-         */
+                if (console_trylock_for_printk(this_cpu))
-        preempt_disable();
+                        console_unlock();
-        /*
+        }
-         * Try to acquire and then immediately release the console semaphore.
-         * The release will print out buffers and wake up /dev/kmsg and syslog()
-         * users.
-         */
-        if (console_trylock_for_printk())
-                console_unlock();
-        preempt_enable();
+        lockdep_on();
+out_restore_irqs:
+        local_irq_restore(flags);
        return printed_len;
 }
 EXPORT_SYMBOL(vprintk_emit);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f1ba77363fbb..625d0b0cd75a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -206,6 +206,70 @@ void rcu_bh_qs(int cpu)
        rdp->passed_quiesce = 1;
 }
+static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
+static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
+        .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
+        .dynticks = ATOMIC_INIT(1),
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+        .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
+        .dynticks_idle = ATOMIC_INIT(1),
+#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+};
+/*
+ * Let the RCU core know that this CPU has gone through the scheduler,
+ * which is a quiescent state.  This is called when the need for a
+ * quiescent state is urgent, so we burn an atomic operation and full
+ * memory barriers to let the RCU core know about it, regardless of what
+ * this CPU might (or might not) do in the near future.
+ *
+ * We inform the RCU core by emulating a zero-duration dyntick-idle
+ * period, which we in turn do by incrementing the ->dynticks counter
+ * by two.
+ */
+static void rcu_momentary_dyntick_idle(void)
+{
+        unsigned long flags;
+        struct rcu_data *rdp;
+        struct rcu_dynticks *rdtp;
+        int resched_mask;
+        struct rcu_state *rsp;
+        local_irq_save(flags);
+        /*
+         * Yes, we can lose flag-setting operations.  This is OK, because
+         * the flag will be set again after some delay.
+         */
+        resched_mask = raw_cpu_read(rcu_sched_qs_mask);
+        raw_cpu_write(rcu_sched_qs_mask, 0);
+        /* Find the flavor that needs a quiescent state. */
+        for_each_rcu_flavor(rsp) {
+                rdp = raw_cpu_ptr(rsp->rda);
+                if (!(resched_mask & rsp->flavor_mask))
+                        continue;
+                smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */
+                if (ACCESS_ONCE(rdp->mynode->completed) !=
+                    ACCESS_ONCE(rdp->cond_resched_completed))
+                        continue;
+                /*
+                 * Pretend to be momentarily idle for the quiescent state.
+                 * This allows the grace-period kthread to record the
+                 * quiescent state, with no need for this CPU to do anything
+                 * further.
+                 */
+                rdtp = this_cpu_ptr(&rcu_dynticks);
+                smp_mb__before_atomic(); /* Earlier stuff before QS. */
+                atomic_add(2, &rdtp->dynticks);  /* QS. */
+                smp_mb__after_atomic(); /* Later stuff after QS. */
+                break;
+        }
+        local_irq_restore(flags);
+}
 /*
 * Note a context switch.  This is a quiescent state for RCU-sched,
 * and requires special handling for preemptible RCU.
@@ -216,19 +280,12 @@ void rcu_note_context_switch(int cpu)
        trace_rcu_utilization(TPS("Start context switch"));
        rcu_sched_qs(cpu);
        rcu_preempt_note_context_switch(cpu);
+        if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+                rcu_momentary_dyntick_idle();
        trace_rcu_utilization(TPS("End context switch"));
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
-static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
-        .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
-        .dynticks = ATOMIC_INIT(1),
-#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
-        .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
-        .dynticks_idle = ATOMIC_INIT(1),
-#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-};
 static long blimit = 10;        /* Maximum callbacks per rcu_do_batch. */
 static long qhimark = 10000;    /* If this many pending, ignore blimit. */
 static long qlowmark = 100;     /* Once only this many pending, use blimit. */
@@ -243,6 +300,13 @@ static ulong jiffies_till_next_fqs = ULONG_MAX;
 module_param(jiffies_till_first_fqs, ulong, 0644);
 module_param(jiffies_till_next_fqs, ulong, 0644);
+/*
+ * How long the grace period must be before we start recruiting
+ * quiescent-state help from rcu_note_context_switch().
+ */
+static ulong jiffies_till_sched_qs = HZ / 20;
+module_param(jiffies_till_sched_qs, ulong, 0644);
 static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
                                  struct rcu_data *rdp);
 static void force_qs_rnp(struct rcu_state *rsp,
@@ -853,6 +917,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
                                    bool *isidle, unsigned long *maxj)
 {
        unsigned int curr;
+        int *rcrmp;
        unsigned int snap;
        curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
@@ -893,27 +958,43 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
        }
        /*
-         * There is a possibility that a CPU in adaptive-ticks state
+         * A CPU running for an extended time within the kernel can
-         * might run in the kernel with the scheduling-clock tick disabled
+         * delay RCU grace periods.  When the CPU is in NO_HZ_FULL mode,
-         * for an extended time period.  Invoke rcu_kick_nohz_cpu() to
+         * even context-switching back and forth between a pair of
-         * force the CPU to restart the scheduling-clock tick in this
+         * in-kernel CPU-bound tasks cannot advance grace periods.
-         * CPU is in this state.
+         * So if the grace period is old enough, make the CPU pay attention.
-         */
+         * Note that the unsynchronized assignments to the per-CPU
-        rcu_kick_nohz_cpu(rdp->cpu);
+         * rcu_sched_qs_mask variable are safe.  Yes, setting of
+         * bits can be lost, but they will be set again on the next
-        /*
+         * force-quiescent-state pass.  So lost bit sets do not result
-         * Alternatively, the CPU might be running in the kernel
+         * in incorrect behavior, merely in a grace period lasting
-         * for an extended period of time without a quiescent state.
+         * a few jiffies longer than it might otherwise.  Because
-         * Attempt to force the CPU through the scheduler to gain the
+         * there are at most four threads involved, and because the
-         * needed quiescent state, but only if the grace period has gone
+         * updates are only once every few jiffies, the probability of
-         * on for an uncommonly long time.  If there are many stuck CPUs,
+         * lossage (and thus of slight grace-period extension) is
-         * we will beat on the first one until it gets unstuck, then move
+         * quite low.
-         * to the next.  Only do this for the primary flavor of RCU.
+         *
+         * Note that if the jiffies_till_sched_qs boot/sysfs parameter
+         * is set too high, we override with half of the RCU CPU stall
+         * warning delay.
         */
-        if (rdp->rsp == rcu_state_p &&
+        rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu);
+        if (ULONG_CMP_GE(jiffies,
+                         rdp->rsp->gp_start + jiffies_till_sched_qs) ||
            ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
-                rdp->rsp->jiffies_resched += 5;
+                if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
-                resched_cpu(rdp->cpu);
+                        ACCESS_ONCE(rdp->cond_resched_completed) =
+                                ACCESS_ONCE(rdp->mynode->completed);
+                        smp_mb(); /* ->cond_resched_completed before *rcrmp. */
+                        ACCESS_ONCE(*rcrmp) =
+                                ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask;
+                        resched_cpu(rdp->cpu);  /* Force CPU into scheduler. */
+                        rdp->rsp->jiffies_resched += 5; /* Enable beating. */
+                } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
+                        /* Time to beat on that CPU again! */
+                        resched_cpu(rdp->cpu);  /* Force CPU into scheduler. */
+                        rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
+                }
        }
        return 0;
@@ -3491,6 +3572,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
                               "rcu_node_fqs_1",
                               "rcu_node_fqs_2",
                               "rcu_node_fqs_3" };  /* Match MAX_RCU_LVLS */
+        static u8 fl_mask = 0x1;
        int cpustride = 1;
        int i;
        int j;
@@ -3509,6 +3591,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
        for (i = 1; i < rcu_num_lvls; i++)
                rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
        rcu_init_levelspread(rsp);
+        rsp->flavor_mask = fl_mask;
+        fl_mask <<= 1;
        /* Initialize the elements themselves, starting from the leaves. */
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index bf2c1e669691..0f69a79c5b7d 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -307,6 +307,9 @@ struct rcu_data {
        /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
        unsigned long dynticks_fqs;     /* Kicked due to dynticks idle. */
        unsigned long offline_fqs;      /* Kicked due to being offline. */
+        unsigned long cond_resched_completed;
+                                        /* Grace period that needs help */
+                                        /*  from cond_resched(). */
        /* 5) __rcu_pending() statistics. */
        unsigned long n_rcu_pending;    /* rcu_pending() calls since boot. */
@@ -392,6 +395,7 @@ struct rcu_state {
        struct rcu_node *level[RCU_NUM_LVLS];   /* Hierarchy levels. */
        u32 levelcnt[MAX_RCU_LVLS + 1];         /* # nodes in each level. */
        u8 levelspread[RCU_NUM_LVLS];           /* kids/node in each level. */
+        u8 flavor_mask;                         /* bit in flavor mask. */
        struct rcu_data __percpu *rda;          /* pointer of percu rcu_data. */
        void (*call)(struct rcu_head *head,     /* call_rcu() flavor. */
                     void (*func)(struct rcu_head *head));
@@ -563,7 +567,7 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
 static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
 static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
 static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
-static void rcu_kick_nohz_cpu(int cpu);
+static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
 static bool init_nocb_callback_list(struct rcu_data *rdp);
 static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
 static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index cbc2c45265e2..02ac0fb186b8 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2404,7 +2404,7 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
 * if an adaptive-ticks CPU is failing to respond to the current grace
 * period and has not be idle from an RCU perspective, kick it.
 */
-static void rcu_kick_nohz_cpu(int cpu)
+static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
 {
 #ifdef CONFIG_NO_HZ_FULL
        if (tick_nohz_full_cpu(cpu))
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index a2aeb4df0f60..bc7883570530 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -200,12 +200,12 @@ void wait_rcu_gp(call_rcu_func_t crf)
 EXPORT_SYMBOL_GPL(wait_rcu_gp);
 #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
-static inline void debug_init_rcu_head(struct rcu_head *head)
+void init_rcu_head(struct rcu_head *head)
 {
        debug_object_init(head, &rcuhead_debug_descr);
 }
-static inline void debug_rcu_head_free(struct rcu_head *head)
+void destroy_rcu_head(struct rcu_head *head)
 {
        debug_object_free(head, &rcuhead_debug_descr);
 }
@@ -350,21 +350,3 @@ static int __init check_cpu_stall_init(void)
 early_initcall(check_cpu_stall_init);
 #endif /* #ifdef CONFIG_RCU_STALL_COMMON */
-/*
- * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings.
- */
-DEFINE_PER_CPU(int, rcu_cond_resched_count);
-/*
- * Report a set of RCU quiescent states, for use by cond_resched()
- * and friends.  Out of line due to being called infrequently.
- */
-void rcu_resched(void)
-{
-        preempt_disable();
-        __this_cpu_write(rcu_cond_resched_count, 0);
-        rcu_note_context_switch(smp_processor_id());
-        preempt_enable();
-}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3bdf01b494fe..bc1638b33449 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4147,7 +4147,6 @@ static void __cond_resched(void)
 int __sched _cond_resched(void)
 {
-        rcu_cond_resched();
        if (should_resched()) {
                __cond_resched();
                return 1;
@@ -4166,18 +4165,15 @@ EXPORT_SYMBOL(_cond_resched);
 */
 int __cond_resched_lock(spinlock_t *lock)
 {
-        bool need_rcu_resched = rcu_should_resched();
        int resched = should_resched();
        int ret = 0;
        lockdep_assert_held(lock);
-        if (spin_needbreak(lock) || resched || need_rcu_resched) {
+        if (spin_needbreak(lock) || resched) {
                spin_unlock(lock);
                if (resched)
                        __cond_resched();
-                else if (unlikely(need_rcu_resched))
-                        rcu_resched();
                else
                        cpu_relax();
                ret = 1;
@@ -4191,7 +4187,6 @@ int __sched __cond_resched_softirq(void)
 {
        BUG_ON(!in_softirq());
-        rcu_cond_resched();  /* BH disabled OK, just recording QSes. */
        if (should_resched()) {
                local_bh_enable();
                __cond_resched();
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 695f9773bb60..627b3c34b821 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -608,7 +608,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
                avg_atom = p->se.sum_exec_runtime;
                if (nr_switches)
-                        do_div(avg_atom, nr_switches);
+                        avg_atom = div64_ul(avg_atom, nr_switches);
                else
                        avg_atom = -1LL;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index cf009fb0bc25..658a58dc30f4 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -147,8 +147,6 @@ use_default:
            clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
                goto use_default;
-        trace_cpu_idle_rcuidle(next_state, dev->cpu);
        /*
         * Enter the idle state previously returned by the governor decision.
         * This function will block until an interrupt occurs and will take
@@ -156,8 +154,6 @@ use_default:
         */
        entered_state = cpuidle_enter(drv, dev, next_state);
-        trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
        if (broadcast)
                clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
diff --git a/kernel/smp.c b/kernel/smp.c
index 306f8180b0d5..80c33f8de14f 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -29,6 +29,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
+static void flush_smp_call_function_queue(bool warn_cpu_offline);
 static int
 hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
@@ -51,12 +53,27 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_UP_CANCELED:
        case CPU_UP_CANCELED_FROZEN:
+                /* Fall-through to the CPU_DEAD[_FROZEN] case. */
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
                free_cpumask_var(cfd->cpumask);
                free_percpu(cfd->csd);
                break;
+        case CPU_DYING:
+        case CPU_DYING_FROZEN:
+                /*
+                 * The IPIs for the smp-call-function callbacks queued by other
+                 * CPUs might arrive late, either due to hardware latencies or
+                 * because this CPU disabled interrupts (inside stop-machine)
+                 * before the IPIs were sent. So flush out any pending callbacks
+                 * explicitly (without waiting for the IPIs to arrive), to
+                 * ensure that the outgoing CPU doesn't go offline with work
+                 * still pending.
+                 */
+                flush_smp_call_function_queue(false);
+                break;
 #endif
        };
@@ -177,23 +194,47 @@ static int generic_exec_single(int cpu, struct call_single_data *csd,
        return 0;
 }
-/*
+/**
- * Invoked by arch to handle an IPI for call function single. Must be
+ * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks
- * called from the arch with interrupts disabled.
+ *
+ * Invoked by arch to handle an IPI for call function single.
+ * Must be called with interrupts disabled.
 */
 void generic_smp_call_function_single_interrupt(void)
 {
+        flush_smp_call_function_queue(true);
+}
+/**
+ * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
+ *
+ * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
+ *                    offline CPU. Skip this check if set to 'false'.
+ *
+ * Flush any pending smp-call-function callbacks queued on this CPU. This is
+ * invoked by the generic IPI handler, as well as by a CPU about to go offline,
+ * to ensure that all pending IPI callbacks are run before it goes completely
+ * offline.
+ *
+ * Loop through the call_single_queue and run all the queued callbacks.
+ * Must be called with interrupts disabled.
+ */
+static void flush_smp_call_function_queue(bool warn_cpu_offline)
+{
+        struct llist_head *head;
        struct llist_node *entry;
        struct call_single_data *csd, *csd_next;
        static bool warned;
-        entry = llist_del_all(&__get_cpu_var(call_single_queue));
+        WARN_ON(!irqs_disabled());
+        head = &__get_cpu_var(call_single_queue);
+        entry = llist_del_all(head);
        entry = llist_reverse_order(entry);
-        /*
+        /* There shouldn't be any pending callbacks on an offline CPU. */
-         * Shouldn't receive this interrupt on a cpu that is not yet online.
+        if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
-         */
+                     !warned && !llist_empty(head))) {
-        if (unlikely(!cpu_online(smp_processor_id()) && !warned)) {
                warned = true;
                WARN(1, "IPI on offline CPU %d\n", smp_processor_id());
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ba9ed453c4ed..75b22e22a72c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -136,7 +136,6 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
 static int minolduid;
-static int min_percpu_pagelist_fract = 8;
 static int ngroups_max = NGROUPS_MAX;
 static const int cap_last_cap = CAP_LAST_CAP;
@@ -152,10 +151,6 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
 #ifdef CONFIG_SPARC
 #endif
-#ifdef CONFIG_SPARC64
-extern int sysctl_tsb_ratio;
-#endif
 #ifdef __hppa__
 extern int pwrsw_enabled;
 #endif
@@ -865,6 +860,17 @@ static struct ctl_table kern_table[] = {
                .extra1         = &zero,
                .extra2         = &one,
        },
+#ifdef CONFIG_SMP
+        {
+                .procname       = "softlockup_all_cpu_backtrace",
+                .data           = &sysctl_softlockup_all_cpu_backtrace,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
+#endif /* CONFIG_SMP */
        {
                .procname       = "nmi_watchdog",
                .data           = &watchdog_user_enabled,
@@ -1321,7 +1327,7 @@ static struct ctl_table vm_table[] = {
                .maxlen         = sizeof(percpu_pagelist_fraction),
                .mode           = 0644,
                .proc_handler   = percpu_pagelist_fraction_sysctl_handler,
-                .extra1         = &min_percpu_pagelist_fract,
+                .extra1         = &zero,
        },
 #ifdef CONFIG_MMU
        {
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 88c9c65a430d..fe75444ae7ec 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -585,9 +585,14 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
                                struct itimerspec *new_setting,
                                struct itimerspec *old_setting)
 {
+        ktime_t exp;
        if (!rtcdev)
                return -ENOTSUPP;
+        if (flags & ~TIMER_ABSTIME)
+                return -EINVAL;
        if (old_setting)
                alarm_timer_get(timr, old_setting);
@@ -597,8 +602,16 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
        /* start the timer */
        timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
-        alarm_start(&timr->it.alarm.alarmtimer,
+        exp = timespec_to_ktime(new_setting->it_value);
-                        timespec_to_ktime(new_setting->it_value));
+        /* Convert (if necessary) to absolute time */
+        if (flags != TIMER_ABSTIME) {
+                ktime_t now;
+                now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime();
+                exp = ktime_add(now, exp);
+        }
+        alarm_start(&timr->it.alarm.alarmtimer, exp);
        return 0;
 }
@@ -730,6 +743,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
        if (!alarmtimer_get_rtcdev())
                return -ENOTSUPP;
+        if (flags & ~TIMER_ABSTIME)
+                return -EINVAL;
        if (!capable(CAP_WAKE_ALARM))
                return -EPERM;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5b372e3ed675..ac9d1dad630b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -265,12 +265,12 @@ static void update_ftrace_function(void)
                func = ftrace_ops_list_func;
        }
+        update_function_graph_func();
        /* If there's no change, then do nothing more here */
        if (ftrace_trace_function == func)
                return;
-        update_function_graph_func();
        /*
         * If we are using the list function, it doesn't care
         * about the function_trace_ops.
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7c56c3d06943..ff7027199a9a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -616,10 +616,6 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
        struct ring_buffer_per_cpu *cpu_buffer;
        struct rb_irq_work *work;
-        if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
-            (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
-                return POLLIN | POLLRDNORM;
        if (cpu == RING_BUFFER_ALL_CPUS)
                work = &buffer->irq_work;
        else {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 384ede311717..291397e66669 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -466,6 +466,12 @@ int __trace_puts(unsigned long ip, const char *str, int size)
        struct print_entry *entry;
        unsigned long irq_flags;
        int alloc;
+        int pc;
+        if (!(trace_flags & TRACE_ITER_PRINTK))
+                return 0;
+        pc = preempt_count();
        if (unlikely(tracing_selftest_running || tracing_disabled))
                return 0;
@@ -475,7 +481,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
        local_save_flags(irq_flags);
        buffer = global_trace.trace_buffer.buffer;
        event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, 
-                                          irq_flags, preempt_count());
+                                          irq_flags, pc);
        if (!event)
                return 0;
@@ -492,6 +498,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
                entry->buf[size] = '\0';
        __buffer_unlock_commit(buffer, event);
+        ftrace_trace_stack(buffer, irq_flags, 4, pc);
        return size;
 }
@@ -509,6 +516,12 @@ int __trace_bputs(unsigned long ip, const char *str)
        struct bputs_entry *entry;
        unsigned long irq_flags;
        int size = sizeof(struct bputs_entry);
+        int pc;
+        if (!(trace_flags & TRACE_ITER_PRINTK))
+                return 0;
+        pc = preempt_count();
        if (unlikely(tracing_selftest_running || tracing_disabled))
                return 0;
@@ -516,7 +529,7 @@ int __trace_bputs(unsigned long ip, const char *str)
        local_save_flags(irq_flags);
        buffer = global_trace.trace_buffer.buffer;
        event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
-                                          irq_flags, preempt_count());
+                                          irq_flags, pc);
        if (!event)
                return 0;
@@ -525,6 +538,7 @@ int __trace_bputs(unsigned long ip, const char *str)
        entry->str                      = str;
        __buffer_unlock_commit(buffer, event);
+        ftrace_trace_stack(buffer, irq_flags, 4, pc);
        return 1;
 }
@@ -809,7 +823,7 @@ static struct {
        { trace_clock_local,    "local",        1 },
        { trace_clock_global,   "global",       1 },
        { trace_clock_counter,  "counter",      0 },
-        { trace_clock_jiffies,  "uptime",       1 },
+        { trace_clock_jiffies,  "uptime",       0 },
        { trace_clock,          "perf",         1 },
        ARCH_TRACE_CLOCKS
 };
@@ -1396,7 +1410,6 @@ void tracing_start(void)
        arch_spin_unlock(&global_trace.max_lock);
-        ftrace_start();
 out:
        raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
 }
@@ -1443,7 +1456,6 @@ void tracing_stop(void)
        struct ring_buffer *buffer;
        unsigned long flags;
-        ftrace_stop();
        raw_spin_lock_irqsave(&global_trace.start_lock, flags);
        if (global_trace.stop_count++)
                goto out;
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 26dc348332b7..57b67b1f24d1 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -59,13 +59,14 @@ u64 notrace trace_clock(void)
 /*
 * trace_jiffy_clock(): Simply use jiffies as a clock counter.
+ * Note that this use of jiffies_64 is not completely safe on
+ * 32-bit systems. But the window is tiny, and the effect if
+ * we are affected is that we will have an obviously bogus
+ * timestamp on a trace event - i.e. not life threatening.
 */
 u64 notrace trace_clock_jiffies(void)
 {
-        u64 jiffy = jiffies - INITIAL_JIFFIES;
+        return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES);
-        /* Return nsecs */
-        return (u64)jiffies_to_usecs(jiffy) * 1000ULL;
 }
 /*
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f99e0b3bca8c..2de53628689f 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -470,6 +470,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file)
        list_del(&file->list);
        remove_subsystem(file->system);
+        free_event_filter(file->filter);
        kmem_cache_free(file_cachep, file);
 }
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 04fdb5de823c..3c9b97e6b1f4 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -893,6 +893,9 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
        int ret;
        if (file) {
+                if (tu->tp.flags & TP_FLAG_PROFILE)
+                        return -EINTR;
                link = kmalloc(sizeof(*link), GFP_KERNEL);
                if (!link)
                        return -ENOMEM;
@@ -901,29 +904,40 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
                list_add_tail_rcu(&link->list, &tu->tp.files);
                tu->tp.flags |= TP_FLAG_TRACE;
-        } else
+        } else {
-                tu->tp.flags |= TP_FLAG_PROFILE;
+                if (tu->tp.flags & TP_FLAG_TRACE)
+                        return -EINTR;
-        ret = uprobe_buffer_enable();
+                tu->tp.flags |= TP_FLAG_PROFILE;
-        if (ret < 0)
+        }
-                return ret;
        WARN_ON(!uprobe_filter_is_empty(&tu->filter));
        if (enabled)
                return 0;
+        ret = uprobe_buffer_enable();
+        if (ret)
+                goto err_flags;
        tu->consumer.filter = filter;
        ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
-        if (ret) {
+        if (ret)
-                if (file) {
+                goto err_buffer;
-                        list_del(&link->list);
-                        kfree(link);
-                        tu->tp.flags &= ~TP_FLAG_TRACE;
-                } else
-                        tu->tp.flags &= ~TP_FLAG_PROFILE;
-        }
+        return 0;
+ err_buffer:
+        uprobe_buffer_disable();
+ err_flags:
+        if (file) {
+                list_del(&link->list);
+                kfree(link);
+                tu->tp.flags &= ~TP_FLAG_TRACE;
+        } else {
+                tu->tp.flags &= ~TP_FLAG_PROFILE;
+        }
        return ret;
 }
@@ -1201,12 +1215,6 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
        current->utask->vaddr = (unsigned long) &udd;
-#ifdef CONFIG_PERF_EVENTS
-        if ((tu->tp.flags & TP_FLAG_TRACE) == 0 &&
-            !uprobe_perf_filter(&tu->consumer, 0, current->mm))
-                return UPROBE_HANDLER_REMOVE;
-#endif
        if (WARN_ON_ONCE(!uprobe_cpu_buffer))
                return 0;
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 33cbd8c203f8..3490407dc7b7 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -492,33 +492,29 @@ static int sys_tracepoint_refcount;
 void syscall_regfunc(void)
 {
-        unsigned long flags;
+        struct task_struct *p, *t;
-        struct task_struct *g, *t;
        if (!sys_tracepoint_refcount) {
-                read_lock_irqsave(&tasklist_lock, flags);
+                read_lock(&tasklist_lock);
-                do_each_thread(g, t) {
+                for_each_process_thread(p, t) {
-                        /* Skip kernel threads. */
+                        set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
-                        if (t->mm)
+                }
-                                set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
+                read_unlock(&tasklist_lock);
-                } while_each_thread(g, t);
-                read_unlock_irqrestore(&tasklist_lock, flags);
        }
        sys_tracepoint_refcount++;
 }
 void syscall_unregfunc(void)
 {
-        unsigned long flags;
+        struct task_struct *p, *t;
-        struct task_struct *g, *t;
        sys_tracepoint_refcount--;
        if (!sys_tracepoint_refcount) {
-                read_lock_irqsave(&tasklist_lock, flags);
+                read_lock(&tasklist_lock);
-                do_each_thread(g, t) {
+                for_each_process_thread(p, t) {
                        clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
-                } while_each_thread(g, t);
+                }
-                read_unlock_irqrestore(&tasklist_lock, flags);
+                read_unlock(&tasklist_lock);
        }
 }
 #endif
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 516203e665fc..c3319bd1b040 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -31,6 +31,12 @@
 int watchdog_user_enabled = 1;
 int __read_mostly watchdog_thresh = 10;
+#ifdef CONFIG_SMP
+int __read_mostly sysctl_softlockup_all_cpu_backtrace;
+#else
+#define sysctl_softlockup_all_cpu_backtrace 0
+#endif
 static int __read_mostly watchdog_running;
 static u64 __read_mostly sample_period;
@@ -47,6 +53,7 @@ static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
 #endif
+static unsigned long soft_lockup_nmi_warn;
 /* boot commands */
 /*
@@ -95,6 +102,15 @@ static int __init nosoftlockup_setup(char *str)
 }
 __setup("nosoftlockup", nosoftlockup_setup);
 /*  */
+#ifdef CONFIG_SMP
+static int __init softlockup_all_cpu_backtrace_setup(char *str)
+{
+        sysctl_softlockup_all_cpu_backtrace =
+                !!simple_strtol(str, NULL, 0);
+        return 1;
+}
+__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
+#endif
 /*
 * Hard-lockup warnings should be triggered after just a few seconds. Soft-
@@ -271,6 +287,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
        unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
        struct pt_regs *regs = get_irq_regs();
        int duration;
+        int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
        /* kick the hardlockup detector */
        watchdog_interrupt_count();
@@ -317,6 +334,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
                if (__this_cpu_read(soft_watchdog_warn) == true)
                        return HRTIMER_RESTART;
+                if (softlockup_all_cpu_backtrace) {
+                        /* Prevent multiple soft-lockup reports if one cpu is already
+                         * engaged in dumping cpu back traces
+                         */
+                        if (test_and_set_bit(0, &soft_lockup_nmi_warn)) {
+                                /* Someone else will report us. Let's give up */
+                                __this_cpu_write(soft_watchdog_warn, true);
+                                return HRTIMER_RESTART;
+                        }
+                }
                printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
                        smp_processor_id(), duration,
                        current->comm, task_pid_nr(current));
@@ -327,6 +355,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
                else
                        dump_stack();
+                if (softlockup_all_cpu_backtrace) {
+                        /* Avoid generating two back traces for current
+                         * given that one is already made above
+                         */
+                        trigger_allbutself_cpu_backtrace();
+                        clear_bit(0, &soft_lockup_nmi_warn);
+                        /* Barrier to sync with other cpus */
+                        smp_mb__after_atomic();
+                }
                if (softlockup_panic)
                        panic("softlockup: hung tasks");
                __this_cpu_write(soft_watchdog_warn, true);
@@ -527,10 +566,8 @@ static void update_timers_all_cpus(void)
        int cpu;
        get_online_cpus();
-        preempt_disable();
        for_each_online_cpu(cpu)
                update_timers(cpu);
-        preempt_enable();
        put_online_cpus();
 }
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6203d2900877..35974ac69600 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3284,6 +3284,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq)
                }
        }
+        dev_set_uevent_suppress(&wq_dev->dev, false);
        kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
        return 0;
 }
@@ -4879,7 +4880,7 @@ static void __init wq_numa_init(void)
        BUG_ON(!tbl);
        for_each_node(node)
-                BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
+                BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
                                node_online(node) ? node : NUMA_NO_NODE));
        for_each_possible_cpu(cpu) {