23 files changed, 599 insertions, 387 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 382dd5a8b2d7..94fabd534b03 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -55,4 +55,4 @@ config HZ
        default 1000 if HZ_1000
 config SCHED_HRTICK
-        def_bool HIGH_RES_TIMERS && USE_GENERIC_SMP_HELPERS
+        def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS)
diff --git a/kernel/capability.c b/kernel/capability.c
index 0101e847603e..33e51e78c2d8 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -486,17 +486,22 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
        return ret;
 }
-int __capable(struct task_struct *t, int cap)
+/**
+ * capable - Determine if the current task has a superior capability in effect
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+int capable(int cap)
 {
-        if (security_capable(t, cap) == 0) {
+        if (has_capability(current, cap)) {
-                t->flags |= PF_SUPERPRIV;
+                current->flags |= PF_SUPERPRIV;
                return 1;
        }
        return 0;
 }
-int capable(int cap)
-{
-        return __capable(current, cap);
-}
 EXPORT_SYMBOL(capable);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index e202a68d1cc1..f17e9854c246 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -349,6 +349,8 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
                goto out_notify;
        BUG_ON(!cpu_online(cpu));
+        cpu_set(cpu, cpu_active_map);
        /* Now call notifier in preparation. */
        raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
@@ -367,7 +369,7 @@ int __cpuinit cpu_up(unsigned int cpu)
        if (!cpu_isset(cpu, cpu_possible_map)) {
                printk(KERN_ERR "can't online cpu %d because it is not "
                        "configured as may-hotadd at boot time\n", cpu);
-#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) || defined(CONFIG_S390)
+#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
                printk(KERN_ERR "please check additional_cpus= boot "
                                "parameter\n");
 #endif
@@ -383,9 +385,6 @@ int __cpuinit cpu_up(unsigned int cpu)
        err = _cpu_up(cpu, 0);
-        if (cpu_online(cpu))
-                cpu_set(cpu, cpu_active_map);
 out:
        cpu_maps_update_done();
        return err;
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index 91e96950cd52..c1d4d5b4c61c 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -92,7 +92,7 @@ void *dma_mark_declared_memory_occupied(struct device *dev,
 EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
 /**
- * Try to allocate memory from the per-device coherent area.
+ * dma_alloc_from_coherent() - try to allocate memory from the per-device coherent area
 *
 * @dev:        device from which we allocate memory
 * @size:       size of requested memory area
@@ -100,11 +100,11 @@ EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
 * @ret:        This pointer will be filled with the virtual address
 *              to allocated area.
 *
- * This function should be only called from per-arch %dma_alloc_coherent()
+ * This function should be only called from per-arch dma_alloc_coherent()
 * to support allocation from per-device coherent memory pools.
 *
 * Returns 0 if dma_alloc_coherent should continue with allocating from
- * generic memory areas, or !0 if dma_alloc_coherent should return %ret.
+ * generic memory areas, or !0 if dma_alloc_coherent should return @ret.
 */
 int dma_alloc_from_coherent(struct device *dev, ssize_t size,
                                       dma_addr_t *dma_handle, void **ret)
@@ -126,7 +126,7 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size,
 }
 /**
- * Try to free the memory allocated from per-device coherent memory pool.
+ * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool
 * @dev:        device from which the memory was allocated
 * @order:      the order of pages allocated
 * @vaddr:      virtual address of allocated pages
@@ -135,7 +135,7 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size,
 * coherent memory pool and if so, releases that memory.
 *
 * Returns 1 if we correctly released the memory, or 0 if
- * %dma_release_coherent() should proceed with releasing memory from
+ * dma_release_coherent() should proceed with releasing memory from
 * generic pools.
 */
 int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6c6d35d68ee9..a09dd29c2fd7 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -8,6 +8,7 @@
 #include <linux/irq.h>
 #include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include <linux/interrupt.h>
 #include "internals.h"
@@ -16,23 +17,18 @@ static struct proc_dir_entry *root_irq_dir;
 #ifdef CONFIG_SMP
-static int irq_affinity_read_proc(char *page, char **start, off_t off,
+static int irq_affinity_proc_show(struct seq_file *m, void *v)
-                                  int count, int *eof, void *data)
 {
-        struct irq_desc *desc = irq_desc + (long)data;
+        struct irq_desc *desc = irq_desc + (long)m->private;
        cpumask_t *mask = &desc->affinity;
-        int len;
 #ifdef CONFIG_GENERIC_PENDING_IRQ
        if (desc->status & IRQ_MOVE_PENDING)
                mask = &desc->pending_mask;
 #endif
-        len = cpumask_scnprintf(page, count, *mask);
+        seq_cpumask(m, mask);
+        seq_putc(m, '\n');
-        if (count - len < 2)
+        return 0;
-                return -EINVAL;
-        len += sprintf(page + len, "\n");
-        return len;
 }
 #ifndef is_affinity_mask_valid
@@ -40,11 +36,12 @@ static int irq_affinity_read_proc(char *page, char **start, off_t off,
 #endif
 int no_irq_affinity;
-static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
+static ssize_t irq_affinity_proc_write(struct file *file,
-                                   unsigned long count, void *data)
+                const char __user *buffer, size_t count, loff_t *pos)
 {
-        unsigned int irq = (int)(long)data, full_count = count, err;
+        unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
        cpumask_t new_value;
+        int err;
        if (!irq_desc[irq].chip->set_affinity || no_irq_affinity ||
            irq_balancing_disabled(irq))
@@ -65,28 +62,38 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
        if (!cpus_intersects(new_value, cpu_online_map))
                /* Special case for empty set - allow the architecture
                   code to set default SMP affinity. */
-                return irq_select_affinity(irq) ? -EINVAL : full_count;
+                return irq_select_affinity(irq) ? -EINVAL : count;
        irq_set_affinity(irq, new_value);
-        return full_count;
+        return count;
 }
-static int default_affinity_read(char *page, char **start, off_t off,
+static int irq_affinity_proc_open(struct inode *inode, struct file *file)
-                                  int count, int *eof, void *data)
 {
-        int len = cpumask_scnprintf(page, count, irq_default_affinity);
+        return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
-        if (count - len < 2)
-                return -EINVAL;
-        len += sprintf(page + len, "\n");
-        return len;
 }
-static int default_affinity_write(struct file *file, const char __user *buffer,
+static const struct file_operations irq_affinity_proc_fops = {
-                                   unsigned long count, void *data)
+        .open           = irq_affinity_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+        .write          = irq_affinity_proc_write,
+};
+static int default_affinity_show(struct seq_file *m, void *v)
+{
+        seq_cpumask(m, &irq_default_affinity);
+        seq_putc(m, '\n');
+        return 0;
+}
+static ssize_t default_affinity_write(struct file *file,
+                const char __user *buffer, size_t count, loff_t *ppos)
 {
-        unsigned int full_count = count, err;
        cpumask_t new_value;
+        int err;
        err = cpumask_parse_user(buffer, count, new_value);
        if (err)
@@ -105,8 +112,21 @@ static int default_affinity_write(struct file *file, const char __user *buffer,
        irq_default_affinity = new_value;
-        return full_count;
+        return count;
 }
+static int default_affinity_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, default_affinity_show, NULL);
+}
+static const struct file_operations default_affinity_proc_fops = {
+        .open           = default_affinity_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+        .write          = default_affinity_write,
+};
 #endif
 static int irq_spurious_read(char *page, char **start, off_t off,
@@ -178,16 +198,9 @@ void register_irq_proc(unsigned int irq)
        irq_desc[irq].dir = proc_mkdir(name, root_irq_dir);
 #ifdef CONFIG_SMP
-        {
+        /* create /proc/irq/<irq>/smp_affinity */
-                /* create /proc/irq/<irq>/smp_affinity */
+        proc_create_data("smp_affinity", 0600, irq_desc[irq].dir,
-                entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir);
+                         &irq_affinity_proc_fops, (void *)(long)irq);
-                if (entry) {
-                        entry->data = (void *)(long)irq;
-                        entry->read_proc = irq_affinity_read_proc;
-                        entry->write_proc = irq_affinity_write_proc;
-                }
-        }
 #endif
        entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir);
@@ -208,15 +221,8 @@ void unregister_handler_proc(unsigned int irq, struct irqaction *action)
 void register_default_affinity_proc(void)
 {
 #ifdef CONFIG_SMP
-        struct proc_dir_entry *entry;
+        proc_create("irq/default_smp_affinity", 0600, NULL,
+                    &default_affinity_proc_fops);
-        /* create /proc/irq/default_smp_affinity */
-        entry = create_proc_entry("default_smp_affinity", 0600, root_irq_dir);
-        if (entry) {
-                entry->data = NULL;
-                entry->read_proc  = default_affinity_read;
-                entry->write_proc = default_affinity_write;
-        }
 #endif
 }
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c8a4370e2a34..59f3f0df35d4 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -12,7 +12,7 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/kexec.h>
-#include <linux/spinlock.h>
+#include <linux/mutex.h>
 #include <linux/list.h>
 #include <linux/highmem.h>
 #include <linux/syscalls.h>
@@ -77,7 +77,7 @@ int kexec_should_crash(struct task_struct *p)
 *
 * The code for the transition from the current kernel to the
 * the new kernel is placed in the control_code_buffer, whose size
- * is given by KEXEC_CONTROL_CODE_SIZE.  In the best case only a single
+ * is given by KEXEC_CONTROL_PAGE_SIZE.  In the best case only a single
 * page of memory is necessary, but some architectures require more.
 * Because this memory must be identity mapped in the transition from
 * virtual to physical addresses it must live in the range
@@ -242,7 +242,7 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
         */
        result = -ENOMEM;
        image->control_code_page = kimage_alloc_control_pages(image,
-                                           get_order(KEXEC_CONTROL_CODE_SIZE));
+                                           get_order(KEXEC_CONTROL_PAGE_SIZE));
        if (!image->control_code_page) {
                printk(KERN_ERR "Could not allocate control_code_buffer\n");
                goto out;
@@ -317,7 +317,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
         */
        result = -ENOMEM;
        image->control_code_page = kimage_alloc_control_pages(image,
-                                           get_order(KEXEC_CONTROL_CODE_SIZE));
+                                           get_order(KEXEC_CONTROL_PAGE_SIZE));
        if (!image->control_code_page) {
                printk(KERN_ERR "Could not allocate control_code_buffer\n");
                goto out;
@@ -924,19 +924,14 @@ static int kimage_load_segment(struct kimage *image,
 */
 struct kimage *kexec_image;
 struct kimage *kexec_crash_image;
-/*
- * A home grown binary mutex.
+static DEFINE_MUTEX(kexec_mutex);
- * Nothing can wait so this mutex is safe to use
- * in interrupt context :)
- */
-static int kexec_lock;
 asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
                                struct kexec_segment __user *segments,
                                unsigned long flags)
 {
        struct kimage **dest_image, *image;
-        int locked;
        int result;
        /* We only trust the superuser with rebooting the system. */
@@ -972,8 +967,7 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
         *
         * KISS: always take the mutex.
         */
-        locked = xchg(&kexec_lock, 1);
+        if (!mutex_trylock(&kexec_mutex))
-        if (locked)
                return -EBUSY;
        dest_image = &kexec_image;
@@ -1015,8 +1009,7 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
        image = xchg(dest_image, image);
 out:
-        locked = xchg(&kexec_lock, 0); /* Release the mutex */
+        mutex_unlock(&kexec_mutex);
-        BUG_ON(!locked);
        kimage_free(image);
        return result;
@@ -1063,10 +1056,7 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry,
 void crash_kexec(struct pt_regs *regs)
 {
-        int locked;
+        /* Take the kexec_mutex here to prevent sys_kexec_load
-        /* Take the kexec_lock here to prevent sys_kexec_load
         * running on one cpu from replacing the crash kernel
         * we are using after a panic on a different cpu.
         *
@@ -1074,8 +1064,7 @@ void crash_kexec(struct pt_regs *regs)
         * of memory the xchg(&kexec_crash_image) would be
         * sufficient.  But since I reuse the memory...
         */
-        locked = xchg(&kexec_lock, 1);
+        if (mutex_trylock(&kexec_mutex)) {
-        if (!locked) {
                if (kexec_crash_image) {
                        struct pt_regs fixed_regs;
                        crash_setup_regs(&fixed_regs, regs);
@@ -1083,8 +1072,7 @@ void crash_kexec(struct pt_regs *regs)
                        machine_crash_shutdown(&fixed_regs);
                        machine_kexec(kexec_crash_image);
                }
-                locked = xchg(&kexec_lock, 0);
+                mutex_unlock(&kexec_mutex);
-                BUG_ON(!locked);
        }
 }
@@ -1426,25 +1414,23 @@ static int __init crash_save_vmcoreinfo_init(void)
 module_init(crash_save_vmcoreinfo_init)
-/**
+/*
- *      kernel_kexec - reboot the system
+ * Move into place and start executing a preloaded standalone
- *
+ * executable.  If nothing was preloaded return an error.
- *      Move into place and start executing a preloaded standalone
- *      executable.  If nothing was preloaded return an error.
 */
 int kernel_kexec(void)
 {
        int error = 0;
-        if (xchg(&kexec_lock, 1))
+        if (!mutex_trylock(&kexec_mutex))
                return -EBUSY;
        if (!kexec_image) {
                error = -EINVAL;
                goto Unlock;
        }
-        if (kexec_image->preserve_context) {
 #ifdef CONFIG_KEXEC_JUMP
+        if (kexec_image->preserve_context) {
                mutex_lock(&pm_mutex);
                pm_prepare_console();
                error = freeze_processes();
@@ -1459,6 +1445,7 @@ int kernel_kexec(void)
                error = disable_nonboot_cpus();
                if (error)
                        goto Resume_devices;
+                device_pm_lock();
                local_irq_disable();
                /* At this point, device_suspend() has been called,
                 * but *not* device_power_down(). We *must*
@@ -1470,26 +1457,22 @@ int kernel_kexec(void)
                error = device_power_down(PMSG_FREEZE);
                if (error)
                        goto Enable_irqs;
-                save_processor_state();
+        } else
 #endif
-        } else {
+        {
-                blocking_notifier_call_chain(&reboot_notifier_list,
+                kernel_restart_prepare(NULL);
-                                             SYS_RESTART, NULL);
-                system_state = SYSTEM_RESTART;
-                device_shutdown();
-                sysdev_shutdown();
                printk(KERN_EMERG "Starting new kernel\n");
                machine_shutdown();
        }
        machine_kexec(kexec_image);
-        if (kexec_image->preserve_context) {
 #ifdef CONFIG_KEXEC_JUMP
-                restore_processor_state();
+        if (kexec_image->preserve_context) {
                device_power_up(PMSG_RESTORE);
 Enable_irqs:
                local_irq_enable();
+                device_pm_unlock();
                enable_nonboot_cpus();
 Resume_devices:
                device_resume(PMSG_RESTORE);
@@ -1499,11 +1482,10 @@ int kernel_kexec(void)
 Restore_console:
                pm_restore_console();
                mutex_unlock(&pm_mutex);
-#endif
        }
+#endif
 Unlock:
-        xchg(&kexec_lock, 0);
+        mutex_unlock(&kexec_mutex);
        return error;
 }
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index d38a64362973..3bfb1877a003 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -124,6 +124,15 @@ static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
 unsigned long nr_lock_classes;
 static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
+static inline struct lock_class *hlock_class(struct held_lock *hlock)
+{
+        if (!hlock->class_idx) {
+                DEBUG_LOCKS_WARN_ON(1);
+                return NULL;
+        }
+        return lock_classes + hlock->class_idx - 1;
+}
 #ifdef CONFIG_LOCK_STAT
 static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
@@ -222,7 +231,7 @@ static void lock_release_holdtime(struct held_lock *hlock)
        holdtime = sched_clock() - hlock->holdtime_stamp;
-        stats = get_lock_stats(hlock->class);
+        stats = get_lock_stats(hlock_class(hlock));
        if (hlock->read)
                lock_time_inc(&stats->read_holdtime, holdtime);
        else
@@ -372,6 +381,19 @@ unsigned int nr_process_chains;
 unsigned int max_lockdep_depth;
 unsigned int max_recursion_depth;
+static unsigned int lockdep_dependency_gen_id;
+static bool lockdep_dependency_visit(struct lock_class *source,
+                                     unsigned int depth)
+{
+        if (!depth)
+                lockdep_dependency_gen_id++;
+        if (source->dep_gen_id == lockdep_dependency_gen_id)
+                return true;
+        source->dep_gen_id = lockdep_dependency_gen_id;
+        return false;
+}
 #ifdef CONFIG_DEBUG_LOCKDEP
 /*
 * We cannot printk in early bootup code. Not even early_printk()
@@ -505,7 +527,7 @@ static void print_lockdep_cache(struct lockdep_map *lock)
 static void print_lock(struct held_lock *hlock)
 {
-        print_lock_name(hlock->class);
+        print_lock_name(hlock_class(hlock));
        printk(", at: ");
        print_ip_sym(hlock->acquire_ip);
 }
@@ -558,6 +580,9 @@ static void print_lock_dependencies(struct lock_class *class, int depth)
 {
        struct lock_list *entry;
+        if (lockdep_dependency_visit(class, depth))
+                return;
        if (DEBUG_LOCKS_WARN_ON(depth >= 20))
                return;
@@ -932,7 +957,7 @@ static noinline int print_circular_bug_tail(void)
        if (debug_locks_silent)
                return 0;
-        this.class = check_source->class;
+        this.class = hlock_class(check_source);
        if (!save_trace(&this.trace))
                return 0;
@@ -959,6 +984,67 @@ static int noinline print_infinite_recursion_bug(void)
        return 0;
 }
+unsigned long __lockdep_count_forward_deps(struct lock_class *class,
+                                           unsigned int depth)
+{
+        struct lock_list *entry;
+        unsigned long ret = 1;
+        if (lockdep_dependency_visit(class, depth))
+                return 0;
+        /*
+         * Recurse this class's dependency list:
+         */
+        list_for_each_entry(entry, &class->locks_after, entry)
+                ret += __lockdep_count_forward_deps(entry->class, depth + 1);
+        return ret;
+}
+unsigned long lockdep_count_forward_deps(struct lock_class *class)
+{
+        unsigned long ret, flags;
+        local_irq_save(flags);
+        __raw_spin_lock(&lockdep_lock);
+        ret = __lockdep_count_forward_deps(class, 0);
+        __raw_spin_unlock(&lockdep_lock);
+        local_irq_restore(flags);
+        return ret;
+}
+unsigned long __lockdep_count_backward_deps(struct lock_class *class,
+                                            unsigned int depth)
+{
+        struct lock_list *entry;
+        unsigned long ret = 1;
+        if (lockdep_dependency_visit(class, depth))
+                return 0;
+        /*
+         * Recurse this class's dependency list:
+         */
+        list_for_each_entry(entry, &class->locks_before, entry)
+                ret += __lockdep_count_backward_deps(entry->class, depth + 1);
+        return ret;
+}
+unsigned long lockdep_count_backward_deps(struct lock_class *class)
+{
+        unsigned long ret, flags;
+        local_irq_save(flags);
+        __raw_spin_lock(&lockdep_lock);
+        ret = __lockdep_count_backward_deps(class, 0);
+        __raw_spin_unlock(&lockdep_lock);
+        local_irq_restore(flags);
+        return ret;
+}
 /*
 * Prove that the dependency graph starting at <entry> can not
 * lead to <target>. Print an error and return 0 if it does.
@@ -968,6 +1054,9 @@ check_noncircular(struct lock_class *source, unsigned int depth)
 {
        struct lock_list *entry;
+        if (lockdep_dependency_visit(source, depth))
+                return 1;
        debug_atomic_inc(&nr_cyclic_check_recursions);
        if (depth > max_recursion_depth)
                max_recursion_depth = depth;
@@ -977,7 +1066,7 @@ check_noncircular(struct lock_class *source, unsigned int depth)
         * Check this lock's dependency list:
         */
        list_for_each_entry(entry, &source->locks_after, entry) {
-                if (entry->class == check_target->class)
+                if (entry->class == hlock_class(check_target))
                        return print_circular_bug_header(entry, depth+1);
                debug_atomic_inc(&nr_cyclic_checks);
                if (!check_noncircular(entry->class, depth+1))
@@ -1011,6 +1100,9 @@ find_usage_forwards(struct lock_class *source, unsigned int depth)
        struct lock_list *entry;
        int ret;
+        if (lockdep_dependency_visit(source, depth))
+                return 1;
        if (depth > max_recursion_depth)
                max_recursion_depth = depth;
        if (depth >= RECURSION_LIMIT)
@@ -1050,6 +1142,9 @@ find_usage_backwards(struct lock_class *source, unsigned int depth)
        struct lock_list *entry;
        int ret;
+        if (lockdep_dependency_visit(source, depth))
+                return 1;
        if (!__raw_spin_is_locked(&lockdep_lock))
                return DEBUG_LOCKS_WARN_ON(1);
@@ -1064,6 +1159,11 @@ find_usage_backwards(struct lock_class *source, unsigned int depth)
                return 2;
        }
+        if (!source && debug_locks_off_graph_unlock()) {
+                WARN_ON(1);
+                return 0;
+        }
        /*
         * Check this lock's dependency list:
         */
@@ -1103,9 +1203,9 @@ print_bad_irq_dependency(struct task_struct *curr,
        printk("\nand this task is already holding:\n");
        print_lock(prev);
        printk("which would create a new lock dependency:\n");
-        print_lock_name(prev->class);
+        print_lock_name(hlock_class(prev));
        printk(" ->");
-        print_lock_name(next->class);
+        print_lock_name(hlock_class(next));
        printk("\n");
        printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
@@ -1146,12 +1246,12 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
        find_usage_bit = bit_backwards;
        /* fills in <backwards_match> */
-        ret = find_usage_backwards(prev->class, 0);
+        ret = find_usage_backwards(hlock_class(prev), 0);
        if (!ret || ret == 1)
                return ret;
        find_usage_bit = bit_forwards;
-        ret = find_usage_forwards(next->class, 0);
+        ret = find_usage_forwards(hlock_class(next), 0);
        if (!ret || ret == 1)
                return ret;
        /* ret == 2 */
@@ -1272,18 +1372,32 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
               struct lockdep_map *next_instance, int read)
 {
        struct held_lock *prev;
+        struct held_lock *nest = NULL;
        int i;
        for (i = 0; i < curr->lockdep_depth; i++) {
                prev = curr->held_locks + i;
-                if (prev->class != next->class)
+                if (prev->instance == next->nest_lock)
+                        nest = prev;
+                if (hlock_class(prev) != hlock_class(next))
                        continue;
                /*
                 * Allow read-after-read recursion of the same
                 * lock class (i.e. read_lock(lock)+read_lock(lock)):
                 */
                if ((read == 2) && prev->read)
                        return 2;
+                /*
+                 * We're holding the nest_lock, which serializes this lock's
+                 * nesting behaviour.
+                 */
+                if (nest)
+                        return 2;
                return print_deadlock_bug(curr, prev, next);
        }
        return 1;
@@ -1329,7 +1443,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
         */
        check_source = next;
        check_target = prev;
-        if (!(check_noncircular(next->class, 0)))
+        if (!(check_noncircular(hlock_class(next), 0)))
                return print_circular_bug_tail();
        if (!check_prev_add_irq(curr, prev, next))
@@ -1353,8 +1467,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
         *  chains - the second one will be new, but L1 already has
         *  L2 added to its dependency list, due to the first chain.)
         */
-        list_for_each_entry(entry, &prev->class->locks_after, entry) {
+        list_for_each_entry(entry, &hlock_class(prev)->locks_after, entry) {
-                if (entry->class == next->class) {
+                if (entry->class == hlock_class(next)) {
                        if (distance == 1)
                                entry->distance = 1;
                        return 2;
@@ -1365,26 +1479,28 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
         * Ok, all validations passed, add the new lock
         * to the previous lock's dependency list:
         */
-        ret = add_lock_to_list(prev->class, next->class,
+        ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
-                               &prev->class->locks_after, next->acquire_ip, distance);
+                               &hlock_class(prev)->locks_after,
+                               next->acquire_ip, distance);
        if (!ret)
                return 0;
-        ret = add_lock_to_list(next->class, prev->class,
+        ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
-                               &next->class->locks_before, next->acquire_ip, distance);
+                               &hlock_class(next)->locks_before,
+                               next->acquire_ip, distance);
        if (!ret)
                return 0;
        /*
         * Debugging printouts:
         */
-        if (verbose(prev->class) || verbose(next->class)) {
+        if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
                graph_unlock();
                printk("\n new dependency: ");
-                print_lock_name(prev->class);
+                print_lock_name(hlock_class(prev));
                printk(" => ");
-                print_lock_name(next->class);
+                print_lock_name(hlock_class(next));
                printk("\n");
                dump_stack();
                return graph_lock();
@@ -1481,7 +1597,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
                                     struct held_lock *hlock,
                                     u64 chain_key)
 {
-        struct lock_class *class = hlock->class;
+        struct lock_class *class = hlock_class(hlock);
        struct list_head *hash_head = chainhashentry(chain_key);
        struct lock_chain *chain;
        struct held_lock *hlock_curr, *hlock_next;
@@ -1554,7 +1670,7 @@ cache_hit:
        if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
                chain->base = cn;
                for (j = 0; j < chain->depth - 1; j++, i++) {
-                        int lock_id = curr->held_locks[i].class - lock_classes;
+                        int lock_id = curr->held_locks[i].class_idx - 1;
                        chain_hlocks[chain->base + j] = lock_id;
                }
                chain_hlocks[chain->base + j] = class - lock_classes;
@@ -1643,14 +1759,13 @@ static void check_chain_key(struct task_struct *curr)
                hlock = curr->held_locks + i;
                if (chain_key != hlock->prev_chain_key) {
                        debug_locks_off();
-                        printk("hm#1, depth: %u [%u], %016Lx != %016Lx\n",
+                        WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n",
                                curr->lockdep_depth, i,
                                (unsigned long long)chain_key,
                                (unsigned long long)hlock->prev_chain_key);
-                        WARN_ON(1);
                        return;
                }
-                id = hlock->class - lock_classes;
+                id = hlock->class_idx - 1;
                if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
                        return;
@@ -1662,11 +1777,10 @@ static void check_chain_key(struct task_struct *curr)
        }
        if (chain_key != curr->curr_chain_key) {
                debug_locks_off();
-                printk("hm#2, depth: %u [%u], %016Lx != %016Lx\n",
+                WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n",
                        curr->lockdep_depth, i,
                        (unsigned long long)chain_key,
                        (unsigned long long)curr->curr_chain_key);
-                WARN_ON(1);
        }
 #endif
 }
@@ -1695,7 +1809,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
        print_lock(this);
        printk("{%s} state was registered at:\n", usage_str[prev_bit]);
-        print_stack_trace(this->class->usage_traces + prev_bit, 1);
+        print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1);
        print_irqtrace_events(curr);
        printk("\nother info that might help us debug this:\n");
@@ -1714,7 +1828,7 @@ static inline int
 valid_state(struct task_struct *curr, struct held_lock *this,
            enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
 {
-        if (unlikely(this->class->usage_mask & (1 << bad_bit)))
+        if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit)))
                return print_usage_bug(curr, this, bad_bit, new_bit);
        return 1;
 }
@@ -1753,7 +1867,7 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
        lockdep_print_held_locks(curr);
        printk("\nthe first lock's dependencies:\n");
-        print_lock_dependencies(this->class, 0);
+        print_lock_dependencies(hlock_class(this), 0);
        printk("\nthe second lock's dependencies:\n");
        print_lock_dependencies(other, 0);
@@ -1776,7 +1890,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
        find_usage_bit = bit;
        /* fills in <forwards_match> */
-        ret = find_usage_forwards(this->class, 0);
+        ret = find_usage_forwards(hlock_class(this), 0);
        if (!ret || ret == 1)
                return ret;
@@ -1795,7 +1909,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
        find_usage_bit = bit;
        /* fills in <backwards_match> */
-        ret = find_usage_backwards(this->class, 0);
+        ret = find_usage_backwards(hlock_class(this), 0);
        if (!ret || ret == 1)
                return ret;
@@ -1861,7 +1975,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
                                LOCK_ENABLED_HARDIRQS_READ, "hard-read"))
                        return 0;
 #endif
-                if (hardirq_verbose(this->class))
+                if (hardirq_verbose(hlock_class(this)))
                        ret = 2;
                break;
        case LOCK_USED_IN_SOFTIRQ:
@@ -1886,7 +2000,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
                                LOCK_ENABLED_SOFTIRQS_READ, "soft-read"))
                        return 0;
 #endif
-                if (softirq_verbose(this->class))
+                if (softirq_verbose(hlock_class(this)))
                        ret = 2;
                break;
        case LOCK_USED_IN_HARDIRQ_READ:
@@ -1899,7 +2013,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
                if (!check_usage_forwards(curr, this,
                                          LOCK_ENABLED_HARDIRQS, "hard"))
                        return 0;
-                if (hardirq_verbose(this->class))
+                if (hardirq_verbose(hlock_class(this)))
                        ret = 2;
                break;
        case LOCK_USED_IN_SOFTIRQ_READ:
@@ -1912,7 +2026,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
                if (!check_usage_forwards(curr, this,
                                          LOCK_ENABLED_SOFTIRQS, "soft"))
                        return 0;
-                if (softirq_verbose(this->class))
+                if (softirq_verbose(hlock_class(this)))
                        ret = 2;
                break;
        case LOCK_ENABLED_HARDIRQS:
@@ -1938,7 +2052,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
                                   LOCK_USED_IN_HARDIRQ_READ, "hard-read"))
                        return 0;
 #endif
-                if (hardirq_verbose(this->class))
+                if (hardirq_verbose(hlock_class(this)))
                        ret = 2;
                break;
        case LOCK_ENABLED_SOFTIRQS:
@@ -1964,7 +2078,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
                                   LOCK_USED_IN_SOFTIRQ_READ, "soft-read"))
                        return 0;
 #endif
-                if (softirq_verbose(this->class))
+                if (softirq_verbose(hlock_class(this)))
                        ret = 2;
                break;
        case LOCK_ENABLED_HARDIRQS_READ:
@@ -1979,7 +2093,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
                                           LOCK_USED_IN_HARDIRQ, "hard"))
                        return 0;
 #endif
-                if (hardirq_verbose(this->class))
+                if (hardirq_verbose(hlock_class(this)))
                        ret = 2;
                break;
        case LOCK_ENABLED_SOFTIRQS_READ:
@@ -1994,7 +2108,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
                                           LOCK_USED_IN_SOFTIRQ, "soft"))
                        return 0;
 #endif
-                if (softirq_verbose(this->class))
+                if (softirq_verbose(hlock_class(this)))
                        ret = 2;
                break;
        default:
@@ -2310,7 +2424,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
         * If already set then do not dirty the cacheline,
         * nor do any checks:
         */
-        if (likely(this->class->usage_mask & new_mask))
+        if (likely(hlock_class(this)->usage_mask & new_mask))
                return 1;
        if (!graph_lock())
@@ -2318,14 +2432,14 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
        /*
         * Make sure we didnt race:
         */
-        if (unlikely(this->class->usage_mask & new_mask)) {
+        if (unlikely(hlock_class(this)->usage_mask & new_mask)) {
                graph_unlock();
                return 1;
        }
-        this->class->usage_mask |= new_mask;
+        hlock_class(this)->usage_mask |= new_mask;
-        if (!save_trace(this->class->usage_traces + new_bit))
+        if (!save_trace(hlock_class(this)->usage_traces + new_bit))
                return 0;
        switch (new_bit) {
@@ -2405,7 +2519,7 @@ EXPORT_SYMBOL_GPL(lockdep_init_map);
 */
 static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
                          int trylock, int read, int check, int hardirqs_off,
-                          unsigned long ip)
+                          struct lockdep_map *nest_lock, unsigned long ip)
 {
        struct task_struct *curr = current;
        struct lock_class *class = NULL;
@@ -2459,14 +2573,16 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
                return 0;
        hlock = curr->held_locks + depth;
+        if (DEBUG_LOCKS_WARN_ON(!class))
-        hlock->class = class;
+                return 0;
+        hlock->class_idx = class - lock_classes + 1;
        hlock->acquire_ip = ip;
        hlock->instance = lock;
+        hlock->nest_lock = nest_lock;
        hlock->trylock = trylock;
        hlock->read = read;
        hlock->check = check;
-        hlock->hardirqs_off = hardirqs_off;
+        hlock->hardirqs_off = !!hardirqs_off;
 #ifdef CONFIG_LOCK_STAT
        hlock->waittime_stamp = 0;
        hlock->holdtime_stamp = sched_clock();
@@ -2574,6 +2690,55 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
        return 1;
 }
+static int
+__lock_set_subclass(struct lockdep_map *lock,
+                    unsigned int subclass, unsigned long ip)
+{
+        struct task_struct *curr = current;
+        struct held_lock *hlock, *prev_hlock;
+        struct lock_class *class;
+        unsigned int depth;
+        int i;
+        depth = curr->lockdep_depth;
+        if (DEBUG_LOCKS_WARN_ON(!depth))
+                return 0;
+        prev_hlock = NULL;
+        for (i = depth-1; i >= 0; i--) {
+                hlock = curr->held_locks + i;
+                /*
+                 * We must not cross into another context:
+                 */
+                if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
+                        break;
+                if (hlock->instance == lock)
+                        goto found_it;
+                prev_hlock = hlock;
+        }
+        return print_unlock_inbalance_bug(curr, lock, ip);
+found_it:
+        class = register_lock_class(lock, subclass, 0);
+        hlock->class_idx = class - lock_classes + 1;
+        curr->lockdep_depth = i;
+        curr->curr_chain_key = hlock->prev_chain_key;
+        for (; i < depth; i++) {
+                hlock = curr->held_locks + i;
+                if (!__lock_acquire(hlock->instance,
+                        hlock_class(hlock)->subclass, hlock->trylock,
+                                hlock->read, hlock->check, hlock->hardirqs_off,
+                                hlock->nest_lock, hlock->acquire_ip))
+                        return 0;
+        }
+        if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
+                return 0;
+        return 1;
+}
 /*
 * Remove the lock to the list of currently held locks in a
 * potentially non-nested (out of order) manner. This is a
@@ -2624,9 +2789,9 @@ found_it:
        for (i++; i < depth; i++) {
                hlock = curr->held_locks + i;
                if (!__lock_acquire(hlock->instance,
-                        hlock->class->subclass, hlock->trylock,
+                        hlock_class(hlock)->subclass, hlock->trylock,
                                hlock->read, hlock->check, hlock->hardirqs_off,
-                                hlock->acquire_ip))
+                                hlock->nest_lock, hlock->acquire_ip))
                        return 0;
        }
@@ -2669,7 +2834,7 @@ static int lock_release_nested(struct task_struct *curr,
 #ifdef CONFIG_DEBUG_LOCKDEP
        hlock->prev_chain_key = 0;
-        hlock->class = NULL;
+        hlock->class_idx = 0;
        hlock->acquire_ip = 0;
        hlock->irq_context = 0;
 #endif
@@ -2738,18 +2903,36 @@ static void check_flags(unsigned long flags)
 #endif
 }
+void
+lock_set_subclass(struct lockdep_map *lock,
+                  unsigned int subclass, unsigned long ip)
+{
+        unsigned long flags;
+        if (unlikely(current->lockdep_recursion))
+                return;
+        raw_local_irq_save(flags);
+        current->lockdep_recursion = 1;
+        check_flags(flags);
+        if (__lock_set_subclass(lock, subclass, ip))
+                check_chain_key(current);
+        current->lockdep_recursion = 0;
+        raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_set_subclass);
 /*
 * We are not always called with irqs disabled - do that here,
 * and also avoid lockdep recursion:
 */
 void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
-                          int trylock, int read, int check, unsigned long ip)
+                          int trylock, int read, int check,
+                          struct lockdep_map *nest_lock, unsigned long ip)
 {
        unsigned long flags;
-        if (unlikely(!lock_stat && !prove_locking))
-                return;
        if (unlikely(current->lockdep_recursion))
                return;
@@ -2758,7 +2941,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        current->lockdep_recursion = 1;
        __lock_acquire(lock, subclass, trylock, read, check,
-                       irqs_disabled_flags(flags), ip);
+                       irqs_disabled_flags(flags), nest_lock, ip);
        current->lockdep_recursion = 0;
        raw_local_irq_restore(flags);
 }
@@ -2770,9 +2953,6 @@ void lock_release(struct lockdep_map *lock, int nested,
 {
        unsigned long flags;
-        if (unlikely(!lock_stat && !prove_locking))
-                return;
        if (unlikely(current->lockdep_recursion))
                return;
@@ -2845,9 +3025,9 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
 found_it:
        hlock->waittime_stamp = sched_clock();
-        point = lock_contention_point(hlock->class, ip);
+        point = lock_contention_point(hlock_class(hlock), ip);
-        stats = get_lock_stats(hlock->class);
+        stats = get_lock_stats(hlock_class(hlock));
        if (point < ARRAY_SIZE(stats->contention_point))
                stats->contention_point[i]++;
        if (lock->cpu != smp_processor_id())
@@ -2893,7 +3073,7 @@ found_it:
                hlock->holdtime_stamp = now;
        }
-        stats = get_lock_stats(hlock->class);
+        stats = get_lock_stats(hlock_class(hlock));
        if (waittime) {
                if (hlock->read)
                        lock_time_inc(&stats->read_waittime, waittime);
@@ -2988,6 +3168,7 @@ static void zap_class(struct lock_class *class)
        list_del_rcu(&class->hash_entry);
        list_del_rcu(&class->lock_entry);
+        class->key = NULL;
 }
 static inline int within(const void *addr, void *start, unsigned long size)
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index c3600a091a28..56b196932c08 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -17,9 +17,6 @@
 */
 #define MAX_LOCKDEP_ENTRIES     8192UL
-#define MAX_LOCKDEP_KEYS_BITS   11
-#define MAX_LOCKDEP_KEYS        (1UL << MAX_LOCKDEP_KEYS_BITS)
 #define MAX_LOCKDEP_CHAINS_BITS 14
 #define MAX_LOCKDEP_CHAINS      (1UL << MAX_LOCKDEP_CHAINS_BITS)
@@ -53,6 +50,22 @@ extern unsigned int nr_process_chains;
 extern unsigned int max_lockdep_depth;
 extern unsigned int max_recursion_depth;
+#ifdef CONFIG_PROVE_LOCKING
+extern unsigned long lockdep_count_forward_deps(struct lock_class *);
+extern unsigned long lockdep_count_backward_deps(struct lock_class *);
+#else
+static inline unsigned long
+lockdep_count_forward_deps(struct lock_class *class)
+{
+        return 0;
+}
+static inline unsigned long
+lockdep_count_backward_deps(struct lock_class *class)
+{
+        return 0;
+}
+#endif
 #ifdef CONFIG_DEBUG_LOCKDEP
 /*
 * Various lockdep statistics:
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 9b0e940e2545..4b194d34d77f 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -63,34 +63,6 @@ static void l_stop(struct seq_file *m, void *v)
 {
 }
-static unsigned long count_forward_deps(struct lock_class *class)
-{
-        struct lock_list *entry;
-        unsigned long ret = 1;
-        /*
-         * Recurse this class's dependency list:
-         */
-        list_for_each_entry(entry, &class->locks_after, entry)
-                ret += count_forward_deps(entry->class);
-        return ret;
-}
-static unsigned long count_backward_deps(struct lock_class *class)
-{
-        struct lock_list *entry;
-        unsigned long ret = 1;
-        /*
-         * Recurse this class's dependency list:
-         */
-        list_for_each_entry(entry, &class->locks_before, entry)
-                ret += count_backward_deps(entry->class);
-        return ret;
-}
 static void print_name(struct seq_file *m, struct lock_class *class)
 {
        char str[128];
@@ -110,7 +82,6 @@ static void print_name(struct seq_file *m, struct lock_class *class)
 static int l_show(struct seq_file *m, void *v)
 {
-        unsigned long nr_forward_deps, nr_backward_deps;
        struct lock_class *class = v;
        struct lock_list *entry;
        char c1, c2, c3, c4;
@@ -124,11 +95,10 @@ static int l_show(struct seq_file *m, void *v)
 #ifdef CONFIG_DEBUG_LOCKDEP
        seq_printf(m, " OPS:%8ld", class->ops);
 #endif
-        nr_forward_deps = count_forward_deps(class);
+#ifdef CONFIG_PROVE_LOCKING
-        seq_printf(m, " FD:%5ld", nr_forward_deps);
+        seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class));
+        seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
-        nr_backward_deps = count_backward_deps(class);
+#endif
-        seq_printf(m, " BD:%5ld", nr_backward_deps);
        get_usage_chars(class, &c1, &c2, &c3, &c4);
        seq_printf(m, " %c%c%c%c", c1, c2, c3, c4);
@@ -229,6 +199,9 @@ static int lc_show(struct seq_file *m, void *v)
        for (i = 0; i < chain->depth; i++) {
                class = lock_chain_get_class(chain, i);
+                if (!class->key)
+                        continue;
                seq_printf(m, "[%p] ", class->key);
                print_name(m, class);
                seq_puts(m, "\n");
@@ -350,7 +323,9 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
                if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
                        nr_hardirq_read_unsafe++;
-                sum_forward_deps += count_forward_deps(class);
+#ifdef CONFIG_PROVE_LOCKING
+                sum_forward_deps += lockdep_count_forward_deps(class);
+#endif
        }
 #ifdef CONFIG_DEBUG_LOCKDEP
        DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused);
diff --git a/kernel/module.c b/kernel/module.c
index 61d212120df4..08864d257eb0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2288,7 +2288,7 @@ sys_init_module(void __user *umod,
        /* Start the module */
        if (mod->init != NULL)
-                ret = mod->init();
+                ret = do_one_initcall(mod->init);
        if (ret < 0) {
                /* Init routine failed: abort.  Try to protect us from
                   buggy refcounters. */
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9a21681aa80f..e36d5798cbff 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -289,21 +289,29 @@ void do_schedule_next_timer(struct siginfo *info)
                else
                        schedule_next_timer(timr);
-                info->si_overrun = timr->it_overrun_last;
+                info->si_overrun += timr->it_overrun_last;
        }
        if (timr)
                unlock_timer(timr, flags);
 }
-int posix_timer_event(struct k_itimer *timr,int si_private)
+int posix_timer_event(struct k_itimer *timr, int si_private)
 {
-        memset(&timr->sigq->info, 0, sizeof(siginfo_t));
+        /*
+         * FIXME: if ->sigq is queued we can race with
+         * dequeue_signal()->do_schedule_next_timer().
+         *
+         * If dequeue_signal() sees the "right" value of
+         * si_sys_private it calls do_schedule_next_timer().
+         * We re-queue ->sigq and drop ->it_lock().
+         * do_schedule_next_timer() locks the timer
+         * and re-schedules it while ->sigq is pending.
+         * Not really bad, but not that we want.
+         */
        timr->sigq->info.si_sys_private = si_private;
-        /* Send signal to the process that owns this timer.*/
        timr->sigq->info.si_signo = timr->it_sigev_signo;
-        timr->sigq->info.si_errno = 0;
        timr->sigq->info.si_code = SI_TIMER;
        timr->sigq->info.si_tid = timr->it_id;
        timr->sigq->info.si_value = timr->it_sigev_value;
@@ -435,6 +443,7 @@ static struct k_itimer * alloc_posix_timer(void)
                kmem_cache_free(posix_timers_cache, tmr);
                tmr = NULL;
        }
+        memset(&tmr->sigq->info, 0, sizeof(siginfo_t));
        return tmr;
 }
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 082b3fcb32a0..356699a96d56 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -140,7 +140,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
        if (!dumpable && !capable(CAP_SYS_PTRACE))
                return -EPERM;
-        return security_ptrace(current, task, mode);
+        return security_ptrace_may_access(task, mode);
 }
 bool ptrace_may_access(struct task_struct *task, unsigned int mode)
@@ -499,8 +499,7 @@ repeat:
                        goto repeat;
                }
-                ret = security_ptrace(current->parent, current,
+                ret = security_ptrace_traceme(current->parent);
-                                      PTRACE_MODE_ATTACH);
                /*
                 * Set the ptrace bit in the process ptrace flags.
diff --git a/kernel/sched.c b/kernel/sched.c
index 04160d277e7a..9a1ddb84e26d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -600,7 +600,6 @@ struct rq {
        /* BKL stats */
        unsigned int bkl_count;
 #endif
-        struct lock_class_key rq_lock_key;
 };
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -809,9 +808,9 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
 /*
 * ratelimit for updating the group shares.
- * default: 0.5ms
+ * default: 0.25ms
 */
-const_debug unsigned int sysctl_sched_shares_ratelimit = 500000;
+unsigned int sysctl_sched_shares_ratelimit = 250000;
 /*
 * period over which we measure -rt task cpu usage in us.
@@ -834,7 +833,7 @@ static inline u64 global_rt_period(void)
 static inline u64 global_rt_runtime(void)
 {
-        if (sysctl_sched_rt_period < 0)
+        if (sysctl_sched_rt_runtime < 0)
                return RUNTIME_INF;
        return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
@@ -2759,10 +2758,10 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
        } else {
                if (rq1 < rq2) {
                        spin_lock(&rq1->lock);
-                        spin_lock(&rq2->lock);
+                        spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
                } else {
                        spin_lock(&rq2->lock);
-                        spin_lock(&rq1->lock);
+                        spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
                }
        }
        update_rq_clock(rq1);
@@ -2805,14 +2804,21 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
                if (busiest < this_rq) {
                        spin_unlock(&this_rq->lock);
                        spin_lock(&busiest->lock);
-                        spin_lock(&this_rq->lock);
+                        spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
                        ret = 1;
                } else
-                        spin_lock(&busiest->lock);
+                        spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
        }
        return ret;
 }
+static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+        __releases(busiest->lock)
+{
+        spin_unlock(&busiest->lock);
+        lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+}
 /*
 * If dest_cpu is allowed for this process, migrate the task to it.
 * This is accomplished by forcing the cpu_allowed mask to only
@@ -3637,7 +3643,7 @@ redo:
                ld_moved = move_tasks(this_rq, this_cpu, busiest,
                                        imbalance, sd, CPU_NEWLY_IDLE,
                                        &all_pinned);
-                spin_unlock(&busiest->lock);
+                double_unlock_balance(this_rq, busiest);
                if (unlikely(all_pinned)) {
                        cpu_clear(cpu_of(busiest), *cpus);
@@ -3752,7 +3758,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
                else
                        schedstat_inc(sd, alb_failed);
        }
-        spin_unlock(&target_rq->lock);
+        double_unlock_balance(busiest_rq, target_rq);
 }
 #ifdef CONFIG_NO_HZ
@@ -4663,6 +4669,52 @@ int __sched wait_for_completion_killable(struct completion *x)
 }
 EXPORT_SYMBOL(wait_for_completion_killable);
+/**
+ *      try_wait_for_completion - try to decrement a completion without blocking
+ *      @x:     completion structure
+ *
+ *      Returns: 0 if a decrement cannot be done without blocking
+ *               1 if a decrement succeeded.
+ *
+ *      If a completion is being used as a counting completion,
+ *      attempt to decrement the counter without blocking. This
+ *      enables us to avoid waiting if the resource the completion
+ *      is protecting is not available.
+ */
+bool try_wait_for_completion(struct completion *x)
+{
+        int ret = 1;
+        spin_lock_irq(&x->wait.lock);
+        if (!x->done)
+                ret = 0;
+        else
+                x->done--;
+        spin_unlock_irq(&x->wait.lock);
+        return ret;
+}
+EXPORT_SYMBOL(try_wait_for_completion);
+/**
+ *      completion_done - Test to see if a completion has any waiters
+ *      @x:     completion structure
+ *
+ *      Returns: 0 if there are waiters (wait_for_completion() in progress)
+ *               1 if there are no waiters.
+ *
+ */
+bool completion_done(struct completion *x)
+{
+        int ret = 1;
+        spin_lock_irq(&x->wait.lock);
+        if (!x->done)
+                ret = 0;
+        spin_unlock_irq(&x->wait.lock);
+        return ret;
+}
+EXPORT_SYMBOL(completion_done);
 static long __sched
 sleep_on_common(wait_queue_head_t *q, int state, long timeout)
 {
@@ -5734,6 +5786,8 @@ static inline void sched_init_granularity(void)
                sysctl_sched_latency = limit;
        sysctl_sched_wakeup_granularity *= factor;
+        sysctl_sched_shares_ratelimit *= factor;
 }
 #ifdef CONFIG_SMP
@@ -8000,7 +8054,6 @@ void __init sched_init(void)
                rq = cpu_rq(i);
                spin_lock_init(&rq->lock);
-                lockdep_set_class(&rq->lock, &rq->rq_lock_key);
                rq->nr_running = 0;
                init_cfs_rq(&rq->cfs, rq);
                init_rt_rq(&rq->rt, rq);
@@ -8457,8 +8510,8 @@ struct task_group *sched_create_group(struct task_group *parent)
        WARN_ON(!parent); /* root should already exist */
        tg->parent = parent;
-        list_add_rcu(&tg->siblings, &parent->children);
        INIT_LIST_HEAD(&tg->children);
+        list_add_rcu(&tg->siblings, &parent->children);
        spin_unlock_irqrestore(&task_group_lock, flags);
        return tg;
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 22ed55d1167f..204991a0bfa7 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -32,13 +32,19 @@
 #include <linux/ktime.h>
 #include <linux/module.h>
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ * This is default implementation.
+ * Architectures and sub-architectures can override this.
+ */
+unsigned long long __attribute__((weak)) sched_clock(void)
+{
+        return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
+}
-#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+static __read_mostly int sched_clock_running;
-#define MULTI_SHIFT 15
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-/* Max is double, Min is 1/2 */
-#define MAX_MULTI (2LL << MULTI_SHIFT)
-#define MIN_MULTI (1LL << (MULTI_SHIFT-1))
 struct sched_clock_data {
        /*
@@ -49,14 +55,9 @@ struct sched_clock_data {
        raw_spinlock_t          lock;
        unsigned long           tick_jiffies;
-        u64                     prev_raw;
        u64                     tick_raw;
        u64                     tick_gtod;
        u64                     clock;
-        s64                     multi;
-#ifdef CONFIG_NO_HZ
-        int                     check_max;
-#endif
 };
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
@@ -71,8 +72,6 @@ static inline struct sched_clock_data *cpu_sdc(int cpu)
        return &per_cpu(sched_clock_data, cpu);
 }
-static __read_mostly int sched_clock_running;
 void sched_clock_init(void)
 {
        u64 ktime_now = ktime_to_ns(ktime_get());
@@ -84,90 +83,39 @@ void sched_clock_init(void)
                scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
                scd->tick_jiffies = now_jiffies;
-                scd->prev_raw = 0;
                scd->tick_raw = 0;
                scd->tick_gtod = ktime_now;
                scd->clock = ktime_now;
-                scd->multi = 1 << MULTI_SHIFT;
-#ifdef CONFIG_NO_HZ
-                scd->check_max = 1;
-#endif
        }
        sched_clock_running = 1;
 }
-#ifdef CONFIG_NO_HZ
-/*
- * The dynamic ticks makes the delta jiffies inaccurate. This
- * prevents us from checking the maximum time update.
- * Disable the maximum check during stopped ticks.
- */
-void sched_clock_tick_stop(int cpu)
-{
-        struct sched_clock_data *scd = cpu_sdc(cpu);
-        scd->check_max = 0;
-}
-void sched_clock_tick_start(int cpu)
-{
-        struct sched_clock_data *scd = cpu_sdc(cpu);
-        scd->check_max = 1;
-}
-static int check_max(struct sched_clock_data *scd)
-{
-        return scd->check_max;
-}
-#else
-static int check_max(struct sched_clock_data *scd)
-{
-        return 1;
-}
-#endif /* CONFIG_NO_HZ */
 /*
 * update the percpu scd from the raw @now value
 *
 *  - filter out backward motion
 *  - use jiffies to generate a min,max window to clip the raw values
 */
-static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *time)
+static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
 {
        unsigned long now_jiffies = jiffies;
        long delta_jiffies = now_jiffies - scd->tick_jiffies;
        u64 clock = scd->clock;
        u64 min_clock, max_clock;
-        s64 delta = now - scd->prev_raw;
+        s64 delta = now - scd->tick_raw;
        WARN_ON_ONCE(!irqs_disabled());
+        min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC;
-        /*
-         * At schedule tick the clock can be just under the gtod. We don't
-         * want to push it too prematurely.
-         */
-        min_clock = scd->tick_gtod + (delta_jiffies * TICK_NSEC);
-        if (min_clock > TICK_NSEC)
-                min_clock -= TICK_NSEC / 2;
        if (unlikely(delta < 0)) {
                clock++;
                goto out;
        }
-        /*
+        max_clock = min_clock + TICK_NSEC;
-         * The clock must stay within a jiffie of the gtod.
-         * But since we may be at the start of a jiffy or the end of one
-         * we add another jiffy buffer.
-         */
-        max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC;
-        delta *= scd->multi;
-        delta >>= MULTI_SHIFT;
-        if (unlikely(clock + delta > max_clock) && check_max(scd)) {
+        if (unlikely(clock + delta > max_clock)) {
                if (clock < max_clock)
                        clock = max_clock;
                else
@@ -180,12 +128,10 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *tim
        if (unlikely(clock < min_clock))
                clock = min_clock;
-        if (time)
+        scd->tick_jiffies = now_jiffies;
-                *time = clock;
+        scd->clock = clock;
-        else {
-                scd->prev_raw = now;
+        return clock;
-                scd->clock = clock;
-        }
 }
 static void lock_double_clock(struct sched_clock_data *data1,
@@ -203,7 +149,7 @@ static void lock_double_clock(struct sched_clock_data *data1,
 u64 sched_clock_cpu(int cpu)
 {
        struct sched_clock_data *scd = cpu_sdc(cpu);
-        u64 now, clock;
+        u64 now, clock, this_clock, remote_clock;
        if (unlikely(!sched_clock_running))
                return 0ull;
@@ -212,43 +158,44 @@ u64 sched_clock_cpu(int cpu)
        now = sched_clock();
        if (cpu != raw_smp_processor_id()) {
-                /*
-                 * in order to update a remote cpu's clock based on our
-                 * unstable raw time rebase it against:
-                 *   tick_raw           (offset between raw counters)
-                 *   tick_gotd          (tick offset between cpus)
-                 */
                struct sched_clock_data *my_scd = this_scd();
                lock_double_clock(scd, my_scd);
-                now -= my_scd->tick_raw;
+                this_clock = __update_sched_clock(my_scd, now);
-                now += scd->tick_raw;
+                remote_clock = scd->clock;
-                now += my_scd->tick_gtod;
+                /*
-                now -= scd->tick_gtod;
+                 * Use the opportunity that we have both locks
+                 * taken to couple the two clocks: we take the
+                 * larger time as the latest time for both
+                 * runqueues. (this creates monotonic movement)
+                 */
+                if (likely(remote_clock < this_clock)) {
+                        clock = this_clock;
+                        scd->clock = clock;
+                } else {
+                        /*
+                         * Should be rare, but possible:
+                         */
+                        clock = remote_clock;
+                        my_scd->clock = remote_clock;
+                }
                __raw_spin_unlock(&my_scd->lock);
-                __update_sched_clock(scd, now, &clock);
-                __raw_spin_unlock(&scd->lock);
        } else {
                __raw_spin_lock(&scd->lock);
-                __update_sched_clock(scd, now, NULL);
+                clock = __update_sched_clock(scd, now);
-                clock = scd->clock;
-                __raw_spin_unlock(&scd->lock);
        }
+        __raw_spin_unlock(&scd->lock);
        return clock;
 }
 void sched_clock_tick(void)
 {
        struct sched_clock_data *scd = this_scd();
-        unsigned long now_jiffies = jiffies;
-        s64 mult, delta_gtod, delta_raw;
        u64 now, now_gtod;
        if (unlikely(!sched_clock_running))
@@ -260,29 +207,14 @@ void sched_clock_tick(void)
        now = sched_clock();
        __raw_spin_lock(&scd->lock);
-        __update_sched_clock(scd, now, NULL);
+        __update_sched_clock(scd, now);
        /*
         * update tick_gtod after __update_sched_clock() because that will
         * already observe 1 new jiffy; adding a new tick_gtod to that would
         * increase the clock 2 jiffies.
         */
-        delta_gtod = now_gtod - scd->tick_gtod;
-        delta_raw = now - scd->tick_raw;
-        if ((long)delta_raw > 0) {
-                mult = delta_gtod << MULTI_SHIFT;
-                do_div(mult, delta_raw);
-                scd->multi = mult;
-                if (scd->multi > MAX_MULTI)
-                        scd->multi = MAX_MULTI;
-                else if (scd->multi < MIN_MULTI)
-                        scd->multi = MIN_MULTI;
-        } else
-                scd->multi = 1 << MULTI_SHIFT;
        scd->tick_raw = now;
        scd->tick_gtod = now_gtod;
-        scd->tick_jiffies = now_jiffies;
        __raw_spin_unlock(&scd->lock);
 }
@@ -301,7 +233,6 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
 void sched_clock_idle_wakeup_event(u64 delta_ns)
 {
        struct sched_clock_data *scd = this_scd();
-        u64 now = sched_clock();
        /*
         * Override the previous timestamp and ignore all
@@ -310,27 +241,30 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
         * rq clock:
         */
        __raw_spin_lock(&scd->lock);
-        scd->prev_raw = now;
        scd->clock += delta_ns;
-        scd->multi = 1 << MULTI_SHIFT;
        __raw_spin_unlock(&scd->lock);
        touch_softlockup_watchdog();
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
-#endif
+#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
-/*
+void sched_clock_init(void)
- * Scheduler clock - returns current time in nanosec units.
- * This is default implementation.
- * Architectures and sub-architectures can override this.
- */
-unsigned long long __attribute__((weak)) sched_clock(void)
 {
-        return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
+        sched_clock_running = 1;
 }
+u64 sched_clock_cpu(int cpu)
+{
+        if (unlikely(!sched_clock_running))
+                return 0;
+        return sched_clock();
+}
+#endif
 unsigned long long cpu_clock(int cpu)
 {
        unsigned long long clock;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index cf2cd6ce4cb2..fb8994c6d4bb 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -899,7 +899,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
                 * doesn't make sense. Rely on vruntime for fairness.
                 */
                if (rq->curr != p)
-                        delta = max(10000LL, delta);
+                        delta = max_t(s64, 10000LL, delta);
                hrtick_start(rq, delta);
        }
@@ -1442,18 +1442,23 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
        struct task_struct *p = NULL;
        struct sched_entity *se;
-        while (next != &cfs_rq->tasks) {
+        if (next == &cfs_rq->tasks)
+                return NULL;
+        /* Skip over entities that are not tasks */
+        do {
                se = list_entry(next, struct sched_entity, group_node);
                next = next->next;
+        } while (next != &cfs_rq->tasks && !entity_is_task(se));
-                /* Skip over entities that are not tasks */
+        if (next == &cfs_rq->tasks)
-                if (entity_is_task(se)) {
+                return NULL;
-                        p = task_of(se);
-                        break;
-                }
-        }
        cfs_rq->balance_iterator = next;
+        if (entity_is_task(se))
+                p = task_of(se);
        return p;
 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 908c04f9dad0..998ba54b4543 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -298,7 +298,7 @@ static void __disable_runtime(struct rq *rq)
                        struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
                        s64 diff;
-                        if (iter == rt_rq)
+                        if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
                                continue;
                        spin_lock(&iter->rt_runtime_lock);
@@ -861,6 +861,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 #define RT_MAX_TRIES 3
 static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
+static void double_unlock_balance(struct rq *this_rq, struct rq *busiest);
 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
@@ -1022,7 +1024,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
                        break;
                /* try again */
-                spin_unlock(&lowest_rq->lock);
+                double_unlock_balance(rq, lowest_rq);
                lowest_rq = NULL;
        }
@@ -1091,7 +1093,7 @@ static int push_rt_task(struct rq *rq)
        resched_task(lowest_rq->curr);
-        spin_unlock(&lowest_rq->lock);
+        double_unlock_balance(rq, lowest_rq);
        ret = 1;
 out:
@@ -1197,7 +1199,7 @@ static int pull_rt_task(struct rq *this_rq)
                }
 skip:
-                spin_unlock(&src_rq->lock);
+                double_unlock_balance(this_rq, src_rq);
        }
        return ret;
diff --git a/kernel/signal.c b/kernel/signal.c
index 954f77d7e3bc..e661b01d340f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1304,6 +1304,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
                q->info.si_overrun++;
                goto out;
        }
+        q->info.si_overrun = 0;
        signalfd_notify(t, sig);
        pending = group ? &t->signal->shared_pending : &t->pending;
@@ -1337,6 +1338,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
        struct siginfo info;
        unsigned long flags;
        struct sighand_struct *psig;
+        int ret = sig;
        BUG_ON(sig == -1);
@@ -1401,7 +1403,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
                 * is implementation-defined: we do (if you don't want
                 * it, just use SIG_IGN instead).
                 */
-                tsk->exit_signal = -1;
+                ret = tsk->exit_signal = -1;
                if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
                        sig = -1;
        }
@@ -1410,7 +1412,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
        __wake_up_parent(tsk, tsk->parent);
        spin_unlock_irqrestore(&psig->siglock, flags);
-        return sig;
+        return ret;
 }
 static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
diff --git a/kernel/smp.c b/kernel/smp.c
index 96fc7c0edc59..782e2b93e465 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -135,7 +135,8 @@ void generic_smp_call_function_interrupt(void)
                         */
                        smp_wmb();
                        data->csd.flags &= ~CSD_FLAG_WAIT;
-                } else
+                }
+                if (data->csd.flags & CSD_FLAG_ALLOC)
                        call_rcu(&data->rcu_head, rcu_free_call_data);
        }
        rcu_read_unlock();
@@ -260,6 +261,42 @@ void __smp_call_function_single(int cpu, struct call_single_data *data)
        generic_exec_single(cpu, data);
 }
+/* Dummy function */
+static void quiesce_dummy(void *unused)
+{
+}
+/*
+ * Ensure stack based data used in call function mask is safe to free.
+ *
+ * This is needed by smp_call_function_mask when using on-stack data, because
+ * a single call function queue is shared by all CPUs, and any CPU may pick up
+ * the data item on the queue at any time before it is deleted. So we need to
+ * ensure that all CPUs have transitioned through a quiescent state after
+ * this call.
+ *
+ * This is a very slow function, implemented by sending synchronous IPIs to
+ * all possible CPUs. For this reason, we have to alloc data rather than use
+ * stack based data even in the case of synchronous calls. The stack based
+ * data is then just used for deadlock/oom fallback which will be very rare.
+ *
+ * If a faster scheme can be made, we could go back to preferring stack based
+ * data -- the data allocation/free is non-zero cost.
+ */
+static void smp_call_function_mask_quiesce_stack(cpumask_t mask)
+{
+        struct call_single_data data;
+        int cpu;
+        data.func = quiesce_dummy;
+        data.info = NULL;
+        for_each_cpu_mask(cpu, mask) {
+                data.flags = CSD_FLAG_WAIT;
+                generic_exec_single(cpu, &data);
+        }
+}
 /**
 * smp_call_function_mask(): Run a function on a set of other CPUs.
 * @mask: The set of cpus to run on.
@@ -285,6 +322,7 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
        cpumask_t allbutself;
        unsigned long flags;
        int cpu, num_cpus;
+        int slowpath = 0;
        /* Can deadlock when called with interrupts disabled */
        WARN_ON(irqs_disabled());
@@ -306,15 +344,16 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
                return smp_call_function_single(cpu, func, info, wait);
        }
-        if (!wait) {
+        data = kmalloc(sizeof(*data), GFP_ATOMIC);
-                data = kmalloc(sizeof(*data), GFP_ATOMIC);
+        if (data) {
-                if (data)
+                data->csd.flags = CSD_FLAG_ALLOC;
-                        data->csd.flags = CSD_FLAG_ALLOC;
+                if (wait)
-        }
+                        data->csd.flags |= CSD_FLAG_WAIT;
-        if (!data) {
+        } else {
                data = &d;
                data->csd.flags = CSD_FLAG_WAIT;
                wait = 1;
+                slowpath = 1;
        }
        spin_lock_init(&data->lock);
@@ -331,8 +370,11 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
        arch_send_call_function_ipi(mask);
        /* optionally wait for the CPUs to complete */
-        if (wait)
+        if (wait) {
                csd_flag_wait(&data->csd);
+                if (unlikely(slowpath))
+                        smp_call_function_mask_quiesce_stack(mask);
+        }
        return 0;
 }
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index a1fb54c93cdd..29ab20749dd3 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -290,8 +290,8 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
        spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
        LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
 }
 EXPORT_SYMBOL(_spin_lock_nested);
 unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass)
 {
        unsigned long flags;
@@ -311,9 +311,17 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas
 #endif
        return flags;
 }
 EXPORT_SYMBOL(_spin_lock_irqsave_nested);
+void __lockfunc _spin_lock_nest_lock(spinlock_t *lock,
+                                     struct lockdep_map *nest_lock)
+{
+        preempt_disable();
+        spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
+        LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
+}
+EXPORT_SYMBOL(_spin_lock_nest_lock);
 #endif
 void __lockfunc _spin_unlock(spinlock_t *lock)
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index e446c7c7d6a9..af3c7cea258b 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -65,7 +65,6 @@ static void ack_state(void)
 static int stop_cpu(struct stop_machine_data *smdata)
 {
        enum stopmachine_state curstate = STOPMACHINE_NONE;
-        int uninitialized_var(ret);
        /* Simple state machine */
        do {
diff --git a/kernel/sys.c b/kernel/sys.c
index c01858090a98..3dacb00a7f76 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -274,7 +274,7 @@ void emergency_restart(void)
 }
 EXPORT_SYMBOL_GPL(emergency_restart);
-static void kernel_restart_prepare(char *cmd)
+void kernel_restart_prepare(char *cmd)
 {
        blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
        system_state = SYSTEM_RESTART;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 825b4c00fe44..f5da526424a9 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -289,7 +289,6 @@ void tick_nohz_stop_sched_tick(int inidle)
                        ts->tick_stopped = 1;
                        ts->idle_jiffies = last_jiffies;
                        rcu_enter_nohz();
-                        sched_clock_tick_stop(cpu);
                }
                /*
@@ -392,7 +391,6 @@ void tick_nohz_restart_sched_tick(void)
        select_nohz_load_balancer(0);
        now = ktime_get();
        tick_do_update_jiffies64(now);
-        sched_clock_tick_start(cpu);
        cpu_clear(cpu, nohz_cpu_mask);
        /*
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4a26a1382df0..4048e92aa04f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -290,11 +290,11 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
                BUG_ON(get_wq_data(work) != cwq);
                work_clear_pending(work);
-                lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+                lock_map_acquire(&cwq->wq->lockdep_map);
-                lock_acquire(&lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+                lock_map_acquire(&lockdep_map);
                f(work);
-                lock_release(&lockdep_map, 1, _THIS_IP_);
+                lock_map_release(&lockdep_map);
-                lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
+                lock_map_release(&cwq->wq->lockdep_map);
                if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
                        printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
@@ -413,8 +413,8 @@ void flush_workqueue(struct workqueue_struct *wq)
        int cpu;
        might_sleep();
-        lock_acquire(&wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+        lock_map_acquire(&wq->lockdep_map);
-        lock_release(&wq->lockdep_map, 1, _THIS_IP_);
+        lock_map_release(&wq->lockdep_map);
        for_each_cpu_mask_nr(cpu, *cpu_map)
                flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
 }
@@ -441,8 +441,8 @@ int flush_work(struct work_struct *work)
        if (!cwq)
                return 0;
-        lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+        lock_map_acquire(&cwq->wq->lockdep_map);
-        lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
+        lock_map_release(&cwq->wq->lockdep_map);
        prev = NULL;
        spin_lock_irq(&cwq->lock);
@@ -536,8 +536,8 @@ static void wait_on_work(struct work_struct *work)
        might_sleep();
-        lock_acquire(&work->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+        lock_map_acquire(&work->lockdep_map);
-        lock_release(&work->lockdep_map, 1, _THIS_IP_);
+        lock_map_release(&work->lockdep_map);
        cwq = get_wq_data(work);
        if (!cwq)
@@ -872,8 +872,8 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
        if (cwq->thread == NULL)
                return;
-        lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+        lock_map_acquire(&cwq->wq->lockdep_map);
-        lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
+        lock_map_release(&cwq->wq->lockdep_map);
        flush_cpu_workqueue(cwq);
        /*