Merge branch 'linus' into x86/cpu

author: Ingo Molnar <mingo@elte.hu> 2008-08-15 10:16:15 -0400
committer: Ingo Molnar <mingo@elte.hu> 2008-08-15 10:16:15 -0400
commit: 1a10390708d675ebf1a2f5e169a5165626afbd88 (patch)
tree: d9ee7d10abd65e580fb74152a501089f51174225 /kernel
parent: 239bd83104ec6bcba90221d8b0973d2565142ef8 (diff)
parent: b635acec48bcaa9183fcbf4e3955616b0d4119b5 (diff)
40 files changed, 1060 insertions, 629 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 382dd5a8b2d7..94fabd534b03 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -55,4 +55,4 @@ config HZ
        default 1000 if HZ_1000
 config SCHED_HRTICK
-        def_bool HIGH_RES_TIMERS && USE_GENERIC_SMP_HELPERS
+        def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS)
diff --git a/kernel/Makefile b/kernel/Makefile
index 54f69837d35a..4e1d7df7c3e2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -84,6 +84,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_MARKERS) += marker.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
+obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
 obj-$(CONFIG_FTRACE) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
diff --git a/kernel/audit.c b/kernel/audit.c
index e092f1c0ce30..4414e93d8750 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -707,12 +707,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                if (status_get->mask & AUDIT_STATUS_ENABLED) {
                        err = audit_set_enabled(status_get->enabled,
                                                loginuid, sessionid, sid);
-                        if (err < 0) return err;
+                        if (err < 0)
+                                return err;
                }
                if (status_get->mask & AUDIT_STATUS_FAILURE) {
                        err = audit_set_failure(status_get->failure,
                                                loginuid, sessionid, sid);
-                        if (err < 0) return err;
+                        if (err < 0)
+                                return err;
                }
                if (status_get->mask & AUDIT_STATUS_PID) {
                        int new_pid = status_get->pid;
@@ -725,9 +727,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                        audit_pid = new_pid;
                        audit_nlk_pid = NETLINK_CB(skb).pid;
                }
-                if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
+                if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) {
                        err = audit_set_rate_limit(status_get->rate_limit,
                                                   loginuid, sessionid, sid);
+                        if (err < 0)
+                                return err;
+                }
                if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
                        err = audit_set_backlog_limit(status_get->backlog_limit,
                                                      loginuid, sessionid, sid);
@@ -1366,7 +1371,7 @@ int audit_string_contains_control(const char *string, size_t len)
 {
        const unsigned char *p;
        for (p = string; p < (const unsigned char *)string + len && *p; p++) {
-                if (*p == '"' || *p < 0x21 || *p > 0x7f)
+                if (*p == '"' || *p < 0x21 || *p > 0x7e)
                        return 1;
        }
        return 0;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 98c50cc671bb..b7d354e2b0ef 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1022,8 +1022,11 @@ static void audit_update_watch(struct audit_parent *parent,
                        struct audit_buffer *ab;
                        ab = audit_log_start(NULL, GFP_KERNEL,
                                AUDIT_CONFIG_CHANGE);
+                        audit_log_format(ab, "auid=%u ses=%u",
+                                audit_get_loginuid(current),
+                                audit_get_sessionid(current));
                        audit_log_format(ab,
-                                "op=updated rules specifying path=");
+                                " op=updated rules specifying path=");
                        audit_log_untrustedstring(ab, owatch->path);
                        audit_log_format(ab, " with dev=%u ino=%lu\n",
                                 dev, ino);
@@ -1058,7 +1061,10 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
                                struct audit_buffer *ab;
                                ab = audit_log_start(NULL, GFP_KERNEL,
                                        AUDIT_CONFIG_CHANGE);
-                                audit_log_format(ab, "op=remove rule path=");
+                                audit_log_format(ab, "auid=%u ses=%u",
+                                        audit_get_loginuid(current),
+                                        audit_get_sessionid(current));
+                                audit_log_format(ab, " op=remove rule path=");
                                audit_log_untrustedstring(ab, w->path);
                                if (r->filterkey) {
                                        audit_log_format(ab, " key=");
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4699950e65bd..972f8e61d36a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -243,6 +243,9 @@ static inline int open_arg(int flags, int mask)
 static int audit_match_perm(struct audit_context *ctx, int mask)
 {
+        if (unlikely(!ctx))
+                return 0;
        unsigned n = ctx->major;
        switch (audit_classify_syscall(ctx->arch, n)) {
        case 0: /* native */
@@ -284,6 +287,10 @@ static int audit_match_filetype(struct audit_context *ctx, int which)
 {
        unsigned index = which & ~S_IFMT;
        mode_t mode = which & S_IFMT;
+        if (unlikely(!ctx))
+                return 0;
        if (index >= ctx->name_count)
                return 0;
        if (ctx->names[index].ino == -1)
@@ -610,7 +617,7 @@ static int audit_filter_rules(struct task_struct *tsk,
                if (!result)
                        return 0;
        }
-        if (rule->filterkey)
+        if (rule->filterkey && ctx)
                ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
        switch (rule->action) {
        case AUDIT_NEVER:    *state = AUDIT_DISABLED;       break;
@@ -2375,7 +2382,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
        struct audit_context *ctx = tsk->audit_context;
        if (audit_pid && t->tgid == audit_pid) {
-                if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) {
+                if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
                        audit_sig_pid = tsk->pid;
                        if (tsk->loginuid != -1)
                                audit_sig_uid = tsk->loginuid;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 657f8f8d93a5..13932abde159 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -355,6 +355,17 @@ static struct css_set *find_existing_css_set(
        return NULL;
 }
+static void free_cg_links(struct list_head *tmp)
+{
+        struct cg_cgroup_link *link;
+        struct cg_cgroup_link *saved_link;
+        list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
+                list_del(&link->cgrp_link_list);
+                kfree(link);
+        }
+}
 /*
 * allocate_cg_links() allocates "count" cg_cgroup_link structures
 * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
@@ -363,17 +374,12 @@ static struct css_set *find_existing_css_set(
 static int allocate_cg_links(int count, struct list_head *tmp)
 {
        struct cg_cgroup_link *link;
-        struct cg_cgroup_link *saved_link;
        int i;
        INIT_LIST_HEAD(tmp);
        for (i = 0; i < count; i++) {
                link = kmalloc(sizeof(*link), GFP_KERNEL);
                if (!link) {
-                        list_for_each_entry_safe(link, saved_link, tmp,
+                        free_cg_links(tmp);
-                                                 cgrp_link_list) {
-                                list_del(&link->cgrp_link_list);
-                                kfree(link);
-                        }
                        return -ENOMEM;
                }
                list_add(&link->cgrp_link_list, tmp);
@@ -381,17 +387,6 @@ static int allocate_cg_links(int count, struct list_head *tmp)
        return 0;
 }
-static void free_cg_links(struct list_head *tmp)
-{
-        struct cg_cgroup_link *link;
-        struct cg_cgroup_link *saved_link;
-        list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
-                list_del(&link->cgrp_link_list);
-                kfree(link);
-        }
-}
 /*
 * find_css_set() takes an existing cgroup group and a
 * cgroup object, and returns a css_set object that's
@@ -956,7 +951,6 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
        struct super_block *sb;
        struct cgroupfs_root *root;
        struct list_head tmp_cg_links;
-        INIT_LIST_HEAD(&tmp_cg_links);
        /* First find the desired set of subsystems */
        ret = parse_cgroupfs_options(data, &opts);
@@ -1424,14 +1418,17 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
                if (buffer == NULL)
                        return -ENOMEM;
        }
-        if (nbytes && copy_from_user(buffer, userbuf, nbytes))
+        if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
-                return -EFAULT;
+                retval = -EFAULT;
+                goto out;
+        }
        buffer[nbytes] = 0;     /* nul-terminate */
        strstrip(buffer);
        retval = cft->write_string(cgrp, cft, buffer);
        if (!retval)
                retval = nbytes;
+out:
        if (buffer != local_buffer)
                kfree(buffer);
        return retval;
@@ -2371,7 +2368,7 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        return cgroup_create(c_parent, dentry, mode | S_IFDIR);
 }
-static inline int cgroup_has_css_refs(struct cgroup *cgrp)
+static int cgroup_has_css_refs(struct cgroup *cgrp)
 {
        /* Check the reference count on each subsystem. Since we
         * already established that there are no tasks in the
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 10ba5f1004a5..f17e9854c246 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -216,7 +216,6 @@ static int __ref take_cpu_down(void *_param)
 static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 {
        int err, nr_calls = 0;
-        struct task_struct *p;
        cpumask_t old_allowed, tmp;
        void *hcpu = (void *)(long)cpu;
        unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
@@ -249,21 +248,18 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        cpus_setall(tmp);
        cpu_clear(cpu, tmp);
        set_cpus_allowed_ptr(current, &tmp);
+        tmp = cpumask_of_cpu(cpu);
-        p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
+        err = __stop_machine(take_cpu_down, &tcd_param, &tmp);
+        if (err) {
-        if (IS_ERR(p) || cpu_online(cpu)) {
                /* CPU didn't die: tell everyone.  Can't complain. */
                if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
                                            hcpu) == NOTIFY_BAD)
                        BUG();
-                if (IS_ERR(p)) {
+                goto out_allowed;
-                        err = PTR_ERR(p);
-                        goto out_allowed;
-                }
-                goto out_thread;
        }
+        BUG_ON(cpu_online(cpu));
        /* Wait for it to sleep (leaving idle task). */
        while (!idle_cpu(cpu))
@@ -279,8 +275,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        check_for_tasks(cpu);
-out_thread:
-        err = kthread_stop(p);
 out_allowed:
        set_cpus_allowed_ptr(current, &old_allowed);
 out_release:
@@ -355,6 +349,8 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
                goto out_notify;
        BUG_ON(!cpu_online(cpu));
+        cpu_set(cpu, cpu_active_map);
        /* Now call notifier in preparation. */
        raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
@@ -373,7 +369,7 @@ int __cpuinit cpu_up(unsigned int cpu)
        if (!cpu_isset(cpu, cpu_possible_map)) {
                printk(KERN_ERR "can't online cpu %d because it is not "
                        "configured as may-hotadd at boot time\n", cpu);
-#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) || defined(CONFIG_S390)
+#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
                printk(KERN_ERR "please check additional_cpus= boot "
                                "parameter\n");
 #endif
@@ -389,9 +385,6 @@ int __cpuinit cpu_up(unsigned int cpu)
        err = _cpu_up(cpu, 0);
-        if (cpu_online(cpu))
-                cpu_set(cpu, cpu_active_map);
 out:
        cpu_maps_update_done();
        return err;
@@ -461,3 +454,28 @@ out:
 #endif /* CONFIG_PM_SLEEP_SMP */
 #endif /* CONFIG_SMP */
+/*
+ * cpu_bit_bitmap[] is a special, "compressed" data structure that
+ * represents all NR_CPUS bits binary values of 1<<nr.
+ *
+ * It is used by cpumask_of_cpu() to get a constant address to a CPU
+ * mask value that has a single bit set only.
+ */
+/* cpu_bit_bitmap[0] is empty - so we can back into it */
+#define MASK_DECLARE_1(x)       [x+1][0] = 1UL << (x)
+#define MASK_DECLARE_2(x)       MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
+#define MASK_DECLARE_4(x)       MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
+#define MASK_DECLARE_8(x)       MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
+const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
+        MASK_DECLARE_8(0),      MASK_DECLARE_8(8),
+        MASK_DECLARE_8(16),     MASK_DECLARE_8(24),
+#if BITS_PER_LONG > 32
+        MASK_DECLARE_8(32),     MASK_DECLARE_8(40),
+        MASK_DECLARE_8(48),     MASK_DECLARE_8(56),
+#endif
+};
+EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 91cf85b36dd5..d5ab79cf516d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -54,7 +54,6 @@
 #include <asm/uaccess.h>
 #include <asm/atomic.h>
 #include <linux/mutex.h>
-#include <linux/kfifo.h>
 #include <linux/workqueue.h>
 #include <linux/cgroup.h>
@@ -486,13 +485,38 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
 static void
 update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
 {
-        if (!dattr)
-                return;
        if (dattr->relax_domain_level < c->relax_domain_level)
                dattr->relax_domain_level = c->relax_domain_level;
        return;
 }
+static void
+update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
+{
+        LIST_HEAD(q);
+        list_add(&c->stack_list, &q);
+        while (!list_empty(&q)) {
+                struct cpuset *cp;
+                struct cgroup *cont;
+                struct cpuset *child;
+                cp = list_first_entry(&q, struct cpuset, stack_list);
+                list_del(q.next);
+                if (cpus_empty(cp->cpus_allowed))
+                        continue;
+                if (is_sched_load_balance(cp))
+                        update_domain_attr(dattr, cp);
+                list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
+                        child = cgroup_cs(cont);
+                        list_add_tail(&child->stack_list, &q);
+                }
+        }
+}
 /*
 * rebuild_sched_domains()
 *
@@ -532,7 +556,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
 * So the reverse nesting would risk an ABBA deadlock.
 *
 * The three key local variables below are:
- *    q  - a kfifo queue of cpuset pointers, used to implement a
+ *    q  - a linked-list queue of cpuset pointers, used to implement a
 *         top-down scan of all cpusets.  This scan loads a pointer
 *         to each cpuset marked is_sched_load_balance into the
 *         array 'csa'.  For our purposes, rebuilding the schedulers
@@ -567,7 +591,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
 void rebuild_sched_domains(void)
 {
-        struct kfifo *q;        /* queue of cpusets to be scanned */
+        LIST_HEAD(q);           /* queue of cpusets to be scanned*/
        struct cpuset *cp;      /* scans q */
        struct cpuset **csa;    /* array of all cpuset ptrs */
        int csn;                /* how many cpuset ptrs in csa so far */
@@ -577,7 +601,6 @@ void rebuild_sched_domains(void)
        int ndoms;              /* number of sched domains in result */
        int nslot;              /* next empty doms[] cpumask_t slot */
-        q = NULL;
        csa = NULL;
        doms = NULL;
        dattr = NULL;
@@ -591,35 +614,42 @@ void rebuild_sched_domains(void)
                dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
                if (dattr) {
                        *dattr = SD_ATTR_INIT;
-                        update_domain_attr(dattr, &top_cpuset);
+                        update_domain_attr_tree(dattr, &top_cpuset);
                }
                *doms = top_cpuset.cpus_allowed;
                goto rebuild;
        }
-        q = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL);
-        if (IS_ERR(q))
-                goto done;
        csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
        if (!csa)
                goto done;
        csn = 0;
-        cp = &top_cpuset;
+        list_add(&top_cpuset.stack_list, &q);
-        __kfifo_put(q, (void *)&cp, sizeof(cp));
+        while (!list_empty(&q)) {
-        while (__kfifo_get(q, (void *)&cp, sizeof(cp))) {
                struct cgroup *cont;
                struct cpuset *child;   /* scans child cpusets of cp */
+                cp = list_first_entry(&q, struct cpuset, stack_list);
+                list_del(q.next);
                if (cpus_empty(cp->cpus_allowed))
                        continue;
-                if (is_sched_load_balance(cp))
+                /*
+                 * All child cpusets contain a subset of the parent's cpus, so
+                 * just skip them, and then we call update_domain_attr_tree()
+                 * to calc relax_domain_level of the corresponding sched
+                 * domain.
+                 */
+                if (is_sched_load_balance(cp)) {
                        csa[csn++] = cp;
+                        continue;
+                }
                list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
                        child = cgroup_cs(cont);
-                        __kfifo_put(q, (void *)&child, sizeof(cp));
+                        list_add_tail(&child->stack_list, &q);
                }
        }
@@ -686,7 +716,7 @@ restart:
                                        cpus_or(*dp, *dp, b->cpus_allowed);
                                        b->pn = -1;
                                        if (dattr)
-                                                update_domain_attr(dattr
+                                                update_domain_attr_tree(dattr
                                                                   + nslot, b);
                                }
                        }
@@ -702,8 +732,6 @@ rebuild:
        put_online_cpus();
 done:
-        if (q && !IS_ERR(q))
-                kfifo_free(q);
        kfree(csa);
        /* Don't kfree(doms) -- partition_sched_domains() does that. */
        /* Don't kfree(dattr) -- partition_sched_domains() does that. */
@@ -1833,24 +1861,21 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
 */
 static void scan_for_empty_cpusets(const struct cpuset *root)
 {
+        LIST_HEAD(queue);
        struct cpuset *cp;      /* scans cpusets being updated */
        struct cpuset *child;   /* scans child cpusets of cp */
-        struct list_head queue;
        struct cgroup *cont;
        nodemask_t oldmems;
-        INIT_LIST_HEAD(&queue);
        list_add_tail((struct list_head *)&root->stack_list, &queue);
        while (!list_empty(&queue)) {
-                cp = container_of(queue.next, struct cpuset, stack_list);
+                cp = list_first_entry(&queue, struct cpuset, stack_list);
                list_del(queue.next);
                list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
                        child = cgroup_cs(cont);
                        list_add_tail(&child->stack_list, &queue);
                }
-                cont = cp->css.cgroup;
                /* Continue past cpusets with all cpus, mems online */
                if (cpus_subset(cp->cpus_allowed, cpu_online_map) &&
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
new file mode 100644
index 000000000000..c1d4d5b4c61c
--- /dev/null
+++ b/kernel/dma-coherent.c
@@ -0,0 +1,153 @@
+/*
+ * Coherent per-device memory handling.
+ * Borrowed from i386
+ */
+#include <linux/kernel.h>
+#include <linux/dma-mapping.h>
+struct dma_coherent_mem {
+        void            *virt_base;
+        u32             device_base;
+        int             size;
+        int             flags;
+        unsigned long   *bitmap;
+};
+int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
+                                dma_addr_t device_addr, size_t size, int flags)
+{
+        void __iomem *mem_base = NULL;
+        int pages = size >> PAGE_SHIFT;
+        int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
+        if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
+                goto out;
+        if (!size)
+                goto out;
+        if (dev->dma_mem)
+                goto out;
+        /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
+        mem_base = ioremap(bus_addr, size);
+        if (!mem_base)
+                goto out;
+        dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
+        if (!dev->dma_mem)
+                goto out;
+        dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+        if (!dev->dma_mem->bitmap)
+                goto free1_out;
+        dev->dma_mem->virt_base = mem_base;
+        dev->dma_mem->device_base = device_addr;
+        dev->dma_mem->size = pages;
+        dev->dma_mem->flags = flags;
+        if (flags & DMA_MEMORY_MAP)
+                return DMA_MEMORY_MAP;
+        return DMA_MEMORY_IO;
+ free1_out:
+        kfree(dev->dma_mem);
+ out:
+        if (mem_base)
+                iounmap(mem_base);
+        return 0;
+}
+EXPORT_SYMBOL(dma_declare_coherent_memory);
+void dma_release_declared_memory(struct device *dev)
+{
+        struct dma_coherent_mem *mem = dev->dma_mem;
+        if (!mem)
+                return;
+        dev->dma_mem = NULL;
+        iounmap(mem->virt_base);
+        kfree(mem->bitmap);
+        kfree(mem);
+}
+EXPORT_SYMBOL(dma_release_declared_memory);
+void *dma_mark_declared_memory_occupied(struct device *dev,
+                                        dma_addr_t device_addr, size_t size)
+{
+        struct dma_coherent_mem *mem = dev->dma_mem;
+        int pos, err;
+        size += device_addr & ~PAGE_MASK;
+        if (!mem)
+                return ERR_PTR(-EINVAL);
+        pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
+        err = bitmap_allocate_region(mem->bitmap, pos, get_order(size));
+        if (err != 0)
+                return ERR_PTR(err);
+        return mem->virt_base + (pos << PAGE_SHIFT);
+}
+EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
+/**
+ * dma_alloc_from_coherent() - try to allocate memory from the per-device coherent area
+ *
+ * @dev:        device from which we allocate memory
+ * @size:       size of requested memory area
+ * @dma_handle: This will be filled with the correct dma handle
+ * @ret:        This pointer will be filled with the virtual address
+ *              to allocated area.
+ *
+ * This function should be only called from per-arch dma_alloc_coherent()
+ * to support allocation from per-device coherent memory pools.
+ *
+ * Returns 0 if dma_alloc_coherent should continue with allocating from
+ * generic memory areas, or !0 if dma_alloc_coherent should return @ret.
+ */
+int dma_alloc_from_coherent(struct device *dev, ssize_t size,
+                                       dma_addr_t *dma_handle, void **ret)
+{
+        struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+        int order = get_order(size);
+        if (mem) {
+                int page = bitmap_find_free_region(mem->bitmap, mem->size,
+                                                     order);
+                if (page >= 0) {
+                        *dma_handle = mem->device_base + (page << PAGE_SHIFT);
+                        *ret = mem->virt_base + (page << PAGE_SHIFT);
+                        memset(*ret, 0, size);
+                } else if (mem->flags & DMA_MEMORY_EXCLUSIVE)
+                        *ret = NULL;
+        }
+        return (mem != NULL);
+}
+/**
+ * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool
+ * @dev:        device from which the memory was allocated
+ * @order:      the order of pages allocated
+ * @vaddr:      virtual address of allocated pages
+ *
+ * This checks whether the memory was allocated from the per-device
+ * coherent memory pool and if so, releases that memory.
+ *
+ * Returns 1 if we correctly released the memory, or 0 if
+ * dma_release_coherent() should proceed with releasing memory from
+ * generic pools.
+ */
+int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
+{
+        struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+        if (mem && vaddr >= mem->virt_base && vaddr <
+                   (mem->virt_base + (mem->size << PAGE_SHIFT))) {
+                int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
+                bitmap_release_region(mem->bitmap, page, order);
+                return 1;
+        }
+        return 0;
+}
diff --git a/kernel/exit.c b/kernel/exit.c
index eb4d6470d1d0..38ec40630149 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -911,10 +911,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
                tsk->exit_signal = SIGCHLD;
        signal = tracehook_notify_death(tsk, &cookie, group_dead);
-        if (signal > 0)
+        if (signal >= 0)
                signal = do_notify_parent(tsk, signal);
-        tsk->exit_state = signal < 0 ? EXIT_DEAD : EXIT_ZOMBIE;
+        tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
        /* mt-exec, de_thread() is waiting for us */
        if (thread_group_leader(tsk) &&
@@ -927,7 +927,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
        tracehook_report_death(tsk, signal, cookie, group_dead);
        /* If the process is dead, release it - nobody will wait for it */
-        if (signal < 0)
+        if (signal == DEATH_REAP)
                release_task(tsk);
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 8214ba7c8bb1..7ce2ebe84796 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -27,6 +27,7 @@
 #include <linux/key.h>
 #include <linux/binfmts.h>
 #include <linux/mman.h>
+#include <linux/mmu_notifier.h>
 #include <linux/fs.h>
 #include <linux/nsproxy.h>
 #include <linux/capability.h>
@@ -414,6 +415,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
        if (likely(!mm_alloc_pgd(mm))) {
                mm->def_flags = 0;
+                mmu_notifier_mm_init(mm);
                return mm;
        }
@@ -446,6 +448,7 @@ void __mmdrop(struct mm_struct *mm)
        BUG_ON(mm == &init_mm);
        mm_free_pgd(mm);
        destroy_context(mm);
+        mmu_notifier_mm_destroy(mm);
        free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 152abfd3589f..0314074fa232 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -323,7 +323,8 @@ static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq,
        ret = chip->set_type(irq, flags & IRQF_TRIGGER_MASK);
        if (ret)
-                pr_err("setting flow type for irq %u failed (%pF)\n",
+                pr_err("setting trigger mode %d for irq %u failed (%pF)\n",
+                                (int)(flags & IRQF_TRIGGER_MASK),
                                irq, chip->set_type);
        return ret;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6c6d35d68ee9..a09dd29c2fd7 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -8,6 +8,7 @@
 #include <linux/irq.h>
 #include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include <linux/interrupt.h>
 #include "internals.h"
@@ -16,23 +17,18 @@ static struct proc_dir_entry *root_irq_dir;
 #ifdef CONFIG_SMP
-static int irq_affinity_read_proc(char *page, char **start, off_t off,
+static int irq_affinity_proc_show(struct seq_file *m, void *v)
-                                  int count, int *eof, void *data)
 {
-        struct irq_desc *desc = irq_desc + (long)data;
+        struct irq_desc *desc = irq_desc + (long)m->private;
        cpumask_t *mask = &desc->affinity;
-        int len;
 #ifdef CONFIG_GENERIC_PENDING_IRQ
        if (desc->status & IRQ_MOVE_PENDING)
                mask = &desc->pending_mask;
 #endif
-        len = cpumask_scnprintf(page, count, *mask);
+        seq_cpumask(m, mask);
+        seq_putc(m, '\n');
-        if (count - len < 2)
+        return 0;
-                return -EINVAL;
-        len += sprintf(page + len, "\n");
-        return len;
 }
 #ifndef is_affinity_mask_valid
@@ -40,11 +36,12 @@ static int irq_affinity_read_proc(char *page, char **start, off_t off,
 #endif
 int no_irq_affinity;
-static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
+static ssize_t irq_affinity_proc_write(struct file *file,
-                                   unsigned long count, void *data)
+                const char __user *buffer, size_t count, loff_t *pos)
 {
-        unsigned int irq = (int)(long)data, full_count = count, err;
+        unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
        cpumask_t new_value;
+        int err;
        if (!irq_desc[irq].chip->set_affinity || no_irq_affinity ||
            irq_balancing_disabled(irq))
@@ -65,28 +62,38 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
        if (!cpus_intersects(new_value, cpu_online_map))
                /* Special case for empty set - allow the architecture
                   code to set default SMP affinity. */
-                return irq_select_affinity(irq) ? -EINVAL : full_count;
+                return irq_select_affinity(irq) ? -EINVAL : count;
        irq_set_affinity(irq, new_value);
-        return full_count;
+        return count;
 }
-static int default_affinity_read(char *page, char **start, off_t off,
+static int irq_affinity_proc_open(struct inode *inode, struct file *file)
-                                  int count, int *eof, void *data)
 {
-        int len = cpumask_scnprintf(page, count, irq_default_affinity);
+        return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
-        if (count - len < 2)
-                return -EINVAL;
-        len += sprintf(page + len, "\n");
-        return len;
 }
-static int default_affinity_write(struct file *file, const char __user *buffer,
+static const struct file_operations irq_affinity_proc_fops = {
-                                   unsigned long count, void *data)
+        .open           = irq_affinity_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+        .write          = irq_affinity_proc_write,
+};
+static int default_affinity_show(struct seq_file *m, void *v)
+{
+        seq_cpumask(m, &irq_default_affinity);
+        seq_putc(m, '\n');
+        return 0;
+}
+static ssize_t default_affinity_write(struct file *file,
+                const char __user *buffer, size_t count, loff_t *ppos)
 {
-        unsigned int full_count = count, err;
        cpumask_t new_value;
+        int err;
        err = cpumask_parse_user(buffer, count, new_value);
        if (err)
@@ -105,8 +112,21 @@ static int default_affinity_write(struct file *file, const char __user *buffer,
        irq_default_affinity = new_value;
-        return full_count;
+        return count;
 }
+static int default_affinity_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, default_affinity_show, NULL);
+}
+static const struct file_operations default_affinity_proc_fops = {
+        .open           = default_affinity_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+        .write          = default_affinity_write,
+};
 #endif
 static int irq_spurious_read(char *page, char **start, off_t off,
@@ -178,16 +198,9 @@ void register_irq_proc(unsigned int irq)
        irq_desc[irq].dir = proc_mkdir(name, root_irq_dir);
 #ifdef CONFIG_SMP
-        {
+        /* create /proc/irq/<irq>/smp_affinity */
-                /* create /proc/irq/<irq>/smp_affinity */
+        proc_create_data("smp_affinity", 0600, irq_desc[irq].dir,
-                entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir);
+                         &irq_affinity_proc_fops, (void *)(long)irq);
-                if (entry) {
-                        entry->data = (void *)(long)irq;
-                        entry->read_proc = irq_affinity_read_proc;
-                        entry->write_proc = irq_affinity_write_proc;
-                }
-        }
 #endif
        entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir);
@@ -208,15 +221,8 @@ void unregister_handler_proc(unsigned int irq, struct irqaction *action)
 void register_default_affinity_proc(void)
 {
 #ifdef CONFIG_SMP
-        struct proc_dir_entry *entry;
+        proc_create("irq/default_smp_affinity", 0600, NULL,
+                    &default_affinity_proc_fops);
-        /* create /proc/irq/default_smp_affinity */
-        entry = create_proc_entry("default_smp_affinity", 0600, root_irq_dir);
-        if (entry) {
-                entry->data = NULL;
-                entry->read_proc  = default_affinity_read;
-                entry->write_proc = default_affinity_write;
-        }
 #endif
 }
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 3ec23c3ec97f..eaa21fc9ad1d 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -56,12 +56,14 @@
 static int kgdb_break_asap;
+#define KGDB_MAX_THREAD_QUERY 17
 struct kgdb_state {
        int                     ex_vector;
        int                     signo;
        int                     err_code;
        int                     cpu;
        int                     pass_exception;
+        unsigned long           thr_query;
        unsigned long           threadid;
        long                    kgdb_usethreadid;
        struct pt_regs          *linux_regs;
@@ -166,13 +168,6 @@ early_param("nokgdbroundup", opt_nokgdbroundup);
 * Weak aliases for breakpoint management,
 * can be overriden by architectures when needed:
 */
-int __weak kgdb_validate_break_address(unsigned long addr)
-{
-        char tmp_variable[BREAK_INSTR_SIZE];
-        return probe_kernel_read(tmp_variable, (char *)addr, BREAK_INSTR_SIZE);
-}
 int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
 {
        int err;
@@ -191,6 +186,25 @@ int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
                                  (char *)bundle, BREAK_INSTR_SIZE);
 }
+int __weak kgdb_validate_break_address(unsigned long addr)
+{
+        char tmp_variable[BREAK_INSTR_SIZE];
+        int err;
+        /* Validate setting the breakpoint and then removing it.  In the
+         * remove fails, the kernel needs to emit a bad message because we
+         * are deep trouble not being able to put things back the way we
+         * found them.
+         */
+        err = kgdb_arch_set_breakpoint(addr, tmp_variable);
+        if (err)
+                return err;
+        err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
+        if (err)
+                printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
+                   "memory destroyed at: %lx", addr);
+        return err;
+}
 unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
 {
        return instruction_pointer(regs);
@@ -433,9 +447,14 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val)
 {
        int hex_val;
        int num = 0;
+        int negate = 0;
        *long_val = 0;
+        if (**ptr == '-') {
+                negate = 1;
+                (*ptr)++;
+        }
        while (**ptr) {
                hex_val = hex(**ptr);
                if (hex_val < 0)
@@ -446,6 +465,9 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val)
                (*ptr)++;
        }
+        if (negate)
+                *long_val = -*long_val;
        return num;
 }
@@ -515,10 +537,16 @@ static void int_to_threadref(unsigned char *id, int value)
 static struct task_struct *getthread(struct pt_regs *regs, int tid)
 {
        /*
-         * Non-positive TIDs are remapped idle tasks:
+         * Non-positive TIDs are remapped to the cpu shadow information
         */
-        if (tid <= 0)
+        if (tid == 0 || tid == -1)
-                return idle_task(-tid);
+                tid = -atomic_read(&kgdb_active) - 2;
+        if (tid < 0) {
+                if (kgdb_info[-tid - 2].task)
+                        return kgdb_info[-tid - 2].task;
+                else
+                        return idle_task(-tid - 2);
+        }
        /*
         * find_task_by_pid_ns() does not take the tasklist lock anymore
@@ -725,14 +753,15 @@ setundefined:
 }
 /*
- * Remap normal tasks to their real PID, idle tasks to -1 ... -NR_CPUs:
+ * Remap normal tasks to their real PID,
+ * CPU shadow threads are mapped to -CPU - 2
 */
 static inline int shadow_pid(int realpid)
 {
        if (realpid)
                return realpid;
-        return -1-raw_smp_processor_id();
+        return -raw_smp_processor_id() - 2;
 }
 static char gdbmsgbuf[BUFMAX + 1];
@@ -826,7 +855,7 @@ static void gdb_cmd_getregs(struct kgdb_state *ks)
                local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
        } else {
                local_debuggerinfo = NULL;
-                for (i = 0; i < NR_CPUS; i++) {
+                for_each_online_cpu(i) {
                        /*
                         * Try to find the task on some other
                         * or possibly this node if we do not
@@ -960,10 +989,13 @@ static int gdb_cmd_reboot(struct kgdb_state *ks)
 /* Handle the 'q' query packets */
 static void gdb_cmd_query(struct kgdb_state *ks)
 {
-        struct task_struct *thread;
+        struct task_struct *g;
+        struct task_struct *p;
        unsigned char thref[8];
        char *ptr;
        int i;
+        int cpu;
+        int finished = 0;
        switch (remcom_in_buffer[1]) {
        case 's':
@@ -973,22 +1005,34 @@ static void gdb_cmd_query(struct kgdb_state *ks)
                        break;
                }
-                if (remcom_in_buffer[1] == 'f')
+                i = 0;
-                        ks->threadid = 1;
                remcom_out_buffer[0] = 'm';
                ptr = remcom_out_buffer + 1;
+                if (remcom_in_buffer[1] == 'f') {
-                for (i = 0; i < 17; ks->threadid++) {
+                        /* Each cpu is a shadow thread */
-                        thread = getthread(ks->linux_regs, ks->threadid);
+                        for_each_online_cpu(cpu) {
-                        if (thread) {
+                                ks->thr_query = 0;
-                                int_to_threadref(thref, ks->threadid);
+                                int_to_threadref(thref, -cpu - 2);
                                pack_threadid(ptr, thref);
                                ptr += BUF_THREAD_ID_SIZE;
                                *(ptr++) = ',';
                                i++;
                        }
                }
+                do_each_thread(g, p) {
+                        if (i >= ks->thr_query && !finished) {
+                                int_to_threadref(thref, p->pid);
+                                pack_threadid(ptr, thref);
+                                ptr += BUF_THREAD_ID_SIZE;
+                                *(ptr++) = ',';
+                                ks->thr_query++;
+                                if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
+                                        finished = 1;
+                        }
+                        i++;
+                } while_each_thread(g, p);
                *(--ptr) = '\0';
                break;
@@ -1011,15 +1055,15 @@ static void gdb_cmd_query(struct kgdb_state *ks)
                        error_packet(remcom_out_buffer, -EINVAL);
                        break;
                }
-                if (ks->threadid > 0) {
+                if ((int)ks->threadid > 0) {
                        kgdb_mem2hex(getthread(ks->linux_regs,
                                        ks->threadid)->comm,
                                        remcom_out_buffer, 16);
                } else {
                        static char tmpstr[23 + BUF_THREAD_ID_SIZE];
-                        sprintf(tmpstr, "Shadow task %d for pid 0",
+                        sprintf(tmpstr, "shadowCPU%d",
-                                        (int)(-ks->threadid-1));
+                                        (int)(-ks->threadid - 2));
                        kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
                }
                break;
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index d38a64362973..1aa91fd6b06e 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -124,6 +124,15 @@ static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
 unsigned long nr_lock_classes;
 static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
+static inline struct lock_class *hlock_class(struct held_lock *hlock)
+{
+        if (!hlock->class_idx) {
+                DEBUG_LOCKS_WARN_ON(1);
+                return NULL;
+        }
+        return lock_classes + hlock->class_idx - 1;
+}
 #ifdef CONFIG_LOCK_STAT
 static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
@@ -222,7 +231,7 @@ static void lock_release_holdtime(struct held_lock *hlock)
        holdtime = sched_clock() - hlock->holdtime_stamp;
-        stats = get_lock_stats(hlock->class);
+        stats = get_lock_stats(hlock_class(hlock));
        if (hlock->read)
                lock_time_inc(&stats->read_holdtime, holdtime);
        else
@@ -372,6 +381,19 @@ unsigned int nr_process_chains;
 unsigned int max_lockdep_depth;
 unsigned int max_recursion_depth;
+static unsigned int lockdep_dependency_gen_id;
+static bool lockdep_dependency_visit(struct lock_class *source,
+                                     unsigned int depth)
+{
+        if (!depth)
+                lockdep_dependency_gen_id++;
+        if (source->dep_gen_id == lockdep_dependency_gen_id)
+                return true;
+        source->dep_gen_id = lockdep_dependency_gen_id;
+        return false;
+}
 #ifdef CONFIG_DEBUG_LOCKDEP
 /*
 * We cannot printk in early bootup code. Not even early_printk()
@@ -505,7 +527,7 @@ static void print_lockdep_cache(struct lockdep_map *lock)
 static void print_lock(struct held_lock *hlock)
 {
-        print_lock_name(hlock->class);
+        print_lock_name(hlock_class(hlock));
        printk(", at: ");
        print_ip_sym(hlock->acquire_ip);
 }
@@ -558,6 +580,9 @@ static void print_lock_dependencies(struct lock_class *class, int depth)
 {
        struct lock_list *entry;
+        if (lockdep_dependency_visit(class, depth))
+                return;
        if (DEBUG_LOCKS_WARN_ON(depth >= 20))
                return;
@@ -932,7 +957,7 @@ static noinline int print_circular_bug_tail(void)
        if (debug_locks_silent)
                return 0;
-        this.class = check_source->class;
+        this.class = hlock_class(check_source);
        if (!save_trace(&this.trace))
                return 0;
@@ -959,6 +984,67 @@ static int noinline print_infinite_recursion_bug(void)
        return 0;
 }
+unsigned long __lockdep_count_forward_deps(struct lock_class *class,
+                                           unsigned int depth)
+{
+        struct lock_list *entry;
+        unsigned long ret = 1;
+        if (lockdep_dependency_visit(class, depth))
+                return 0;
+        /*
+         * Recurse this class's dependency list:
+         */
+        list_for_each_entry(entry, &class->locks_after, entry)
+                ret += __lockdep_count_forward_deps(entry->class, depth + 1);
+        return ret;
+}
+unsigned long lockdep_count_forward_deps(struct lock_class *class)
+{
+        unsigned long ret, flags;
+        local_irq_save(flags);
+        __raw_spin_lock(&lockdep_lock);
+        ret = __lockdep_count_forward_deps(class, 0);
+        __raw_spin_unlock(&lockdep_lock);
+        local_irq_restore(flags);
+        return ret;
+}
+unsigned long __lockdep_count_backward_deps(struct lock_class *class,
+                                            unsigned int depth)
+{
+        struct lock_list *entry;
+        unsigned long ret = 1;
+        if (lockdep_dependency_visit(class, depth))
+                return 0;
+        /*
+         * Recurse this class's dependency list:
+         */
+        list_for_each_entry(entry, &class->locks_before, entry)
+                ret += __lockdep_count_backward_deps(entry->class, depth + 1);
+        return ret;
+}
+unsigned long lockdep_count_backward_deps(struct lock_class *class)
+{
+        unsigned long ret, flags;
+        local_irq_save(flags);
+        __raw_spin_lock(&lockdep_lock);
+        ret = __lockdep_count_backward_deps(class, 0);
+        __raw_spin_unlock(&lockdep_lock);
+        local_irq_restore(flags);
+        return ret;
+}
 /*
 * Prove that the dependency graph starting at <entry> can not
 * lead to <target>. Print an error and return 0 if it does.
@@ -968,6 +1054,9 @@ check_noncircular(struct lock_class *source, unsigned int depth)
 {
        struct lock_list *entry;
+        if (lockdep_dependency_visit(source, depth))
+                return 1;
        debug_atomic_inc(&nr_cyclic_check_recursions);
        if (depth > max_recursion_depth)
                max_recursion_depth = depth;
@@ -977,7 +1066,7 @@ check_noncircular(struct lock_class *source, unsigned int depth)
         * Check this lock's dependency list:
         */
        list_for_each_entry(entry, &source->locks_after, entry) {
-                if (entry->class == check_target->class)
+                if (entry->class == hlock_class(check_target))
                        return print_circular_bug_header(entry, depth+1);
                debug_atomic_inc(&nr_cyclic_checks);
                if (!check_noncircular(entry->class, depth+1))
@@ -1011,6 +1100,9 @@ find_usage_forwards(struct lock_class *source, unsigned int depth)
        struct lock_list *entry;
        int ret;
+        if (lockdep_dependency_visit(source, depth))
+                return 1;
        if (depth > max_recursion_depth)
                max_recursion_depth = depth;
        if (depth >= RECURSION_LIMIT)
@@ -1050,6 +1142,9 @@ find_usage_backwards(struct lock_class *source, unsigned int depth)
        struct lock_list *entry;
        int ret;
+        if (lockdep_dependency_visit(source, depth))
+                return 1;
        if (!__raw_spin_is_locked(&lockdep_lock))
                return DEBUG_LOCKS_WARN_ON(1);
@@ -1064,6 +1159,11 @@ find_usage_backwards(struct lock_class *source, unsigned int depth)
                return 2;
        }
+        if (!source && debug_locks_off_graph_unlock()) {
+                WARN_ON(1);
+                return 0;
+        }
        /*
         * Check this lock's dependency list:
         */
@@ -1103,9 +1203,9 @@ print_bad_irq_dependency(struct task_struct *curr,
        printk("\nand this task is already holding:\n");
        print_lock(prev);
        printk("which would create a new lock dependency:\n");
-        print_lock_name(prev->class);
+        print_lock_name(hlock_class(prev));
        printk(" ->");
-        print_lock_name(next->class);
+        print_lock_name(hlock_class(next));
        printk("\n");
        printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
@@ -1146,12 +1246,12 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
        find_usage_bit = bit_backwards;
        /* fills in <backwards_match> */
-        ret = find_usage_backwards(prev->class, 0);
+        ret = find_usage_backwards(hlock_class(prev), 0);
        if (!ret || ret == 1)
                return ret;
        find_usage_bit = bit_forwards;
-        ret = find_usage_forwards(next->class, 0);
+        ret = find_usage_forwards(hlock_class(next), 0);
        if (!ret || ret == 1)
                return ret;
        /* ret == 2 */
@@ -1272,18 +1372,32 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
               struct lockdep_map *next_instance, int read)
 {
        struct held_lock *prev;
+        struct held_lock *nest = NULL;
        int i;
        for (i = 0; i < curr->lockdep_depth; i++) {
                prev = curr->held_locks + i;
-                if (prev->class != next->class)
+                if (prev->instance == next->nest_lock)
+                        nest = prev;
+                if (hlock_class(prev) != hlock_class(next))
                        continue;
                /*
                 * Allow read-after-read recursion of the same
                 * lock class (i.e. read_lock(lock)+read_lock(lock)):
                 */
                if ((read == 2) && prev->read)
                        return 2;
+                /*
+                 * We're holding the nest_lock, which serializes this lock's
+                 * nesting behaviour.
+                 */
+                if (nest)
+                        return 2;
                return print_deadlock_bug(curr, prev, next);
        }
        return 1;
@@ -1329,7 +1443,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
         */
        check_source = next;
        check_target = prev;
-        if (!(check_noncircular(next->class, 0)))
+        if (!(check_noncircular(hlock_class(next), 0)))
                return print_circular_bug_tail();
        if (!check_prev_add_irq(curr, prev, next))
@@ -1353,8 +1467,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
         *  chains - the second one will be new, but L1 already has
         *  L2 added to its dependency list, due to the first chain.)
         */
-        list_for_each_entry(entry, &prev->class->locks_after, entry) {
+        list_for_each_entry(entry, &hlock_class(prev)->locks_after, entry) {
-                if (entry->class == next->class) {
+                if (entry->class == hlock_class(next)) {
                        if (distance == 1)
                                entry->distance = 1;
                        return 2;
@@ -1365,26 +1479,28 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
         * Ok, all validations passed, add the new lock
         * to the previous lock's dependency list:
         */
-        ret = add_lock_to_list(prev->class, next->class,
+        ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
-                               &prev->class->locks_after, next->acquire_ip, distance);
+                               &hlock_class(prev)->locks_after,
+                               next->acquire_ip, distance);
        if (!ret)
                return 0;
-        ret = add_lock_to_list(next->class, prev->class,
+        ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
-                               &next->class->locks_before, next->acquire_ip, distance);
+                               &hlock_class(next)->locks_before,
+                               next->acquire_ip, distance);
        if (!ret)
                return 0;
        /*
         * Debugging printouts:
         */
-        if (verbose(prev->class) || verbose(next->class)) {
+        if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
                graph_unlock();
                printk("\n new dependency: ");
-                print_lock_name(prev->class);
+                print_lock_name(hlock_class(prev));
                printk(" => ");
-                print_lock_name(next->class);
+                print_lock_name(hlock_class(next));
                printk("\n");
                dump_stack();
                return graph_lock();
@@ -1481,7 +1597,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
                                     struct held_lock *hlock,
                                     u64 chain_key)
 {
-        struct lock_class *class = hlock->class;
+        struct lock_class *class = hlock_class(hlock);
        struct list_head *hash_head = chainhashentry(chain_key);
        struct lock_chain *chain;
        struct held_lock *hlock_curr, *hlock_next;
@@ -1554,7 +1670,7 @@ cache_hit:
        if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
                chain->base = cn;
                for (j = 0; j < chain->depth - 1; j++, i++) {
-                        int lock_id = curr->held_locks[i].class - lock_classes;
+                        int lock_id = curr->held_locks[i].class_idx - 1;
                        chain_hlocks[chain->base + j] = lock_id;
                }
                chain_hlocks[chain->base + j] = class - lock_classes;
@@ -1650,7 +1766,7 @@ static void check_chain_key(struct task_struct *curr)
                        WARN_ON(1);
                        return;
                }
-                id = hlock->class - lock_classes;
+                id = hlock->class_idx - 1;
                if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
                        return;
@@ -1695,7 +1811,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
        print_lock(this);
        printk("{%s} state was registered at:\n", usage_str[prev_bit]);
-        print_stack_trace(this->class->usage_traces + prev_bit, 1);
+        print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1);
        print_irqtrace_events(curr);
        printk("\nother info that might help us debug this:\n");
@@ -1714,7 +1830,7 @@ static inline int
 valid_state(struct task_struct *curr, struct held_lock *this,
            enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
 {
-        if (unlikely(this->class->usage_mask & (1 << bad_bit)))
+        if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit)))
                return print_usage_bug(curr, this, bad_bit, new_bit);
        return 1;
 }
@@ -1753,7 +1869,7 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
        lockdep_print_held_locks(curr);
        printk("\nthe first lock's dependencies:\n");
-        print_lock_dependencies(this->class, 0);
+        print_lock_dependencies(hlock_class(this), 0);
        printk("\nthe second lock's dependencies:\n");
        print_lock_dependencies(other, 0);
@@ -1776,7 +1892,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
        find_usage_bit = bit;
        /* fills in <forwards_match> */
-        ret = find_usage_forwards(this->class, 0);
+        ret = find_usage_forwards(hlock_class(this), 0);
        if (!ret || ret == 1)
                return ret;
@@ -1795,7 +1911,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
        find_usage_bit = bit;
        /* fills in <backwards_match> */
-        ret = find_usage_backwards(this->class, 0);
+        ret = find_usage_backwards(hlock_class(this), 0);
        if (!ret || ret == 1)
                return ret;
@@ -1861,7 +1977,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
                                LOCK_ENABLED_HARDIRQS_READ, "hard-read"))
                        return 0;
 #endif
-                if (hardirq_verbose(this->class))
+                if (hardirq_verbose(hlock_class(this)))
                        ret = 2;
                break;
        case LOCK_USED_IN_SOFTIRQ:
@@ -1886,7 +2002,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
                                LOCK_ENABLED_SOFTIRQS_READ, "soft-read"))
                        return 0;
 #endif
-                if (softirq_verbose(this->class))
+                if (softirq_verbose(hlock_class(this)))
                        ret = 2;
                break;
        case LOCK_USED_IN_HARDIRQ_READ:
@@ -1899,7 +2015,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
                if (!check_usage_forwards(curr, this,
                                          LOCK_ENABLED_HARDIRQS, "hard"))
                        return 0;
-                if (hardirq_verbose(this->class))
+                if (hardirq_verbose(hlock_class(this)))
                        ret = 2;
                break;
        case LOCK_USED_IN_SOFTIRQ_READ:
@@ -1912,7 +2028,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
                if (!check_usage_forwards(curr, this,
                                          LOCK_ENABLED_SOFTIRQS, "soft"))
                        return 0;
-                if (softirq_verbose(this->class))
+                if (softirq_verbose(hlock_class(this)))
                        ret = 2;
                break;
        case LOCK_ENABLED_HARDIRQS:
@@ -1938,7 +2054,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
                                   LOCK_USED_IN_HARDIRQ_READ, "hard-read"))
                        return 0;
 #endif
-                if (hardirq_verbose(this->class))
+                if (hardirq_verbose(hlock_class(this)))
                        ret = 2;
                break;
        case LOCK_ENABLED_SOFTIRQS:
@@ -1964,7 +2080,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
                                   LOCK_USED_IN_SOFTIRQ_READ, "soft-read"))
                        return 0;
 #endif
-                if (softirq_verbose(this->class))
+                if (softirq_verbose(hlock_class(this)))
                        ret = 2;
                break;
        case LOCK_ENABLED_HARDIRQS_READ:
@@ -1979,7 +2095,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
                                           LOCK_USED_IN_HARDIRQ, "hard"))
                        return 0;
 #endif
-                if (hardirq_verbose(this->class))
+                if (hardirq_verbose(hlock_class(this)))
                        ret = 2;
                break;
        case LOCK_ENABLED_SOFTIRQS_READ:
@@ -1994,7 +2110,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
                                           LOCK_USED_IN_SOFTIRQ, "soft"))
                        return 0;
 #endif
-                if (softirq_verbose(this->class))
+                if (softirq_verbose(hlock_class(this)))
                        ret = 2;
                break;
        default:
@@ -2310,7 +2426,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
         * If already set then do not dirty the cacheline,
         * nor do any checks:
         */
-        if (likely(this->class->usage_mask & new_mask))
+        if (likely(hlock_class(this)->usage_mask & new_mask))
                return 1;
        if (!graph_lock())
@@ -2318,14 +2434,14 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
        /*
         * Make sure we didnt race:
         */
-        if (unlikely(this->class->usage_mask & new_mask)) {
+        if (unlikely(hlock_class(this)->usage_mask & new_mask)) {
                graph_unlock();
                return 1;
        }
-        this->class->usage_mask |= new_mask;
+        hlock_class(this)->usage_mask |= new_mask;
-        if (!save_trace(this->class->usage_traces + new_bit))
+        if (!save_trace(hlock_class(this)->usage_traces + new_bit))
                return 0;
        switch (new_bit) {
@@ -2405,7 +2521,7 @@ EXPORT_SYMBOL_GPL(lockdep_init_map);
 */
 static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
                          int trylock, int read, int check, int hardirqs_off,
-                          unsigned long ip)
+                          struct lockdep_map *nest_lock, unsigned long ip)
 {
        struct task_struct *curr = current;
        struct lock_class *class = NULL;
@@ -2459,10 +2575,12 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
                return 0;
        hlock = curr->held_locks + depth;
+        if (DEBUG_LOCKS_WARN_ON(!class))
-        hlock->class = class;
+                return 0;
+        hlock->class_idx = class - lock_classes + 1;
        hlock->acquire_ip = ip;
        hlock->instance = lock;
+        hlock->nest_lock = nest_lock;
        hlock->trylock = trylock;
        hlock->read = read;
        hlock->check = check;
@@ -2574,6 +2692,55 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
        return 1;
 }
+static int
+__lock_set_subclass(struct lockdep_map *lock,
+                    unsigned int subclass, unsigned long ip)
+{
+        struct task_struct *curr = current;
+        struct held_lock *hlock, *prev_hlock;
+        struct lock_class *class;
+        unsigned int depth;
+        int i;
+        depth = curr->lockdep_depth;
+        if (DEBUG_LOCKS_WARN_ON(!depth))
+                return 0;
+        prev_hlock = NULL;
+        for (i = depth-1; i >= 0; i--) {
+                hlock = curr->held_locks + i;
+                /*
+                 * We must not cross into another context:
+                 */
+                if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
+                        break;
+                if (hlock->instance == lock)
+                        goto found_it;
+                prev_hlock = hlock;
+        }
+        return print_unlock_inbalance_bug(curr, lock, ip);
+found_it:
+        class = register_lock_class(lock, subclass, 0);
+        hlock->class_idx = class - lock_classes + 1;
+        curr->lockdep_depth = i;
+        curr->curr_chain_key = hlock->prev_chain_key;
+        for (; i < depth; i++) {
+                hlock = curr->held_locks + i;
+                if (!__lock_acquire(hlock->instance,
+                        hlock_class(hlock)->subclass, hlock->trylock,
+                                hlock->read, hlock->check, hlock->hardirqs_off,
+                                hlock->nest_lock, hlock->acquire_ip))
+                        return 0;
+        }
+        if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
+                return 0;
+        return 1;
+}
 /*
 * Remove the lock to the list of currently held locks in a
 * potentially non-nested (out of order) manner. This is a
@@ -2624,9 +2791,9 @@ found_it:
        for (i++; i < depth; i++) {
                hlock = curr->held_locks + i;
                if (!__lock_acquire(hlock->instance,
-                        hlock->class->subclass, hlock->trylock,
+                        hlock_class(hlock)->subclass, hlock->trylock,
                                hlock->read, hlock->check, hlock->hardirqs_off,
-                                hlock->acquire_ip))
+                                hlock->nest_lock, hlock->acquire_ip))
                        return 0;
        }
@@ -2669,7 +2836,7 @@ static int lock_release_nested(struct task_struct *curr,
 #ifdef CONFIG_DEBUG_LOCKDEP
        hlock->prev_chain_key = 0;
-        hlock->class = NULL;
+        hlock->class_idx = 0;
        hlock->acquire_ip = 0;
        hlock->irq_context = 0;
 #endif
@@ -2738,18 +2905,36 @@ static void check_flags(unsigned long flags)
 #endif
 }
+void
+lock_set_subclass(struct lockdep_map *lock,
+                  unsigned int subclass, unsigned long ip)
+{
+        unsigned long flags;
+        if (unlikely(current->lockdep_recursion))
+                return;
+        raw_local_irq_save(flags);
+        current->lockdep_recursion = 1;
+        check_flags(flags);
+        if (__lock_set_subclass(lock, subclass, ip))
+                check_chain_key(current);
+        current->lockdep_recursion = 0;
+        raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_set_subclass);
 /*
 * We are not always called with irqs disabled - do that here,
 * and also avoid lockdep recursion:
 */
 void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
-                          int trylock, int read, int check, unsigned long ip)
+                          int trylock, int read, int check,
+                          struct lockdep_map *nest_lock, unsigned long ip)
 {
        unsigned long flags;
-        if (unlikely(!lock_stat && !prove_locking))
-                return;
        if (unlikely(current->lockdep_recursion))
                return;
@@ -2758,7 +2943,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        current->lockdep_recursion = 1;
        __lock_acquire(lock, subclass, trylock, read, check,
-                       irqs_disabled_flags(flags), ip);
+                       irqs_disabled_flags(flags), nest_lock, ip);
        current->lockdep_recursion = 0;
        raw_local_irq_restore(flags);
 }
@@ -2770,9 +2955,6 @@ void lock_release(struct lockdep_map *lock, int nested,
 {
        unsigned long flags;
-        if (unlikely(!lock_stat && !prove_locking))
-                return;
        if (unlikely(current->lockdep_recursion))
                return;
@@ -2845,9 +3027,9 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
 found_it:
        hlock->waittime_stamp = sched_clock();
-        point = lock_contention_point(hlock->class, ip);
+        point = lock_contention_point(hlock_class(hlock), ip);
-        stats = get_lock_stats(hlock->class);
+        stats = get_lock_stats(hlock_class(hlock));
        if (point < ARRAY_SIZE(stats->contention_point))
                stats->contention_point[i]++;
        if (lock->cpu != smp_processor_id())
@@ -2893,7 +3075,7 @@ found_it:
                hlock->holdtime_stamp = now;
        }
-        stats = get_lock_stats(hlock->class);
+        stats = get_lock_stats(hlock_class(hlock));
        if (waittime) {
                if (hlock->read)
                        lock_time_inc(&stats->read_waittime, waittime);
@@ -2988,6 +3170,7 @@ static void zap_class(struct lock_class *class)
        list_del_rcu(&class->hash_entry);
        list_del_rcu(&class->lock_entry);
+        class->key = NULL;
 }
 static inline int within(const void *addr, void *start, unsigned long size)
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index c3600a091a28..55db193d366d 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -17,9 +17,6 @@
 */
 #define MAX_LOCKDEP_ENTRIES     8192UL
-#define MAX_LOCKDEP_KEYS_BITS   11
-#define MAX_LOCKDEP_KEYS        (1UL << MAX_LOCKDEP_KEYS_BITS)
 #define MAX_LOCKDEP_CHAINS_BITS 14
 #define MAX_LOCKDEP_CHAINS      (1UL << MAX_LOCKDEP_CHAINS_BITS)
@@ -53,6 +50,9 @@ extern unsigned int nr_process_chains;
 extern unsigned int max_lockdep_depth;
 extern unsigned int max_recursion_depth;
+extern unsigned long lockdep_count_forward_deps(struct lock_class *);
+extern unsigned long lockdep_count_backward_deps(struct lock_class *);
 #ifdef CONFIG_DEBUG_LOCKDEP
 /*
 * Various lockdep statistics:
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 9b0e940e2545..fa19aee604c2 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -63,34 +63,6 @@ static void l_stop(struct seq_file *m, void *v)
 {
 }
-static unsigned long count_forward_deps(struct lock_class *class)
-{
-        struct lock_list *entry;
-        unsigned long ret = 1;
-        /*
-         * Recurse this class's dependency list:
-         */
-        list_for_each_entry(entry, &class->locks_after, entry)
-                ret += count_forward_deps(entry->class);
-        return ret;
-}
-static unsigned long count_backward_deps(struct lock_class *class)
-{
-        struct lock_list *entry;
-        unsigned long ret = 1;
-        /*
-         * Recurse this class's dependency list:
-         */
-        list_for_each_entry(entry, &class->locks_before, entry)
-                ret += count_backward_deps(entry->class);
-        return ret;
-}
 static void print_name(struct seq_file *m, struct lock_class *class)
 {
        char str[128];
@@ -124,10 +96,10 @@ static int l_show(struct seq_file *m, void *v)
 #ifdef CONFIG_DEBUG_LOCKDEP
        seq_printf(m, " OPS:%8ld", class->ops);
 #endif
-        nr_forward_deps = count_forward_deps(class);
+        nr_forward_deps = lockdep_count_forward_deps(class);
        seq_printf(m, " FD:%5ld", nr_forward_deps);
-        nr_backward_deps = count_backward_deps(class);
+        nr_backward_deps = lockdep_count_backward_deps(class);
        seq_printf(m, " BD:%5ld", nr_backward_deps);
        get_usage_chars(class, &c1, &c2, &c3, &c4);
@@ -229,6 +201,9 @@ static int lc_show(struct seq_file *m, void *v)
        for (i = 0; i < chain->depth; i++) {
                class = lock_chain_get_class(chain, i);
+                if (!class->key)
+                        continue;
                seq_printf(m, "[%p] ", class->key);
                print_name(m, class);
                seq_puts(m, "\n");
@@ -350,7 +325,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
                if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
                        nr_hardirq_read_unsafe++;
-                sum_forward_deps += count_forward_deps(class);
+                sum_forward_deps += lockdep_count_forward_deps(class);
        }
 #ifdef CONFIG_DEBUG_LOCKDEP
        DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused);
diff --git a/kernel/marker.c b/kernel/marker.c
index 971da5317903..7d1faecd7a51 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -126,6 +126,11 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
                struct marker_probe_closure *multi;
                int i;
                /*
+                 * Read mdata->ptype before mdata->multi.
+                 */
+                smp_rmb();
+                multi = mdata->multi;
+                /*
                 * multi points to an array, therefore accessing the array
                 * depends on reading multi. However, even in this case,
                 * we must insure that the pointer is read _before_ the array
@@ -133,7 +138,6 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
                 * in the fast path, so put the explicit barrier here.
                 */
                smp_read_barrier_depends();
-                multi = mdata->multi;
                for (i = 0; multi[i].func; i++) {
                        va_start(args, call_private);
                        multi[i].func(multi[i].probe_private, call_private,
@@ -175,6 +179,11 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
                struct marker_probe_closure *multi;
                int i;
                /*
+                 * Read mdata->ptype before mdata->multi.
+                 */
+                smp_rmb();
+                multi = mdata->multi;
+                /*
                 * multi points to an array, therefore accessing the array
                 * depends on reading multi. However, even in this case,
                 * we must insure that the pointer is read _before_ the array
@@ -182,7 +191,6 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
                 * in the fast path, so put the explicit barrier here.
                 */
                smp_read_barrier_depends();
-                multi = mdata->multi;
                for (i = 0; multi[i].func; i++)
                        multi[i].func(multi[i].probe_private, call_private,
                                mdata->format, &args);
diff --git a/kernel/module.c b/kernel/module.c
index d8b5605132a0..08864d257eb0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -325,18 +325,6 @@ static unsigned long find_symbol(const char *name,
        return -ENOENT;
 }
-/* lookup symbol in given range of kernel_symbols */
-static const struct kernel_symbol *lookup_symbol(const char *name,
-        const struct kernel_symbol *start,
-        const struct kernel_symbol *stop)
-{
-        const struct kernel_symbol *ks = start;
-        for (; ks < stop; ks++)
-                if (strcmp(ks->name, name) == 0)
-                        return ks;
-        return NULL;
-}
 /* Search for module by name: must hold module_mutex. */
 static struct module *find_module(const char *name)
 {
@@ -690,7 +678,7 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
        if (flags & O_NONBLOCK) {
                struct stopref sref = { mod, flags, forced };
-                return stop_machine_run(__try_stop_module, &sref, NR_CPUS);
+                return stop_machine(__try_stop_module, &sref, NULL);
        } else {
                /* We don't need to stop the machine for this. */
                mod->state = MODULE_STATE_GOING;
@@ -1428,7 +1416,7 @@ static int __unlink_module(void *_mod)
 static void free_module(struct module *mod)
 {
        /* Delete from various lists */
-        stop_machine_run(__unlink_module, mod, NR_CPUS);
+        stop_machine(__unlink_module, mod, NULL);
        remove_notes_attrs(mod);
        remove_sect_attrs(mod);
        mod_kobject_remove(mod);
@@ -1703,6 +1691,19 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
 }
 #ifdef CONFIG_KALLSYMS
+/* lookup symbol in given range of kernel_symbols */
+static const struct kernel_symbol *lookup_symbol(const char *name,
+        const struct kernel_symbol *start,
+        const struct kernel_symbol *stop)
+{
+        const struct kernel_symbol *ks = start;
+        for (; ks < stop; ks++)
+                if (strcmp(ks->name, name) == 0)
+                        return ks;
+        return NULL;
+}
 static int is_exported(const char *name, const struct module *mod)
 {
        if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab))
@@ -2196,7 +2197,7 @@ static struct module *load_module(void __user *umod,
        /* Now sew it into the lists so we can get lockdep and oops
         * info during argument parsing.  Noone should access us, since
         * strong_try_module_get() will fail. */
-        stop_machine_run(__link_module, mod, NR_CPUS);
+        stop_machine(__link_module, mod, NULL);
        /* Size of section 0 is 0, so this works well if no params */
        err = parse_args(mod->name, mod->args,
@@ -2230,7 +2231,7 @@ static struct module *load_module(void __user *umod,
        return mod;
 unlink:
-        stop_machine_run(__unlink_module, mod, NR_CPUS);
+        stop_machine(__unlink_module, mod, NULL);
        module_arch_cleanup(mod);
 cleanup:
        kobject_del(&mod->mkobj.kobj);
@@ -2287,7 +2288,7 @@ sys_init_module(void __user *umod,
        /* Start the module */
        if (mod->init != NULL)
-                ret = mod->init();
+                ret = do_one_initcall(mod->init);
        if (ret < 0) {
                /* Init routine failed: abort.  Try to protect us from
                   buggy refcounters. */
diff --git a/kernel/mutex.c b/kernel/mutex.c
index bcdc9ac8ef60..12c779dc65d4 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -34,6 +34,7 @@
 /***
 * mutex_init - initialize the mutex
 * @lock: the mutex to be initialized
+ * @key: the lock_class_key for the class; used by mutex lock debugging
 *
 * Initialize the mutex to unlocked state.
 *
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 8cb757026386..da9c2dda6a4e 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -24,7 +24,7 @@
 * requirement that the application has is cleaned up when closes the file
 * pointer or exits the pm_qos_object will get an opportunity to clean up.
 *
- * mark gross mgross@linux.intel.com
+ * Mark Gross <mgross@linux.intel.com>
 */
 #include <linux/pm_qos_params.h>
@@ -211,8 +211,8 @@ EXPORT_SYMBOL_GPL(pm_qos_requirement);
 * @value: defines the qos request
 *
 * This function inserts a new entry in the pm_qos_class list of requested qos
- * performance charactoistics.  It recomputes the agregate QoS expectations for
+ * performance characteristics.  It recomputes the aggregate QoS expectations
- * the pm_qos_class of parrameters.
+ * for the pm_qos_class of parameters.
 */
 int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value)
 {
@@ -250,10 +250,10 @@ EXPORT_SYMBOL_GPL(pm_qos_add_requirement);
 * @name: identifies the request
 * @value: defines the qos request
 *
- * Updates an existing qos requierement for the pm_qos_class of parameters along
+ * Updates an existing qos requirement for the pm_qos_class of parameters along
 * with updating the target pm_qos_class value.
 *
- * If the named request isn't in the lest then no change is made.
+ * If the named request isn't in the list then no change is made.
 */
 int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value)
 {
@@ -287,7 +287,7 @@ EXPORT_SYMBOL_GPL(pm_qos_update_requirement);
 * @pm_qos_class: identifies which list of qos request to us
 * @name: identifies the request
 *
- * Will remove named qos request from pm_qos_class list of parrameters and
+ * Will remove named qos request from pm_qos_class list of parameters and
 * recompute the current target value for the pm_qos_class.
 */
 void pm_qos_remove_requirement(int pm_qos_class, char *name)
@@ -319,7 +319,7 @@ EXPORT_SYMBOL_GPL(pm_qos_remove_requirement);
 * @notifier: notifier block managed by caller.
 *
 * will register the notifier into a notification chain that gets called
- * uppon changes to the pm_qos_class target value.
+ * upon changes to the pm_qos_class target value.
 */
 int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
 {
@@ -338,7 +338,7 @@ EXPORT_SYMBOL_GPL(pm_qos_add_notifier);
 * @notifier: notifier block to be removed.
 *
 * will remove the notifier from the notification chain that gets called
- * uppon changes to the pm_qos_class target value.
+ * upon changes to the pm_qos_class target value.
 */
 int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
 {
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9a21681aa80f..e36d5798cbff 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -289,21 +289,29 @@ void do_schedule_next_timer(struct siginfo *info)
                else
                        schedule_next_timer(timr);
-                info->si_overrun = timr->it_overrun_last;
+                info->si_overrun += timr->it_overrun_last;
        }
        if (timr)
                unlock_timer(timr, flags);
 }
-int posix_timer_event(struct k_itimer *timr,int si_private)
+int posix_timer_event(struct k_itimer *timr, int si_private)
 {
-        memset(&timr->sigq->info, 0, sizeof(siginfo_t));
+        /*
+         * FIXME: if ->sigq is queued we can race with
+         * dequeue_signal()->do_schedule_next_timer().
+         *
+         * If dequeue_signal() sees the "right" value of
+         * si_sys_private it calls do_schedule_next_timer().
+         * We re-queue ->sigq and drop ->it_lock().
+         * do_schedule_next_timer() locks the timer
+         * and re-schedules it while ->sigq is pending.
+         * Not really bad, but not that we want.
+         */
        timr->sigq->info.si_sys_private = si_private;
-        /* Send signal to the process that owns this timer.*/
        timr->sigq->info.si_signo = timr->it_sigev_signo;
-        timr->sigq->info.si_errno = 0;
        timr->sigq->info.si_code = SI_TIMER;
        timr->sigq->info.si_tid = timr->it_id;
        timr->sigq->info.si_value = timr->it_sigev_value;
@@ -435,6 +443,7 @@ static struct k_itimer * alloc_posix_timer(void)
                kmem_cache_free(posix_timers_cache, tmr);
                tmr = NULL;
        }
+        memset(&tmr->sigq->info, 0, sizeof(siginfo_t));
        return tmr;
 }
diff --git a/kernel/printk.c b/kernel/printk.c
index a7f7559c5f6c..b51b1567bb55 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1309,14 +1309,14 @@ void tty_write_message(struct tty_struct *tty, char *msg)
 #if defined CONFIG_PRINTK
-DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
 /*
 * printk rate limiting, lifted from the networking subsystem.
 *
- * This enforces a rate limit: not more than one kernel message
+ * This enforces a rate limit: not more than 10 kernel messages
- * every printk_ratelimit_jiffies to make a denial-of-service
+ * every 5s to make a denial-of-service attack impossible.
- * attack impossible.
 */
+DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
 int printk_ratelimit(void)
 {
        return __ratelimit(&printk_ratelimit_state);
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 6f8696c502f4..aad93cdc9f68 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -91,8 +91,8 @@ static void force_quiescent_state(struct rcu_data *rdp,
                 * rdp->cpu is the current cpu.
                 *
                 * cpu_online_map is updated by the _cpu_down()
-                 * using stop_machine_run(). Since we're in irqs disabled
+                 * using __stop_machine(). Since we're in irqs disabled
-                 * section, stop_machine_run() is not exectuting, hence
+                 * section, __stop_machine() is not exectuting, hence
                 * the cpu_online_map is stable.
                 *
                 * However,  a cpu might have been offlined _just_ before
diff --git a/kernel/relay.c b/kernel/relay.c
index 04006ef970b8..8d13a7855c08 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -944,6 +944,10 @@ static void relay_file_read_consume(struct rchan_buf *buf,
        size_t n_subbufs = buf->chan->n_subbufs;
        size_t read_subbuf;
+        if (buf->subbufs_produced == buf->subbufs_consumed &&
+            buf->offset == buf->bytes_consumed)
+                return;
        if (buf->bytes_consumed + bytes_consumed > subbuf_size) {
                relay_subbufs_consumed(buf->chan, buf->cpu, 1);
                buf->bytes_consumed = 0;
@@ -975,6 +979,8 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
        relay_file_read_consume(buf, read_pos, 0);
+        consumed = buf->subbufs_consumed;
        if (unlikely(buf->offset > subbuf_size)) {
                if (produced == consumed)
                        return 0;
@@ -993,8 +999,12 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
        if (consumed > produced)
                produced += n_subbufs * subbuf_size;
-        if (consumed == produced)
+        if (consumed == produced) {
+                if (buf->offset == subbuf_size &&
+                    buf->subbufs_produced > buf->subbufs_consumed)
+                        return 1;
                return 0;
+        }
        return 1;
 }
diff --git a/kernel/resource.c b/kernel/resource.c
index 74af2d7cb5a1..f5b518eabefe 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -490,7 +490,7 @@ resource_size_t resource_alignment(struct resource *res)
 {
        switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) {
        case IORESOURCE_SIZEALIGN:
-                return res->end - res->start + 1;
+                return resource_size(res);
        case IORESOURCE_STARTALIGN:
                return res->start;
        default:
diff --git a/kernel/sched.c b/kernel/sched.c
index 0236958addcb..d601fb0406ca 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -600,7 +600,6 @@ struct rq {
        /* BKL stats */
        unsigned int bkl_count;
 #endif
-        struct lock_class_key rq_lock_key;
 };
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -834,7 +833,7 @@ static inline u64 global_rt_period(void)
 static inline u64 global_rt_runtime(void)
 {
-        if (sysctl_sched_rt_period < 0)
+        if (sysctl_sched_rt_runtime < 0)
                return RUNTIME_INF;
        return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
@@ -2759,10 +2758,10 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
        } else {
                if (rq1 < rq2) {
                        spin_lock(&rq1->lock);
-                        spin_lock(&rq2->lock);
+                        spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
                } else {
                        spin_lock(&rq2->lock);
-                        spin_lock(&rq1->lock);
+                        spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
                }
        }
        update_rq_clock(rq1);
@@ -2805,14 +2804,21 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
                if (busiest < this_rq) {
                        spin_unlock(&this_rq->lock);
                        spin_lock(&busiest->lock);
-                        spin_lock(&this_rq->lock);
+                        spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
                        ret = 1;
                } else
-                        spin_lock(&busiest->lock);
+                        spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
        }
        return ret;
 }
+static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+        __releases(busiest->lock)
+{
+        spin_unlock(&busiest->lock);
+        lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+}
 /*
 * If dest_cpu is allowed for this process, migrate the task to it.
 * This is accomplished by forcing the cpu_allowed mask to only
@@ -3637,7 +3643,7 @@ redo:
                ld_moved = move_tasks(this_rq, this_cpu, busiest,
                                        imbalance, sd, CPU_NEWLY_IDLE,
                                        &all_pinned);
-                spin_unlock(&busiest->lock);
+                double_unlock_balance(this_rq, busiest);
                if (unlikely(all_pinned)) {
                        cpu_clear(cpu_of(busiest), *cpus);
@@ -3752,7 +3758,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
                else
                        schedstat_inc(sd, alb_failed);
        }
-        spin_unlock(&target_rq->lock);
+        double_unlock_balance(busiest_rq, target_rq);
 }
 #ifdef CONFIG_NO_HZ
@@ -5004,19 +5010,21 @@ recheck:
                        return -EPERM;
        }
+        if (user) {
 #ifdef CONFIG_RT_GROUP_SCHED
-        /*
+                /*
-         * Do not allow realtime tasks into groups that have no runtime
+                 * Do not allow realtime tasks into groups that have no runtime
-         * assigned.
+                 * assigned.
-         */
+                 */
-        if (user
+                if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
-            && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
+                        return -EPERM;
-                return -EPERM;
 #endif
-        retval = security_task_setscheduler(p, policy, param);
+                retval = security_task_setscheduler(p, policy, param);
-        if (retval)
+                if (retval)
-                return retval;
+                        return retval;
+        }
        /*
         * make sure no PI-waiters arrive (or leave) while we are
         * changing the priority of the task:
@@ -7671,34 +7679,34 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 }
 #ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct sys_device *dev,
+static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
-                                struct sysdev_attribute *attr, char *page)
+                                           char *page)
 {
        return sprintf(page, "%u\n", sched_mc_power_savings);
 }
-static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
+static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
-                                            struct sysdev_attribute *attr,
                                            const char *buf, size_t count)
 {
        return sched_power_savings_store(buf, count, 0);
 }
-static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
+static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
-                   sched_mc_power_savings_store);
+                         sched_mc_power_savings_show,
+                         sched_mc_power_savings_store);
 #endif
 #ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct sys_device *dev,
+static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
-                                struct sysdev_attribute *attr, char *page)
+                                            char *page)
 {
        return sprintf(page, "%u\n", sched_smt_power_savings);
 }
-static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
+static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
-                                             struct sysdev_attribute *attr,
                                             const char *buf, size_t count)
 {
        return sched_power_savings_store(buf, count, 1);
 }
-static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
+static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
+                   sched_smt_power_savings_show,
                   sched_smt_power_savings_store);
 #endif
@@ -7998,7 +8006,6 @@ void __init sched_init(void)
                rq = cpu_rq(i);
                spin_lock_init(&rq->lock);
-                lockdep_set_class(&rq->lock, &rq->rq_lock_key);
                rq->nr_running = 0;
                init_cfs_rq(&rq->cfs, rq);
                init_rt_rq(&rq->rt, rq);
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 22ed55d1167f..204991a0bfa7 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -32,13 +32,19 @@
 #include <linux/ktime.h>
 #include <linux/module.h>
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ * This is default implementation.
+ * Architectures and sub-architectures can override this.
+ */
+unsigned long long __attribute__((weak)) sched_clock(void)
+{
+        return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
+}
-#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+static __read_mostly int sched_clock_running;
-#define MULTI_SHIFT 15
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-/* Max is double, Min is 1/2 */
-#define MAX_MULTI (2LL << MULTI_SHIFT)
-#define MIN_MULTI (1LL << (MULTI_SHIFT-1))
 struct sched_clock_data {
        /*
@@ -49,14 +55,9 @@ struct sched_clock_data {
        raw_spinlock_t          lock;
        unsigned long           tick_jiffies;
-        u64                     prev_raw;
        u64                     tick_raw;
        u64                     tick_gtod;
        u64                     clock;
-        s64                     multi;
-#ifdef CONFIG_NO_HZ
-        int                     check_max;
-#endif
 };
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
@@ -71,8 +72,6 @@ static inline struct sched_clock_data *cpu_sdc(int cpu)
        return &per_cpu(sched_clock_data, cpu);
 }
-static __read_mostly int sched_clock_running;
 void sched_clock_init(void)
 {
        u64 ktime_now = ktime_to_ns(ktime_get());
@@ -84,90 +83,39 @@ void sched_clock_init(void)
                scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
                scd->tick_jiffies = now_jiffies;
-                scd->prev_raw = 0;
                scd->tick_raw = 0;
                scd->tick_gtod = ktime_now;
                scd->clock = ktime_now;
-                scd->multi = 1 << MULTI_SHIFT;
-#ifdef CONFIG_NO_HZ
-                scd->check_max = 1;
-#endif
        }
        sched_clock_running = 1;
 }
-#ifdef CONFIG_NO_HZ
-/*
- * The dynamic ticks makes the delta jiffies inaccurate. This
- * prevents us from checking the maximum time update.
- * Disable the maximum check during stopped ticks.
- */
-void sched_clock_tick_stop(int cpu)
-{
-        struct sched_clock_data *scd = cpu_sdc(cpu);
-        scd->check_max = 0;
-}
-void sched_clock_tick_start(int cpu)
-{
-        struct sched_clock_data *scd = cpu_sdc(cpu);
-        scd->check_max = 1;
-}
-static int check_max(struct sched_clock_data *scd)
-{
-        return scd->check_max;
-}
-#else
-static int check_max(struct sched_clock_data *scd)
-{
-        return 1;
-}
-#endif /* CONFIG_NO_HZ */
 /*
 * update the percpu scd from the raw @now value
 *
 *  - filter out backward motion
 *  - use jiffies to generate a min,max window to clip the raw values
 */
-static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *time)
+static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
 {
        unsigned long now_jiffies = jiffies;
        long delta_jiffies = now_jiffies - scd->tick_jiffies;
        u64 clock = scd->clock;
        u64 min_clock, max_clock;
-        s64 delta = now - scd->prev_raw;
+        s64 delta = now - scd->tick_raw;
        WARN_ON_ONCE(!irqs_disabled());
+        min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC;
-        /*
-         * At schedule tick the clock can be just under the gtod. We don't
-         * want to push it too prematurely.
-         */
-        min_clock = scd->tick_gtod + (delta_jiffies * TICK_NSEC);
-        if (min_clock > TICK_NSEC)
-                min_clock -= TICK_NSEC / 2;
        if (unlikely(delta < 0)) {
                clock++;
                goto out;
        }
-        /*
+        max_clock = min_clock + TICK_NSEC;
-         * The clock must stay within a jiffie of the gtod.
-         * But since we may be at the start of a jiffy or the end of one
-         * we add another jiffy buffer.
-         */
-        max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC;
-        delta *= scd->multi;
-        delta >>= MULTI_SHIFT;
-        if (unlikely(clock + delta > max_clock) && check_max(scd)) {
+        if (unlikely(clock + delta > max_clock)) {
                if (clock < max_clock)
                        clock = max_clock;
                else
@@ -180,12 +128,10 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *tim
        if (unlikely(clock < min_clock))
                clock = min_clock;
-        if (time)
+        scd->tick_jiffies = now_jiffies;
-                *time = clock;
+        scd->clock = clock;
-        else {
-                scd->prev_raw = now;
+        return clock;
-                scd->clock = clock;
-        }
 }
 static void lock_double_clock(struct sched_clock_data *data1,
@@ -203,7 +149,7 @@ static void lock_double_clock(struct sched_clock_data *data1,
 u64 sched_clock_cpu(int cpu)
 {
        struct sched_clock_data *scd = cpu_sdc(cpu);
-        u64 now, clock;
+        u64 now, clock, this_clock, remote_clock;
        if (unlikely(!sched_clock_running))
                return 0ull;
@@ -212,43 +158,44 @@ u64 sched_clock_cpu(int cpu)
        now = sched_clock();
        if (cpu != raw_smp_processor_id()) {
-                /*
-                 * in order to update a remote cpu's clock based on our
-                 * unstable raw time rebase it against:
-                 *   tick_raw           (offset between raw counters)
-                 *   tick_gotd          (tick offset between cpus)
-                 */
                struct sched_clock_data *my_scd = this_scd();
                lock_double_clock(scd, my_scd);
-                now -= my_scd->tick_raw;
+                this_clock = __update_sched_clock(my_scd, now);
-                now += scd->tick_raw;
+                remote_clock = scd->clock;
-                now += my_scd->tick_gtod;
+                /*
-                now -= scd->tick_gtod;
+                 * Use the opportunity that we have both locks
+                 * taken to couple the two clocks: we take the
+                 * larger time as the latest time for both
+                 * runqueues. (this creates monotonic movement)
+                 */
+                if (likely(remote_clock < this_clock)) {
+                        clock = this_clock;
+                        scd->clock = clock;
+                } else {
+                        /*
+                         * Should be rare, but possible:
+                         */
+                        clock = remote_clock;
+                        my_scd->clock = remote_clock;
+                }
                __raw_spin_unlock(&my_scd->lock);
-                __update_sched_clock(scd, now, &clock);
-                __raw_spin_unlock(&scd->lock);
        } else {
                __raw_spin_lock(&scd->lock);
-                __update_sched_clock(scd, now, NULL);
+                clock = __update_sched_clock(scd, now);
-                clock = scd->clock;
-                __raw_spin_unlock(&scd->lock);
        }
+        __raw_spin_unlock(&scd->lock);
        return clock;
 }
 void sched_clock_tick(void)
 {
        struct sched_clock_data *scd = this_scd();
-        unsigned long now_jiffies = jiffies;
-        s64 mult, delta_gtod, delta_raw;
        u64 now, now_gtod;
        if (unlikely(!sched_clock_running))
@@ -260,29 +207,14 @@ void sched_clock_tick(void)
        now = sched_clock();
        __raw_spin_lock(&scd->lock);
-        __update_sched_clock(scd, now, NULL);
+        __update_sched_clock(scd, now);
        /*
         * update tick_gtod after __update_sched_clock() because that will
         * already observe 1 new jiffy; adding a new tick_gtod to that would
         * increase the clock 2 jiffies.
         */
-        delta_gtod = now_gtod - scd->tick_gtod;
-        delta_raw = now - scd->tick_raw;
-        if ((long)delta_raw > 0) {
-                mult = delta_gtod << MULTI_SHIFT;
-                do_div(mult, delta_raw);
-                scd->multi = mult;
-                if (scd->multi > MAX_MULTI)
-                        scd->multi = MAX_MULTI;
-                else if (scd->multi < MIN_MULTI)
-                        scd->multi = MIN_MULTI;
-        } else
-                scd->multi = 1 << MULTI_SHIFT;
        scd->tick_raw = now;
        scd->tick_gtod = now_gtod;
-        scd->tick_jiffies = now_jiffies;
        __raw_spin_unlock(&scd->lock);
 }
@@ -301,7 +233,6 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
 void sched_clock_idle_wakeup_event(u64 delta_ns)
 {
        struct sched_clock_data *scd = this_scd();
-        u64 now = sched_clock();
        /*
         * Override the previous timestamp and ignore all
@@ -310,27 +241,30 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
         * rq clock:
         */
        __raw_spin_lock(&scd->lock);
-        scd->prev_raw = now;
        scd->clock += delta_ns;
-        scd->multi = 1 << MULTI_SHIFT;
        __raw_spin_unlock(&scd->lock);
        touch_softlockup_watchdog();
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
-#endif
+#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
-/*
+void sched_clock_init(void)
- * Scheduler clock - returns current time in nanosec units.
- * This is default implementation.
- * Architectures and sub-architectures can override this.
- */
-unsigned long long __attribute__((weak)) sched_clock(void)
 {
-        return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
+        sched_clock_running = 1;
 }
+u64 sched_clock_cpu(int cpu)
+{
+        if (unlikely(!sched_clock_running))
+                return 0;
+        return sched_clock();
+}
+#endif
 unsigned long long cpu_clock(int cpu)
 {
        unsigned long long clock;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index cf2cd6ce4cb2..fb8994c6d4bb 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -899,7 +899,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
                 * doesn't make sense. Rely on vruntime for fairness.
                 */
                if (rq->curr != p)
-                        delta = max(10000LL, delta);
+                        delta = max_t(s64, 10000LL, delta);
                hrtick_start(rq, delta);
        }
@@ -1442,18 +1442,23 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
        struct task_struct *p = NULL;
        struct sched_entity *se;
-        while (next != &cfs_rq->tasks) {
+        if (next == &cfs_rq->tasks)
+                return NULL;
+        /* Skip over entities that are not tasks */
+        do {
                se = list_entry(next, struct sched_entity, group_node);
                next = next->next;
+        } while (next != &cfs_rq->tasks && !entity_is_task(se));
-                /* Skip over entities that are not tasks */
+        if (next == &cfs_rq->tasks)
-                if (entity_is_task(se)) {
+                return NULL;
-                        p = task_of(se);
-                        break;
-                }
-        }
        cfs_rq->balance_iterator = next;
+        if (entity_is_task(se))
+                p = task_of(se);
        return p;
 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 908c04f9dad0..6163e4cf885b 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -861,6 +861,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 #define RT_MAX_TRIES 3
 static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
+static void double_unlock_balance(struct rq *this_rq, struct rq *busiest);
 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
@@ -1022,7 +1024,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
                        break;
                /* try again */
-                spin_unlock(&lowest_rq->lock);
+                double_unlock_balance(rq, lowest_rq);
                lowest_rq = NULL;
        }
@@ -1091,7 +1093,7 @@ static int push_rt_task(struct rq *rq)
        resched_task(lowest_rq->curr);
-        spin_unlock(&lowest_rq->lock);
+        double_unlock_balance(rq, lowest_rq);
        ret = 1;
 out:
@@ -1197,7 +1199,7 @@ static int pull_rt_task(struct rq *this_rq)
                }
 skip:
-                spin_unlock(&src_rq->lock);
+                double_unlock_balance(this_rq, src_rq);
        }
        return ret;
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index aaaeae8244e7..94a62c0d4ade 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -212,9 +212,7 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
        waiter.up = 0;
        for (;;) {
-                if (state == TASK_INTERRUPTIBLE && signal_pending(task))
+                if (signal_pending_state(state, task))
-                        goto interrupted;
-                if (state == TASK_KILLABLE && fatal_signal_pending(task))
                        goto interrupted;
                if (timeout <= 0)
                        goto timed_out;
diff --git a/kernel/signal.c b/kernel/signal.c
index 954f77d7e3bc..c539f60c6f41 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1304,6 +1304,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
                q->info.si_overrun++;
                goto out;
        }
+        q->info.si_overrun = 0;
        signalfd_notify(t, sig);
        pending = group ? &t->signal->shared_pending : &t->pending;
diff --git a/kernel/smp.c b/kernel/smp.c
index 96fc7c0edc59..782e2b93e465 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -135,7 +135,8 @@ void generic_smp_call_function_interrupt(void)
                         */
                        smp_wmb();
                        data->csd.flags &= ~CSD_FLAG_WAIT;
-                } else
+                }
+                if (data->csd.flags & CSD_FLAG_ALLOC)
                        call_rcu(&data->rcu_head, rcu_free_call_data);
        }
        rcu_read_unlock();
@@ -260,6 +261,42 @@ void __smp_call_function_single(int cpu, struct call_single_data *data)
        generic_exec_single(cpu, data);
 }
+/* Dummy function */
+static void quiesce_dummy(void *unused)
+{
+}
+/*
+ * Ensure stack based data used in call function mask is safe to free.
+ *
+ * This is needed by smp_call_function_mask when using on-stack data, because
+ * a single call function queue is shared by all CPUs, and any CPU may pick up
+ * the data item on the queue at any time before it is deleted. So we need to
+ * ensure that all CPUs have transitioned through a quiescent state after
+ * this call.
+ *
+ * This is a very slow function, implemented by sending synchronous IPIs to
+ * all possible CPUs. For this reason, we have to alloc data rather than use
+ * stack based data even in the case of synchronous calls. The stack based
+ * data is then just used for deadlock/oom fallback which will be very rare.
+ *
+ * If a faster scheme can be made, we could go back to preferring stack based
+ * data -- the data allocation/free is non-zero cost.
+ */
+static void smp_call_function_mask_quiesce_stack(cpumask_t mask)
+{
+        struct call_single_data data;
+        int cpu;
+        data.func = quiesce_dummy;
+        data.info = NULL;
+        for_each_cpu_mask(cpu, mask) {
+                data.flags = CSD_FLAG_WAIT;
+                generic_exec_single(cpu, &data);
+        }
+}
 /**
 * smp_call_function_mask(): Run a function on a set of other CPUs.
 * @mask: The set of cpus to run on.
@@ -285,6 +322,7 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
        cpumask_t allbutself;
        unsigned long flags;
        int cpu, num_cpus;
+        int slowpath = 0;
        /* Can deadlock when called with interrupts disabled */
        WARN_ON(irqs_disabled());
@@ -306,15 +344,16 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
                return smp_call_function_single(cpu, func, info, wait);
        }
-        if (!wait) {
+        data = kmalloc(sizeof(*data), GFP_ATOMIC);
-                data = kmalloc(sizeof(*data), GFP_ATOMIC);
+        if (data) {
-                if (data)
+                data->csd.flags = CSD_FLAG_ALLOC;
-                        data->csd.flags = CSD_FLAG_ALLOC;
+                if (wait)
-        }
+                        data->csd.flags |= CSD_FLAG_WAIT;
-        if (!data) {
+        } else {
                data = &d;
                data->csd.flags = CSD_FLAG_WAIT;
                wait = 1;
+                slowpath = 1;
        }
        spin_lock_init(&data->lock);
@@ -331,8 +370,11 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
        arch_send_call_function_ipi(mask);
        /* optionally wait for the CPUs to complete */
-        if (wait)
+        if (wait) {
                csd_flag_wait(&data->csd);
+                if (unlikely(slowpath))
+                        smp_call_function_mask_quiesce_stack(mask);
+        }
        return 0;
 }
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index a1fb54c93cdd..44baeea94ab9 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -292,6 +292,7 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
 }
 EXPORT_SYMBOL(_spin_lock_nested);
 unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass)
 {
        unsigned long flags;
@@ -314,6 +315,16 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas
 EXPORT_SYMBOL(_spin_lock_irqsave_nested);
+void __lockfunc _spin_lock_nest_lock(spinlock_t *lock,
+                                     struct lockdep_map *nest_lock)
+{
+        preempt_disable();
+        spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
+        LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
+}
+EXPORT_SYMBOL(_spin_lock_nest_lock);
 #endif
 void __lockfunc _spin_unlock(spinlock_t *lock)
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 738b411ff2d3..af3c7cea258b 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,4 +1,4 @@
-/* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
+/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
 * GPL v2 and any later version.
 */
 #include <linux/cpu.h>
@@ -13,204 +13,177 @@
 #include <asm/atomic.h>
 #include <asm/uaccess.h>
-/* Since we effect priority and affinity (both of which are visible
+/* This controls the threads on each CPU. */
- * to, and settable by outside processes) we do indirection via a
- * kthread. */
-/* Thread to stop each CPU in user context. */
 enum stopmachine_state {
-        STOPMACHINE_WAIT,
+        /* Dummy starting state for thread. */
+        STOPMACHINE_NONE,
+        /* Awaiting everyone to be scheduled. */
        STOPMACHINE_PREPARE,
+        /* Disable interrupts. */
        STOPMACHINE_DISABLE_IRQ,
+        /* Run the function */
+        STOPMACHINE_RUN,
+        /* Exit */
        STOPMACHINE_EXIT,
 };
+static enum stopmachine_state state;
-static enum stopmachine_state stopmachine_state;
+struct stop_machine_data {
-static unsigned int stopmachine_num_threads;
+        int (*fn)(void *);
-static atomic_t stopmachine_thread_ack;
+        void *data;
+        int fnret;
-static int stopmachine(void *cpu)
+};
-{
-        int irqs_disabled = 0;
-        int prepared = 0;
-        cpumask_of_cpu_ptr(cpumask, (int)(long)cpu);
-        set_cpus_allowed_ptr(current, cpumask);
-        /* Ack: we are alive */
-        smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
-        atomic_inc(&stopmachine_thread_ack);
-        /* Simple state machine */
-        while (stopmachine_state != STOPMACHINE_EXIT) {
-                if (stopmachine_state == STOPMACHINE_DISABLE_IRQ 
-                    && !irqs_disabled) {
-                        local_irq_disable();
-                        hard_irq_disable();
-                        irqs_disabled = 1;
-                        /* Ack: irqs disabled. */
-                        smp_mb(); /* Must read state first. */
-                        atomic_inc(&stopmachine_thread_ack);
-                } else if (stopmachine_state == STOPMACHINE_PREPARE
-                           && !prepared) {
-                        /* Everyone is in place, hold CPU. */
-                        preempt_disable();
-                        prepared = 1;
-                        smp_mb(); /* Must read state first. */
-                        atomic_inc(&stopmachine_thread_ack);
-                }
-                /* Yield in first stage: migration threads need to
-                 * help our sisters onto their CPUs. */
-                if (!prepared && !irqs_disabled)
-                        yield();
-                cpu_relax();
-        }
-        /* Ack: we are exiting. */
-        smp_mb(); /* Must read state first. */
-        atomic_inc(&stopmachine_thread_ack);
-        if (irqs_disabled)
-                local_irq_enable();
-        if (prepared)
-                preempt_enable();
-        return 0;
+/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
-}
+static unsigned int num_threads;
+static atomic_t thread_ack;
+static struct completion finished;
+static DEFINE_MUTEX(lock);
-/* Change the thread state */
+static void set_state(enum stopmachine_state newstate)
-static void stopmachine_set_state(enum stopmachine_state state)
 {
-        atomic_set(&stopmachine_thread_ack, 0);
+        /* Reset ack counter. */
+        atomic_set(&thread_ack, num_threads);
        smp_wmb();
-        stopmachine_state = state;
+        state = newstate;
-        while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
-                cpu_relax();
 }
-static int stop_machine(void)
+/* Last one to ack a state moves to the next state. */
+static void ack_state(void)
 {
-        int i, ret = 0;
+        if (atomic_dec_and_test(&thread_ack)) {
+                /* If we're the last one to ack the EXIT, we're finished. */
-        atomic_set(&stopmachine_thread_ack, 0);
+                if (state == STOPMACHINE_EXIT)
-        stopmachine_num_threads = 0;
+                        complete(&finished);
-        stopmachine_state = STOPMACHINE_WAIT;
+                else
+                        set_state(state + 1);
-        for_each_online_cpu(i) {
-                if (i == raw_smp_processor_id())
-                        continue;
-                ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL);
-                if (ret < 0)
-                        break;
-                stopmachine_num_threads++;
-        }
-        /* Wait for them all to come to life. */
-        while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) {
-                yield();
-                cpu_relax();
        }
+}
-        /* If some failed, kill them all. */
+/* This is the actual thread which stops the CPU.  It exits by itself rather
-        if (ret < 0) {
+ * than waiting for kthread_stop(), because it's easier for hotplug CPU. */
-                stopmachine_set_state(STOPMACHINE_EXIT);
+static int stop_cpu(struct stop_machine_data *smdata)
-                return ret;
+{
-        }
+        enum stopmachine_state curstate = STOPMACHINE_NONE;
-        /* Now they are all started, make them hold the CPUs, ready. */
+        /* Simple state machine */
-        preempt_disable();
+        do {
-        stopmachine_set_state(STOPMACHINE_PREPARE);
+                /* Chill out and ensure we re-read stopmachine_state. */
+                cpu_relax();
+                if (state != curstate) {
+                        curstate = state;
+                        switch (curstate) {
+                        case STOPMACHINE_DISABLE_IRQ:
+                                local_irq_disable();
+                                hard_irq_disable();
+                                break;
+                        case STOPMACHINE_RUN:
+                                /* |= allows error detection if functions on
+                                 * multiple CPUs. */
+                                smdata->fnret |= smdata->fn(smdata->data);
+                                break;
+                        default:
+                                break;
+                        }
+                        ack_state();
+                }
+        } while (curstate != STOPMACHINE_EXIT);
-        /* Make them disable irqs. */
+        local_irq_enable();
-        local_irq_disable();
+        do_exit(0);
-        hard_irq_disable();
+}
-        stopmachine_set_state(STOPMACHINE_DISABLE_IRQ);
+/* Callback for CPUs which aren't supposed to do anything. */
+static int chill(void *unused)
+{
        return 0;
 }
-static void restart_machine(void)
+int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
 {
-        stopmachine_set_state(STOPMACHINE_EXIT);
+        int i, err;
-        local_irq_enable();
+        struct stop_machine_data active, idle;
-        preempt_enable_no_resched();
+        struct task_struct **threads;
-}
+        active.fn = fn;
+        active.data = data;
+        active.fnret = 0;
+        idle.fn = chill;
+        idle.data = NULL;
+        /* This could be too big for stack on large machines. */
+        threads = kcalloc(NR_CPUS, sizeof(threads[0]), GFP_KERNEL);
+        if (!threads)
+                return -ENOMEM;
+        /* Set up initial state. */
+        mutex_lock(&lock);
+        init_completion(&finished);
+        num_threads = num_online_cpus();
+        set_state(STOPMACHINE_PREPARE);
-struct stop_machine_data {
+        for_each_online_cpu(i) {
-        int (*fn)(void *);
+                struct stop_machine_data *smdata = &idle;
-        void *data;
+                struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
-        struct completion done;
-};
-static int do_stop(void *_smdata)
+                if (!cpus) {
-{
+                        if (i == first_cpu(cpu_online_map))
-        struct stop_machine_data *smdata = _smdata;
+                                smdata = &active;
-        int ret;
+                } else {
+                        if (cpu_isset(i, *cpus))
+                                smdata = &active;
+                }
-        ret = stop_machine();
+                threads[i] = kthread_create((void *)stop_cpu, smdata, "kstop%u",
-        if (ret == 0) {
+                                            i);
-                ret = smdata->fn(smdata->data);
+                if (IS_ERR(threads[i])) {
-                restart_machine();
+                        err = PTR_ERR(threads[i]);
-        }
+                        threads[i] = NULL;
+                        goto kill_threads;
+                }
-        /* We're done: you can kthread_stop us now */
+                /* Place it onto correct cpu. */
-        complete(&smdata->done);
+                kthread_bind(threads[i], i);
-        /* Wait for kthread_stop */
+                /* Make it highest prio. */
-        set_current_state(TASK_INTERRUPTIBLE);
+                if (sched_setscheduler_nocheck(threads[i], SCHED_FIFO, &param))
-        while (!kthread_should_stop()) {
+                        BUG();
-                schedule();
-                set_current_state(TASK_INTERRUPTIBLE);
        }
-        __set_current_state(TASK_RUNNING);
-        return ret;
-}
-struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
+        /* We've created all the threads.  Wake them all: hold this CPU so one
-                                       unsigned int cpu)
+         * doesn't hit this CPU until we're ready. */
-{
+        get_cpu();
-        static DEFINE_MUTEX(stopmachine_mutex);
+        for_each_online_cpu(i)
-        struct stop_machine_data smdata;
+                wake_up_process(threads[i]);
-        struct task_struct *p;
-        smdata.fn = fn;
+        /* This will release the thread on our CPU. */
-        smdata.data = data;
+        put_cpu();
-        init_completion(&smdata.done);
+        wait_for_completion(&finished);
+        mutex_unlock(&lock);
-        mutex_lock(&stopmachine_mutex);
+        kfree(threads);
-        /* If they don't care which CPU fn runs on, bind to any online one. */
+        return active.fnret;
-        if (cpu == NR_CPUS)
-                cpu = raw_smp_processor_id();
-        p = kthread_create(do_stop, &smdata, "kstopmachine");
+kill_threads:
-        if (!IS_ERR(p)) {
+        for_each_online_cpu(i)
-                struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+                if (threads[i])
+                        kthread_stop(threads[i]);
+        mutex_unlock(&lock);
-                /* One high-prio thread per cpu.  We'll do this one. */
+        kfree(threads);
-                sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
+        return err;
-                kthread_bind(p, cpu);
-                wake_up_process(p);
-                wait_for_completion(&smdata.done);
-        }
-        mutex_unlock(&stopmachine_mutex);
-        return p;
 }
-int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
+int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
 {
-        struct task_struct *p;
        int ret;
        /* No CPUs can come up or down during this. */
        get_online_cpus();
-        p = __stop_machine_run(fn, data, cpu);
+        ret = __stop_machine(fn, data, cpus);
-        if (!IS_ERR(p))
-                ret = kthread_stop(p);
-        else
-                ret = PTR_ERR(p);
        put_online_cpus();
        return ret;
 }
-EXPORT_SYMBOL_GPL(stop_machine_run);
+EXPORT_SYMBOL_GPL(stop_machine);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index bf43284d6855..80c4336f4188 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -196,12 +196,10 @@ static int tick_check_new_device(struct clock_event_device *newdev)
        struct tick_device *td;
        int cpu, ret = NOTIFY_OK;
        unsigned long flags;
-        cpumask_of_cpu_ptr_declare(cpumask);
        spin_lock_irqsave(&tick_device_lock, flags);
        cpu = smp_processor_id();
-        cpumask_of_cpu_ptr_next(cpumask, cpu);
        if (!cpu_isset(cpu, newdev->cpumask))
                goto out_bc;
@@ -209,7 +207,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
        curdev = td->evtdev;
        /* cpu local device ? */
-        if (!cpus_equal(newdev->cpumask, *cpumask)) {
+        if (!cpus_equal(newdev->cpumask, cpumask_of_cpu(cpu))) {
                /*
                 * If the cpu affinity of the device interrupt can not
@@ -222,7 +220,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
                 * If we have a cpu local device already, do not replace it
                 * by a non cpu local device
                 */
-                if (curdev && cpus_equal(curdev->cpumask, *cpumask))
+                if (curdev && cpus_equal(curdev->cpumask, cpumask_of_cpu(cpu)))
                        goto out_bc;
        }
@@ -254,7 +252,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
                curdev = NULL;
        }
        clockevents_exchange_device(curdev, newdev);
-        tick_setup_device(td, newdev, cpu, cpumask);
+        tick_setup_device(td, newdev, cpu, &cpumask_of_cpu(cpu));
        if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
                tick_oneshot_notify();
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 825b4c00fe44..f5da526424a9 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -289,7 +289,6 @@ void tick_nohz_stop_sched_tick(int inidle)
                        ts->tick_stopped = 1;
                        ts->idle_jiffies = last_jiffies;
                        rcu_enter_nohz();
-                        sched_clock_tick_stop(cpu);
                }
                /*
@@ -392,7 +391,6 @@ void tick_nohz_restart_sched_tick(void)
        select_nohz_load_balancer(0);
        now = ktime_get();
        tick_do_update_jiffies64(now);
-        sched_clock_tick_start(cpu);
        cpu_clear(cpu, nohz_cpu_mask);
        /*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4231a3dc224a..f6e3af31b403 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -587,7 +587,7 @@ static int __ftrace_modify_code(void *data)
 static void ftrace_run_update_code(int command)
 {
-        stop_machine_run(__ftrace_modify_code, &command, NR_CPUS);
+        stop_machine(__ftrace_modify_code, &command, NULL);
 }
 void ftrace_disable_daemon(void)
@@ -787,7 +787,7 @@ static int ftrace_update_code(void)
            !ftrace_enabled || !ftraced_trigger)
                return 0;
-        stop_machine_run(__ftrace_update_code, NULL, NR_CPUS);
+        stop_machine(__ftrace_update_code, NULL, NULL);
        return 1;
 }
@@ -1564,7 +1564,7 @@ static int __init ftrace_dynamic_init(void)
        addr = (unsigned long)ftrace_record_ip;
-        stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS);
+        stop_machine(ftrace_dyn_arch_init, &addr, NULL);
        /* ftrace_dyn_arch_init places the return code in addr */
        if (addr) {
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index ce2d723c10e1..bb948e52ce20 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -213,9 +213,7 @@ static void start_stack_timers(void)
        int cpu;
        for_each_online_cpu(cpu) {
-                cpumask_of_cpu_ptr(new_mask, cpu);
+                set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
-                set_cpus_allowed_ptr(current, new_mask);
                start_stack_timer(cpu);
        }
        set_cpus_allowed_ptr(current, &saved_mask);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ec7e4f62aaff..4048e92aa04f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -290,11 +290,11 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
                BUG_ON(get_wq_data(work) != cwq);
                work_clear_pending(work);
-                lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+                lock_map_acquire(&cwq->wq->lockdep_map);
-                lock_acquire(&lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+                lock_map_acquire(&lockdep_map);
                f(work);
-                lock_release(&lockdep_map, 1, _THIS_IP_);
+                lock_map_release(&lockdep_map);
-                lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
+                lock_map_release(&cwq->wq->lockdep_map);
                if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
                        printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
@@ -413,8 +413,8 @@ void flush_workqueue(struct workqueue_struct *wq)
        int cpu;
        might_sleep();
-        lock_acquire(&wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+        lock_map_acquire(&wq->lockdep_map);
-        lock_release(&wq->lockdep_map, 1, _THIS_IP_);
+        lock_map_release(&wq->lockdep_map);
        for_each_cpu_mask_nr(cpu, *cpu_map)
                flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
 }
@@ -441,8 +441,8 @@ int flush_work(struct work_struct *work)
        if (!cwq)
                return 0;
-        lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+        lock_map_acquire(&cwq->wq->lockdep_map);
-        lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
+        lock_map_release(&cwq->wq->lockdep_map);
        prev = NULL;
        spin_lock_irq(&cwq->lock);
@@ -536,8 +536,8 @@ static void wait_on_work(struct work_struct *work)
        might_sleep();
-        lock_acquire(&work->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+        lock_map_acquire(&work->lockdep_map);
-        lock_release(&work->lockdep_map, 1, _THIS_IP_);
+        lock_map_release(&work->lockdep_map);
        cwq = get_wq_data(work);
        if (!cwq)
@@ -830,10 +830,21 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
                start_workqueue_thread(cwq, -1);
        } else {
                cpu_maps_update_begin();
+                /*
+                 * We must place this wq on list even if the code below fails.
+                 * cpu_down(cpu) can remove cpu from cpu_populated_map before
+                 * destroy_workqueue() takes the lock, in that case we leak
+                 * cwq[cpu]->thread.
+                 */
                spin_lock(&workqueue_lock);
                list_add(&wq->list, &workqueues);
                spin_unlock(&workqueue_lock);
+                /*
+                 * We must initialize cwqs for each possible cpu even if we
+                 * are going to call destroy_workqueue() finally. Otherwise
+                 * cpu_up() can hit the uninitialized cwq once we drop the
+                 * lock.
+                 */
                for_each_possible_cpu(cpu) {
                        cwq = init_cpu_workqueue(wq, cpu);
                        if (err || !cpu_online(cpu))
@@ -861,8 +872,8 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
        if (cwq->thread == NULL)
                return;
-        lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+        lock_map_acquire(&cwq->wq->lockdep_map);
-        lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
+        lock_map_release(&cwq->wq->lockdep_map);
        flush_cpu_workqueue(cwq);
        /*
author	Ingo Molnar <mingo@elte.hu>	2008-08-15 10:16:15 -0400
committer	Ingo Molnar <mingo@elte.hu>	2008-08-15 10:16:15 -0400
commit	1a10390708d675ebf1a2f5e169a5165626afbd88 (patch)
tree	d9ee7d10abd65e580fb74152a501089f51174225 /kernel
parent	239bd83104ec6bcba90221d8b0973d2565142ef8 (diff)
parent	b635acec48bcaa9183fcbf4e3955616b0d4119b5 (diff)