Merge branch 'master' into for-next

author: Jiri Kosina <jkosina@suse.cz> 2011-02-15 04:24:31 -0500
committer: Jiri Kosina <jkosina@suse.cz> 2011-02-15 04:24:31 -0500
commit: 0a9d59a2461477bd9ed143c01af9df3f8f00fa81 (patch)
tree: df997d1cfb0786427a0df1fbd6f0640fa4248cf4 /kernel
parent: a23ce6da9677d245aa0aadc99f4197030350ab54 (diff)
parent: 795abaf1e4e188c4171e3cd3dbb11a9fcacaf505 (diff)
42 files changed, 681 insertions, 626 deletions
diff --git a/kernel/capability.c b/kernel/capability.c
index 2f05303715a5..9e9385f132c8 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -306,7 +306,7 @@ int capable(int cap)
                BUG();
        }
-        if (security_capable(cap) == 0) {
+        if (security_capable(current_cred(), cap) == 0) {
                current->flags |= PF_SUPERPRIV;
                return 1;
        }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 5c5f4cc2e99a..b24d7027b83c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -764,6 +764,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
 */
 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
+static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
 static int cgroup_populate_dir(struct cgroup *cgrp);
 static const struct inode_operations cgroup_dir_inode_operations;
@@ -860,6 +861,11 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
        iput(inode);
 }
+static int cgroup_delete(const struct dentry *d)
+{
+        return 1;
+}
 static void remove_dir(struct dentry *d)
 {
        struct dentry *parent = dget(d->d_parent);
@@ -910,7 +916,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
        parent = dentry->d_parent;
        spin_lock(&parent->d_lock);
-        spin_lock(&dentry->d_lock);
+        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
        list_del_init(&dentry->d_u.d_child);
        spin_unlock(&dentry->d_lock);
        spin_unlock(&parent->d_lock);
@@ -1451,6 +1457,7 @@ static int cgroup_get_rootdir(struct super_block *sb)
 {
        static const struct dentry_operations cgroup_dops = {
                .d_iput = cgroup_diput,
+                .d_delete = cgroup_delete,
        };
        struct inode *inode =
@@ -2195,12 +2202,20 @@ static const struct file_operations cgroup_file_operations = {
 };
 static const struct inode_operations cgroup_dir_inode_operations = {
-        .lookup = simple_lookup,
+        .lookup = cgroup_lookup,
        .mkdir = cgroup_mkdir,
        .rmdir = cgroup_rmdir,
        .rename = cgroup_rename,
 };
+static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+        if (dentry->d_name.len > NAME_MAX)
+                return ERR_PTR(-ENAMETOOLONG);
+        d_add(dentry, NULL);
+        return NULL;
+}
 /*
 * Check if a file is a control file
 */
diff --git a/kernel/cred.c b/kernel/cred.c
index 6a1aa004e376..3a9d6dd53a6c 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -252,13 +252,13 @@ struct cred *cred_alloc_blank(void)
 #endif
        atomic_set(&new->usage, 1);
+#ifdef CONFIG_DEBUG_CREDENTIALS
+        new->magic = CRED_MAGIC;
+#endif
        if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
                goto error;
-#ifdef CONFIG_DEBUG_CREDENTIALS
-        new->magic = CRED_MAGIC;
-#endif
        return new;
 error:
@@ -657,6 +657,8 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
        validate_creds(old);
        *new = *old;
+        atomic_set(&new->usage, 1);
+        set_cred_subscribers(new, 0);
        get_uid(new->user);
        get_group_info(new->group_info);
@@ -674,8 +676,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
        if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
                goto error;
-        atomic_set(&new->usage, 1);
-        set_cred_subscribers(new, 0);
        put_cred(old);
        validate_creds(new);
        return new;
@@ -748,7 +748,11 @@ bool creds_are_invalid(const struct cred *cred)
        if (cred->magic != CRED_MAGIC)
                return true;
 #ifdef CONFIG_SECURITY_SELINUX
-        if (selinux_is_enabled()) {
+        /*
+         * cred->security == NULL if security_cred_alloc_blank() or
+         * security_prepare_creds() returned an error.
+         */
+        if (selinux_is_enabled() && cred->security) {
                if ((unsigned long) cred->security < PAGE_SIZE)
                        return true;
                if ((*(u32 *)cred->security & 0xffffff00) ==
diff --git a/kernel/fork.c b/kernel/fork.c
index d9b44f20b6b0..25e429152ddc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -66,6 +66,7 @@
 #include <linux/posix-timers.h>
 #include <linux/user-return-notifier.h>
 #include <linux/oom.h>
+#include <linux/khugepaged.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -330,6 +331,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
        retval = ksm_fork(mm, oldmm);
        if (retval)
                goto out;
+        retval = khugepaged_fork(mm, oldmm);
+        if (retval)
+                goto out;
        prev = NULL;
        for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
@@ -529,6 +533,9 @@ void __mmdrop(struct mm_struct *mm)
        mm_free_pgd(mm);
        destroy_context(mm);
        mmu_notifier_mm_destroy(mm);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        VM_BUG_ON(mm->pmd_huge_pte);
+#endif
        free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
@@ -543,6 +550,7 @@ void mmput(struct mm_struct *mm)
        if (atomic_dec_and_test(&mm->mm_users)) {
                exit_aio(mm);
                ksm_exit(mm);
+                khugepaged_exit(mm); /* must run before exit_mmap */
                exit_mmap(mm);
                set_mm_exe_file(mm, NULL);
                if (!list_empty(&mm->mmlist)) {
@@ -669,6 +677,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
        mm->token_priority = 0;
        mm->last_interval = 0;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        mm->pmd_huge_pte = NULL;
+#endif
        if (!mm_init(mm, tsk))
                goto fail_nomem;
@@ -910,6 +922,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        sig->oom_adj = current->signal->oom_adj;
        sig->oom_score_adj = current->signal->oom_score_adj;
+        sig->oom_score_adj_min = current->signal->oom_score_adj_min;
        mutex_init(&sig->cred_guard_mutex);
@@ -1410,23 +1423,6 @@ long do_fork(unsigned long clone_flags,
        }
        /*
-         * We hope to recycle these flags after 2.6.26
-         */
-        if (unlikely(clone_flags & CLONE_STOPPED)) {
-                static int __read_mostly count = 100;
-                if (count > 0 && printk_ratelimit()) {
-                        char comm[TASK_COMM_LEN];
-                        count--;
-                        printk(KERN_INFO "fork(): process `%s' used deprecated "
-                                        "clone flags 0x%lx\n",
-                                get_task_comm(comm, current),
-                                clone_flags & CLONE_STOPPED);
-                }
-        }
-        /*
         * When called from kernel_thread, don't do user tracing stuff.
         */
        if (likely(user_mode(regs)))
@@ -1464,16 +1460,7 @@ long do_fork(unsigned long clone_flags,
                 */
                p->flags &= ~PF_STARTING;
-                if (unlikely(clone_flags & CLONE_STOPPED)) {
+                wake_up_new_task(p, clone_flags);
-                        /*
-                         * We'll start up with an immediate SIGSTOP.
-                         */
-                        sigaddset(&p->pending.signal, SIGSTOP);
-                        set_tsk_thread_flag(p, TIF_SIGPENDING);
-                        __set_task_state(p, TASK_STOPPED);
-                } else {
-                        wake_up_new_task(p, clone_flags);
-                }
                tracehook_report_clone_complete(trace, regs,
                                                clone_flags, nr, p);
diff --git a/kernel/futex.c b/kernel/futex.c
index 3019b92e6917..b766d28accd6 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -233,7 +233,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
 {
        unsigned long address = (unsigned long)uaddr;
        struct mm_struct *mm = current->mm;
-        struct page *page;
+        struct page *page, *page_head;
        int err;
        /*
@@ -265,11 +265,46 @@ again:
        if (err < 0)
                return err;
-        page = compound_head(page);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-        lock_page(page);
+        page_head = page;
-        if (!page->mapping) {
+        if (unlikely(PageTail(page))) {
-                unlock_page(page);
                put_page(page);
+                /* serialize against __split_huge_page_splitting() */
+                local_irq_disable();
+                if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
+                        page_head = compound_head(page);
+                        /*
+                         * page_head is valid pointer but we must pin
+                         * it before taking the PG_lock and/or
+                         * PG_compound_lock. The moment we re-enable
+                         * irqs __split_huge_page_splitting() can
+                         * return and the head page can be freed from
+                         * under us. We can't take the PG_lock and/or
+                         * PG_compound_lock on a page that could be
+                         * freed from under us.
+                         */
+                        if (page != page_head) {
+                                get_page(page_head);
+                                put_page(page);
+                        }
+                        local_irq_enable();
+                } else {
+                        local_irq_enable();
+                        goto again;
+                }
+        }
+#else
+        page_head = compound_head(page);
+        if (page != page_head) {
+                get_page(page_head);
+                put_page(page);
+        }
+#endif
+        lock_page(page_head);
+        if (!page_head->mapping) {
+                unlock_page(page_head);
+                put_page(page_head);
                goto again;
        }
@@ -280,20 +315,20 @@ again:
         * it's a read-only handle, it's expected that futexes attach to
         * the object not the particular process.
         */
-        if (PageAnon(page)) {
+        if (PageAnon(page_head)) {
                key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
                key->private.mm = mm;
                key->private.address = address;
        } else {
                key->both.offset |= FUT_OFF_INODE; /* inode-based key */
-                key->shared.inode = page->mapping->host;
+                key->shared.inode = page_head->mapping->host;
-                key->shared.pgoff = page->index;
+                key->shared.pgoff = page_head->index;
        }
        get_futex_key_refs(key);
-        unlock_page(page);
+        unlock_page(page_head);
-        put_page(page);
+        put_page(page_head);
        return 0;
 }
@@ -791,10 +826,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
        new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
        /*
-         * This happens when we have stolen the lock and the original
+         * It is possible that the next waiter (the one that brought
-         * pending owner did not enqueue itself back on the rt_mutex.
+         * this owner to the kernel) timed out and is no longer
-         * Thats not a tragedy. We know that way, that a lock waiter
+         * waiting on the lock.
-         * is on the fly. We make the futex_q waiter the pending owner.
         */
        if (!new_owner)
                new_owner = this->task;
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 31d766bf5d2e..8e42fec7686d 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -9,9 +9,6 @@ menu "IRQ subsystem"
 config GENERIC_HARDIRQS
       def_bool y
-config GENERIC_HARDIRQS_NO__DO_IRQ
-       def_bool y
 # Select this to disable the deprecated stuff
 config GENERIC_HARDIRQS_NO_DEPRECATED
       def_bool n
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index e2347eb63306..3540a7190122 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -118,114 +118,3 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
        return retval;
 }
-#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
-#ifdef CONFIG_ENABLE_WARN_DEPRECATED
-# warning __do_IRQ is deprecated. Please convert to proper flow handlers
-#endif
-/**
- * __do_IRQ - original all in one highlevel IRQ handler
- * @irq:        the interrupt number
- *
- * __do_IRQ handles all normal device IRQ's (the special
- * SMP cross-CPU interrupts have their own specific
- * handlers).
- *
- * This is the original x86 implementation which is used for every
- * interrupt type.
- */
-unsigned int __do_IRQ(unsigned int irq)
-{
-        struct irq_desc *desc = irq_to_desc(irq);
-        struct irqaction *action;
-        unsigned int status;
-        kstat_incr_irqs_this_cpu(irq, desc);
-        if (CHECK_IRQ_PER_CPU(desc->status)) {
-                irqreturn_t action_ret;
-                /*
-                 * No locking required for CPU-local interrupts:
-                 */
-                if (desc->irq_data.chip->ack)
-                        desc->irq_data.chip->ack(irq);
-                if (likely(!(desc->status & IRQ_DISABLED))) {
-                        action_ret = handle_IRQ_event(irq, desc->action);
-                        if (!noirqdebug)
-                                note_interrupt(irq, desc, action_ret);
-                }
-                desc->irq_data.chip->end(irq);
-                return 1;
-        }
-        raw_spin_lock(&desc->lock);
-        if (desc->irq_data.chip->ack)
-                desc->irq_data.chip->ack(irq);
-        /*
-         * REPLAY is when Linux resends an IRQ that was dropped earlier
-         * WAITING is used by probe to mark irqs that are being tested
-         */
-        status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING);
-        status |= IRQ_PENDING; /* we _want_ to handle it */
-        /*
-         * If the IRQ is disabled for whatever reason, we cannot
-         * use the action we have.
-         */
-        action = NULL;
-        if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) {
-                action = desc->action;
-                status &= ~IRQ_PENDING; /* we commit to handling */
-                status |= IRQ_INPROGRESS; /* we are handling it */
-        }
-        desc->status = status;
-        /*
-         * If there is no IRQ handler or it was disabled, exit early.
-         * Since we set PENDING, if another processor is handling
-         * a different instance of this same irq, the other processor
-         * will take care of it.
-         */
-        if (unlikely(!action))
-                goto out;
-        /*
-         * Edge triggered interrupts need to remember
-         * pending events.
-         * This applies to any hw interrupts that allow a second
-         * instance of the same irq to arrive while we are in do_IRQ
-         * or in the handler. But the code here only handles the _second_
-         * instance of the irq, not the third or fourth. So it is mostly
-         * useful for irq hardware that does not mask cleanly in an
-         * SMP environment.
-         */
-        for (;;) {
-                irqreturn_t action_ret;
-                raw_spin_unlock(&desc->lock);
-                action_ret = handle_IRQ_event(irq, action);
-                if (!noirqdebug)
-                        note_interrupt(irq, desc, action_ret);
-                raw_spin_lock(&desc->lock);
-                if (likely(!(desc->status & IRQ_PENDING)))
-                        break;
-                desc->status &= ~IRQ_PENDING;
-        }
-        desc->status &= ~IRQ_INPROGRESS;
-out:
-        /*
-         * The ->end() handler has to deal with interrupts which got
-         * disabled while the handler was running.
-         */
-        desc->irq_data.chip->end(irq);
-        raw_spin_unlock(&desc->lock);
-        return 1;
-}
-#endif
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9988d03797f5..282f20230e67 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -72,6 +72,8 @@ static inline int desc_node(struct irq_desc *desc) { return 0; }
 static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
 {
+        int cpu;
        desc->irq_data.irq = irq;
        desc->irq_data.chip = &no_irq_chip;
        desc->irq_data.chip_data = NULL;
@@ -83,7 +85,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
        desc->irq_count = 0;
        desc->irqs_unhandled = 0;
        desc->name = NULL;
-        memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
+        for_each_possible_cpu(cpu)
+                *per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
        desc_smp_init(desc, node);
 }
@@ -133,8 +136,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
        if (!desc)
                return NULL;
        /* allocate based on nr_cpu_ids */
-        desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs),
+        desc->kstat_irqs = alloc_percpu(unsigned int);
-                                         gfp, node);
        if (!desc->kstat_irqs)
                goto err_desc;
@@ -149,7 +151,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
        return desc;
 err_kstat:
-        kfree(desc->kstat_irqs);
+        free_percpu(desc->kstat_irqs);
 err_desc:
        kfree(desc);
        return NULL;
@@ -166,7 +168,7 @@ static void free_desc(unsigned int irq)
        mutex_unlock(&sparse_irq_lock);
        free_masks(desc);
-        kfree(desc->kstat_irqs);
+        free_percpu(desc->kstat_irqs);
        kfree(desc);
 }
@@ -234,7 +236,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
        }
 };
-static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
 int __init early_irq_init(void)
 {
        int count, i, node = first_online_node;
@@ -250,7 +251,8 @@ int __init early_irq_init(void)
        for (i = 0; i < count; i++) {
                desc[i].irq_data.irq = i;
                desc[i].irq_data.chip = &no_irq_chip;
-                desc[i].kstat_irqs = kstat_irqs_all[i];
+                /* TODO : do this allocation on-demand ... */
+                desc[i].kstat_irqs = alloc_percpu(unsigned int);
                alloc_masks(desc + i, GFP_KERNEL, node);
                desc_smp_init(desc + i, node);
                lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
@@ -275,6 +277,22 @@ static void free_desc(unsigned int irq)
 static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
 {
+#if defined(CONFIG_KSTAT_IRQS_ONDEMAND)
+        struct irq_desc *desc;
+        unsigned int i;
+        for (i = 0; i < cnt; i++) {
+                desc = irq_to_desc(start + i);
+                if (desc && !desc->kstat_irqs) {
+                        unsigned int __percpu *stats = alloc_percpu(unsigned int);
+                        if (!stats)
+                                return -1;
+                        if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL)
+                                free_percpu(stats);
+                }
+        }
+#endif
        return start;
 }
 #endif /* !CONFIG_SPARSE_IRQ */
@@ -391,7 +409,9 @@ void dynamic_irq_cleanup(unsigned int irq)
 unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 {
        struct irq_desc *desc = irq_to_desc(irq);
-        return desc ? desc->kstat_irqs[cpu] : 0;
+        return desc && desc->kstat_irqs ?
+                        *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
 }
 #ifdef CONFIG_GENERIC_HARDIRQS
@@ -401,10 +421,10 @@ unsigned int kstat_irqs(unsigned int irq)
        int cpu;
        int sum = 0;
-        if (!desc)
+        if (!desc || !desc->kstat_irqs)
                return 0;
        for_each_possible_cpu(cpu)
-                sum += desc->kstat_irqs[cpu];
+                sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
        return sum;
 }
 #endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 1d2541940480..441fd629ff04 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -56,6 +56,7 @@ void move_masked_irq(int irq)
 void move_native_irq(int irq)
 {
        struct irq_desc *desc = irq_to_desc(irq);
+        bool masked;
        if (likely(!(desc->status & IRQ_MOVE_PENDING)))
                return;
@@ -63,8 +64,15 @@ void move_native_irq(int irq)
        if (unlikely(desc->status & IRQ_DISABLED))
                return;
-        desc->irq_data.chip->irq_mask(&desc->irq_data);
+        /*
+         * Be careful vs. already masked interrupts. If this is a
+         * threaded interrupt with ONESHOT set, we can end up with an
+         * interrupt storm.
+         */
+        masked = desc->status & IRQ_MASKED;
+        if (!masked)
+                desc->irq_data.chip->irq_mask(&desc->irq_data);
        move_masked_irq(irq);
-        desc->irq_data.chip->irq_unmask(&desc->irq_data);
+        if (!masked)
+                desc->irq_data.chip->irq_unmask(&desc->irq_data);
 }
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 42ba65dff7d9..0d2058da80f5 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2292,22 +2292,6 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
 }
 /*
- * Debugging helper: via this flag we know that we are in
- * 'early bootup code', and will warn about any invalid irqs-on event:
- */
-static int early_boot_irqs_enabled;
-void early_boot_irqs_off(void)
-{
-        early_boot_irqs_enabled = 0;
-}
-void early_boot_irqs_on(void)
-{
-        early_boot_irqs_enabled = 1;
-}
-/*
 * Hardirqs will be enabled:
 */
 void trace_hardirqs_on_caller(unsigned long ip)
@@ -2319,7 +2303,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
        if (unlikely(!debug_locks || current->lockdep_recursion))
                return;
-        if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled)))
+        if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
                return;
        if (unlikely(curr->hardirqs_enabled)) {
diff --git a/kernel/module.c b/kernel/module.c
index 34e00b708fad..efa290ea94bf 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2460,9 +2460,9 @@ static void find_module_sections(struct module *mod, struct load_info *info)
 #endif
 #ifdef CONFIG_TRACEPOINTS
-        mod->tracepoints = section_objs(info, "__tracepoints",
+        mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs",
-                                        sizeof(*mod->tracepoints),
+                                             sizeof(*mod->tracepoints_ptrs),
-                                        &mod->num_tracepoints);
+                                             &mod->num_tracepoints);
 #endif
 #ifdef HAVE_JUMP_LABEL
        mod->jump_entries = section_objs(info, "__jump_table",
@@ -3393,7 +3393,7 @@ void module_layout(struct module *mod,
                   struct modversion_info *ver,
                   struct kernel_param *kp,
                   struct kernel_symbol *ks,
-                   struct tracepoint *tp)
+                   struct tracepoint * const *tp)
 {
 }
 EXPORT_SYMBOL(module_layout);
@@ -3407,8 +3407,8 @@ void module_update_tracepoints(void)
        mutex_lock(&module_mutex);
        list_for_each_entry(mod, &modules, list)
                if (!mod->taints)
-                        tracepoint_update_probe_range(mod->tracepoints,
+                        tracepoint_update_probe_range(mod->tracepoints_ptrs,
-                                mod->tracepoints + mod->num_tracepoints);
+                                mod->tracepoints_ptrs + mod->num_tracepoints);
        mutex_unlock(&module_mutex);
 }
@@ -3432,8 +3432,8 @@ int module_get_iter_tracepoints(struct tracepoint_iter *iter)
                        else if (iter_mod > iter->module)
                                iter->tracepoint = NULL;
                        found = tracepoint_get_iter_range(&iter->tracepoint,
-                                iter_mod->tracepoints,
+                                iter_mod->tracepoints_ptrs,
-                                iter_mod->tracepoints
+                                iter_mod->tracepoints_ptrs
                                        + iter_mod->num_tracepoints);
                        if (found) {
                                iter->module = iter_mod;
diff --git a/kernel/panic.c b/kernel/panic.c
index 4c13b1a88ebb..991bb87a1704 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -34,6 +34,7 @@ static int pause_on_oops_flag;
 static DEFINE_SPINLOCK(pause_on_oops_lock);
 int panic_timeout;
+EXPORT_SYMBOL_GPL(panic_timeout);
 ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
diff --git a/kernel/params.c b/kernel/params.c
index 08107d181758..0da1411222b9 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -719,9 +719,7 @@ void destroy_params(const struct kernel_param *params, unsigned num)
                        params[i].ops->free(params[i].arg);
 }
-static void __init kernel_add_sysfs_param(const char *name,
+static struct module_kobject * __init locate_module_kobject(const char *name)
-                                          struct kernel_param *kparam,
-                                          unsigned int name_skip)
 {
        struct module_kobject *mk;
        struct kobject *kobj;
@@ -729,10 +727,7 @@ static void __init kernel_add_sysfs_param(const char *name,
        kobj = kset_find_obj(module_kset, name);
        if (kobj) {
-                /* We already have one.  Remove params so we can add more. */
                mk = to_module_kobject(kobj);
-                /* We need to remove it before adding parameters. */
-                sysfs_remove_group(&mk->kobj, &mk->mp->grp);
        } else {
                mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
                BUG_ON(!mk);
@@ -743,15 +738,36 @@ static void __init kernel_add_sysfs_param(const char *name,
                                           "%s", name);
                if (err) {
                        kobject_put(&mk->kobj);
-                        printk(KERN_ERR "Module '%s' failed add to sysfs, "
+                        printk(KERN_ERR
-                               "error number %d\n", name, err);
+                                "Module '%s' failed add to sysfs, error number %d\n",
-                        printk(KERN_ERR "The system will be unstable now.\n");
+                                name, err);
-                        return;
+                        printk(KERN_ERR
+                                "The system will be unstable now.\n");
+                        return NULL;
                }
-                /* So that exit path is even. */
+                /* So that we hold reference in both cases. */
                kobject_get(&mk->kobj);
        }
+        return mk;
+}
+static void __init kernel_add_sysfs_param(const char *name,
+                                          struct kernel_param *kparam,
+                                          unsigned int name_skip)
+{
+        struct module_kobject *mk;
+        int err;
+        mk = locate_module_kobject(name);
+        if (!mk)
+                return;
+        /* We need to remove old parameters before adding more. */
+        if (mk->mp)
+                sysfs_remove_group(&mk->kobj, &mk->mp->grp);
        /* These should not fail at boot. */
        err = add_sysfs_param(mk, kparam, kparam->name + name_skip);
        BUG_ON(err);
@@ -796,6 +812,32 @@ static void __init param_sysfs_builtin(void)
        }
 }
+ssize_t __modver_version_show(struct module_attribute *mattr,
+                              struct module *mod, char *buf)
+{
+        struct module_version_attribute *vattr =
+                container_of(mattr, struct module_version_attribute, mattr);
+        return sprintf(buf, "%s\n", vattr->version);
+}
+extern struct module_version_attribute __start___modver[], __stop___modver[];
+static void __init version_sysfs_builtin(void)
+{
+        const struct module_version_attribute *vattr;
+        struct module_kobject *mk;
+        int err;
+        for (vattr = __start___modver; vattr < __stop___modver; vattr++) {
+                mk = locate_module_kobject(vattr->module_name);
+                if (mk) {
+                        err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
+                        kobject_uevent(&mk->kobj, KOBJ_ADD);
+                        kobject_put(&mk->kobj);
+                }
+        }
+}
 /* module-related sysfs stuff */
@@ -875,6 +917,7 @@ static int __init param_sysfs_init(void)
        }
        module_sysfs_initialized = 1;
+        version_sysfs_builtin();
        param_sysfs_builtin();
        return 0;
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 05ebe841270b..999835b6112b 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1901,11 +1901,12 @@ static void __perf_event_read(void *info)
                return;
        raw_spin_lock(&ctx->lock);
-        update_context_time(ctx);
+        if (ctx->is_active)
+                update_context_time(ctx);
        update_event_times(event);
+        if (event->state == PERF_EVENT_STATE_ACTIVE)
+                event->pmu->read(event);
        raw_spin_unlock(&ctx->lock);
-        event->pmu->read(event);
 }
 static inline u64 perf_event_count(struct perf_event *event)
@@ -1999,8 +2000,7 @@ static int alloc_callchain_buffers(void)
         * accessed from NMI. Use a temporary manual per cpu allocation
         * until that gets sorted out.
         */
-        size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) *
+        size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
-                num_possible_cpus();
        entries = kzalloc(size, GFP_KERNEL);
        if (!entries)
@@ -2201,13 +2201,6 @@ find_lively_task_by_vpid(pid_t vpid)
        if (!task)
                return ERR_PTR(-ESRCH);
-        /*
-         * Can't attach events to a dying task.
-         */
-        err = -ESRCH;
-        if (task->flags & PF_EXITING)
-                goto errout;
        /* Reuse ptrace permission checks for now. */
        err = -EACCES;
        if (!ptrace_may_access(task, PTRACE_MODE_READ))
@@ -2228,14 +2221,11 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
        unsigned long flags;
        int ctxn, err;
-        if (!task && cpu != -1) {
+        if (!task) {
                /* Must be root to operate on a CPU event: */
                if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
                        return ERR_PTR(-EACCES);
-                if (cpu < 0 || cpu >= nr_cpumask_bits)
-                        return ERR_PTR(-EINVAL);
                /*
                 * We could be clever and allow to attach a event to an
                 * offline CPU and activate it when the CPU comes up, but
@@ -2271,14 +2261,27 @@ retry:
                get_ctx(ctx);
-                if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) {
+                err = 0;
-                        /*
+                mutex_lock(&task->perf_event_mutex);
-                         * We raced with some other task; use
+                /*
-                         * the context they set.
+                 * If it has already passed perf_event_exit_task().
-                         */
+                 * we must see PF_EXITING, it takes this mutex too.
+                 */
+                if (task->flags & PF_EXITING)
+                        err = -ESRCH;
+                else if (task->perf_event_ctxp[ctxn])
+                        err = -EAGAIN;
+                else
+                        rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
+                mutex_unlock(&task->perf_event_mutex);
+                if (unlikely(err)) {
                        put_task_struct(task);
                        kfree(ctx);
-                        goto retry;
+                        if (err == -EAGAIN)
+                                goto retry;
+                        goto errout;
                }
        }
@@ -5377,6 +5380,8 @@ free_dev:
        goto out;
 }
+static struct lock_class_key cpuctx_mutex;
 int perf_pmu_register(struct pmu *pmu, char *name, int type)
 {
        int cpu, ret;
@@ -5425,6 +5430,7 @@ skip_type:
                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
                __perf_event_init_context(&cpuctx->ctx);
+                lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
                cpuctx->ctx.type = cpu_context;
                cpuctx->ctx.pmu = pmu;
                cpuctx->jiffies_interval = 1;
@@ -5541,6 +5547,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        struct hw_perf_event *hwc;
        long err;
+        if ((unsigned)cpu >= nr_cpu_ids) {
+                if (!task || cpu != -1)
+                        return ERR_PTR(-EINVAL);
+        }
        event = kzalloc(sizeof(*event), GFP_KERNEL);
        if (!event)
                return ERR_PTR(-ENOMEM);
@@ -5589,7 +5600,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        if (!overflow_handler && parent_event)
                overflow_handler = parent_event->overflow_handler;
-        
        event->overflow_handler = overflow_handler;
        if (attr->disabled)
@@ -6125,7 +6136,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
         * scheduled, so we are now safe from rescheduling changing
         * our context.
         */
-        child_ctx = child->perf_event_ctxp[ctxn];
+        child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
        task_ctx_sched_out(child_ctx, EVENT_ALL);
        /*
@@ -6438,11 +6449,6 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
        unsigned long flags;
        int ret = 0;
-        child->perf_event_ctxp[ctxn] = NULL;
-        mutex_init(&child->perf_event_mutex);
-        INIT_LIST_HEAD(&child->perf_event_list);
        if (likely(!parent->perf_event_ctxp[ctxn]))
                return 0;
@@ -6494,7 +6500,6 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
        raw_spin_lock_irqsave(&parent_ctx->lock, flags);
        parent_ctx->rotate_disable = 0;
-        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
        child_ctx = child->perf_event_ctxp[ctxn];
@@ -6502,12 +6507,11 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
                /*
                 * Mark the child context as a clone of the parent
                 * context, or of whatever the parent is a clone of.
-                 * Note that if the parent is a clone, it could get
+                 *
-                 * uncloned at any point, but that doesn't matter
+                 * Note that if the parent is a clone, the holding of
-                 * because the list of events and the generation
+                 * parent_ctx->lock avoids it from being uncloned.
-                 * count can't have changed since we took the mutex.
                 */
-                cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
+                cloned_ctx = parent_ctx->parent_ctx;
                if (cloned_ctx) {
                        child_ctx->parent_ctx = cloned_ctx;
                        child_ctx->parent_gen = parent_ctx->parent_gen;
@@ -6518,6 +6522,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
                get_ctx(child_ctx->parent_ctx);
        }
+        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
        mutex_unlock(&parent_ctx->mutex);
        perf_unpin_context(parent_ctx);
@@ -6532,6 +6537,10 @@ int perf_event_init_task(struct task_struct *child)
 {
        int ctxn, ret;
+        memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
+        mutex_init(&child->perf_event_mutex);
+        INIT_LIST_HEAD(&child->perf_event_list);
        for_each_task_context_nr(ctxn) {
                ret = perf_event_init_context(child, ctxn);
                if (ret)
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index a5aff3ebad38..265729966ece 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -100,13 +100,9 @@ config PM_SLEEP_ADVANCED_DEBUG
        depends on PM_ADVANCED_DEBUG
        default n
-config SUSPEND_NVS
-       bool
 config SUSPEND
        bool "Suspend to RAM and standby"
        depends on PM && ARCH_SUSPEND_POSSIBLE
-        select SUSPEND_NVS if HAS_IOMEM
        default y
        ---help---
          Allow the system to enter sleep states in which main memory is
@@ -140,7 +136,6 @@ config HIBERNATION
        depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
        select LZO_COMPRESS
        select LZO_DECOMPRESS
-        select SUSPEND_NVS if HAS_IOMEM
        ---help---
          Enable the suspend to disk (STD) functionality, which is usually
          called "hibernation" in user interfaces.  STD checkpoints the
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index b75597235d85..c350e18b53e3 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -7,6 +7,5 @@ obj-$(CONFIG_SUSPEND)		+= suspend.o
 obj-$(CONFIG_PM_TEST_SUSPEND)   += suspend_test.o
 obj-$(CONFIG_HIBERNATION)       += hibernate.o snapshot.o swap.o user.o \
                                   block_io.o
-obj-$(CONFIG_SUSPEND_NVS)       += nvs.o
 obj-$(CONFIG_MAGIC_SYSRQ)       += poweroff.o
diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c
deleted file mode 100644
index 1836db60bbb6..000000000000
--- a/kernel/power/nvs.c
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory
- *
- * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
- *
- * This file is released under the GPLv2.
- */
-#include <linux/io.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/suspend.h>
-/*
- * Platforms, like ACPI, may want us to save some memory used by them during
- * suspend and to restore the contents of this memory during the subsequent
- * resume.  The code below implements a mechanism allowing us to do that.
- */
-struct nvs_page {
-        unsigned long phys_start;
-        unsigned int size;
-        void *kaddr;
-        void *data;
-        struct list_head node;
-};
-static LIST_HEAD(nvs_list);
-/**
- *      suspend_nvs_register - register platform NVS memory region to save
- *      @start - physical address of the region
- *      @size - size of the region
- *
- *      The NVS region need not be page-aligned (both ends) and we arrange
- *      things so that the data from page-aligned addresses in this region will
- *      be copied into separate RAM pages.
- */
-int suspend_nvs_register(unsigned long start, unsigned long size)
-{
-        struct nvs_page *entry, *next;
-        while (size > 0) {
-                unsigned int nr_bytes;
-                entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
-                if (!entry)
-                        goto Error;
-                list_add_tail(&entry->node, &nvs_list);
-                entry->phys_start = start;
-                nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
-                entry->size = (size < nr_bytes) ? size : nr_bytes;
-                start += entry->size;
-                size -= entry->size;
-        }
-        return 0;
- Error:
-        list_for_each_entry_safe(entry, next, &nvs_list, node) {
-                list_del(&entry->node);
-                kfree(entry);
-        }
-        return -ENOMEM;
-}
-/**
- *      suspend_nvs_free - free data pages allocated for saving NVS regions
- */
-void suspend_nvs_free(void)
-{
-        struct nvs_page *entry;
-        list_for_each_entry(entry, &nvs_list, node)
-                if (entry->data) {
-                        free_page((unsigned long)entry->data);
-                        entry->data = NULL;
-                        if (entry->kaddr) {
-                                iounmap(entry->kaddr);
-                                entry->kaddr = NULL;
-                        }
-                }
-}
-/**
- *      suspend_nvs_alloc - allocate memory necessary for saving NVS regions
- */
-int suspend_nvs_alloc(void)
-{
-        struct nvs_page *entry;
-        list_for_each_entry(entry, &nvs_list, node) {
-                entry->data = (void *)__get_free_page(GFP_KERNEL);
-                if (!entry->data) {
-                        suspend_nvs_free();
-                        return -ENOMEM;
-                }
-        }
-        return 0;
-}
-/**
- *      suspend_nvs_save - save NVS memory regions
- */
-void suspend_nvs_save(void)
-{
-        struct nvs_page *entry;
-        printk(KERN_INFO "PM: Saving platform NVS memory\n");
-        list_for_each_entry(entry, &nvs_list, node)
-                if (entry->data) {
-                        entry->kaddr = ioremap(entry->phys_start, entry->size);
-                        memcpy(entry->data, entry->kaddr, entry->size);
-                }
-}
-/**
- *      suspend_nvs_restore - restore NVS memory regions
- *
- *      This function is going to be called with interrupts disabled, so it
- *      cannot iounmap the virtual addresses used to access the NVS region.
- */
-void suspend_nvs_restore(void)
-{
-        struct nvs_page *entry;
-        printk(KERN_INFO "PM: Restoring platform NVS memory\n");
-        list_for_each_entry(entry, &nvs_list, node)
-                if (entry->data)
-                        memcpy(entry->kaddr, entry->data, entry->size);
-}
diff --git a/kernel/printk.c b/kernel/printk.c
index 53d9a9ec88e6..36231525e22f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -97,7 +97,7 @@ static int console_locked, console_suspended;
 /*
 * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
 * It is also used in interesting ways to provide interlocking in
- * release_console_sem().
+ * console_unlock();.
 */
 static DEFINE_SPINLOCK(logbuf_lock);
@@ -262,25 +262,47 @@ int dmesg_restrict = 1;
 int dmesg_restrict;
 #endif
+static int syslog_action_restricted(int type)
+{
+        if (dmesg_restrict)
+                return 1;
+        /* Unless restricted, we allow "read all" and "get buffer size" for everybody */
+        return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER;
+}
+static int check_syslog_permissions(int type, bool from_file)
+{
+        /*
+         * If this is from /proc/kmsg and we've already opened it, then we've
+         * already done the capabilities checks at open time.
+         */
+        if (from_file && type != SYSLOG_ACTION_OPEN)
+                return 0;
+        if (syslog_action_restricted(type)) {
+                if (capable(CAP_SYSLOG))
+                        return 0;
+                /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */
+                if (capable(CAP_SYS_ADMIN)) {
+                        WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN "
+                                 "but no CAP_SYSLOG (deprecated).\n");
+                        return 0;
+                }
+                return -EPERM;
+        }
+        return 0;
+}
 int do_syslog(int type, char __user *buf, int len, bool from_file)
 {
        unsigned i, j, limit, count;
        int do_clear = 0;
        char c;
-        int error = 0;
+        int error;
-        /*
+        error = check_syslog_permissions(type, from_file);
-         * If this is from /proc/kmsg we only do the capabilities checks
+        if (error)
-         * at open time.
+                goto out;
-         */
-        if (type == SYSLOG_ACTION_OPEN || !from_file) {
-                if (dmesg_restrict && !capable(CAP_SYSLOG))
-                        goto warn; /* switch to return -EPERM after 2.6.39 */
-                if ((type != SYSLOG_ACTION_READ_ALL &&
-                     type != SYSLOG_ACTION_SIZE_BUFFER) &&
-                    !capable(CAP_SYSLOG))
-                        goto warn; /* switch to return -EPERM after 2.6.39 */
-        }
        error = security_syslog(type);
        if (error)
@@ -423,12 +445,6 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
        }
 out:
        return error;
-warn:
-        /* remove after 2.6.39 */
-        if (capable(CAP_SYS_ADMIN))
-                WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN "
-                  "but no CAP_SYSLOG (deprecated and denied).\n");
-        return -EPERM;
 }
 SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
@@ -501,7 +517,7 @@ static void _call_console_drivers(unsigned start,
 /*
 * Call the console drivers, asking them to write out
 * log_buf[start] to log_buf[end - 1].
- * The console_sem must be held.
+ * The console_lock must be held.
 */
 static void call_console_drivers(unsigned start, unsigned end)
 {
@@ -604,11 +620,11 @@ static int have_callable_console(void)
 *
 * This is printk().  It can be called from any context.  We want it to work.
 *
- * We try to grab the console_sem.  If we succeed, it's easy - we log the output and
+ * We try to grab the console_lock.  If we succeed, it's easy - we log the output and
 * call the console drivers.  If we fail to get the semaphore we place the output
 * into the log buffer and return.  The current holder of the console_sem will
- * notice the new output in release_console_sem() and will send it to the
+ * notice the new output in console_unlock(); and will send it to the
- * consoles before releasing the semaphore.
+ * consoles before releasing the lock.
 *
 * One effect of this deferred printing is that code which calls printk() and
 * then changes console_loglevel may break. This is because console_loglevel
@@ -659,19 +675,19 @@ static inline int can_use_console(unsigned int cpu)
 /*
 * Try to get console ownership to actually show the kernel
 * messages from a 'printk'. Return true (and with the
- * console_semaphore held, and 'console_locked' set) if it
+ * console_lock held, and 'console_locked' set) if it
 * is successful, false otherwise.
 *
 * This gets called with the 'logbuf_lock' spinlock held and
 * interrupts disabled. It should return with 'lockbuf_lock'
 * released but interrupts still disabled.
 */
-static int acquire_console_semaphore_for_printk(unsigned int cpu)
+static int console_trylock_for_printk(unsigned int cpu)
        __releases(&logbuf_lock)
 {
        int retval = 0;
-        if (!try_acquire_console_sem()) {
+        if (console_trylock()) {
                retval = 1;
                /*
@@ -827,12 +843,12 @@ asmlinkage int vprintk(const char *fmt, va_list args)
         * actual magic (print out buffers, wake up klogd,
         * etc). 
         *
-         * The acquire_console_semaphore_for_printk() function
+         * The console_trylock_for_printk() function
         * will release 'logbuf_lock' regardless of whether it
         * actually gets the semaphore or not.
         */
-        if (acquire_console_semaphore_for_printk(this_cpu))
+        if (console_trylock_for_printk(this_cpu))
-                release_console_sem();
+                console_unlock();
        lockdep_on();
 out_restore_irqs:
@@ -993,7 +1009,7 @@ void suspend_console(void)
        if (!console_suspend_enabled)
                return;
        printk("Suspending console(s) (use no_console_suspend to debug)\n");
-        acquire_console_sem();
+        console_lock();
        console_suspended = 1;
        up(&console_sem);
 }
@@ -1004,7 +1020,7 @@ void resume_console(void)
                return;
        down(&console_sem);
        console_suspended = 0;
-        release_console_sem();
+        console_unlock();
 }
 /**
@@ -1027,21 +1043,21 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self,
        case CPU_DYING:
        case CPU_DOWN_FAILED:
        case CPU_UP_CANCELED:
-                acquire_console_sem();
+                console_lock();
-                release_console_sem();
+                console_unlock();
        }
        return NOTIFY_OK;
 }
 /**
- * acquire_console_sem - lock the console system for exclusive use.
+ * console_lock - lock the console system for exclusive use.
 *
- * Acquires a semaphore which guarantees that the caller has
+ * Acquires a lock which guarantees that the caller has
 * exclusive access to the console system and the console_drivers list.
 *
 * Can sleep, returns nothing.
 */
-void acquire_console_sem(void)
+void console_lock(void)
 {
        BUG_ON(in_interrupt());
        down(&console_sem);
@@ -1050,21 +1066,29 @@ void acquire_console_sem(void)
        console_locked = 1;
        console_may_schedule = 1;
 }
-EXPORT_SYMBOL(acquire_console_sem);
+EXPORT_SYMBOL(console_lock);
-int try_acquire_console_sem(void)
+/**
+ * console_trylock - try to lock the console system for exclusive use.
+ *
+ * Tried to acquire a lock which guarantees that the caller has
+ * exclusive access to the console system and the console_drivers list.
+ *
+ * returns 1 on success, and 0 on failure to acquire the lock.
+ */
+int console_trylock(void)
 {
        if (down_trylock(&console_sem))
-                return -1;
+                return 0;
        if (console_suspended) {
                up(&console_sem);
-                return -1;
+                return 0;
        }
        console_locked = 1;
        console_may_schedule = 0;
-        return 0;
+        return 1;
 }
-EXPORT_SYMBOL(try_acquire_console_sem);
+EXPORT_SYMBOL(console_trylock);
 int is_console_locked(void)
 {
@@ -1095,20 +1119,20 @@ void wake_up_klogd(void)
 }
 /**
- * release_console_sem - unlock the console system
+ * console_unlock - unlock the console system
 *
- * Releases the semaphore which the caller holds on the console system
+ * Releases the console_lock which the caller holds on the console system
 * and the console driver list.
 *
- * While the semaphore was held, console output may have been buffered
+ * While the console_lock was held, console output may have been buffered
- * by printk().  If this is the case, release_console_sem() emits
+ * by printk().  If this is the case, console_unlock(); emits
- * the output prior to releasing the semaphore.
+ * the output prior to releasing the lock.
 *
 * If there is output waiting for klogd, we wake it up.
 *
- * release_console_sem() may be called from any context.
+ * console_unlock(); may be called from any context.
 */
-void release_console_sem(void)
+void console_unlock(void)
 {
        unsigned long flags;
        unsigned _con_start, _log_end;
@@ -1141,7 +1165,7 @@ void release_console_sem(void)
        if (wake_klogd)
                wake_up_klogd();
 }
-EXPORT_SYMBOL(release_console_sem);
+EXPORT_SYMBOL(console_unlock);
 /**
 * console_conditional_schedule - yield the CPU if required
@@ -1150,7 +1174,7 @@ EXPORT_SYMBOL(release_console_sem);
 * if this CPU should yield the CPU to another task, do
 * so here.
 *
- * Must be called within acquire_console_sem().
+ * Must be called within console_lock();.
 */
 void __sched console_conditional_schedule(void)
 {
@@ -1171,14 +1195,14 @@ void console_unblank(void)
                if (down_trylock(&console_sem) != 0)
                        return;
        } else
-                acquire_console_sem();
+                console_lock();
        console_locked = 1;
        console_may_schedule = 0;
        for_each_console(c)
                if ((c->flags & CON_ENABLED) && c->unblank)
                        c->unblank();
-        release_console_sem();
+        console_unlock();
 }
 /*
@@ -1189,7 +1213,7 @@ struct tty_driver *console_device(int *index)
        struct console *c;
        struct tty_driver *driver = NULL;
-        acquire_console_sem();
+        console_lock();
        for_each_console(c) {
                if (!c->device)
                        continue;
@@ -1197,7 +1221,7 @@ struct tty_driver *console_device(int *index)
                if (driver)
                        break;
        }
-        release_console_sem();
+        console_unlock();
        return driver;
 }
@@ -1208,17 +1232,17 @@ struct tty_driver *console_device(int *index)
 */
 void console_stop(struct console *console)
 {
-        acquire_console_sem();
+        console_lock();
        console->flags &= ~CON_ENABLED;
-        release_console_sem();
+        console_unlock();
 }
 EXPORT_SYMBOL(console_stop);
 void console_start(struct console *console)
 {
-        acquire_console_sem();
+        console_lock();
        console->flags |= CON_ENABLED;
-        release_console_sem();
+        console_unlock();
 }
 EXPORT_SYMBOL(console_start);
@@ -1340,7 +1364,7 @@ void register_console(struct console *newcon)
         *      Put this console in the list - keep the
         *      preferred driver at the head of the list.
         */
-        acquire_console_sem();
+        console_lock();
        if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
                newcon->next = console_drivers;
                console_drivers = newcon;
@@ -1352,14 +1376,14 @@ void register_console(struct console *newcon)
        }
        if (newcon->flags & CON_PRINTBUFFER) {
                /*
-                 * release_console_sem() will print out the buffered messages
+                 * console_unlock(); will print out the buffered messages
                 * for us.
                 */
                spin_lock_irqsave(&logbuf_lock, flags);
                con_start = log_start;
                spin_unlock_irqrestore(&logbuf_lock, flags);
        }
-        release_console_sem();
+        console_unlock();
        console_sysfs_notify();
        /*
@@ -1396,7 +1420,7 @@ int unregister_console(struct console *console)
                return braille_unregister_console(console);
 #endif
-        acquire_console_sem();
+        console_lock();
        if (console_drivers == console) {
                console_drivers=console->next;
                res = 0;
@@ -1418,7 +1442,7 @@ int unregister_console(struct console *console)
        if (console_drivers != NULL && console->flags & CON_CONSDEV)
                console_drivers->flags |= CON_CONSDEV;
-        release_console_sem();
+        console_unlock();
        console_sysfs_notify();
        return res;
 }
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 99bbaa3e5b0d..1708b1e2972d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -313,7 +313,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
                child->exit_code = data;
                dead = __ptrace_detach(current, child);
                if (!child->exit_state)
-                        wake_up_process(child);
+                        wake_up_state(child, TASK_TRACED | TASK_STOPPED);
        }
        write_unlock_irq(&tasklist_lock);
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 034493724749..0c343b9a46d5 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -189,7 +189,8 @@ static int rcu_kthread(void *arg)
        unsigned long flags;
        for (;;) {
-                wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0);
+                wait_event_interruptible(rcu_kthread_wq,
+                                         have_rcu_kthread_work != 0);
                morework = rcu_boost();
                local_irq_save(flags);
                work = have_rcu_kthread_work;
diff --git a/kernel/sched.c b/kernel/sched.c
index ea3e5eff3878..18d38e4ec7ba 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -553,9 +553,6 @@ struct rq {
        /* try_to_wake_up() stats */
        unsigned int ttwu_count;
        unsigned int ttwu_local;
-        /* BKL stats */
-        unsigned int bkl_count;
 #endif
 };
@@ -609,6 +606,9 @@ static inline struct task_group *task_group(struct task_struct *p)
        struct task_group *tg;
        struct cgroup_subsys_state *css;
+        if (p->flags & PF_EXITING)
+                return &root_task_group;
        css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
                        lockdep_is_held(&task_rq(p)->lock));
        tg = container_of(css, struct task_group, css);
@@ -3887,7 +3887,7 @@ static inline void schedule_debug(struct task_struct *prev)
        schedstat_inc(this_rq(), sched_count);
 #ifdef CONFIG_SCHEDSTATS
        if (unlikely(prev->lock_depth >= 0)) {
-                schedstat_inc(this_rq(), bkl_count);
+                schedstat_inc(this_rq(), rq_sched_info.bkl_count);
                schedstat_inc(prev, sched_info.bkl_count);
        }
 #endif
@@ -4871,7 +4871,8 @@ recheck:
                 * assigned.
                 */
                if (rt_bandwidth_enabled() && rt_policy(policy) &&
-                                task_group(p)->rt_bandwidth.rt_runtime == 0) {
+                                task_group(p)->rt_bandwidth.rt_runtime == 0 &&
+                                !task_group_is_autogroup(task_group(p))) {
                        __task_rq_unlock(rq);
                        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
                        return -EPERM;
@@ -8882,6 +8883,20 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
        }
 }
+static void
+cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
+{
+        /*
+         * cgroup_exit() is called in the copy_process() failure path.
+         * Ignore this case since the task hasn't ran yet, this avoids
+         * trying to poke a half freed task state from generic code.
+         */
+        if (!(task->flags & PF_EXITING))
+                return;
+        sched_move_task(task);
+}
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
                                u64 shareval)
@@ -8954,6 +8969,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
        .destroy        = cpu_cgroup_destroy,
        .can_attach     = cpu_cgroup_can_attach,
        .attach         = cpu_cgroup_attach,
+        .exit           = cpu_cgroup_exit,
        .populate       = cpu_cgroup_populate,
        .subsys_id      = cpu_cgroup_subsys_id,
        .early_init     = 1,
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
index 32a723b8f84c..9fb656283157 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched_autogroup.c
@@ -27,6 +27,11 @@ static inline void autogroup_destroy(struct kref *kref)
 {
        struct autogroup *ag = container_of(kref, struct autogroup, kref);
+#ifdef CONFIG_RT_GROUP_SCHED
+        /* We've redirected RT tasks to the root task group... */
+        ag->tg->rt_se = NULL;
+        ag->tg->rt_rq = NULL;
+#endif
        sched_destroy_group(ag->tg);
 }
@@ -55,6 +60,10 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p)
        return ag;
 }
+#ifdef CONFIG_RT_GROUP_SCHED
+static void free_rt_sched_group(struct task_group *tg);
+#endif
 static inline struct autogroup *autogroup_create(void)
 {
        struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
@@ -72,6 +81,19 @@ static inline struct autogroup *autogroup_create(void)
        init_rwsem(&ag->lock);
        ag->id = atomic_inc_return(&autogroup_seq_nr);
        ag->tg = tg;
+#ifdef CONFIG_RT_GROUP_SCHED
+        /*
+         * Autogroup RT tasks are redirected to the root task group
+         * so we don't have to move tasks around upon policy change,
+         * or flail around trying to allocate bandwidth on the fly.
+         * A bandwidth exception in __sched_setscheduler() allows
+         * the policy change to proceed.  Thereafter, task_group()
+         * returns &root_task_group, so zero bandwidth is required.
+         */
+        free_rt_sched_group(tg);
+        tg->rt_se = root_task_group.rt_se;
+        tg->rt_rq = root_task_group.rt_rq;
+#endif
        tg->autogroup = ag;
        return ag;
@@ -106,6 +128,11 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg)
        return true;
 }
+static inline bool task_group_is_autogroup(struct task_group *tg)
+{
+        return tg != &root_task_group && tg->autogroup;
+}
 static inline struct task_group *
 autogroup_task_group(struct task_struct *p, struct task_group *tg)
 {
@@ -231,6 +258,11 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
 #ifdef CONFIG_SCHED_DEBUG
 static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
 {
+        int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+        if (!enabled || !tg->autogroup)
+                return 0;
        return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
 }
 #endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
index 5358e241cb20..7b859ffe5dad 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched_autogroup.h
@@ -15,6 +15,10 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg);
 static inline void autogroup_init(struct task_struct *init_task) {  }
 static inline void autogroup_free(struct task_group *tg) { }
+static inline bool task_group_is_autogroup(struct task_group *tg)
+{
+        return 0;
+}
 static inline struct task_group *
 autogroup_task_group(struct task_struct *p, struct task_group *tg)
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 1dfae3d014b5..eb6cb8edd075 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -16,6 +16,8 @@
 #include <linux/kallsyms.h>
 #include <linux/utsname.h>
+static DEFINE_SPINLOCK(sched_debug_lock);
 /*
 * This allows printing both to /proc/sched_debug and
 * to the console
@@ -86,6 +88,26 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 }
 #endif
+#ifdef CONFIG_CGROUP_SCHED
+static char group_path[PATH_MAX];
+static char *task_group_path(struct task_group *tg)
+{
+        if (autogroup_path(tg, group_path, PATH_MAX))
+                return group_path;
+        /*
+         * May be NULL if the underlying cgroup isn't fully-created yet
+         */
+        if (!tg->css.cgroup) {
+                group_path[0] = '\0';
+                return group_path;
+        }
+        cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+        return group_path;
+}
+#endif
 static void
 print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 {
@@ -108,6 +130,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
        SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
                0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
 #endif
+#ifdef CONFIG_CGROUP_SCHED
+        SEQ_printf(m, " %s", task_group_path(task_group(p)));
+#endif
        SEQ_printf(m, "\n");
 }
@@ -144,7 +169,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
        struct sched_entity *last;
        unsigned long flags;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
+#else
        SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
+#endif
        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
                        SPLIT_NS(cfs_rq->exec_clock));
@@ -191,7 +220,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 {
+#ifdef CONFIG_RT_GROUP_SCHED
+        SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
+#else
        SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
+#endif
 #define P(x) \
        SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
@@ -212,6 +245,7 @@ extern __read_mostly int sched_clock_running;
 static void print_cpu(struct seq_file *m, int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
+        unsigned long flags;
 #ifdef CONFIG_X86
        {
@@ -262,14 +296,20 @@ static void print_cpu(struct seq_file *m, int cpu)
        P(ttwu_count);
        P(ttwu_local);
-        P(bkl_count);
+        SEQ_printf(m, "  .%-30s: %d\n", "bkl_count",
+                                rq->rq_sched_info.bkl_count);
 #undef P
+#undef P64
 #endif
+        spin_lock_irqsave(&sched_debug_lock, flags);
        print_cfs_stats(m, cpu);
        print_rt_stats(m, cpu);
+        rcu_read_lock();
        print_rq(m, rq, cpu);
+        rcu_read_unlock();
+        spin_unlock_irqrestore(&sched_debug_lock, flags);
 }
 static const char *sched_tunable_scaling_names[] = {
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c62ebae65cf0..0c26e2df450e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -699,7 +699,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
        cfs_rq->nr_running--;
 }
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_FAIR_GROUP_SCHED
+# ifdef CONFIG_SMP
 static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
                                            int global_update)
 {
@@ -721,10 +722,10 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
        u64 now, delta;
        unsigned long load = cfs_rq->load.weight;
-        if (!cfs_rq)
+        if (cfs_rq->tg == &root_task_group)
                return;
-        now = rq_of(cfs_rq)->clock;
+        now = rq_of(cfs_rq)->clock_task;
        delta = now - cfs_rq->load_stamp;
        /* truncate load history at 4 idle periods */
@@ -762,6 +763,51 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
                list_del_leaf_cfs_rq(cfs_rq);
 }
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
+                                long weight_delta)
+{
+        long load_weight, load, shares;
+        load = cfs_rq->load.weight + weight_delta;
+        load_weight = atomic_read(&tg->load_weight);
+        load_weight -= cfs_rq->load_contribution;
+        load_weight += load;
+        shares = (tg->shares * load);
+        if (load_weight)
+                shares /= load_weight;
+        if (shares < MIN_SHARES)
+                shares = MIN_SHARES;
+        if (shares > tg->shares)
+                shares = tg->shares;
+        return shares;
+}
+static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+        if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
+                update_cfs_load(cfs_rq, 0);
+                update_cfs_shares(cfs_rq, 0);
+        }
+}
+# else /* CONFIG_SMP */
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+}
+static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
+                                long weight_delta)
+{
+        return tg->shares;
+}
+static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+}
+# endif /* CONFIG_SMP */
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
                            unsigned long weight)
 {
@@ -782,41 +828,20 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
 {
        struct task_group *tg;
        struct sched_entity *se;
-        long load_weight, load, shares;
+        long shares;
-        if (!cfs_rq)
-                return;
        tg = cfs_rq->tg;
        se = tg->se[cpu_of(rq_of(cfs_rq))];
        if (!se)
                return;
+#ifndef CONFIG_SMP
-        load = cfs_rq->load.weight + weight_delta;
+        if (likely(se->load.weight == tg->shares))
+                return;
-        load_weight = atomic_read(&tg->load_weight);
+#endif
-        load_weight -= cfs_rq->load_contribution;
+        shares = calc_cfs_shares(cfs_rq, tg, weight_delta);
-        load_weight += load;
-        shares = (tg->shares * load);
-        if (load_weight)
-                shares /= load_weight;
-        if (shares < MIN_SHARES)
-                shares = MIN_SHARES;
-        if (shares > tg->shares)
-                shares = tg->shares;
        reweight_entity(cfs_rq_of(se), se, shares);
 }
-static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
-        if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
-                update_cfs_load(cfs_rq, 0);
-                update_cfs_shares(cfs_rq, 0);
-        }
-}
 #else /* CONFIG_FAIR_GROUP_SCHED */
 static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 {
@@ -1062,6 +1087,9 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
                struct sched_entity *se = __pick_next_entity(cfs_rq);
                s64 delta = curr->vruntime - se->vruntime;
+                if (delta < 0)
+                        return;
                if (delta > ideal_runtime)
                        resched_task(rq_of(cfs_rq)->curr);
        }
@@ -1362,27 +1390,27 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
                return wl;
        for_each_sched_entity(se) {
-                long S, rw, s, a, b;
+                long lw, w;
-                S = se->my_q->tg->shares;
+                tg = se->my_q->tg;
-                s = se->load.weight;
+                w = se->my_q->load.weight;
-                rw = se->my_q->load.weight;
-                a = S*(rw + wl);
+                /* use this cpu's instantaneous contribution */
-                b = S*rw + s*wg;
+                lw = atomic_read(&tg->load_weight);
+                lw -= se->my_q->load_contribution;
+                lw += w + wg;
-                wl = s*(a-b);
+                wl += w;
-                if (likely(b))
+                if (lw > 0 && wl < lw)
-                        wl /= b;
+                        wl = (wl * tg->shares) / lw;
+                else
+                        wl = tg->shares;
-                /*
+                /* zero point is MIN_SHARES */
-                 * Assume the group is already running and will
+                if (wl < MIN_SHARES)
-                 * thus already be accounted for in the weight.
+                        wl = MIN_SHARES;
-                 *
+                wl -= se->load.weight;
-                 * That is, moving shares between CPUs, does not
-                 * alter the group weight.
-                 */
                wg = 0;
        }
@@ -1401,7 +1429,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 {
-        unsigned long this_load, load;
+        s64 this_load, load;
        int idx, this_cpu, prev_cpu;
        unsigned long tl_per_task;
        struct task_group *tg;
@@ -1440,8 +1468,8 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
         * Otherwise check if either cpus are near enough in load to allow this
         * task to be woken on this_cpu.
         */
-        if (this_load) {
+        if (this_load > 0) {
-                unsigned long this_eff_load, prev_eff_load;
+                s64 this_eff_load, prev_eff_load;
                this_eff_load = 100;
                this_eff_load *= power_of(prev_cpu);
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index c914ec747ca6..ad6267714c84 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -625,7 +625,7 @@ static void update_curr_rt(struct rq *rq)
        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
        u64 delta_exec;
-        if (!task_has_rt_policy(curr))
+        if (curr->sched_class != &rt_sched_class)
                return;
        delta_exec = rq->clock_task - curr->se.exec_start;
diff --git a/kernel/smp.c b/kernel/smp.c
index 4ec30e069987..9910744f0856 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -194,23 +194,52 @@ void generic_smp_call_function_interrupt(void)
         */
        list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
                int refs;
+                void (*func) (void *info);
-                if (!cpumask_test_and_clear_cpu(cpu, data->cpumask))
+                /*
+                 * Since we walk the list without any locks, we might
+                 * see an entry that was completed, removed from the
+                 * list and is in the process of being reused.
+                 *
+                 * We must check that the cpu is in the cpumask before
+                 * checking the refs, and both must be set before
+                 * executing the callback on this cpu.
+                 */
+                if (!cpumask_test_cpu(cpu, data->cpumask))
+                        continue;
+                smp_rmb();
+                if (atomic_read(&data->refs) == 0)
                        continue;
+                func = data->csd.func;                  /* for later warn */
                data->csd.func(data->csd.info);
+                /*
+                 * If the cpu mask is not still set then it enabled interrupts,
+                 * we took another smp interrupt, and executed the function
+                 * twice on this cpu.  In theory that copy decremented refs.
+                 */
+                if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
+                        WARN(1, "%pS enabled interrupts and double executed\n",
+                             func);
+                        continue;
+                }
                refs = atomic_dec_return(&data->refs);
                WARN_ON(refs < 0);
-                if (!refs) {
-                        raw_spin_lock(&call_function.lock);
-                        list_del_rcu(&data->csd.list);
-                        raw_spin_unlock(&call_function.lock);
-                }
                if (refs)
                        continue;
+                WARN_ON(!cpumask_empty(data->cpumask));
+                raw_spin_lock(&call_function.lock);
+                list_del_rcu(&data->csd.list);
+                raw_spin_unlock(&call_function.lock);
                csd_unlock(&data->csd);
        }
@@ -430,7 +459,7 @@ void smp_call_function_many(const struct cpumask *mask,
         * can't happen.
         */
        WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
-                     && !oops_in_progress);
+                     && !oops_in_progress && !early_boot_irqs_disabled);
        /* So, what's a CPU they want? Ignoring this one. */
        cpu = cpumask_first_and(mask, cpu_online_mask);
@@ -454,11 +483,21 @@ void smp_call_function_many(const struct cpumask *mask,
        data = &__get_cpu_var(cfd_data);
        csd_lock(&data->csd);
+        BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
        data->csd.func = func;
        data->csd.info = info;
        cpumask_and(data->cpumask, mask, cpu_online_mask);
        cpumask_clear_cpu(this_cpu, data->cpumask);
+        /*
+         * To ensure the interrupt handler gets an complete view
+         * we order the cpumask and refs writes and order the read
+         * of them in the interrupt handler.  In addition we may
+         * only clear our own cpu bit from the mask.
+         */
+        smp_wmb();
        atomic_set(&data->refs, cpumask_weight(data->cpumask));
        raw_spin_lock_irqsave(&call_function.lock, flags);
@@ -533,17 +572,20 @@ void ipi_call_unlock_irq(void)
 #endif /* USE_GENERIC_SMP_HELPERS */
 /*
- * Call a function on all processors
+ * Call a function on all processors.  May be used during early boot while
+ * early_boot_irqs_disabled is set.  Use local_irq_save/restore() instead
+ * of local_irq_disable/enable().
 */
 int on_each_cpu(void (*func) (void *info), void *info, int wait)
 {
+        unsigned long flags;
        int ret = 0;
        preempt_disable();
        ret = smp_call_function(func, info, wait);
-        local_irq_disable();
+        local_irq_save(flags);
        func(info);
-        local_irq_enable();
+        local_irq_restore(flags);
        preempt_enable();
        return ret;
 }
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 98d8c1e80edb..73ce23feaea9 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -156,6 +156,16 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx)
 EXPORT_SYMBOL_GPL(__srcu_read_unlock);
 /*
+ * We use an adaptive strategy for synchronize_srcu() and especially for
+ * synchronize_srcu_expedited().  We spin for a fixed time period
+ * (defined below) to allow SRCU readers to exit their read-side critical
+ * sections.  If there are still some readers after 10 microseconds,
+ * we repeatedly block for 1-millisecond time periods.  This approach
+ * has done well in testing, so there is no need for a config parameter.
+ */
+#define SYNCHRONIZE_SRCU_READER_DELAY 10
+/*
 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
 */
 static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
@@ -207,11 +217,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
         * will have finished executing.  We initially give readers
         * an arbitrarily chosen 10 microseconds to get out of their
         * SRCU read-side critical sections, then loop waiting 1/HZ
-         * seconds per iteration.
+         * seconds per iteration.  The 10-microsecond value has done
+         * very well in testing.
         */
        if (srcu_readers_active_idx(sp, idx))
-                udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY);
+                udelay(SYNCHRONIZE_SRCU_READER_DELAY);
        while (srcu_readers_active_idx(sp, idx))
                schedule_timeout_interruptible(1);
diff --git a/kernel/sys.c b/kernel/sys.c
index 31b71a276b40..18da702ec813 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1385,7 +1385,8 @@ static int check_prlimit_permission(struct task_struct *task)
        const struct cred *cred = current_cred(), *tcred;
        tcred = __task_cred(task);
-        if ((cred->uid != tcred->euid ||
+        if (current != task &&
+            (cred->uid != tcred->euid ||
             cred->uid != tcred->suid ||
             cred->uid != tcred->uid  ||
             cred->gid != tcred->egid ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index bc86bb32e126..0f1bd83db985 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -170,7 +170,8 @@ static int proc_taint(struct ctl_table *table, int write,
 #endif
 #ifdef CONFIG_MAGIC_SYSRQ
-static int __sysrq_enabled; /* Note: sysrq code ises it's own private copy */
+/* Note: sysrq code uses it's own private copy */
+static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
 static int sysrq_sysctl_handler(ctl_table *table, int write,
                                void __user *buffer, size_t *lenp,
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c50a034de30f..6519cf62d9cd 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
 * @shift:      pointer to shift variable
 * @from:       frequency to convert from
 * @to:         frequency to convert to
- * @minsec:     guaranteed runtime conversion range in seconds
+ * @maxsec:     guaranteed runtime conversion range in seconds
 *
 * The function evaluates the shift/mult pair for the scaled math
 * operations of clocksources and clockevents.
@@ -122,7 +122,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
 * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
 * event @to is the counter frequency and @from is NSEC_PER_SEC.
 *
- * The @minsec conversion range argument controls the time frame in
+ * The @maxsec conversion range argument controls the time frame in
 * seconds which must be covered by the runtime conversion with the
 * calculated mult and shift factors. This guarantees that no 64bit
 * overflow happens when the input value of the conversion is
@@ -131,7 +131,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
 * factors.
 */
 void
-clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
+clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
 {
        u64 tmp;
        u32 sft, sftacc= 32;
@@ -140,7 +140,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
         * Calculate the shift factor which is limiting the conversion
         * range:
         */
-        tmp = ((u64)minsec * from) >> 32;
+        tmp = ((u64)maxsec * from) >> 32;
        while (tmp) {
                tmp >>=1;
                sftacc--;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3e216e01bbd1..c55ea2433471 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -642,8 +642,7 @@ static void tick_nohz_switch_to_nohz(void)
        }
        local_irq_enable();
-        printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n",
+        printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
-               smp_processor_id());
 }
 /*
@@ -795,8 +794,10 @@ void tick_setup_sched_timer(void)
        }
 #ifdef CONFIG_NO_HZ
-        if (tick_nohz_enabled)
+        if (tick_nohz_enabled) {
                ts->nohz_mode = NOHZ_MODE_HIGHRES;
+                printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
+        }
 #endif
 }
 #endif /* HIGH_RES_TIMERS */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5536aaf3ba36..d27c7562902c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -49,7 +49,7 @@ struct timekeeper {
        u32     mult;
 };
-struct timekeeper timekeeper;
+static struct timekeeper timekeeper;
 /**
 * timekeeper_setup_internals - Set up internals to use clocksource clock.
@@ -164,7 +164,7 @@ static struct timespec total_sleep_time;
 /*
 * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
 */
-struct timespec raw_time;
+static struct timespec raw_time;
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
diff --git a/kernel/timer.c b/kernel/timer.c
index 43ca9936f2d0..d53ce66daea0 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -969,10 +969,14 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
 int del_timer_sync(struct timer_list *timer)
 {
 #ifdef CONFIG_LOCKDEP
+        unsigned long flags;
+        raw_local_irq_save(flags);
        local_bh_disable();
        lock_map_acquire(&timer->lockdep_map);
        lock_map_release(&timer->lockdep_map);
-        local_bh_enable();
+        _local_bh_enable();
+        raw_local_irq_restore(flags);
 #endif
        /*
         * don't use it in hardirq context, because it
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 153562d0b93c..d95721f33702 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -138,6 +138,13 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
                     !blk_tracer_enabled))
                return;
+        /*
+         * If the BLK_TC_NOTIFY action mask isn't set, don't send any note
+         * message to the trace.
+         */
+        if (!(bt->act_mask & BLK_TC_NOTIFY))
+                return;
        local_irq_save(flags);
        buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
        va_start(args, fmt);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 35fde09b81de..5f499e0438a4 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1284,7 +1284,7 @@ trace_create_file_ops(struct module *mod)
 static void trace_module_add_events(struct module *mod)
 {
        struct ftrace_module_file_ops *file_ops = NULL;
-        struct ftrace_event_call *call, *start, *end;
+        struct ftrace_event_call **call, **start, **end;
        start = mod->trace_events;
        end = mod->trace_events + mod->num_trace_events;
@@ -1297,7 +1297,7 @@ static void trace_module_add_events(struct module *mod)
                return;
        for_each_event(call, start, end) {
-                __trace_add_event_call(call, mod,
+                __trace_add_event_call(*call, mod,
                                       &file_ops->id, &file_ops->enable,
                                       &file_ops->filter, &file_ops->format);
        }
@@ -1367,8 +1367,8 @@ static struct notifier_block trace_module_nb = {
        .priority = 0,
 };
-extern struct ftrace_event_call __start_ftrace_events[];
+extern struct ftrace_event_call *__start_ftrace_events[];
-extern struct ftrace_event_call __stop_ftrace_events[];
+extern struct ftrace_event_call *__stop_ftrace_events[];
 static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
@@ -1384,7 +1384,7 @@ __setup("trace_event=", setup_trace_event);
 static __init int event_trace_init(void)
 {
-        struct ftrace_event_call *call;
+        struct ftrace_event_call **call;
        struct dentry *d_tracer;
        struct dentry *entry;
        struct dentry *d_events;
@@ -1430,7 +1430,7 @@ static __init int event_trace_init(void)
                pr_warning("tracing: Failed to allocate common fields");
        for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
-                __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
+                __trace_add_event_call(*call, NULL, &ftrace_event_id_fops,
                                       &ftrace_enable_fops,
                                       &ftrace_event_filter_fops,
                                       &ftrace_event_format_fops);
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 4b74d71705c0..bbeec31e0ae3 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -161,13 +161,13 @@ struct ftrace_event_class event_class_ftrace_##call = {			\
        .fields                 = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
 };                                                                      \
                                                                        \
-struct ftrace_event_call __used                                         \
+struct ftrace_event_call __used event_##call = {                        \
-__attribute__((__aligned__(4)))                                         \
-__attribute__((section("_ftrace_events"))) event_##call = {             \
        .name                   = #call,                                \
        .event.type             = etype,                                \
        .class                  = &event_class_ftrace_##call,           \
        .print_fmt              = print,                                \
 };                                                                      \
+struct ftrace_event_call __used                                         \
+__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
 #include "trace_entries.h"
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 5cf8c602b880..92b6e1e12d98 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -453,14 +453,6 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
 * Stubs:
 */
-void early_boot_irqs_off(void)
-{
-}
-void early_boot_irqs_on(void)
-{
-}
 void trace_softirqs_on(unsigned long ip)
 {
 }
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index bac752f0cfb5..5c9fe08d2093 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -23,9 +23,6 @@ static int syscall_exit_register(struct ftrace_event_call *event,
 static int syscall_enter_define_fields(struct ftrace_event_call *call);
 static int syscall_exit_define_fields(struct ftrace_event_call *call);
-/* All syscall exit events have the same fields */
-static LIST_HEAD(syscall_exit_fields);
 static struct list_head *
 syscall_get_enter_fields(struct ftrace_event_call *call)
 {
@@ -34,50 +31,45 @@ syscall_get_enter_fields(struct ftrace_event_call *call)
        return &entry->enter_fields;
 }
-static struct list_head *
-syscall_get_exit_fields(struct ftrace_event_call *call)
-{
-        return &syscall_exit_fields;
-}
 struct trace_event_functions enter_syscall_print_funcs = {
-        .trace                  = print_syscall_enter,
+        .trace          = print_syscall_enter,
 };
 struct trace_event_functions exit_syscall_print_funcs = {
-        .trace                  = print_syscall_exit,
+        .trace          = print_syscall_exit,
 };
 struct ftrace_event_class event_class_syscall_enter = {
-        .system                 = "syscalls",
+        .system         = "syscalls",
-        .reg                    = syscall_enter_register,
+        .reg            = syscall_enter_register,
-        .define_fields          = syscall_enter_define_fields,
+        .define_fields  = syscall_enter_define_fields,
-        .get_fields             = syscall_get_enter_fields,
+        .get_fields     = syscall_get_enter_fields,
-        .raw_init               = init_syscall_trace,
+        .raw_init       = init_syscall_trace,
 };
 struct ftrace_event_class event_class_syscall_exit = {
-        .system                 = "syscalls",
+        .system         = "syscalls",
-        .reg                    = syscall_exit_register,
+        .reg            = syscall_exit_register,
-        .define_fields          = syscall_exit_define_fields,
+        .define_fields  = syscall_exit_define_fields,
-        .get_fields             = syscall_get_exit_fields,
+        .fields         = LIST_HEAD_INIT(event_class_syscall_exit.fields),
-        .raw_init               = init_syscall_trace,
+        .raw_init       = init_syscall_trace,
 };
-extern unsigned long __start_syscalls_metadata[];
+extern struct syscall_metadata *__start_syscalls_metadata[];
-extern unsigned long __stop_syscalls_metadata[];
+extern struct syscall_metadata *__stop_syscalls_metadata[];
 static struct syscall_metadata **syscalls_metadata;
-static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
+static __init struct syscall_metadata *
+find_syscall_meta(unsigned long syscall)
 {
-        struct syscall_metadata *start;
+        struct syscall_metadata **start;
-        struct syscall_metadata *stop;
+        struct syscall_metadata **stop;
        char str[KSYM_SYMBOL_LEN];
-        start = (struct syscall_metadata *)__start_syscalls_metadata;
+        start = __start_syscalls_metadata;
-        stop = (struct syscall_metadata *)__stop_syscalls_metadata;
+        stop = __stop_syscalls_metadata;
        kallsyms_lookup(syscall, NULL, NULL, NULL, str);
        for ( ; start < stop; start++) {
@@ -87,8 +79,8 @@ static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
                 * with "SyS" instead of "sys", leading to an unwanted
                 * mismatch.
                 */
-                if (start->name && !strcmp(start->name + 3, str + 3))
+                if ((*start)->name && !strcmp((*start)->name + 3, str + 3))
-                        return start;
+                        return *start;
        }
        return NULL;
 }
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index e95ee7f31d43..68187af4889e 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -27,8 +27,8 @@
 #include <linux/sched.h>
 #include <linux/jump_label.h>
-extern struct tracepoint __start___tracepoints[];
+extern struct tracepoint * const __start___tracepoints_ptrs[];
-extern struct tracepoint __stop___tracepoints[];
+extern struct tracepoint * const __stop___tracepoints_ptrs[];
 /* Set to 1 to enable tracepoint debug output */
 static const int tracepoint_debug;
@@ -298,10 +298,10 @@ static void disable_tracepoint(struct tracepoint *elem)
 *
 * Updates the probe callback corresponding to a range of tracepoints.
 */
-void
+void tracepoint_update_probe_range(struct tracepoint * const *begin,
-tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
+                                   struct tracepoint * const *end)
 {
-        struct tracepoint *iter;
+        struct tracepoint * const *iter;
        struct tracepoint_entry *mark_entry;
        if (!begin)
@@ -309,12 +309,12 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
        mutex_lock(&tracepoints_mutex);
        for (iter = begin; iter < end; iter++) {
-                mark_entry = get_tracepoint(iter->name);
+                mark_entry = get_tracepoint((*iter)->name);
                if (mark_entry) {
-                        set_tracepoint(&mark_entry, iter,
+                        set_tracepoint(&mark_entry, *iter,
                                        !!mark_entry->refcount);
                } else {
-                        disable_tracepoint(iter);
+                        disable_tracepoint(*iter);
                }
        }
        mutex_unlock(&tracepoints_mutex);
@@ -326,8 +326,8 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
 static void tracepoint_update_probes(void)
 {
        /* Core kernel tracepoints */
-        tracepoint_update_probe_range(__start___tracepoints,
+        tracepoint_update_probe_range(__start___tracepoints_ptrs,
-                __stop___tracepoints);
+                __stop___tracepoints_ptrs);
        /* tracepoints in modules. */
        module_update_tracepoints();
 }
@@ -514,8 +514,8 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
 * Will return the first tracepoint in the range if the input tracepoint is
 * NULL.
 */
-int tracepoint_get_iter_range(struct tracepoint **tracepoint,
+int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
-        struct tracepoint *begin, struct tracepoint *end)
+        struct tracepoint * const *begin, struct tracepoint * const *end)
 {
        if (!*tracepoint && begin != end) {
                *tracepoint = begin;
@@ -534,7 +534,8 @@ static void tracepoint_get_iter(struct tracepoint_iter *iter)
        /* Core kernel tracepoints */
        if (!iter->module) {
                found = tracepoint_get_iter_range(&iter->tracepoint,
-                                __start___tracepoints, __stop___tracepoints);
+                                __start___tracepoints_ptrs,
+                                __stop___tracepoints_ptrs);
                if (found)
                        goto end;
        }
@@ -585,8 +586,8 @@ int tracepoint_module_notify(struct notifier_block *self,
        switch (val) {
        case MODULE_STATE_COMING:
        case MODULE_STATE_GOING:
-                tracepoint_update_probe_range(mod->tracepoints,
+                tracepoint_update_probe_range(mod->tracepoints_ptrs,
-                        mod->tracepoints + mod->num_tracepoints);
+                        mod->tracepoints_ptrs + mod->num_tracepoints);
                break;
        }
        return 0;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index d7ebdf4cea98..f37f974aa81b 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -27,7 +27,7 @@
 #include <asm/irq_regs.h>
 #include <linux/perf_event.h>
-int watchdog_enabled;
+int watchdog_enabled = 1;
 int __read_mostly softlockup_thresh = 60;
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -43,9 +43,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
 #endif
-static int no_watchdog;
 /* boot commands */
 /*
 * Should we panic when a soft-lockup or hard-lockup occurs:
@@ -58,7 +55,7 @@ static int __init hardlockup_panic_setup(char *str)
        if (!strncmp(str, "panic", 5))
                hardlockup_panic = 1;
        else if (!strncmp(str, "0", 1))
-                no_watchdog = 1;
+                watchdog_enabled = 0;
        return 1;
 }
 __setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -77,7 +74,7 @@ __setup("softlockup_panic=", softlockup_panic_setup);
 static int __init nowatchdog_setup(char *str)
 {
-        no_watchdog = 1;
+        watchdog_enabled = 0;
        return 1;
 }
 __setup("nowatchdog", nowatchdog_setup);
@@ -85,7 +82,7 @@ __setup("nowatchdog", nowatchdog_setup);
 /* deprecated */
 static int __init nosoftlockup_setup(char *str)
 {
-        no_watchdog = 1;
+        watchdog_enabled = 0;
        return 1;
 }
 __setup("nosoftlockup", nosoftlockup_setup);
@@ -432,9 +429,6 @@ static int watchdog_enable(int cpu)
                wake_up_process(p);
        }
-        /* if any cpu succeeds, watchdog is considered enabled for the system */
-        watchdog_enabled = 1;
        return 0;
 }
@@ -462,12 +456,16 @@ static void watchdog_disable(int cpu)
 static void watchdog_enable_all_cpus(void)
 {
        int cpu;
-        int result = 0;
+        watchdog_enabled = 0;
        for_each_online_cpu(cpu)
-                result += watchdog_enable(cpu);
+                if (!watchdog_enable(cpu))
+                        /* if any cpu succeeds, watchdog is considered
+                           enabled for the system */
+                        watchdog_enabled = 1;
-        if (result)
+        if (!watchdog_enabled)
                printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
 }
@@ -476,9 +474,6 @@ static void watchdog_disable_all_cpus(void)
 {
        int cpu;
-        if (no_watchdog)
-                return;
        for_each_online_cpu(cpu)
                watchdog_disable(cpu);
@@ -498,10 +493,12 @@ int proc_dowatchdog_enabled(struct ctl_table *table, int write,
 {
        proc_dointvec(table, write, buffer, length, ppos);
-        if (watchdog_enabled)
+        if (write) {
-                watchdog_enable_all_cpus();
+                if (watchdog_enabled)
-        else
+                        watchdog_enable_all_cpus();
-                watchdog_disable_all_cpus();
+                else
+                        watchdog_disable_all_cpus();
+        }
        return 0;
 }
@@ -530,7 +527,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
                break;
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
-                err = watchdog_enable(hotcpu);
+                if (watchdog_enabled)
+                        err = watchdog_enable(hotcpu);
                break;
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_UP_CANCELED:
@@ -555,9 +553,6 @@ void __init lockup_detector_init(void)
        void *cpu = (void *)(long)smp_processor_id();
        int err;
-        if (no_watchdog)
-                return;
        err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
        WARN_ON(notifier_to_errno(err));
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8ee6ec82f88a..11869faa6819 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -768,7 +768,11 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
        worker->flags &= ~flags;
-        /* if transitioning out of NOT_RUNNING, increment nr_running */
+        /*
+         * If transitioning out of NOT_RUNNING, increment nr_running.  Note
+         * that the nested NOT_RUNNING is not a noop.  NOT_RUNNING is mask
+         * of multiple flags, not a single flag.
+         */
        if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
                if (!(worker->flags & WORKER_NOT_RUNNING))
                        atomic_inc(get_gcwq_nr_running(gcwq->cpu));
@@ -1840,7 +1844,7 @@ __acquires(&gcwq->lock)
        spin_unlock_irq(&gcwq->lock);
        work_clear_pending(work);
-        lock_map_acquire(&cwq->wq->lockdep_map);
+        lock_map_acquire_read(&cwq->wq->lockdep_map);
        lock_map_acquire(&lockdep_map);
        trace_workqueue_execute_start(work);
        f(work);
@@ -2384,8 +2388,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
        insert_wq_barrier(cwq, barr, work, worker);
        spin_unlock_irq(&gcwq->lock);
-        lock_map_acquire(&cwq->wq->lockdep_map);
+        /*
+         * If @max_active is 1 or rescuer is in use, flushing another work
+         * item on the same workqueue may lead to deadlock.  Make sure the
+         * flusher is not running on the same workqueue by verifying write
+         * access.
+         */
+        if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER)
+                lock_map_acquire(&cwq->wq->lockdep_map);
+        else
+                lock_map_acquire_read(&cwq->wq->lockdep_map);
        lock_map_release(&cwq->wq->lockdep_map);
        return true;
 already_gone:
        spin_unlock_irq(&gcwq->lock);
author	Jiri Kosina <jkosina@suse.cz>	2011-02-15 04:24:31 -0500
committer	Jiri Kosina <jkosina@suse.cz>	2011-02-15 04:24:31 -0500
commit	0a9d59a2461477bd9ed143c01af9df3f8f00fa81 (patch)
tree	df997d1cfb0786427a0df1fbd6f0640fa4248cf4 /kernel
parent	a23ce6da9677d245aa0aadc99f4197030350ab54 (diff)
parent	795abaf1e4e188c4171e3cd3dbb11a9fcacaf505 (diff)