52 files changed, 969 insertions, 526 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 0b5ff083fa22..353d3fe8ba33 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -43,7 +43,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
-obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o
+obj-$(CONFIG_SMP) += smp.o
 ifneq ($(CONFIG_SMP),y)
 obj-y += up.o
 endif
@@ -100,6 +100,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_RING_BUFFER) += trace/
+obj-$(CONFIG_TRACEPOINTS) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_IRQ_WORK) += irq_work.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
@@ -121,7 +122,7 @@ $(obj)/configs.o: $(obj)/config_data.h
 # config_data.h contains the same information as ikconfig.h but gzipped.
 # Info from config_data can be extracted from /proc/config*
 targets += config_data.gz
-$(obj)/config_data.gz: .config FORCE
+$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
        $(call if_changed,gzip)
 quiet_cmd_ikconfiggz = IKCFG   $@
diff --git a/kernel/audit.c b/kernel/audit.c
index 77770a034d59..e4956244ae50 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -400,7 +400,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
        if (err < 0) {
                BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
                printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
-                audit_log_lost("auditd dissapeared\n");
+                audit_log_lost("auditd disappeared\n");
                audit_pid = 0;
                /* we might get lucky and get this in the next auditd */
                audit_hold_skb(skb);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 66a416b42c18..b24d7027b83c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -764,6 +764,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
 */
 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
+static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
 static int cgroup_populate_dir(struct cgroup *cgrp);
 static const struct inode_operations cgroup_dir_inode_operations;
@@ -860,6 +861,11 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
        iput(inode);
 }
+static int cgroup_delete(const struct dentry *d)
+{
+        return 1;
+}
 static void remove_dir(struct dentry *d)
 {
        struct dentry *parent = dget(d->d_parent);
@@ -874,25 +880,29 @@ static void cgroup_clear_directory(struct dentry *dentry)
        struct list_head *node;
        BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
-        spin_lock(&dcache_lock);
+        spin_lock(&dentry->d_lock);
        node = dentry->d_subdirs.next;
        while (node != &dentry->d_subdirs) {
                struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
+                spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
                list_del_init(node);
                if (d->d_inode) {
                        /* This should never be called on a cgroup
                         * directory with child cgroups */
                        BUG_ON(d->d_inode->i_mode & S_IFDIR);
-                        d = dget_locked(d);
+                        dget_dlock(d);
-                        spin_unlock(&dcache_lock);
+                        spin_unlock(&d->d_lock);
+                        spin_unlock(&dentry->d_lock);
                        d_delete(d);
                        simple_unlink(dentry->d_inode, d);
                        dput(d);
-                        spin_lock(&dcache_lock);
+                        spin_lock(&dentry->d_lock);
-                }
+                } else
+                        spin_unlock(&d->d_lock);
                node = dentry->d_subdirs.next;
        }
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
 }
 /*
@@ -900,11 +910,16 @@ static void cgroup_clear_directory(struct dentry *dentry)
 */
 static void cgroup_d_remove_dir(struct dentry *dentry)
 {
+        struct dentry *parent;
        cgroup_clear_directory(dentry);
-        spin_lock(&dcache_lock);
+        parent = dentry->d_parent;
+        spin_lock(&parent->d_lock);
+        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
        list_del_init(&dentry->d_u.d_child);
-        spin_unlock(&dcache_lock);
+        spin_unlock(&dentry->d_lock);
+        spin_unlock(&parent->d_lock);
        remove_dir(dentry);
 }
@@ -1440,6 +1455,11 @@ static int cgroup_set_super(struct super_block *sb, void *data)
 static int cgroup_get_rootdir(struct super_block *sb)
 {
+        static const struct dentry_operations cgroup_dops = {
+                .d_iput = cgroup_diput,
+                .d_delete = cgroup_delete,
+        };
        struct inode *inode =
                cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
        struct dentry *dentry;
@@ -1457,6 +1477,8 @@ static int cgroup_get_rootdir(struct super_block *sb)
                return -ENOMEM;
        }
        sb->s_root = dentry;
+        /* for everything else we want ->d_op set */
+        sb->s_d_op = &cgroup_dops;
        return 0;
 }
@@ -2180,12 +2202,20 @@ static const struct file_operations cgroup_file_operations = {
 };
 static const struct inode_operations cgroup_dir_inode_operations = {
-        .lookup = simple_lookup,
+        .lookup = cgroup_lookup,
        .mkdir = cgroup_mkdir,
        .rmdir = cgroup_rmdir,
        .rename = cgroup_rename,
 };
+static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+        if (dentry->d_name.len > NAME_MAX)
+                return ERR_PTR(-ENAMETOOLONG);
+        d_add(dentry, NULL);
+        return NULL;
+}
 /*
 * Check if a file is a control file
 */
@@ -2199,10 +2229,6 @@ static inline struct cftype *__file_cft(struct file *file)
 static int cgroup_create_file(struct dentry *dentry, mode_t mode,
                                struct super_block *sb)
 {
-        static const struct dentry_operations cgroup_dops = {
-                .d_iput = cgroup_diput,
-        };
        struct inode *inode;
        if (!dentry)
@@ -2228,7 +2254,6 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode,
                inode->i_size = 0;
                inode->i_fop = &cgroup_file_operations;
        }
-        dentry->d_op = &cgroup_dops;
        d_instantiate(dentry, inode);
        dget(dentry);   /* Extra count - pin the dentry in core */
        return 0;
@@ -3638,9 +3663,7 @@ again:
        list_del(&cgrp->sibling);
        cgroup_unlock_hierarchy(cgrp->root);
-        spin_lock(&cgrp->dentry->d_lock);
        d = dget(cgrp->dentry);
-        spin_unlock(&d->d_lock);
        cgroup_d_remove_dir(d);
        dput(d);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index a6e729766821..bd3e8e29caa3 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2914,7 +2914,7 @@ static void __init kdb_cmd_init(void)
        }
 }
-/* Intialize kdb_printf, breakpoint tables and kdb state */
+/* Initialize kdb_printf, breakpoint tables and kdb state */
 void __init kdb_init(int lvl)
 {
        static int kdb_init_lvl = KDB_NOT_INITIALIZED;
diff --git a/kernel/exit.c b/kernel/exit.c
index 676149a4ac5f..f9a45ebcc7b1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -69,7 +69,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
                list_del_rcu(&p->tasks);
                list_del_init(&p->sibling);
-                __get_cpu_var(process_counts)--;
+                __this_cpu_dec(process_counts);
        }
        list_del_rcu(&p->thread_group);
 }
@@ -994,6 +994,15 @@ NORET_TYPE void do_exit(long code)
        exit_fs(tsk);
        check_stack_usage();
        exit_thread();
+        /*
+         * Flush inherited counters to the parent - before the parent
+         * gets woken up by child-exit notifications.
+         *
+         * because of cgroup mode, must be called before cgroup_exit()
+         */
+        perf_event_exit_task(tsk);
        cgroup_exit(tsk, 1);
        if (group_dead)
@@ -1007,11 +1016,6 @@ NORET_TYPE void do_exit(long code)
         * FIXME: do that only when needed, using sched_exit tracepoint
         */
        flush_ptrace_hw_breakpoint(tsk);
-        /*
-         * Flush inherited counters to the parent - before the parent
-         * gets woken up by child-exit notifications.
-         */
-        perf_event_exit_task(tsk);
        exit_notify(tsk, group_dead);
 #ifdef CONFIG_NUMA
diff --git a/kernel/fork.c b/kernel/fork.c
index 7d164e25b0f0..25e429152ddc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -66,6 +66,7 @@
 #include <linux/posix-timers.h>
 #include <linux/user-return-notifier.h>
 #include <linux/oom.h>
+#include <linux/khugepaged.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -169,15 +170,14 @@ EXPORT_SYMBOL(free_task);
 static inline void free_signal_struct(struct signal_struct *sig)
 {
        taskstats_tgid_free(sig);
+        sched_autogroup_exit(sig);
        kmem_cache_free(signal_cachep, sig);
 }
 static inline void put_signal_struct(struct signal_struct *sig)
 {
-        if (atomic_dec_and_test(&sig->sigcnt)) {
+        if (atomic_dec_and_test(&sig->sigcnt))
-                sched_autogroup_exit(sig);
                free_signal_struct(sig);
-        }
 }
 void __put_task_struct(struct task_struct *tsk)
@@ -331,6 +331,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
        retval = ksm_fork(mm, oldmm);
        if (retval)
                goto out;
+        retval = khugepaged_fork(mm, oldmm);
+        if (retval)
+                goto out;
        prev = NULL;
        for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
@@ -530,6 +533,9 @@ void __mmdrop(struct mm_struct *mm)
        mm_free_pgd(mm);
        destroy_context(mm);
        mmu_notifier_mm_destroy(mm);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        VM_BUG_ON(mm->pmd_huge_pte);
+#endif
        free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
@@ -544,6 +550,7 @@ void mmput(struct mm_struct *mm)
        if (atomic_dec_and_test(&mm->mm_users)) {
                exit_aio(mm);
                ksm_exit(mm);
+                khugepaged_exit(mm); /* must run before exit_mmap */
                exit_mmap(mm);
                set_mm_exe_file(mm, NULL);
                if (!list_empty(&mm->mmlist)) {
@@ -670,6 +677,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
        mm->token_priority = 0;
        mm->last_interval = 0;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        mm->pmd_huge_pte = NULL;
+#endif
        if (!mm_init(mm, tsk))
                goto fail_nomem;
@@ -911,6 +922,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        sig->oom_adj = current->signal->oom_adj;
        sig->oom_score_adj = current->signal->oom_score_adj;
+        sig->oom_score_adj_min = current->signal->oom_score_adj_min;
        mutex_init(&sig->cred_guard_mutex);
@@ -1286,7 +1298,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                        attach_pid(p, PIDTYPE_SID, task_session(current));
                        list_add_tail(&p->sibling, &p->real_parent->children);
                        list_add_tail_rcu(&p->tasks, &init_task.tasks);
-                        __get_cpu_var(process_counts)++;
+                        __this_cpu_inc(process_counts);
                }
                attach_pid(p, PIDTYPE_PID, pid);
                nr_threads++;
@@ -1318,7 +1330,7 @@ bad_fork_cleanup_mm:
        }
 bad_fork_cleanup_signal:
        if (!(clone_flags & CLONE_THREAD))
-                put_signal_struct(p->signal);
+                free_signal_struct(p->signal);
 bad_fork_cleanup_sighand:
        __cleanup_sighand(p->sighand);
 bad_fork_cleanup_fs:
@@ -1411,23 +1423,6 @@ long do_fork(unsigned long clone_flags,
        }
        /*
-         * We hope to recycle these flags after 2.6.26
-         */
-        if (unlikely(clone_flags & CLONE_STOPPED)) {
-                static int __read_mostly count = 100;
-                if (count > 0 && printk_ratelimit()) {
-                        char comm[TASK_COMM_LEN];
-                        count--;
-                        printk(KERN_INFO "fork(): process `%s' used deprecated "
-                                        "clone flags 0x%lx\n",
-                                get_task_comm(comm, current),
-                                clone_flags & CLONE_STOPPED);
-                }
-        }
-        /*
         * When called from kernel_thread, don't do user tracing stuff.
         */
        if (likely(user_mode(regs)))
@@ -1465,16 +1460,7 @@ long do_fork(unsigned long clone_flags,
                 */
                p->flags &= ~PF_STARTING;
-                if (unlikely(clone_flags & CLONE_STOPPED)) {
+                wake_up_new_task(p, clone_flags);
-                        /*
-                         * We'll start up with an immediate SIGSTOP.
-                         */
-                        sigaddset(&p->pending.signal, SIGSTOP);
-                        set_tsk_thread_flag(p, TIF_SIGPENDING);
-                        __set_task_state(p, TASK_STOPPED);
-                } else {
-                        wake_up_new_task(p, clone_flags);
-                }
                tracehook_report_clone_complete(trace, regs,
                                                clone_flags, nr, p);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index bd1d42b17cb2..66ecd2ead215 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -104,8 +104,13 @@ bool freeze_task(struct task_struct *p, bool sig_only)
        }
        if (should_send_signal(p)) {
-                if (!signal_pending(p))
+                fake_signal_wake_up(p);
-                        fake_signal_wake_up(p);
+                /*
+                 * fake_signal_wake_up() goes through p's scheduler
+                 * lock and guarantees that TASK_STOPPED/TRACED ->
+                 * TASK_RUNNING transition can't race with task state
+                 * testing in try_to_freeze_tasks().
+                 */
        } else if (sig_only) {
                return false;
        } else {
diff --git a/kernel/futex.c b/kernel/futex.c
index 3019b92e6917..b766d28accd6 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -233,7 +233,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
 {
        unsigned long address = (unsigned long)uaddr;
        struct mm_struct *mm = current->mm;
-        struct page *page;
+        struct page *page, *page_head;
        int err;
        /*
@@ -265,11 +265,46 @@ again:
        if (err < 0)
                return err;
-        page = compound_head(page);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-        lock_page(page);
+        page_head = page;
-        if (!page->mapping) {
+        if (unlikely(PageTail(page))) {
-                unlock_page(page);
                put_page(page);
+                /* serialize against __split_huge_page_splitting() */
+                local_irq_disable();
+                if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
+                        page_head = compound_head(page);
+                        /*
+                         * page_head is valid pointer but we must pin
+                         * it before taking the PG_lock and/or
+                         * PG_compound_lock. The moment we re-enable
+                         * irqs __split_huge_page_splitting() can
+                         * return and the head page can be freed from
+                         * under us. We can't take the PG_lock and/or
+                         * PG_compound_lock on a page that could be
+                         * freed from under us.
+                         */
+                        if (page != page_head) {
+                                get_page(page_head);
+                                put_page(page);
+                        }
+                        local_irq_enable();
+                } else {
+                        local_irq_enable();
+                        goto again;
+                }
+        }
+#else
+        page_head = compound_head(page);
+        if (page != page_head) {
+                get_page(page_head);
+                put_page(page);
+        }
+#endif
+        lock_page(page_head);
+        if (!page_head->mapping) {
+                unlock_page(page_head);
+                put_page(page_head);
                goto again;
        }
@@ -280,20 +315,20 @@ again:
         * it's a read-only handle, it's expected that futexes attach to
         * the object not the particular process.
         */
-        if (PageAnon(page)) {
+        if (PageAnon(page_head)) {
                key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
                key->private.mm = mm;
                key->private.address = address;
        } else {
                key->both.offset |= FUT_OFF_INODE; /* inode-based key */
-                key->shared.inode = page->mapping->host;
+                key->shared.inode = page_head->mapping->host;
-                key->shared.pgoff = page->index;
+                key->shared.pgoff = page_head->index;
        }
        get_futex_key_refs(key);
-        unlock_page(page);
+        unlock_page(page_head);
-        put_page(page);
+        put_page(page_head);
        return 0;
 }
@@ -791,10 +826,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
        new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
        /*
-         * This happens when we have stolen the lock and the original
+         * It is possible that the next waiter (the one that brought
-         * pending owner did not enqueue itself back on the rt_mutex.
+         * this owner to the kernel) timed out and is no longer
-         * Thats not a tragedy. We know that way, that a lock waiter
+         * waiting on the lock.
-         * is on the fly. We make the futex_q waiter the pending owner.
         */
        if (!new_owner)
                new_owner = this->task;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f2429fc3438c..0c8d7c048615 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -497,7 +497,7 @@ static inline int hrtimer_is_hres_enabled(void)
 */
 static inline int hrtimer_hres_active(void)
 {
-        return __get_cpu_var(hrtimer_bases).hres_active;
+        return __this_cpu_read(hrtimer_bases.hres_active);
 }
 /*
@@ -1745,7 +1745,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
        }
        /*
-         * A NULL parameter means "inifinte"
+         * A NULL parameter means "infinite"
         */
        if (!expires) {
                schedule();
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9988d03797f5..282f20230e67 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -72,6 +72,8 @@ static inline int desc_node(struct irq_desc *desc) { return 0; }
 static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
 {
+        int cpu;
        desc->irq_data.irq = irq;
        desc->irq_data.chip = &no_irq_chip;
        desc->irq_data.chip_data = NULL;
@@ -83,7 +85,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
        desc->irq_count = 0;
        desc->irqs_unhandled = 0;
        desc->name = NULL;
-        memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
+        for_each_possible_cpu(cpu)
+                *per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
        desc_smp_init(desc, node);
 }
@@ -133,8 +136,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
        if (!desc)
                return NULL;
        /* allocate based on nr_cpu_ids */
-        desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs),
+        desc->kstat_irqs = alloc_percpu(unsigned int);
-                                         gfp, node);
        if (!desc->kstat_irqs)
                goto err_desc;
@@ -149,7 +151,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
        return desc;
 err_kstat:
-        kfree(desc->kstat_irqs);
+        free_percpu(desc->kstat_irqs);
 err_desc:
        kfree(desc);
        return NULL;
@@ -166,7 +168,7 @@ static void free_desc(unsigned int irq)
        mutex_unlock(&sparse_irq_lock);
        free_masks(desc);
-        kfree(desc->kstat_irqs);
+        free_percpu(desc->kstat_irqs);
        kfree(desc);
 }
@@ -234,7 +236,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
        }
 };
-static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
 int __init early_irq_init(void)
 {
        int count, i, node = first_online_node;
@@ -250,7 +251,8 @@ int __init early_irq_init(void)
        for (i = 0; i < count; i++) {
                desc[i].irq_data.irq = i;
                desc[i].irq_data.chip = &no_irq_chip;
-                desc[i].kstat_irqs = kstat_irqs_all[i];
+                /* TODO : do this allocation on-demand ... */
+                desc[i].kstat_irqs = alloc_percpu(unsigned int);
                alloc_masks(desc + i, GFP_KERNEL, node);
                desc_smp_init(desc + i, node);
                lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
@@ -275,6 +277,22 @@ static void free_desc(unsigned int irq)
 static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
 {
+#if defined(CONFIG_KSTAT_IRQS_ONDEMAND)
+        struct irq_desc *desc;
+        unsigned int i;
+        for (i = 0; i < cnt; i++) {
+                desc = irq_to_desc(start + i);
+                if (desc && !desc->kstat_irqs) {
+                        unsigned int __percpu *stats = alloc_percpu(unsigned int);
+                        if (!stats)
+                                return -1;
+                        if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL)
+                                free_percpu(stats);
+                }
+        }
+#endif
        return start;
 }
 #endif /* !CONFIG_SPARSE_IRQ */
@@ -391,7 +409,9 @@ void dynamic_irq_cleanup(unsigned int irq)
 unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 {
        struct irq_desc *desc = irq_to_desc(irq);
-        return desc ? desc->kstat_irqs[cpu] : 0;
+        return desc && desc->kstat_irqs ?
+                        *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
 }
 #ifdef CONFIG_GENERIC_HARDIRQS
@@ -401,10 +421,10 @@ unsigned int kstat_irqs(unsigned int irq)
        int cpu;
        int sum = 0;
-        if (!desc)
+        if (!desc || !desc->kstat_irqs)
                return 0;
        for_each_possible_cpu(cpu)
-                sum += desc->kstat_irqs[cpu];
+                sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
        return sum;
 }
 #endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 91a5fa25054e..0caa59f747dd 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -577,7 +577,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
 */
 static int irq_thread(void *data)
 {
-        static struct sched_param param = {
+        static const struct sched_param param = {
                .sched_priority = MAX_USER_RT_PRIO/2,
        };
        struct irqaction *action = data;
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 90f881904bb1..c58fa7da8aef 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -77,21 +77,21 @@ void __weak arch_irq_work_raise(void)
 */
 static void __irq_work_queue(struct irq_work *entry)
 {
-        struct irq_work **head, *next;
+        struct irq_work *next;
-        head = &get_cpu_var(irq_work_list);
+        preempt_disable();
        do {
-                next = *head;
+                next = __this_cpu_read(irq_work_list);
                /* Can assign non-atomic because we keep the flags set. */
                entry->next = next_flags(next, IRQ_WORK_FLAGS);
-        } while (cmpxchg(head, next, entry) != next);
+        } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
        /* The list was empty, raise self-interrupt to start processing. */
        if (!irq_work_next(entry))
                arch_irq_work_raise();
-        put_cpu_var(irq_work_list);
+        preempt_enable();
 }
 /*
@@ -120,16 +120,16 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
 */
 void irq_work_run(void)
 {
-        struct irq_work *list, **head;
+        struct irq_work *list;
-        head = &__get_cpu_var(irq_work_list);
+        if (this_cpu_read(irq_work_list) == NULL)
-        if (*head == NULL)
                return;
        BUG_ON(!in_irq());
        BUG_ON(!irqs_disabled());
-        list = xchg(head, NULL);
+        list = this_cpu_xchg(irq_work_list, NULL);
        while (list != NULL) {
                struct irq_work *entry = list;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index b55045bc7563..ec19b92c7ebd 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -163,7 +163,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
         * just verifies it is an address we can use.
         *
         * Since the kernel does everything in page size chunks ensure
-         * the destination addreses are page aligned.  Too many
+         * the destination addresses are page aligned.  Too many
         * special cases crop of when we don't do this.  The most
         * insidious is getting overlapping destination addresses
         * simply because addresses are changed to page size
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 7663e5df0e6f..77981813a1e7 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -317,12 +317,12 @@ void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
 /* We have preemption disabled.. so it is safe to use __ versions */
 static inline void set_kprobe_instance(struct kprobe *kp)
 {
-        __get_cpu_var(kprobe_instance) = kp;
+        __this_cpu_write(kprobe_instance, kp);
 }
 static inline void reset_kprobe_instance(void)
 {
-        __get_cpu_var(kprobe_instance) = NULL;
+        __this_cpu_write(kprobe_instance, NULL);
 }
 /*
@@ -965,7 +965,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
 static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
                                        int trapnr)
 {
-        struct kprobe *cur = __get_cpu_var(kprobe_instance);
+        struct kprobe *cur = __this_cpu_read(kprobe_instance);
        /*
         * if we faulted "during" the execution of a user specified
@@ -980,7 +980,7 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
 static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
 {
-        struct kprobe *cur = __get_cpu_var(kprobe_instance);
+        struct kprobe *cur = __this_cpu_read(kprobe_instance);
        int ret = 0;
        if (cur && cur->break_handler) {
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 5355cfd44a3f..c55afba990a3 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -148,7 +148,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
        wait_for_completion(&create.done);
        if (!IS_ERR(create.result)) {
-                static struct sched_param param = { .sched_priority = 0 };
+                static const struct sched_param param = { .sched_priority = 0 };
                va_list args;
                va_start(args, namefmt);
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 17110a4a4fc2..ee74b35e528d 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -241,24 +241,19 @@ static int lstats_show(struct seq_file *m, void *v)
        seq_puts(m, "Latency Top version : v0.1\n");
        for (i = 0; i < MAXLR; i++) {
-                if (latency_record[i].backtrace[0]) {
+                struct latency_record *lr = &latency_record[i];
+                if (lr->backtrace[0]) {
                        int q;
-                        seq_printf(m, "%i %lu %lu ",
+                        seq_printf(m, "%i %lu %lu",
-                                latency_record[i].count,
+                                   lr->count, lr->time, lr->max);
-                                latency_record[i].time,
-                                latency_record[i].max);
                        for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
-                                char sym[KSYM_SYMBOL_LEN];
+                                unsigned long bt = lr->backtrace[q];
-                                char *c;
+                                if (!bt)
-                                if (!latency_record[i].backtrace[q])
                                        break;
-                                if (latency_record[i].backtrace[q] == ULONG_MAX)
+                                if (bt == ULONG_MAX)
                                        break;
-                                sprint_symbol(sym, latency_record[i].backtrace[q]);
+                                seq_printf(m, " %ps", (void *)bt);
-                                c = strchr(sym, '+');
-                                if (c)
-                                        *c = 0;
-                                seq_printf(m, "%s ", sym);
                        }
                        seq_printf(m, "\n");
                }
diff --git a/kernel/panic.c b/kernel/panic.c
index 4c13b1a88ebb..991bb87a1704 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -34,6 +34,7 @@ static int pause_on_oops_flag;
 static DEFINE_SPINLOCK(pause_on_oops_lock);
 int panic_timeout;
+EXPORT_SYMBOL_GPL(panic_timeout);
 ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 11847bf1e8cc..05ebe841270b 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -38,6 +38,12 @@
 #include <asm/irq_regs.h>
+enum event_type_t {
+        EVENT_FLEXIBLE = 0x1,
+        EVENT_PINNED = 0x2,
+        EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
+};
 atomic_t perf_task_events __read_mostly;
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
@@ -65,6 +71,12 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000;
 static atomic64_t perf_event_id;
+static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
+                              enum event_type_t event_type);
+static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
+                             enum event_type_t event_type);
 void __weak perf_event_print_debug(void)        { }
 extern __weak const char *perf_pmu_name(void)
@@ -72,6 +84,11 @@ extern __weak const char *perf_pmu_name(void)
        return "pmu";
 }
+static inline u64 perf_clock(void)
+{
+        return local_clock();
+}
 void perf_pmu_disable(struct pmu *pmu)
 {
        int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -240,11 +257,6 @@ static void perf_unpin_context(struct perf_event_context *ctx)
        put_ctx(ctx);
 }
-static inline u64 perf_clock(void)
-{
-        return local_clock();
-}
 /*
 * Update the record of the current time in a context.
 */
@@ -256,6 +268,12 @@ static void update_context_time(struct perf_event_context *ctx)
        ctx->timestamp = now;
 }
+static u64 perf_event_time(struct perf_event *event)
+{
+        struct perf_event_context *ctx = event->ctx;
+        return ctx ? ctx->time : 0;
+}
 /*
 * Update the total_time_enabled and total_time_running fields for a event.
 */
@@ -269,7 +287,7 @@ static void update_event_times(struct perf_event *event)
                return;
        if (ctx->is_active)
-                run_end = ctx->time;
+                run_end = perf_event_time(event);
        else
                run_end = event->tstamp_stopped;
@@ -278,7 +296,7 @@ static void update_event_times(struct perf_event *event)
        if (event->state == PERF_EVENT_STATE_INACTIVE)
                run_end = event->tstamp_stopped;
        else
-                run_end = ctx->time;
+                run_end = perf_event_time(event);
        event->total_time_running = run_end - event->tstamp_running;
 }
@@ -534,6 +552,7 @@ event_sched_out(struct perf_event *event,
                  struct perf_cpu_context *cpuctx,
                  struct perf_event_context *ctx)
 {
+        u64 tstamp = perf_event_time(event);
        u64 delta;
        /*
         * An event which could not be activated because of
@@ -545,7 +564,7 @@ event_sched_out(struct perf_event *event,
            && !event_filter_match(event)) {
                delta = ctx->time - event->tstamp_stopped;
                event->tstamp_running += delta;
-                event->tstamp_stopped = ctx->time;
+                event->tstamp_stopped = tstamp;
        }
        if (event->state != PERF_EVENT_STATE_ACTIVE)
@@ -556,7 +575,7 @@ event_sched_out(struct perf_event *event,
                event->pending_disable = 0;
                event->state = PERF_EVENT_STATE_OFF;
        }
-        event->tstamp_stopped = ctx->time;
+        event->tstamp_stopped = tstamp;
        event->pmu->del(event, 0);
        event->oncpu = -1;
@@ -768,6 +787,8 @@ event_sched_in(struct perf_event *event,
                 struct perf_cpu_context *cpuctx,
                 struct perf_event_context *ctx)
 {
+        u64 tstamp = perf_event_time(event);
        if (event->state <= PERF_EVENT_STATE_OFF)
                return 0;
@@ -784,9 +805,9 @@ event_sched_in(struct perf_event *event,
                return -EAGAIN;
        }
-        event->tstamp_running += ctx->time - event->tstamp_stopped;
+        event->tstamp_running += tstamp - event->tstamp_stopped;
-        event->shadow_ctx_time = ctx->time - ctx->timestamp;
+        event->shadow_ctx_time = tstamp - ctx->timestamp;
        if (!is_software_event(event))
                cpuctx->active_oncpu++;
@@ -898,11 +919,13 @@ static int group_can_go_on(struct perf_event *event,
 static void add_event_to_ctx(struct perf_event *event,
                               struct perf_event_context *ctx)
 {
+        u64 tstamp = perf_event_time(event);
        list_add_event(event, ctx);
        perf_group_attach(event);
-        event->tstamp_enabled = ctx->time;
+        event->tstamp_enabled = tstamp;
-        event->tstamp_running = ctx->time;
+        event->tstamp_running = tstamp;
-        event->tstamp_stopped = ctx->time;
+        event->tstamp_stopped = tstamp;
 }
 /*
@@ -937,7 +960,7 @@ static void __perf_install_in_context(void *info)
        add_event_to_ctx(event, ctx);
-        if (event->cpu != -1 && event->cpu != smp_processor_id())
+        if (!event_filter_match(event))
                goto unlock;
        /*
@@ -1042,14 +1065,13 @@ static void __perf_event_mark_enabled(struct perf_event *event,
                                        struct perf_event_context *ctx)
 {
        struct perf_event *sub;
+        u64 tstamp = perf_event_time(event);
        event->state = PERF_EVENT_STATE_INACTIVE;
-        event->tstamp_enabled = ctx->time - event->total_time_enabled;
+        event->tstamp_enabled = tstamp - event->total_time_enabled;
        list_for_each_entry(sub, &event->sibling_list, group_entry) {
-                if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
+                if (sub->state >= PERF_EVENT_STATE_INACTIVE)
-                        sub->tstamp_enabled =
+                        sub->tstamp_enabled = tstamp - sub->total_time_enabled;
-                                ctx->time - sub->total_time_enabled;
-                }
        }
 }
@@ -1082,7 +1104,7 @@ static void __perf_event_enable(void *info)
                goto unlock;
        __perf_event_mark_enabled(event, ctx);
-        if (event->cpu != -1 && event->cpu != smp_processor_id())
+        if (!event_filter_match(event))
                goto unlock;
        /*
@@ -1193,12 +1215,6 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
        return 0;
 }
-enum event_type_t {
-        EVENT_FLEXIBLE = 0x1,
-        EVENT_PINNED = 0x2,
-        EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
-};
 static void ctx_sched_out(struct perf_event_context *ctx,
                          struct perf_cpu_context *cpuctx,
                          enum event_type_t event_type)
@@ -1435,7 +1451,7 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
        list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
                if (event->state <= PERF_EVENT_STATE_OFF)
                        continue;
-                if (event->cpu != -1 && event->cpu != smp_processor_id())
+                if (!event_filter_match(event))
                        continue;
                if (group_can_go_on(event, cpuctx, 1))
@@ -1467,7 +1483,7 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
                 * Listen to the 'cpu' scheduling filter constraint
                 * of events:
                 */
-                if (event->cpu != -1 && event->cpu != smp_processor_id())
+                if (!event_filter_match(event))
                        continue;
                if (group_can_go_on(event, cpuctx, can_add_hw)) {
@@ -1694,7 +1710,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
                if (event->state != PERF_EVENT_STATE_ACTIVE)
                        continue;
-                if (event->cpu != -1 && event->cpu != smp_processor_id())
+                if (!event_filter_match(event))
                        continue;
                hwc = &event->hw;
@@ -3893,7 +3909,7 @@ static int perf_event_task_match(struct perf_event *event)
        if (event->state < PERF_EVENT_STATE_INACTIVE)
                return 0;
-        if (event->cpu != -1 && event->cpu != smp_processor_id())
+        if (!event_filter_match(event))
                return 0;
        if (event->attr.comm || event->attr.mmap ||
@@ -4030,7 +4046,7 @@ static int perf_event_comm_match(struct perf_event *event)
        if (event->state < PERF_EVENT_STATE_INACTIVE)
                return 0;
-        if (event->cpu != -1 && event->cpu != smp_processor_id())
+        if (!event_filter_match(event))
                return 0;
        if (event->attr.comm)
@@ -4178,7 +4194,7 @@ static int perf_event_mmap_match(struct perf_event *event,
        if (event->state < PERF_EVENT_STATE_INACTIVE)
                return 0;
-        if (event->cpu != -1 && event->cpu != smp_processor_id())
+        if (!event_filter_match(event))
                return 0;
        if ((!executable && event->attr.mmap_data) ||
@@ -4648,7 +4664,7 @@ int perf_swevent_get_recursion_context(void)
 }
 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
-void inline perf_swevent_put_recursion_context(int rctx)
+inline void perf_swevent_put_recursion_context(int rctx)
 {
        struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index a5aff3ebad38..265729966ece 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -100,13 +100,9 @@ config PM_SLEEP_ADVANCED_DEBUG
        depends on PM_ADVANCED_DEBUG
        default n
-config SUSPEND_NVS
-       bool
 config SUSPEND
        bool "Suspend to RAM and standby"
        depends on PM && ARCH_SUSPEND_POSSIBLE
-        select SUSPEND_NVS if HAS_IOMEM
        default y
        ---help---
          Allow the system to enter sleep states in which main memory is
@@ -140,7 +136,6 @@ config HIBERNATION
        depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
        select LZO_COMPRESS
        select LZO_DECOMPRESS
-        select SUSPEND_NVS if HAS_IOMEM
        ---help---
          Enable the suspend to disk (STD) functionality, which is usually
          called "hibernation" in user interfaces.  STD checkpoints the
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index f9063c6b185d..c350e18b53e3 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,7 +1,4 @@
+ccflags-$(CONFIG_PM_DEBUG)      :=      -DDEBUG
-ifeq ($(CONFIG_PM_DEBUG),y)
-EXTRA_CFLAGS    +=      -DDEBUG
-endif
 obj-$(CONFIG_PM)                += main.o
 obj-$(CONFIG_PM_SLEEP)          += console.o
@@ -10,6 +7,5 @@ obj-$(CONFIG_SUSPEND)		+= suspend.o
 obj-$(CONFIG_PM_TEST_SUSPEND)   += suspend_test.o
 obj-$(CONFIG_HIBERNATION)       += hibernate.o snapshot.o swap.o user.o \
                                   block_io.o
-obj-$(CONFIG_SUSPEND_NVS)       += nvs.o
 obj-$(CONFIG_MAGIC_SYSRQ)       += poweroff.o
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 048d0b514831..1832bd264219 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -51,18 +51,18 @@ enum {
 static int hibernation_mode = HIBERNATION_SHUTDOWN;
-static struct platform_hibernation_ops *hibernation_ops;
+static const struct platform_hibernation_ops *hibernation_ops;
 /**
 * hibernation_set_ops - set the global hibernate operations
 * @ops: the hibernation operations to use in subsequent hibernation transitions
 */
-void hibernation_set_ops(struct platform_hibernation_ops *ops)
+void hibernation_set_ops(const struct platform_hibernation_ops *ops)
 {
        if (ops && !(ops->begin && ops->end &&  ops->pre_snapshot
            && ops->prepare && ops->finish && ops->enter && ops->pre_restore
-            && ops->restore_cleanup)) {
+            && ops->restore_cleanup && ops->leave)) {
                WARN_ON(1);
                return;
        }
@@ -278,7 +278,7 @@ static int create_image(int platform_mode)
                goto Enable_irqs;
        }
-        if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events())
+        if (hibernation_test(TEST_CORE) || pm_wakeup_pending())
                goto Power_up;
        in_suspend = 1;
@@ -516,7 +516,7 @@ int hibernation_platform_enter(void)
        local_irq_disable();
        sysdev_suspend(PMSG_HIBERNATE);
-        if (!pm_check_wakeup_events()) {
+        if (pm_wakeup_pending()) {
                error = -EAGAIN;
                goto Power_up;
        }
@@ -647,6 +647,7 @@ int hibernate(void)
                swsusp_free();
                if (!error)
                        power_down();
+                in_suspend = 0;
                pm_restore_gfp_mask();
        } else {
                pr_debug("PM: Image restored successfully.\n");
diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c
deleted file mode 100644
index 1836db60bbb6..000000000000
--- a/kernel/power/nvs.c
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory
- *
- * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
- *
- * This file is released under the GPLv2.
- */
-#include <linux/io.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/suspend.h>
-/*
- * Platforms, like ACPI, may want us to save some memory used by them during
- * suspend and to restore the contents of this memory during the subsequent
- * resume.  The code below implements a mechanism allowing us to do that.
- */
-struct nvs_page {
-        unsigned long phys_start;
-        unsigned int size;
-        void *kaddr;
-        void *data;
-        struct list_head node;
-};
-static LIST_HEAD(nvs_list);
-/**
- *      suspend_nvs_register - register platform NVS memory region to save
- *      @start - physical address of the region
- *      @size - size of the region
- *
- *      The NVS region need not be page-aligned (both ends) and we arrange
- *      things so that the data from page-aligned addresses in this region will
- *      be copied into separate RAM pages.
- */
-int suspend_nvs_register(unsigned long start, unsigned long size)
-{
-        struct nvs_page *entry, *next;
-        while (size > 0) {
-                unsigned int nr_bytes;
-                entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
-                if (!entry)
-                        goto Error;
-                list_add_tail(&entry->node, &nvs_list);
-                entry->phys_start = start;
-                nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
-                entry->size = (size < nr_bytes) ? size : nr_bytes;
-                start += entry->size;
-                size -= entry->size;
-        }
-        return 0;
- Error:
-        list_for_each_entry_safe(entry, next, &nvs_list, node) {
-                list_del(&entry->node);
-                kfree(entry);
-        }
-        return -ENOMEM;
-}
-/**
- *      suspend_nvs_free - free data pages allocated for saving NVS regions
- */
-void suspend_nvs_free(void)
-{
-        struct nvs_page *entry;
-        list_for_each_entry(entry, &nvs_list, node)
-                if (entry->data) {
-                        free_page((unsigned long)entry->data);
-                        entry->data = NULL;
-                        if (entry->kaddr) {
-                                iounmap(entry->kaddr);
-                                entry->kaddr = NULL;
-                        }
-                }
-}
-/**
- *      suspend_nvs_alloc - allocate memory necessary for saving NVS regions
- */
-int suspend_nvs_alloc(void)
-{
-        struct nvs_page *entry;
-        list_for_each_entry(entry, &nvs_list, node) {
-                entry->data = (void *)__get_free_page(GFP_KERNEL);
-                if (!entry->data) {
-                        suspend_nvs_free();
-                        return -ENOMEM;
-                }
-        }
-        return 0;
-}
-/**
- *      suspend_nvs_save - save NVS memory regions
- */
-void suspend_nvs_save(void)
-{
-        struct nvs_page *entry;
-        printk(KERN_INFO "PM: Saving platform NVS memory\n");
-        list_for_each_entry(entry, &nvs_list, node)
-                if (entry->data) {
-                        entry->kaddr = ioremap(entry->phys_start, entry->size);
-                        memcpy(entry->data, entry->kaddr, entry->size);
-                }
-}
-/**
- *      suspend_nvs_restore - restore NVS memory regions
- *
- *      This function is going to be called with interrupts disabled, so it
- *      cannot iounmap the virtual addresses used to access the NVS region.
- */
-void suspend_nvs_restore(void)
-{
-        struct nvs_page *entry;
-        printk(KERN_INFO "PM: Restoring platform NVS memory\n");
-        list_for_each_entry(entry, &nvs_list, node)
-                if (entry->data)
-                        memcpy(entry->kaddr, entry->data, entry->size);
-}
diff --git a/kernel/power/process.c b/kernel/power/process.c
index e50b4c1b2a0f..d6d2a10320e0 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -64,6 +64,12 @@ static int try_to_freeze_tasks(bool sig_only)
                         * perturb a task in TASK_STOPPED or TASK_TRACED.
                         * It is "frozen enough".  If the task does wake
                         * up, it will immediately call try_to_freeze.
+                         *
+                         * Because freeze_task() goes through p's
+                         * scheduler lock after setting TIF_FREEZE, it's
+                         * guaranteed that either we see TASK_RUNNING or
+                         * try_to_stop() after schedule() in ptrace/signal
+                         * stop sees TIF_FREEZE.
                         */
                        if (!task_is_stopped_or_traced(p) &&
                            !freezer_should_skip(p))
@@ -79,7 +85,7 @@ static int try_to_freeze_tasks(bool sig_only)
                if (!todo || time_after(jiffies, end_time))
                        break;
-                if (!pm_check_wakeup_events()) {
+                if (pm_wakeup_pending()) {
                        wakeup = true;
                        break;
                }
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 031d5e3a6197..de6f86bfa303 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -31,13 +31,13 @@ const char *const pm_states[PM_SUSPEND_MAX] = {
        [PM_SUSPEND_MEM]        = "mem",
 };
-static struct platform_suspend_ops *suspend_ops;
+static const struct platform_suspend_ops *suspend_ops;
 /**
 *      suspend_set_ops - Set the global suspend method table.
 *      @ops:   Pointer to ops structure.
 */
-void suspend_set_ops(struct platform_suspend_ops *ops)
+void suspend_set_ops(const struct platform_suspend_ops *ops)
 {
        mutex_lock(&pm_mutex);
        suspend_ops = ops;
@@ -164,7 +164,7 @@ static int suspend_enter(suspend_state_t state)
        error = sysdev_suspend(PMSG_SUSPEND);
        if (!error) {
-                if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) {
+                if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
                        error = suspend_ops->enter(state);
                        events_check_enabled = false;
                }
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8c7e4832b9be..7c97c3a0eee3 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -224,7 +224,7 @@ static int swsusp_swap_check(void)
                return res;
        root_swap = res;
-        res = blkdev_get(hib_resume_bdev, FMODE_WRITE);
+        res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL);
        if (res)
                return res;
@@ -888,7 +888,7 @@ out_finish:
 /**
 *      swsusp_read - read the hibernation image.
 *      @flags_p: flags passed by the "frozen" kernel in the image header should
- *                be written into this memeory location
+ *                be written into this memory location
 */
 int swsusp_read(unsigned int *flags_p)
@@ -930,7 +930,8 @@ int swsusp_check(void)
 {
        int error;
-        hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
+        hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device,
+                                            FMODE_READ, NULL);
        if (!IS_ERR(hib_resume_bdev)) {
                set_blocksize(hib_resume_bdev, PAGE_SIZE);
                clear_page(swsusp_header);
diff --git a/kernel/printk.c b/kernel/printk.c
index ab3ffc5b3b64..53d9a9ec88e6 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -39,16 +39,11 @@
 #include <linux/syslog.h>
 #include <linux/cpu.h>
 #include <linux/notifier.h>
+#include <linux/rculist.h>
 #include <asm/uaccess.h>
 /*
- * for_each_console() allows you to iterate on each console
- */
-#define for_each_console(con) \
-        for (con = console_drivers; con != NULL; con = con->next)
-/*
 * Architectures can override it:
 */
 void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
@@ -279,12 +274,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
         * at open time.
         */
        if (type == SYSLOG_ACTION_OPEN || !from_file) {
-                if (dmesg_restrict && !capable(CAP_SYS_ADMIN))
+                if (dmesg_restrict && !capable(CAP_SYSLOG))
-                        return -EPERM;
+                        goto warn; /* switch to return -EPERM after 2.6.39 */
                if ((type != SYSLOG_ACTION_READ_ALL &&
                     type != SYSLOG_ACTION_SIZE_BUFFER) &&
-                    !capable(CAP_SYS_ADMIN))
+                    !capable(CAP_SYSLOG))
-                        return -EPERM;
+                        goto warn; /* switch to return -EPERM after 2.6.39 */
        }
        error = security_syslog(type);
@@ -428,6 +423,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
        }
 out:
        return error;
+warn:
+        /* remove after 2.6.39 */
+        if (capable(CAP_SYS_ADMIN))
+                WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN "
+                  "but no CAP_SYSLOG (deprecated and denied).\n");
+        return -EPERM;
 }
 SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
@@ -1359,6 +1360,7 @@ void register_console(struct console *newcon)
                spin_unlock_irqrestore(&logbuf_lock, flags);
        }
        release_console_sem();
+        console_sysfs_notify();
        /*
         * By unregistering the bootconsoles after we enable the real console
@@ -1417,6 +1419,7 @@ int unregister_console(struct console *console)
                console_drivers->flags |= CON_CONSDEV;
        release_console_sem();
+        console_sysfs_notify();
        return res;
 }
 EXPORT_SYMBOL(unregister_console);
@@ -1500,7 +1503,7 @@ int kmsg_dump_register(struct kmsg_dumper *dumper)
        /* Don't allow registering multiple times */
        if (!dumper->registered) {
                dumper->registered = 1;
-                list_add_tail(&dumper->list, &dump_list);
+                list_add_tail_rcu(&dumper->list, &dump_list);
                err = 0;
        }
        spin_unlock_irqrestore(&dump_list_lock, flags);
@@ -1524,29 +1527,16 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)
        spin_lock_irqsave(&dump_list_lock, flags);
        if (dumper->registered) {
                dumper->registered = 0;
-                list_del(&dumper->list);
+                list_del_rcu(&dumper->list);
                err = 0;
        }
        spin_unlock_irqrestore(&dump_list_lock, flags);
+        synchronize_rcu();
        return err;
 }
 EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
-static const char * const kmsg_reasons[] = {
-        [KMSG_DUMP_OOPS]        = "oops",
-        [KMSG_DUMP_PANIC]       = "panic",
-        [KMSG_DUMP_KEXEC]       = "kexec",
-};
-static const char *kmsg_to_str(enum kmsg_dump_reason reason)
-{
-        if (reason >= ARRAY_SIZE(kmsg_reasons) || reason < 0)
-                return "unknown";
-        return kmsg_reasons[reason];
-}
 /**
 * kmsg_dump - dump kernel log to kernel message dumpers.
 * @reason: the reason (oops, panic etc) for dumping
@@ -1585,13 +1575,9 @@ void kmsg_dump(enum kmsg_dump_reason reason)
                l2 = chars;
        }
-        if (!spin_trylock_irqsave(&dump_list_lock, flags)) {
+        rcu_read_lock();
-                printk(KERN_ERR "dump_kmsg: dump list lock is held during %s, skipping dump\n",
+        list_for_each_entry_rcu(dumper, &dump_list, list)
-                                kmsg_to_str(reason));
-                return;
-        }
-        list_for_each_entry(dumper, &dump_list, list)
                dumper->dump(dumper, reason, s1, l1, s2, l2);
-        spin_unlock_irqrestore(&dump_list_lock, flags);
+        rcu_read_unlock();
 }
 #endif
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 034493724749..0c343b9a46d5 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -189,7 +189,8 @@ static int rcu_kthread(void *arg)
        unsigned long flags;
        for (;;) {
-                wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0);
+                wait_event_interruptible(rcu_kthread_wq,
+                                         have_rcu_kthread_work != 0);
                morework = rcu_boost();
                local_irq_save(flags);
                work = have_rcu_kthread_work;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d0ddfea6579d..dd4aea806f8e 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -364,8 +364,8 @@ void rcu_irq_exit(void)
        WARN_ON_ONCE(rdtp->dynticks & 0x1);
        /* If the interrupt queued a callback, get out of dyntick mode. */
-        if (__get_cpu_var(rcu_sched_data).nxtlist ||
+        if (__this_cpu_read(rcu_sched_data.nxtlist) ||
-            __get_cpu_var(rcu_bh_data).nxtlist)
+            __this_cpu_read(rcu_bh_data.nxtlist))
                set_need_resched();
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index 04949089e760..ea3e5eff3878 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -278,14 +278,12 @@ struct task_group {
 #endif
 };
-#define root_task_group init_task_group
 /* task_group_lock serializes the addition/removal of task groups */
 static DEFINE_SPINLOCK(task_group_lock);
 #ifdef CONFIG_FAIR_GROUP_SCHED
-# define INIT_TASK_GROUP_LOAD   NICE_0_LOAD
+# define ROOT_TASK_GROUP_LOAD   NICE_0_LOAD
 /*
 * A weight of 0 or 1 can cause arithmetics problems.
@@ -298,13 +296,13 @@ static DEFINE_SPINLOCK(task_group_lock);
 #define MIN_SHARES      2
 #define MAX_SHARES      (1UL << 18)
-static int init_task_group_load = INIT_TASK_GROUP_LOAD;
+static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
 #endif
 /* Default task group.
 *      Every task in system belong to this group at bootup.
 */
-struct task_group init_task_group;
+struct task_group root_task_group;
 #endif  /* CONFIG_CGROUP_SCHED */
@@ -743,7 +741,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
        buf[cnt] = 0;
        cmp = strstrip(buf);
-        if (strncmp(buf, "NO_", 3) == 0) {
+        if (strncmp(cmp, "NO_", 3) == 0) {
                neg = 1;
                cmp += 3;
        }
@@ -2507,7 +2505,7 @@ out:
 * try_to_wake_up_local - try to wake up a local task with rq lock held
 * @p: the thread to be awakened
 *
- * Put @p on the run-queue if it's not alredy there.  The caller must
+ * Put @p on the run-queue if it's not already there.  The caller must
 * ensure that this_rq() is locked, @p is bound to this_rq() and not
 * the current task.  this_rq() stays locked over invocation.
 */
@@ -7848,7 +7846,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
        cfs_rq->tg = tg;
        tg->se[cpu] = se;
-        /* se could be NULL for init_task_group */
+        /* se could be NULL for root_task_group */
        if (!se)
                return;
@@ -7908,18 +7906,18 @@ void __init sched_init(void)
                ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
 #ifdef CONFIG_FAIR_GROUP_SCHED
-                init_task_group.se = (struct sched_entity **)ptr;
+                root_task_group.se = (struct sched_entity **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
-                init_task_group.cfs_rq = (struct cfs_rq **)ptr;
+                root_task_group.cfs_rq = (struct cfs_rq **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
-                init_task_group.rt_se = (struct sched_rt_entity **)ptr;
+                root_task_group.rt_se = (struct sched_rt_entity **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
-                init_task_group.rt_rq = (struct rt_rq **)ptr;
+                root_task_group.rt_rq = (struct rt_rq **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
 #endif /* CONFIG_RT_GROUP_SCHED */
@@ -7939,13 +7937,13 @@ void __init sched_init(void)
                        global_rt_period(), global_rt_runtime());
 #ifdef CONFIG_RT_GROUP_SCHED
-        init_rt_bandwidth(&init_task_group.rt_bandwidth,
+        init_rt_bandwidth(&root_task_group.rt_bandwidth,
                        global_rt_period(), global_rt_runtime());
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_CGROUP_SCHED
-        list_add(&init_task_group.list, &task_groups);
+        list_add(&root_task_group.list, &task_groups);
-        INIT_LIST_HEAD(&init_task_group.children);
+        INIT_LIST_HEAD(&root_task_group.children);
        autogroup_init(&init_task);
 #endif /* CONFIG_CGROUP_SCHED */
@@ -7960,34 +7958,34 @@ void __init sched_init(void)
                init_cfs_rq(&rq->cfs, rq);
                init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
-                init_task_group.shares = init_task_group_load;
+                root_task_group.shares = root_task_group_load;
                INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
                /*
-                 * How much cpu bandwidth does init_task_group get?
+                 * How much cpu bandwidth does root_task_group get?
                 *
                 * In case of task-groups formed thr' the cgroup filesystem, it
                 * gets 100% of the cpu resources in the system. This overall
                 * system cpu resource is divided among the tasks of
-                 * init_task_group and its child task-groups in a fair manner,
+                 * root_task_group and its child task-groups in a fair manner,
                 * based on each entity's (task or task-group's) weight
                 * (se->load.weight).
                 *
-                 * In other words, if init_task_group has 10 tasks of weight
+                 * In other words, if root_task_group has 10 tasks of weight
                 * 1024) and two child groups A0 and A1 (of weight 1024 each),
                 * then A0's share of the cpu resource is:
                 *
                 *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
                 *
-                 * We achieve this by letting init_task_group's tasks sit
+                 * We achieve this by letting root_task_group's tasks sit
-                 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
+                 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
                 */
-                init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, NULL);
+                init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
                rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
 #ifdef CONFIG_RT_GROUP_SCHED
                INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
-                init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, NULL);
+                init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
 #endif
                for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -8379,6 +8377,7 @@ static void free_sched_group(struct task_group *tg)
 {
        free_fair_sched_group(tg);
        free_rt_sched_group(tg);
+        autogroup_free(tg);
        kfree(tg);
 }
@@ -8812,7 +8811,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
        if (!cgrp->parent) {
                /* This is early initialization for the top cgroup */
-                return &init_task_group.css;
+                return &root_task_group.css;
        }
        parent = cgroup_tg(cgrp->parent);
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
index c80fedcd476b..32a723b8f84c 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched_autogroup.c
@@ -9,10 +9,10 @@ unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
 static struct autogroup autogroup_default;
 static atomic_t autogroup_seq_nr;
-static void autogroup_init(struct task_struct *init_task)
+static void __init autogroup_init(struct task_struct *init_task)
 {
-        autogroup_default.tg = &init_task_group;
+        autogroup_default.tg = &root_task_group;
-        init_task_group.autogroup = &autogroup_default;
+        root_task_group.autogroup = &autogroup_default;
        kref_init(&autogroup_default.kref);
        init_rwsem(&autogroup_default.lock);
        init_task->signal->autogroup = &autogroup_default;
@@ -63,7 +63,7 @@ static inline struct autogroup *autogroup_create(void)
        if (!ag)
                goto out_fail;
-        tg = sched_create_group(&init_task_group);
+        tg = sched_create_group(&root_task_group);
        if (IS_ERR(tg))
                goto out_free;
diff --git a/kernel/smp.c b/kernel/smp.c
index 12ed8b013e2d..4ec30e069987 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -13,6 +13,7 @@
 #include <linux/smp.h>
 #include <linux/cpu.h>
+#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
 static struct {
        struct list_head        queue;
        raw_spinlock_t          lock;
@@ -529,3 +530,21 @@ void ipi_call_unlock_irq(void)
 {
        raw_spin_unlock_irq(&call_function.lock);
 }
+#endif /* USE_GENERIC_SMP_HELPERS */
+/*
+ * Call a function on all processors
+ */
+int on_each_cpu(void (*func) (void *info), void *info, int wait)
+{
+        int ret = 0;
+        preempt_disable();
+        ret = smp_call_function(func, info, wait);
+        local_irq_disable();
+        func(info);
+        local_irq_enable();
+        preempt_enable();
+        return ret;
+}
+EXPORT_SYMBOL(on_each_cpu);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index d4d918a91881..68eb5efec388 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -70,7 +70,7 @@ char *softirq_to_name[NR_SOFTIRQS] = {
 static void wakeup_softirqd(void)
 {
        /* Interrupts are disabled: no need to stop preemption */
-        struct task_struct *tsk = __get_cpu_var(ksoftirqd);
+        struct task_struct *tsk = __this_cpu_read(ksoftirqd);
        if (tsk && tsk->state != TASK_RUNNING)
                wake_up_process(tsk);
@@ -388,8 +388,8 @@ void __tasklet_schedule(struct tasklet_struct *t)
        local_irq_save(flags);
        t->next = NULL;
-        *__get_cpu_var(tasklet_vec).tail = t;
+        *__this_cpu_read(tasklet_vec.tail) = t;
-        __get_cpu_var(tasklet_vec).tail = &(t->next);
+        __this_cpu_write(tasklet_vec.tail, &(t->next));
        raise_softirq_irqoff(TASKLET_SOFTIRQ);
        local_irq_restore(flags);
 }
@@ -402,8 +402,8 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
        local_irq_save(flags);
        t->next = NULL;
-        *__get_cpu_var(tasklet_hi_vec).tail = t;
+        *__this_cpu_read(tasklet_hi_vec.tail) = t;
-        __get_cpu_var(tasklet_hi_vec).tail = &(t->next);
+        __this_cpu_write(tasklet_hi_vec.tail,  &(t->next));
        raise_softirq_irqoff(HI_SOFTIRQ);
        local_irq_restore(flags);
 }
@@ -414,8 +414,8 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
 {
        BUG_ON(!irqs_disabled());
-        t->next = __get_cpu_var(tasklet_hi_vec).head;
+        t->next = __this_cpu_read(tasklet_hi_vec.head);
-        __get_cpu_var(tasklet_hi_vec).head = t;
+        __this_cpu_write(tasklet_hi_vec.head, t);
        __raise_softirq_irqoff(HI_SOFTIRQ);
 }
@@ -426,9 +426,9 @@ static void tasklet_action(struct softirq_action *a)
        struct tasklet_struct *list;
        local_irq_disable();
-        list = __get_cpu_var(tasklet_vec).head;
+        list = __this_cpu_read(tasklet_vec.head);
-        __get_cpu_var(tasklet_vec).head = NULL;
+        __this_cpu_write(tasklet_vec.head, NULL);
-        __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head;
+        __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head);
        local_irq_enable();
        while (list) {
@@ -449,8 +449,8 @@ static void tasklet_action(struct softirq_action *a)
                local_irq_disable();
                t->next = NULL;
-                *__get_cpu_var(tasklet_vec).tail = t;
+                *__this_cpu_read(tasklet_vec.tail) = t;
-                __get_cpu_var(tasklet_vec).tail = &(t->next);
+                __this_cpu_write(tasklet_vec.tail, &(t->next));
                __raise_softirq_irqoff(TASKLET_SOFTIRQ);
                local_irq_enable();
        }
@@ -461,9 +461,9 @@ static void tasklet_hi_action(struct softirq_action *a)
        struct tasklet_struct *list;
        local_irq_disable();
-        list = __get_cpu_var(tasklet_hi_vec).head;
+        list = __this_cpu_read(tasklet_hi_vec.head);
-        __get_cpu_var(tasklet_hi_vec).head = NULL;
+        __this_cpu_write(tasklet_hi_vec.head, NULL);
-        __get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head;
+        __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head);
        local_irq_enable();
        while (list) {
@@ -484,8 +484,8 @@ static void tasklet_hi_action(struct softirq_action *a)
                local_irq_disable();
                t->next = NULL;
-                *__get_cpu_var(tasklet_hi_vec).tail = t;
+                *__this_cpu_read(tasklet_hi_vec.tail) = t;
-                __get_cpu_var(tasklet_hi_vec).tail = &(t->next);
+                __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
                __raise_softirq_irqoff(HI_SOFTIRQ);
                local_irq_enable();
        }
@@ -802,16 +802,16 @@ static void takeover_tasklets(unsigned int cpu)
        /* Find end, append list for that CPU. */
        if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) {
-                *(__get_cpu_var(tasklet_vec).tail) = per_cpu(tasklet_vec, cpu).head;
+                *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head;
-                __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail;
+                this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail);
                per_cpu(tasklet_vec, cpu).head = NULL;
                per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
        }
        raise_softirq_irqoff(TASKLET_SOFTIRQ);
        if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) {
-                *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head;
+                *__this_cpu_read(tasklet_hi_vec.tail) = per_cpu(tasklet_hi_vec, cpu).head;
-                __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail;
+                __this_cpu_write(tasklet_hi_vec.tail, per_cpu(tasklet_hi_vec, cpu).tail);
                per_cpu(tasklet_hi_vec, cpu).head = NULL;
                per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
        }
@@ -853,7 +853,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
                             cpumask_any(cpu_online_mask));
        case CPU_DEAD:
        case CPU_DEAD_FROZEN: {
-                static struct sched_param param = {
+                static const struct sched_param param = {
                        .sched_priority = MAX_RT_PRIO-1
                };
@@ -885,25 +885,6 @@ static __init int spawn_ksoftirqd(void)
 }
 early_initcall(spawn_ksoftirqd);
-#ifdef CONFIG_SMP
-/*
- * Call a function on all processors
- */
-int on_each_cpu(void (*func) (void *info), void *info, int wait)
-{
-        int ret = 0;
-        preempt_disable();
-        ret = smp_call_function(func, info, wait);
-        local_irq_disable();
-        func(info);
-        local_irq_enable();
-        preempt_enable();
-        return ret;
-}
-EXPORT_SYMBOL(on_each_cpu);
-#endif
 /*
 * [ These __weak aliases are kept in a separate compilation unit, so that
 *   GCC does not inline them incorrectly. ]
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 98d8c1e80edb..73ce23feaea9 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -156,6 +156,16 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx)
 EXPORT_SYMBOL_GPL(__srcu_read_unlock);
 /*
+ * We use an adaptive strategy for synchronize_srcu() and especially for
+ * synchronize_srcu_expedited().  We spin for a fixed time period
+ * (defined below) to allow SRCU readers to exit their read-side critical
+ * sections.  If there are still some readers after 10 microseconds,
+ * we repeatedly block for 1-millisecond time periods.  This approach
+ * has done well in testing, so there is no need for a config parameter.
+ */
+#define SYNCHRONIZE_SRCU_READER_DELAY 10
+/*
 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
 */
 static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
@@ -207,11 +217,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
         * will have finished executing.  We initially give readers
         * an arbitrarily chosen 10 microseconds to get out of their
         * SRCU read-side critical sections, then loop waiting 1/HZ
-         * seconds per iteration.
+         * seconds per iteration.  The 10-microsecond value has done
+         * very well in testing.
         */
        if (srcu_readers_active_idx(sp, idx))
-                udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY);
+                udelay(SYNCHRONIZE_SRCU_READER_DELAY);
        while (srcu_readers_active_idx(sp, idx))
                schedule_timeout_interruptible(1);
diff --git a/kernel/sys.c b/kernel/sys.c
index 2745dcdb6c6c..31b71a276b40 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -43,6 +43,8 @@
 #include <linux/kprobes.h>
 #include <linux/user_namespace.h>
+#include <linux/kmsg_dump.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/unistd.h>
@@ -285,6 +287,7 @@ out_unlock:
 */
 void emergency_restart(void)
 {
+        kmsg_dump(KMSG_DUMP_EMERG);
        machine_emergency_restart();
 }
 EXPORT_SYMBOL_GPL(emergency_restart);
@@ -312,6 +315,7 @@ void kernel_restart(char *cmd)
                printk(KERN_EMERG "Restarting system.\n");
        else
                printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
+        kmsg_dump(KMSG_DUMP_RESTART);
        machine_restart(cmd);
 }
 EXPORT_SYMBOL_GPL(kernel_restart);
@@ -333,6 +337,7 @@ void kernel_halt(void)
        kernel_shutdown_prepare(SYSTEM_HALT);
        sysdev_shutdown();
        printk(KERN_EMERG "System halted.\n");
+        kmsg_dump(KMSG_DUMP_HALT);
        machine_halt();
 }
@@ -351,6 +356,7 @@ void kernel_power_off(void)
        disable_nonboot_cpus();
        sysdev_shutdown();
        printk(KERN_EMERG "Power down.\n");
+        kmsg_dump(KMSG_DUMP_POWEROFF);
        machine_power_off();
 }
 EXPORT_SYMBOL_GPL(kernel_power_off);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ae5cbb1e3ced..bc86bb32e126 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -24,6 +24,7 @@
 #include <linux/slab.h>
 #include <linux/sysctl.h>
 #include <linux/signal.h>
+#include <linux/printk.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/ctype.h>
@@ -245,10 +246,6 @@ static struct ctl_table root_table[] = {
                .mode           = 0555,
                .child          = dev_table,
        },
-/*
- * NOTE: do not add new entries to this table unless you have read
- * Documentation/sysctl/ctl_unnumbered.txt
- */
        { }
 };
@@ -710,6 +707,15 @@ static struct ctl_table kern_table[] = {
                .extra1         = &zero,
                .extra2         = &one,
        },
+        {
+                .procname       = "kptr_restrict",
+                .data           = &kptr_restrict,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
+                .extra2         = &two,
+        },
 #endif
        {
                .procname       = "ngroups_max",
@@ -962,10 +968,6 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = proc_dointvec,
        },
 #endif
-/*
- * NOTE: do not add new entries to this table unless you have read
- * Documentation/sysctl/ctl_unnumbered.txt
- */
        { }
 };
@@ -1326,11 +1328,6 @@ static struct ctl_table vm_table[] = {
                .extra2         = &one,
        },
 #endif
-/*
- * NOTE: do not add new entries to this table unless you have read
- * Documentation/sysctl/ctl_unnumbered.txt
- */
        { }
 };
@@ -1486,10 +1483,6 @@ static struct ctl_table fs_table[] = {
                .proc_handler   = &pipe_proc_fn,
                .extra1         = &pipe_min_size,
        },
-/*
- * NOTE: do not add new entries to this table unless you have read
- * Documentation/sysctl/ctl_unnumbered.txt
- */
        { }
 };
@@ -2899,7 +2892,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
        }
 }
-#else /* CONFIG_PROC_FS */
+#else /* CONFIG_PROC_SYSCTL */
 int proc_dostring(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -2951,7 +2944,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
 }
-#endif /* CONFIG_PROC_FS */
+#endif /* CONFIG_PROC_SYSCTL */
 /*
 * No sense putting this after each symbol definition, twice,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 4b2545a136ff..b875bedf7c9a 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1192,7 +1192,7 @@ static ssize_t bin_dn_node_address(struct file *file,
                buf[result] = '\0';
-                /* Convert the decnet addresss to binary */
+                /* Convert the decnet address to binary */
                result = -EIO;
                nodep = strchr(buf, '.') + 1;
                if (!nodep)
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 3308fd7f1b52..3971c6b9d58d 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -89,8 +89,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
                return -ENOMEM;
        if (!info) {
-                int seq = get_cpu_var(taskstats_seqnum)++;
+                int seq = this_cpu_inc_return(taskstats_seqnum) - 1;
-                put_cpu_var(taskstats_seqnum);
                reply = genlmsg_put(skb, 0, seq, &family, 0, cmd);
        } else
@@ -349,7 +348,7 @@ static int parse(struct nlattr *na, struct cpumask *mask)
        return ret;
 }
-#ifdef CONFIG_IA64
+#if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
 #define TASKSTATS_NEEDS_PADDING 1
 #endif
@@ -612,7 +611,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
                fill_tgid_exit(tsk);
        }
-        listeners = &__raw_get_cpu_var(listener_array);
+        listeners = __this_cpu_ptr(&listener_array);
        if (list_empty(&listeners->list))
                return;
diff --git a/kernel/time.c b/kernel/time.c
index ba9b338d1835..32174359576f 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -238,7 +238,7 @@ EXPORT_SYMBOL(current_fs_time);
 * Avoid unnecessary multiplications/divisions in the
 * two most common HZ cases:
 */
-unsigned int inline jiffies_to_msecs(const unsigned long j)
+inline unsigned int jiffies_to_msecs(const unsigned long j)
 {
 #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
        return (MSEC_PER_SEC / HZ) * j;
@@ -254,7 +254,7 @@ unsigned int inline jiffies_to_msecs(const unsigned long j)
 }
 EXPORT_SYMBOL(jiffies_to_msecs);
-unsigned int inline jiffies_to_usecs(const unsigned long j)
+inline unsigned int jiffies_to_usecs(const unsigned long j)
 {
 #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
        return (USEC_PER_SEC / HZ) * j;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 8588abcac07b..6519cf62d9cd 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -152,6 +152,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
         */
        for (sft = 32; sft > 0; sft--) {
                tmp = (u64) to << sft;
+                tmp += from / 2;
                do_div(tmp, from);
                if ((tmp >> sftacc) == 0)
                        break;
@@ -678,7 +679,7 @@ EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
 int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 {
-        /* Intialize mult/shift and max_idle_ns */
+        /* Initialize mult/shift and max_idle_ns */
        __clocksource_updatefreq_scale(cs, scale, freq);
        /* Add clocksource to the clcoksource list */
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index d2321891538f..5c00242fa921 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -14,6 +14,7 @@
 #include <linux/timex.h>
 #include <linux/time.h>
 #include <linux/mm.h>
+#include <linux/module.h>
 /*
 * NTP timekeeping variables:
@@ -74,6 +75,162 @@ static long			time_adjust;
 /* constant (boot-param configurable) NTP tick adjustment (upscaled)    */
 static s64                      ntp_tick_adj;
+#ifdef CONFIG_NTP_PPS
+/*
+ * The following variables are used when a pulse-per-second (PPS) signal
+ * is available. They establish the engineering parameters of the clock
+ * discipline loop when controlled by the PPS signal.
+ */
+#define PPS_VALID       10      /* PPS signal watchdog max (s) */
+#define PPS_POPCORN     4       /* popcorn spike threshold (shift) */
+#define PPS_INTMIN      2       /* min freq interval (s) (shift) */
+#define PPS_INTMAX      8       /* max freq interval (s) (shift) */
+#define PPS_INTCOUNT    4       /* number of consecutive good intervals to
+                                   increase pps_shift or consecutive bad
+                                   intervals to decrease it */
+#define PPS_MAXWANDER   100000  /* max PPS freq wander (ns/s) */
+static int pps_valid;           /* signal watchdog counter */
+static long pps_tf[3];          /* phase median filter */
+static long pps_jitter;         /* current jitter (ns) */
+static struct timespec pps_fbase; /* beginning of the last freq interval */
+static int pps_shift;           /* current interval duration (s) (shift) */
+static int pps_intcnt;          /* interval counter */
+static s64 pps_freq;            /* frequency offset (scaled ns/s) */
+static long pps_stabil;         /* current stability (scaled ns/s) */
+/*
+ * PPS signal quality monitors
+ */
+static long pps_calcnt;         /* calibration intervals */
+static long pps_jitcnt;         /* jitter limit exceeded */
+static long pps_stbcnt;         /* stability limit exceeded */
+static long pps_errcnt;         /* calibration errors */
+/* PPS kernel consumer compensates the whole phase error immediately.
+ * Otherwise, reduce the offset by a fixed factor times the time constant.
+ */
+static inline s64 ntp_offset_chunk(s64 offset)
+{
+        if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL)
+                return offset;
+        else
+                return shift_right(offset, SHIFT_PLL + time_constant);
+}
+static inline void pps_reset_freq_interval(void)
+{
+        /* the PPS calibration interval may end
+           surprisingly early */
+        pps_shift = PPS_INTMIN;
+        pps_intcnt = 0;
+}
+/**
+ * pps_clear - Clears the PPS state variables
+ *
+ * Must be called while holding a write on the xtime_lock
+ */
+static inline void pps_clear(void)
+{
+        pps_reset_freq_interval();
+        pps_tf[0] = 0;
+        pps_tf[1] = 0;
+        pps_tf[2] = 0;
+        pps_fbase.tv_sec = pps_fbase.tv_nsec = 0;
+        pps_freq = 0;
+}
+/* Decrease pps_valid to indicate that another second has passed since
+ * the last PPS signal. When it reaches 0, indicate that PPS signal is
+ * missing.
+ *
+ * Must be called while holding a write on the xtime_lock
+ */
+static inline void pps_dec_valid(void)
+{
+        if (pps_valid > 0)
+                pps_valid--;
+        else {
+                time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
+                                 STA_PPSWANDER | STA_PPSERROR);
+                pps_clear();
+        }
+}
+static inline void pps_set_freq(s64 freq)
+{
+        pps_freq = freq;
+}
+static inline int is_error_status(int status)
+{
+        return (time_status & (STA_UNSYNC|STA_CLOCKERR))
+                /* PPS signal lost when either PPS time or
+                 * PPS frequency synchronization requested
+                 */
+                || ((time_status & (STA_PPSFREQ|STA_PPSTIME))
+                        && !(time_status & STA_PPSSIGNAL))
+                /* PPS jitter exceeded when
+                 * PPS time synchronization requested */
+                || ((time_status & (STA_PPSTIME|STA_PPSJITTER))
+                        == (STA_PPSTIME|STA_PPSJITTER))
+                /* PPS wander exceeded or calibration error when
+                 * PPS frequency synchronization requested
+                 */
+                || ((time_status & STA_PPSFREQ)
+                        && (time_status & (STA_PPSWANDER|STA_PPSERROR)));
+}
+static inline void pps_fill_timex(struct timex *txc)
+{
+        txc->ppsfreq       = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) *
+                                         PPM_SCALE_INV, NTP_SCALE_SHIFT);
+        txc->jitter        = pps_jitter;
+        if (!(time_status & STA_NANO))
+                txc->jitter /= NSEC_PER_USEC;
+        txc->shift         = pps_shift;
+        txc->stabil        = pps_stabil;
+        txc->jitcnt        = pps_jitcnt;
+        txc->calcnt        = pps_calcnt;
+        txc->errcnt        = pps_errcnt;
+        txc->stbcnt        = pps_stbcnt;
+}
+#else /* !CONFIG_NTP_PPS */
+static inline s64 ntp_offset_chunk(s64 offset)
+{
+        return shift_right(offset, SHIFT_PLL + time_constant);
+}
+static inline void pps_reset_freq_interval(void) {}
+static inline void pps_clear(void) {}
+static inline void pps_dec_valid(void) {}
+static inline void pps_set_freq(s64 freq) {}
+static inline int is_error_status(int status)
+{
+        return status & (STA_UNSYNC|STA_CLOCKERR);
+}
+static inline void pps_fill_timex(struct timex *txc)
+{
+        /* PPS is not implemented, so these are zero */
+        txc->ppsfreq       = 0;
+        txc->jitter        = 0;
+        txc->shift         = 0;
+        txc->stabil        = 0;
+        txc->jitcnt        = 0;
+        txc->calcnt        = 0;
+        txc->errcnt        = 0;
+        txc->stbcnt        = 0;
+}
+#endif /* CONFIG_NTP_PPS */
 /*
 * NTP methods:
 */
@@ -185,6 +342,9 @@ void ntp_clear(void)
        tick_length     = tick_length_base;
        time_offset     = 0;
+        /* Clear PPS state variables */
+        pps_clear();
 }
 /*
@@ -250,16 +410,16 @@ void second_overflow(void)
                time_status |= STA_UNSYNC;
        }
-        /*
+        /* Compute the phase adjustment for the next second */
-         * Compute the phase adjustment for the next second. The offset is
-         * reduced by a fixed factor times the time constant.
-         */
        tick_length      = tick_length_base;
-        delta            = shift_right(time_offset, SHIFT_PLL + time_constant);
+        delta            = ntp_offset_chunk(time_offset);
        time_offset     -= delta;
        tick_length     += delta;
+        /* Check PPS signal */
+        pps_dec_valid();
        if (!time_adjust)
                return;
@@ -369,6 +529,8 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
        if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
                time_state = TIME_OK;
                time_status = STA_UNSYNC;
+                /* restart PPS frequency calibration */
+                pps_reset_freq_interval();
        }
        /*
@@ -418,6 +580,8 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
                time_freq = txc->freq * PPM_SCALE;
                time_freq = min(time_freq, MAXFREQ_SCALED);
                time_freq = max(time_freq, -MAXFREQ_SCALED);
+                /* update pps_freq */
+                pps_set_freq(time_freq);
        }
        if (txc->modes & ADJ_MAXERROR)
@@ -508,7 +672,8 @@ int do_adjtimex(struct timex *txc)
        }
        result = time_state;    /* mostly `TIME_OK' */
-        if (time_status & (STA_UNSYNC|STA_CLOCKERR))
+        /* check for errors */
+        if (is_error_status(time_status))
                result = TIME_ERROR;
        txc->freq          = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
@@ -522,15 +687,8 @@ int do_adjtimex(struct timex *txc)
        txc->tick          = tick_usec;
        txc->tai           = time_tai;
-        /* PPS is not implemented, so these are zero */
+        /* fill PPS status fields */
-        txc->ppsfreq       = 0;
+        pps_fill_timex(txc);
-        txc->jitter        = 0;
-        txc->shift         = 0;
-        txc->stabil        = 0;
-        txc->jitcnt        = 0;
-        txc->calcnt        = 0;
-        txc->errcnt        = 0;
-        txc->stbcnt        = 0;
        write_sequnlock_irq(&xtime_lock);
@@ -544,6 +702,243 @@ int do_adjtimex(struct timex *txc)
        return result;
 }
+#ifdef  CONFIG_NTP_PPS
+/* actually struct pps_normtime is good old struct timespec, but it is
+ * semantically different (and it is the reason why it was invented):
+ * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ]
+ * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */
+struct pps_normtime {
+        __kernel_time_t sec;    /* seconds */
+        long            nsec;   /* nanoseconds */
+};
+/* normalize the timestamp so that nsec is in the
+   ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */
+static inline struct pps_normtime pps_normalize_ts(struct timespec ts)
+{
+        struct pps_normtime norm = {
+                .sec = ts.tv_sec,
+                .nsec = ts.tv_nsec
+        };
+        if (norm.nsec > (NSEC_PER_SEC >> 1)) {
+                norm.nsec -= NSEC_PER_SEC;
+                norm.sec++;
+        }
+        return norm;
+}
+/* get current phase correction and jitter */
+static inline long pps_phase_filter_get(long *jitter)
+{
+        *jitter = pps_tf[0] - pps_tf[1];
+        if (*jitter < 0)
+                *jitter = -*jitter;
+        /* TODO: test various filters */
+        return pps_tf[0];
+}
+/* add the sample to the phase filter */
+static inline void pps_phase_filter_add(long err)
+{
+        pps_tf[2] = pps_tf[1];
+        pps_tf[1] = pps_tf[0];
+        pps_tf[0] = err;
+}
+/* decrease frequency calibration interval length.
+ * It is halved after four consecutive unstable intervals.
+ */
+static inline void pps_dec_freq_interval(void)
+{
+        if (--pps_intcnt <= -PPS_INTCOUNT) {
+                pps_intcnt = -PPS_INTCOUNT;
+                if (pps_shift > PPS_INTMIN) {
+                        pps_shift--;
+                        pps_intcnt = 0;
+                }
+        }
+}
+/* increase frequency calibration interval length.
+ * It is doubled after four consecutive stable intervals.
+ */
+static inline void pps_inc_freq_interval(void)
+{
+        if (++pps_intcnt >= PPS_INTCOUNT) {
+                pps_intcnt = PPS_INTCOUNT;
+                if (pps_shift < PPS_INTMAX) {
+                        pps_shift++;
+                        pps_intcnt = 0;
+                }
+        }
+}
+/* update clock frequency based on MONOTONIC_RAW clock PPS signal
+ * timestamps
+ *
+ * At the end of the calibration interval the difference between the
+ * first and last MONOTONIC_RAW clock timestamps divided by the length
+ * of the interval becomes the frequency update. If the interval was
+ * too long, the data are discarded.
+ * Returns the difference between old and new frequency values.
+ */
+static long hardpps_update_freq(struct pps_normtime freq_norm)
+{
+        long delta, delta_mod;
+        s64 ftemp;
+        /* check if the frequency interval was too long */
+        if (freq_norm.sec > (2 << pps_shift)) {
+                time_status |= STA_PPSERROR;
+                pps_errcnt++;
+                pps_dec_freq_interval();
+                pr_err("hardpps: PPSERROR: interval too long - %ld s\n",
+                                freq_norm.sec);
+                return 0;
+        }
+        /* here the raw frequency offset and wander (stability) is
+         * calculated. If the wander is less than the wander threshold
+         * the interval is increased; otherwise it is decreased.
+         */
+        ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT,
+                        freq_norm.sec);
+        delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT);
+        pps_freq = ftemp;
+        if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) {
+                pr_warning("hardpps: PPSWANDER: change=%ld\n", delta);
+                time_status |= STA_PPSWANDER;
+                pps_stbcnt++;
+                pps_dec_freq_interval();
+        } else {        /* good sample */
+                pps_inc_freq_interval();
+        }
+        /* the stability metric is calculated as the average of recent
+         * frequency changes, but is used only for performance
+         * monitoring
+         */
+        delta_mod = delta;
+        if (delta_mod < 0)
+                delta_mod = -delta_mod;
+        pps_stabil += (div_s64(((s64)delta_mod) <<
+                                (NTP_SCALE_SHIFT - SHIFT_USEC),
+                                NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN;
+        /* if enabled, the system clock frequency is updated */
+        if ((time_status & STA_PPSFREQ) != 0 &&
+            (time_status & STA_FREQHOLD) == 0) {
+                time_freq = pps_freq;
+                ntp_update_frequency();
+        }
+        return delta;
+}
+/* correct REALTIME clock phase error against PPS signal */
+static void hardpps_update_phase(long error)
+{
+        long correction = -error;
+        long jitter;
+        /* add the sample to the median filter */
+        pps_phase_filter_add(correction);
+        correction = pps_phase_filter_get(&jitter);
+        /* Nominal jitter is due to PPS signal noise. If it exceeds the
+         * threshold, the sample is discarded; otherwise, if so enabled,
+         * the time offset is updated.
+         */
+        if (jitter > (pps_jitter << PPS_POPCORN)) {
+                pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n",
+                       jitter, (pps_jitter << PPS_POPCORN));
+                time_status |= STA_PPSJITTER;
+                pps_jitcnt++;
+        } else if (time_status & STA_PPSTIME) {
+                /* correct the time using the phase offset */
+                time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT,
+                                NTP_INTERVAL_FREQ);
+                /* cancel running adjtime() */
+                time_adjust = 0;
+        }
+        /* update jitter */
+        pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN;
+}
+/*
+ * hardpps() - discipline CPU clock oscillator to external PPS signal
+ *
+ * This routine is called at each PPS signal arrival in order to
+ * discipline the CPU clock oscillator to the PPS signal. It takes two
+ * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former
+ * is used to correct clock phase error and the latter is used to
+ * correct the frequency.
+ *
+ * This code is based on David Mills's reference nanokernel
+ * implementation. It was mostly rewritten but keeps the same idea.
+ */
+void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
+{
+        struct pps_normtime pts_norm, freq_norm;
+        unsigned long flags;
+        pts_norm = pps_normalize_ts(*phase_ts);
+        write_seqlock_irqsave(&xtime_lock, flags);
+        /* clear the error bits, they will be set again if needed */
+        time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
+        /* indicate signal presence */
+        time_status |= STA_PPSSIGNAL;
+        pps_valid = PPS_VALID;
+        /* when called for the first time,
+         * just start the frequency interval */
+        if (unlikely(pps_fbase.tv_sec == 0)) {
+                pps_fbase = *raw_ts;
+                write_sequnlock_irqrestore(&xtime_lock, flags);
+                return;
+        }
+        /* ok, now we have a base for frequency calculation */
+        freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase));
+        /* check that the signal is in the range
+         * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */
+        if ((freq_norm.sec == 0) ||
+                        (freq_norm.nsec > MAXFREQ * freq_norm.sec) ||
+                        (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) {
+                time_status |= STA_PPSJITTER;
+                /* restart the frequency calibration interval */
+                pps_fbase = *raw_ts;
+                write_sequnlock_irqrestore(&xtime_lock, flags);
+                pr_err("hardpps: PPSJITTER: bad pulse\n");
+                return;
+        }
+        /* signal is ok */
+        /* check if the current frequency interval is finished */
+        if (freq_norm.sec >= (1 << pps_shift)) {
+                pps_calcnt++;
+                /* restart the frequency calibration interval */
+                pps_fbase = *raw_ts;
+                hardpps_update_freq(freq_norm);
+        }
+        hardpps_update_phase(pts_norm.nsec);
+        write_sequnlock_irqrestore(&xtime_lock, flags);
+}
+EXPORT_SYMBOL(hardpps);
+#endif  /* CONFIG_NTP_PPS */
 static int __init ntp_tick_adj_setup(char *str)
 {
        ntp_tick_adj = simple_strtol(str, NULL, 0);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index b6b898d2eeef..051bc80a0c43 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -49,7 +49,7 @@ struct tick_device *tick_get_device(int cpu)
 */
 int tick_is_oneshot_available(void)
 {
-        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+        struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
        return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT);
 }
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index aada0e52680a..5cbc101f908b 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -95,7 +95,7 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
 */
 int tick_program_event(ktime_t expires, int force)
 {
-        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+        struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
        return tick_dev_program_event(dev, expires, force);
 }
@@ -167,7 +167,7 @@ int tick_oneshot_mode_active(void)
        int ret;
        local_irq_save(flags);
-        ret = __get_cpu_var(tick_cpu_device).mode == TICKDEV_MODE_ONESHOT;
+        ret = __this_cpu_read(tick_cpu_device.mode) == TICKDEV_MODE_ONESHOT;
        local_irq_restore(flags);
        return ret;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index eef7452bd8a9..d27c7562902c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -288,6 +288,49 @@ void ktime_get_ts(struct timespec *ts)
 }
 EXPORT_SYMBOL_GPL(ktime_get_ts);
+#ifdef CONFIG_NTP_PPS
+/**
+ * getnstime_raw_and_real - get day and raw monotonic time in timespec format
+ * @ts_raw:     pointer to the timespec to be set to raw monotonic time
+ * @ts_real:    pointer to the timespec to be set to the time of day
+ *
+ * This function reads both the time of day and raw monotonic time at the
+ * same time atomically and stores the resulting timestamps in timespec
+ * format.
+ */
+void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
+{
+        unsigned long seq;
+        s64 nsecs_raw, nsecs_real;
+        WARN_ON_ONCE(timekeeping_suspended);
+        do {
+                u32 arch_offset;
+                seq = read_seqbegin(&xtime_lock);
+                *ts_raw = raw_time;
+                *ts_real = xtime;
+                nsecs_raw = timekeeping_get_ns_raw();
+                nsecs_real = timekeeping_get_ns();
+                /* If arch requires, add in gettimeoffset() */
+                arch_offset = arch_gettimeoffset();
+                nsecs_raw += arch_offset;
+                nsecs_real += arch_offset;
+        } while (read_seqretry(&xtime_lock, seq));
+        timespec_add_ns(ts_raw, nsecs_raw);
+        timespec_add_ns(ts_real, nsecs_real);
+}
+EXPORT_SYMBOL(getnstime_raw_and_real);
+#endif /* CONFIG_NTP_PPS */
 /**
 * do_gettimeofday - Returns the time of day in a timeval
 * @tv:         pointer to the timeval to be set
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 53f338190b26..761c510a06c5 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
-obj-$(CONFIG_EVENT_TRACING) += power-traces.o
+obj-$(CONFIG_TRACEPOINTS) += power-traces.o
 ifeq ($(CONFIG_TRACING),y)
 obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
 endif
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7b8ec0281548..153562d0b93c 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -758,53 +758,58 @@ static void blk_add_trace_rq_complete(void *ignore,
 * @q:          queue the io is for
 * @bio:        the source bio
 * @what:       the action
+ * @error:      error, if any
 *
 * Description:
 *     Records an action against a bio. Will log the bio offset + size.
 *
 **/
 static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
-                                     u32 what)
+                              u32 what, int error)
 {
        struct blk_trace *bt = q->blk_trace;
        if (likely(!bt))
                return;
+        if (!error && !bio_flagged(bio, BIO_UPTODATE))
+                error = EIO;
        __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
-                        !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
+                        error, 0, NULL);
 }
 static void blk_add_trace_bio_bounce(void *ignore,
                                     struct request_queue *q, struct bio *bio)
 {
-        blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
+        blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
 }
 static void blk_add_trace_bio_complete(void *ignore,
-                                       struct request_queue *q, struct bio *bio)
+                                       struct request_queue *q, struct bio *bio,
+                                       int error)
 {
-        blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
+        blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
 }
 static void blk_add_trace_bio_backmerge(void *ignore,
                                        struct request_queue *q,
                                        struct bio *bio)
 {
-        blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+        blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
 }
 static void blk_add_trace_bio_frontmerge(void *ignore,
                                         struct request_queue *q,
                                         struct bio *bio)
 {
-        blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+        blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
 }
 static void blk_add_trace_bio_queue(void *ignore,
                                    struct request_queue *q, struct bio *bio)
 {
-        blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+        blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
 }
 static void blk_add_trace_getrq(void *ignore,
@@ -812,7 +817,7 @@ static void blk_add_trace_getrq(void *ignore,
                                struct bio *bio, int rw)
 {
        if (bio)
-                blk_add_trace_bio(q, bio, BLK_TA_GETRQ);
+                blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
        else {
                struct blk_trace *bt = q->blk_trace;
@@ -827,7 +832,7 @@ static void blk_add_trace_sleeprq(void *ignore,
                                  struct bio *bio, int rw)
 {
        if (bio)
-                blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ);
+                blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
        else {
                struct blk_trace *bt = q->blk_trace;
@@ -887,7 +892,7 @@ static void blk_add_trace_split(void *ignore,
 }
 /**
- * blk_add_trace_remap - Add a trace for a remap operation
+ * blk_add_trace_bio_remap - Add a trace for a bio-remap operation
 * @ignore:     trace callback data parameter (not used)
 * @q:          queue the io is for
 * @bio:        the source bio
@@ -899,9 +904,9 @@ static void blk_add_trace_split(void *ignore,
 *     it spans a stripe (or similar). Add a trace for that action.
 *
 **/
-static void blk_add_trace_remap(void *ignore,
+static void blk_add_trace_bio_remap(void *ignore,
-                                struct request_queue *q, struct bio *bio,
+                                    struct request_queue *q, struct bio *bio,
-                                dev_t dev, sector_t from)
+                                    dev_t dev, sector_t from)
 {
        struct blk_trace *bt = q->blk_trace;
        struct blk_io_trace_remap r;
@@ -1016,7 +1021,7 @@ static void blk_register_tracepoints(void)
        WARN_ON(ret);
        ret = register_trace_block_split(blk_add_trace_split, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_remap(blk_add_trace_remap, NULL);
+        ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
        WARN_ON(ret);
        ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
        WARN_ON(ret);
@@ -1025,7 +1030,7 @@ static void blk_register_tracepoints(void)
 static void blk_unregister_tracepoints(void)
 {
        unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
-        unregister_trace_block_remap(blk_add_trace_remap, NULL);
+        unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
        unregister_trace_block_split(blk_add_trace_split, NULL);
        unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
        unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f8cf959bad45..dc53ecb80589 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1313,12 +1313,10 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
        __this_cpu_inc(user_stack_count);
        event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
                                          sizeof(*entry), flags, pc);
        if (!event)
-                return;
+                goto out_drop_count;
        entry   = ring_buffer_event_data(event);
        entry->tgid             = current->tgid;
@@ -1333,8 +1331,8 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
        if (!filter_check_discard(call, entry, buffer, event))
                ring_buffer_unlock_commit(buffer, event);
+ out_drop_count:
        __this_cpu_dec(user_stack_count);
 out:
        preempt_enable();
 }
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e3dfecaf13e6..6cf223764be8 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -53,7 +53,7 @@
 */
 /*
- * Function trace entry - function address and parent function addres:
+ * Function trace entry - function address and parent function address:
 */
 FTRACE_ENTRY(function, ftrace_entry,
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 562c56e048fd..659732eba07c 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -558,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
 static int trace_wakeup_test_thread(void *data)
 {
        /* Make this a RT thread, doesn't need to be too high */
-        static struct sched_param param = { .sched_priority = 5 };
+        static const struct sched_param param = { .sched_priority = 5 };
        struct completion *x = data;
        sched_setscheduler(current, SCHED_FIFO, &param);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index bac752f0cfb5..b706529b4fc7 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -23,9 +23,6 @@ static int syscall_exit_register(struct ftrace_event_call *event,
 static int syscall_enter_define_fields(struct ftrace_event_call *call);
 static int syscall_exit_define_fields(struct ftrace_event_call *call);
-/* All syscall exit events have the same fields */
-static LIST_HEAD(syscall_exit_fields);
 static struct list_head *
 syscall_get_enter_fields(struct ftrace_event_call *call)
 {
@@ -34,34 +31,28 @@ syscall_get_enter_fields(struct ftrace_event_call *call)
        return &entry->enter_fields;
 }
-static struct list_head *
-syscall_get_exit_fields(struct ftrace_event_call *call)
-{
-        return &syscall_exit_fields;
-}
 struct trace_event_functions enter_syscall_print_funcs = {
-        .trace                  = print_syscall_enter,
+        .trace          = print_syscall_enter,
 };
 struct trace_event_functions exit_syscall_print_funcs = {
-        .trace                  = print_syscall_exit,
+        .trace          = print_syscall_exit,
 };
 struct ftrace_event_class event_class_syscall_enter = {
-        .system                 = "syscalls",
+        .system         = "syscalls",
-        .reg                    = syscall_enter_register,
+        .reg            = syscall_enter_register,
-        .define_fields          = syscall_enter_define_fields,
+        .define_fields  = syscall_enter_define_fields,
-        .get_fields             = syscall_get_enter_fields,
+        .get_fields     = syscall_get_enter_fields,
-        .raw_init               = init_syscall_trace,
+        .raw_init       = init_syscall_trace,
 };
 struct ftrace_event_class event_class_syscall_exit = {
-        .system                 = "syscalls",
+        .system         = "syscalls",
-        .reg                    = syscall_exit_register,
+        .reg            = syscall_exit_register,
-        .define_fields          = syscall_exit_define_fields,
+        .define_fields  = syscall_exit_define_fields,
-        .get_fields             = syscall_get_exit_fields,
+        .fields         = LIST_HEAD_INIT(event_class_syscall_exit.fields),
-        .raw_init               = init_syscall_trace,
+        .raw_init       = init_syscall_trace,
 };
 extern unsigned long __start_syscalls_metadata[];
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 25915832291a..9da289c34f22 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -12,6 +12,8 @@
 #include <linux/highuid.h>
 #include <linux/cred.h>
+static struct kmem_cache *user_ns_cachep __read_mostly;
 /*
 * Create a new user namespace, deriving the creator from the user in the
 * passed credentials, and replacing that user with the new root user for the
@@ -26,7 +28,7 @@ int create_user_ns(struct cred *new)
        struct user_struct *root_user;
        int n;
-        ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
+        ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL);
        if (!ns)
                return -ENOMEM;
@@ -38,7 +40,7 @@ int create_user_ns(struct cred *new)
        /* Alloc new root user.  */
        root_user = alloc_uid(ns, 0);
        if (!root_user) {
-                kfree(ns);
+                kmem_cache_free(user_ns_cachep, ns);
                return -ENOMEM;
        }
@@ -71,7 +73,7 @@ static void free_user_ns_work(struct work_struct *work)
        struct user_namespace *ns =
                container_of(work, struct user_namespace, destroyer);
        free_uid(ns->creator);
-        kfree(ns);
+        kmem_cache_free(user_ns_cachep, ns);
 }
 void free_user_ns(struct kref *kref)
@@ -126,3 +128,10 @@ gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t
        /* No useful relationship so no mapping */
        return overflowgid;
 }
+static __init int user_namespaces_init(void)
+{
+        user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
+        return 0;
+}
+module_init(user_namespaces_init);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 6e7b575ac33c..d7ebdf4cea98 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -118,12 +118,12 @@ static void __touch_watchdog(void)
 {
        int this_cpu = smp_processor_id();
-        __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu);
+        __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu));
 }
 void touch_softlockup_watchdog(void)
 {
-        __raw_get_cpu_var(watchdog_touch_ts) = 0;
+        __this_cpu_write(watchdog_touch_ts, 0);
 }
 EXPORT_SYMBOL(touch_softlockup_watchdog);
@@ -167,12 +167,12 @@ void touch_softlockup_watchdog_sync(void)
 /* watchdog detector functions */
 static int is_hardlockup(void)
 {
-        unsigned long hrint = __get_cpu_var(hrtimer_interrupts);
+        unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
-        if (__get_cpu_var(hrtimer_interrupts_saved) == hrint)
+        if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
                return 1;
-        __get_cpu_var(hrtimer_interrupts_saved) = hrint;
+        __this_cpu_write(hrtimer_interrupts_saved, hrint);
        return 0;
 }
 #endif
@@ -205,8 +205,8 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
        /* Ensure the watchdog never gets throttled */
        event->hw.interrupts = 0;
-        if (__get_cpu_var(watchdog_nmi_touch) == true) {
+        if (__this_cpu_read(watchdog_nmi_touch) == true) {
-                __get_cpu_var(watchdog_nmi_touch) = false;
+                __this_cpu_write(watchdog_nmi_touch, false);
                return;
        }
@@ -220,7 +220,7 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
                int this_cpu = smp_processor_id();
                /* only print hardlockups once */
-                if (__get_cpu_var(hard_watchdog_warn) == true)
+                if (__this_cpu_read(hard_watchdog_warn) == true)
                        return;
                if (hardlockup_panic)
@@ -228,16 +228,16 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,
                else
                        WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
-                __get_cpu_var(hard_watchdog_warn) = true;
+                __this_cpu_write(hard_watchdog_warn, true);
                return;
        }
-        __get_cpu_var(hard_watchdog_warn) = false;
+        __this_cpu_write(hard_watchdog_warn, false);
        return;
 }
 static void watchdog_interrupt_count(void)
 {
-        __get_cpu_var(hrtimer_interrupts)++;
+        __this_cpu_inc(hrtimer_interrupts);
 }
 #else
 static inline void watchdog_interrupt_count(void) { return; }
@@ -246,7 +246,7 @@ static inline void watchdog_interrupt_count(void) { return; }
 /* watchdog kicker functions */
 static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 {
-        unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts);
+        unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
        struct pt_regs *regs = get_irq_regs();
        int duration;
@@ -254,18 +254,18 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
        watchdog_interrupt_count();
        /* kick the softlockup detector */
-        wake_up_process(__get_cpu_var(softlockup_watchdog));
+        wake_up_process(__this_cpu_read(softlockup_watchdog));
        /* .. and repeat */
        hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
        if (touch_ts == 0) {
-                if (unlikely(__get_cpu_var(softlockup_touch_sync))) {
+                if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
                        /*
                         * If the time stamp was touched atomically
                         * make sure the scheduler tick is up to date.
                         */
-                        __get_cpu_var(softlockup_touch_sync) = false;
+                        __this_cpu_write(softlockup_touch_sync, false);
                        sched_clock_tick();
                }
                __touch_watchdog();
@@ -281,7 +281,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
        duration = is_softlockup(touch_ts);
        if (unlikely(duration)) {
                /* only warn once */
-                if (__get_cpu_var(soft_watchdog_warn) == true)
+                if (__this_cpu_read(soft_watchdog_warn) == true)
                        return HRTIMER_RESTART;
                printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
@@ -296,9 +296,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
                if (softlockup_panic)
                        panic("softlockup: hung tasks");
-                __get_cpu_var(soft_watchdog_warn) = true;
+                __this_cpu_write(soft_watchdog_warn, true);
        } else
-                __get_cpu_var(soft_watchdog_warn) = false;
+                __this_cpu_write(soft_watchdog_warn, false);
        return HRTIMER_RESTART;
 }
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e785b0f2aea5..8ee6ec82f88a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -932,6 +932,38 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
                wake_up_worker(gcwq);
 }
+/*
+ * Test whether @work is being queued from another work executing on the
+ * same workqueue.  This is rather expensive and should only be used from
+ * cold paths.
+ */
+static bool is_chained_work(struct workqueue_struct *wq)
+{
+        unsigned long flags;
+        unsigned int cpu;
+        for_each_gcwq_cpu(cpu) {
+                struct global_cwq *gcwq = get_gcwq(cpu);
+                struct worker *worker;
+                struct hlist_node *pos;
+                int i;
+                spin_lock_irqsave(&gcwq->lock, flags);
+                for_each_busy_worker(worker, i, pos, gcwq) {
+                        if (worker->task != current)
+                                continue;
+                        spin_unlock_irqrestore(&gcwq->lock, flags);
+                        /*
+                         * I'm @worker, no locking necessary.  See if @work
+                         * is headed to the same workqueue.
+                         */
+                        return worker->current_cwq->wq == wq;
+                }
+                spin_unlock_irqrestore(&gcwq->lock, flags);
+        }
+        return false;
+}
 static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
                         struct work_struct *work)
 {
@@ -943,7 +975,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
        debug_work_activate(work);
-        if (WARN_ON_ONCE(wq->flags & WQ_DYING))
+        /* if dying, only works from the same workqueue are allowed */
+        if (unlikely(wq->flags & WQ_DYING) &&
+            WARN_ON_ONCE(!is_chained_work(wq)))
                return;
        /* determine gcwq to use */
@@ -2936,11 +2970,35 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
 */
 void destroy_workqueue(struct workqueue_struct *wq)
 {
+        unsigned int flush_cnt = 0;
        unsigned int cpu;
+        /*
+         * Mark @wq dying and drain all pending works.  Once WQ_DYING is
+         * set, only chain queueing is allowed.  IOW, only currently
+         * pending or running work items on @wq can queue further work
+         * items on it.  @wq is flushed repeatedly until it becomes empty.
+         * The number of flushing is detemined by the depth of chaining and
+         * should be relatively short.  Whine if it takes too long.
+         */
        wq->flags |= WQ_DYING;
+reflush:
        flush_workqueue(wq);
+        for_each_cwq_cpu(cpu, wq) {
+                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+                if (!cwq->nr_active && list_empty(&cwq->delayed_works))
+                        continue;
+                if (++flush_cnt == 10 ||
+                    (flush_cnt % 100 == 0 && flush_cnt <= 1000))
+                        printk(KERN_WARNING "workqueue %s: flush on "
+                               "destruction isn't complete after %u tries\n",
+                               wq->name, flush_cnt);
+                goto reflush;
+        }
        /*
         * wq list is used to freeze wq, remove from list after
         * flushing is complete in case freeze races us.