Merge branch 'linus' into x86/urgent

Merge in Linus's tree to avoid a conflict. Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Ingo Molnar <mingo@kernel.org> 2012-07-25 15:40:40 -0400
committer: Ingo Molnar <mingo@kernel.org> 2012-07-25 15:40:40 -0400
commit: d431adfbc9b7de651f3164c6b7ffcad75805d7e4 (patch)
tree: 29bce222c81a3a392e51c11e2188659aa6d1bded /kernel
parent: d6250a3f12edb3a86db9598ffeca3de8b4a219e9 (diff)
parent: e2b34e311be3a57c9abcb927e37a57e38913714c (diff)
34 files changed, 1838 insertions, 1417 deletions
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 67b847dfa2bb..1f91413edb87 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -14,6 +14,7 @@
 #include <linux/ctype.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
+#include <linux/kmsg_dump.h>
 #include <linux/reboot.h>
 #include <linux/sched.h>
 #include <linux/sysrq.h>
@@ -2040,8 +2041,15 @@ static int kdb_env(int argc, const char **argv)
 */
 static int kdb_dmesg(int argc, const char **argv)
 {
-        char *syslog_data[4], *start, *end, c = '\0', *p;
+        int diag;
-        int diag, logging, logsize, lines = 0, adjust = 0, n;
+        int logging;
+        int lines = 0;
+        int adjust = 0;
+        int n = 0;
+        int skip = 0;
+        struct kmsg_dumper dumper = { .active = 1 };
+        size_t len;
+        char buf[201];
        if (argc > 2)
                return KDB_ARGCOUNT;
@@ -2064,22 +2072,10 @@ static int kdb_dmesg(int argc, const char **argv)
                kdb_set(2, setargs);
        }
-        /* syslog_data[0,1] physical start, end+1.  syslog_data[2,3]
+        kmsg_dump_rewind_nolock(&dumper);
-         * logical start, end+1. */
+        while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL))
-        kdb_syslog_data(syslog_data);
+                n++;
-        if (syslog_data[2] == syslog_data[3])
-                return 0;
-        logsize = syslog_data[1] - syslog_data[0];
-        start = syslog_data[2];
-        end = syslog_data[3];
-#define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0])
-        for (n = 0, p = start; p < end; ++p) {
-                c = *KDB_WRAP(p);
-                if (c == '\n')
-                        ++n;
-        }
-        if (c != '\n')
-                ++n;
        if (lines < 0) {
                if (adjust >= n)
                        kdb_printf("buffer only contains %d lines, nothing "
@@ -2087,21 +2083,11 @@ static int kdb_dmesg(int argc, const char **argv)
                else if (adjust - lines >= n)
                        kdb_printf("buffer only contains %d lines, last %d "
                                   "lines printed\n", n, n - adjust);
-                if (adjust) {
+                skip = adjust;
-                        for (; start < end && adjust; ++start) {
+                lines = abs(lines);
-                                if (*KDB_WRAP(start) == '\n')
-                                        --adjust;
-                        }
-                        if (start < end)
-                                ++start;
-                }
-                for (p = start; p < end && lines; ++p) {
-                        if (*KDB_WRAP(p) == '\n')
-                                ++lines;
-                }
-                end = p;
        } else if (lines > 0) {
-                int skip = n - (adjust + lines);
+                skip = n - lines - adjust;
+                lines = abs(lines);
                if (adjust >= n) {
                        kdb_printf("buffer only contains %d lines, "
                                   "nothing printed\n", n);
@@ -2112,35 +2098,24 @@ static int kdb_dmesg(int argc, const char **argv)
                        kdb_printf("buffer only contains %d lines, first "
                                   "%d lines printed\n", n, lines);
                }
-                for (; start < end && skip; ++start) {
+        } else {
-                        if (*KDB_WRAP(start) == '\n')
+                lines = n;
-                                --skip;
-                }
-                for (p = start; p < end && lines; ++p) {
-                        if (*KDB_WRAP(p) == '\n')
-                                --lines;
-                }
-                end = p;
        }
-        /* Do a line at a time (max 200 chars) to reduce protocol overhead */
-        c = '\n';
+        if (skip >= n || skip < 0)
-        while (start != end) {
+                return 0;
-                char buf[201];
-                p = buf;
+        kmsg_dump_rewind_nolock(&dumper);
-                if (KDB_FLAG(CMD_INTERRUPT))
+        while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) {
-                        return 0;
+                if (skip) {
-                while (start < end && (c = *KDB_WRAP(start)) &&
+                        skip--;
-                       (p - buf) < sizeof(buf)-1) {
+                        continue;
-                        ++start;
-                        *p++ = c;
-                        if (c == '\n')
-                                break;
                }
-                *p = '\0';
+                if (!lines--)
-                kdb_printf("%s", buf);
+                        break;
+                kdb_printf("%.*s\n", (int)len - 1, buf);
        }
-        if (c != '\n')
-                kdb_printf("\n");
        return 0;
 }
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 47c4e56e513b..392ec6a25844 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -205,7 +205,6 @@ extern char kdb_grep_string[];
 extern int kdb_grep_leading;
 extern int kdb_grep_trailing;
 extern char *kdb_cmds[];
-extern void kdb_syslog_data(char *syslog_data[]);
 extern unsigned long kdb_task_state_string(const char *);
 extern char kdb_task_state_char (const struct task_struct *);
 extern unsigned long kdb_task_state(const struct task_struct *p,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d7d71d6ec972..f1cf0edeb39a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1645,6 +1645,8 @@ perf_install_in_context(struct perf_event_context *ctx,
        lockdep_assert_held(&ctx->mutex);
        event->ctx = ctx;
+        if (event->cpu != -1)
+                event->cpu = cpu;
        if (!task) {
                /*
@@ -6252,6 +6254,8 @@ SYSCALL_DEFINE5(perf_event_open,
                }
        }
+        get_online_cpus();
        event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
                                 NULL, NULL);
        if (IS_ERR(event)) {
@@ -6304,7 +6308,7 @@ SYSCALL_DEFINE5(perf_event_open,
        /*
         * Get the target context (task or percpu):
         */
-        ctx = find_get_context(pmu, task, cpu);
+        ctx = find_get_context(pmu, task, event->cpu);
        if (IS_ERR(ctx)) {
                err = PTR_ERR(ctx);
                goto err_alloc;
@@ -6377,20 +6381,23 @@ SYSCALL_DEFINE5(perf_event_open,
        mutex_lock(&ctx->mutex);
        if (move_group) {
-                perf_install_in_context(ctx, group_leader, cpu);
+                synchronize_rcu();
+                perf_install_in_context(ctx, group_leader, event->cpu);
                get_ctx(ctx);
                list_for_each_entry(sibling, &group_leader->sibling_list,
                                    group_entry) {
-                        perf_install_in_context(ctx, sibling, cpu);
+                        perf_install_in_context(ctx, sibling, event->cpu);
                        get_ctx(ctx);
                }
        }
-        perf_install_in_context(ctx, event, cpu);
+        perf_install_in_context(ctx, event, event->cpu);
        ++ctx->generation;
        perf_unpin_context(ctx);
        mutex_unlock(&ctx->mutex);
+        put_online_cpus();
        event->owner = current;
        mutex_lock(&current->perf_event_mutex);
@@ -6419,6 +6426,7 @@ err_context:
 err_alloc:
        free_event(event);
 err_task:
+        put_online_cpus();
        if (task)
                put_task_struct(task);
 err_group_fd:
@@ -6479,6 +6487,39 @@ err:
 }
 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
+void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
+{
+        struct perf_event_context *src_ctx;
+        struct perf_event_context *dst_ctx;
+        struct perf_event *event, *tmp;
+        LIST_HEAD(events);
+        src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
+        dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
+        mutex_lock(&src_ctx->mutex);
+        list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
+                                 event_entry) {
+                perf_remove_from_context(event);
+                put_ctx(src_ctx);
+                list_add(&event->event_entry, &events);
+        }
+        mutex_unlock(&src_ctx->mutex);
+        synchronize_rcu();
+        mutex_lock(&dst_ctx->mutex);
+        list_for_each_entry_safe(event, tmp, &events, event_entry) {
+                list_del(&event->event_entry);
+                if (event->state >= PERF_EVENT_STATE_OFF)
+                        event->state = PERF_EVENT_STATE_INACTIVE;
+                perf_install_in_context(dst_ctx, event, dst_cpu);
+                get_ctx(dst_ctx);
+        }
+        mutex_unlock(&dst_ctx->mutex);
+}
+EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
 static void sync_child_event(struct perf_event *child_event,
                               struct task_struct *child)
 {
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 985be4d80fe8..f93532748bca 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -38,13 +38,29 @@
 #define UINSNS_PER_PAGE                 (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
 #define MAX_UPROBE_XOL_SLOTS            UINSNS_PER_PAGE
-static struct srcu_struct uprobes_srcu;
 static struct rb_root uprobes_tree = RB_ROOT;
 static DEFINE_SPINLOCK(uprobes_treelock);       /* serialize rbtree access */
 #define UPROBES_HASH_SZ 13
+/*
+ * We need separate register/unregister and mmap/munmap lock hashes because
+ * of mmap_sem nesting.
+ *
+ * uprobe_register() needs to install probes on (potentially) all processes
+ * and thus needs to acquire multiple mmap_sems (consequtively, not
+ * concurrently), whereas uprobe_mmap() is called while holding mmap_sem
+ * for the particular process doing the mmap.
+ *
+ * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem
+ * because of lock order against i_mmap_mutex. This means there's a hole in
+ * the register vma iteration where a mmap() can happen.
+ *
+ * Thus uprobe_register() can race with uprobe_mmap() and we can try and
+ * install a probe where one is already installed.
+ */
 /* serialize (un)register */
 static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
@@ -61,17 +77,6 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
 */
 static atomic_t uprobe_events = ATOMIC_INIT(0);
-/*
- * Maintain a temporary per vma info that can be used to search if a vma
- * has already been handled. This structure is introduced since extending
- * vm_area_struct wasnt recommended.
- */
-struct vma_info {
-        struct list_head        probe_list;
-        struct mm_struct        *mm;
-        loff_t                  vaddr;
-};
 struct uprobe {
        struct rb_node          rb_node;        /* node in the rb tree */
        atomic_t                ref;
@@ -100,7 +105,8 @@ static bool valid_vma(struct vm_area_struct *vma, bool is_register)
        if (!is_register)
                return true;
-        if ((vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) == (VM_READ|VM_EXEC))
+        if ((vma->vm_flags & (VM_HUGETLB|VM_READ|VM_WRITE|VM_EXEC|VM_SHARED))
+                                == (VM_READ|VM_EXEC))
                return true;
        return false;
@@ -129,33 +135,17 @@ static loff_t vma_address(struct vm_area_struct *vma, loff_t offset)
 static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage)
 {
        struct mm_struct *mm = vma->vm_mm;
-        pgd_t *pgd;
-        pud_t *pud;
-        pmd_t *pmd;
-        pte_t *ptep;
-        spinlock_t *ptl;
        unsigned long addr;
-        int err = -EFAULT;
+        spinlock_t *ptl;
+        pte_t *ptep;
        addr = page_address_in_vma(page, vma);
        if (addr == -EFAULT)
-                goto out;
+                return -EFAULT;
-        pgd = pgd_offset(mm, addr);
-        if (!pgd_present(*pgd))
-                goto out;
-        pud = pud_offset(pgd, addr);
-        if (!pud_present(*pud))
-                goto out;
-        pmd = pmd_offset(pud, addr);
-        if (!pmd_present(*pmd))
-                goto out;
-        ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
+        ptep = page_check_address(page, mm, addr, &ptl, 0);
        if (!ptep)
-                goto out;
+                return -EAGAIN;
        get_page(kpage);
        page_add_new_anon_rmap(kpage, vma, addr);
@@ -174,10 +164,8 @@ static int __replace_page(struct vm_area_struct *vma, struct page *page, struct
                try_to_free_swap(page);
        put_page(page);
        pte_unmap_unlock(ptep, ptl);
-        err = 0;
-out:
+        return 0;
-        return err;
 }
 /**
@@ -222,9 +210,8 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
        void *vaddr_old, *vaddr_new;
        struct vm_area_struct *vma;
        struct uprobe *uprobe;
-        loff_t addr;
        int ret;
+retry:
        /* Read the page with vaddr into memory */
        ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma);
        if (ret <= 0)
@@ -246,10 +233,6 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
        if (mapping != vma->vm_file->f_mapping)
                goto put_out;
-        addr = vma_address(vma, uprobe->offset);
-        if (vaddr != (unsigned long)addr)
-                goto put_out;
        ret = -ENOMEM;
        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
        if (!new_page)
@@ -267,11 +250,7 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
        vaddr_new = kmap_atomic(new_page);
        memcpy(vaddr_new, vaddr_old, PAGE_SIZE);
+        memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE);
-        /* poke the new insn in, ASSUMES we don't cross page boundary */
-        vaddr &= ~PAGE_MASK;
-        BUG_ON(vaddr + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
-        memcpy(vaddr_new + vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
        kunmap_atomic(vaddr_new);
        kunmap_atomic(vaddr_old);
@@ -291,6 +270,8 @@ unlock_out:
 put_out:
        put_page(old_page);
+        if (unlikely(ret == -EAGAIN))
+                goto retry;
        return ret;
 }
@@ -312,7 +293,7 @@ static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_
        void *vaddr_new;
        int ret;
-        ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &page, NULL);
+        ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
        if (ret <= 0)
                return ret;
@@ -333,10 +314,20 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
        uprobe_opcode_t opcode;
        int result;
+        if (current->mm == mm) {
+                pagefault_disable();
+                result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
+                                                                sizeof(opcode));
+                pagefault_enable();
+                if (likely(result == 0))
+                        goto out;
+        }
        result = read_opcode(mm, vaddr, &opcode);
        if (result)
                return result;
+out:
        if (is_swbp_insn(&opcode))
                return 1;
@@ -355,7 +346,9 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
 int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
 {
        int result;
+        /*
+         * See the comment near uprobes_hash().
+         */
        result = is_swbp_at_addr(mm, vaddr);
        if (result == 1)
                return -EEXIST;
@@ -520,7 +513,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
        uprobe->inode = igrab(inode);
        uprobe->offset = offset;
        init_rwsem(&uprobe->consumer_rwsem);
-        INIT_LIST_HEAD(&uprobe->pending_list);
        /* add to uprobes_tree, sorted on inode:offset */
        cur_uprobe = insert_uprobe(uprobe);
@@ -588,20 +580,22 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
 }
 static int
-__copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *insn,
+__copy_insn(struct address_space *mapping, struct file *filp, char *insn,
-                        unsigned long nbytes, unsigned long offset)
+                        unsigned long nbytes, loff_t offset)
 {
-        struct file *filp = vma->vm_file;
        struct page *page;
        void *vaddr;
-        unsigned long off1;
+        unsigned long off;
-        unsigned long idx;
+        pgoff_t idx;
        if (!filp)
                return -EINVAL;
-        idx = (unsigned long)(offset >> PAGE_CACHE_SHIFT);
+        if (!mapping->a_ops->readpage)
-        off1 = offset &= ~PAGE_MASK;
+                return -EIO;
+        idx = offset >> PAGE_CACHE_SHIFT;
+        off = offset & ~PAGE_MASK;
        /*
         * Ensure that the page that has the original instruction is
@@ -612,22 +606,20 @@ __copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *ins
                return PTR_ERR(page);
        vaddr = kmap_atomic(page);
-        memcpy(insn, vaddr + off1, nbytes);
+        memcpy(insn, vaddr + off, nbytes);
        kunmap_atomic(vaddr);
        page_cache_release(page);
        return 0;
 }
-static int
+static int copy_insn(struct uprobe *uprobe, struct file *filp)
-copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr)
 {
        struct address_space *mapping;
        unsigned long nbytes;
        int bytes;
-        addr &= ~PAGE_MASK;
+        nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK);
-        nbytes = PAGE_SIZE - addr;
        mapping = uprobe->inode->i_mapping;
        /* Instruction at end of binary; copy only available bytes */
@@ -638,13 +630,13 @@ copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr)
        /* Instruction at the page-boundary; copy bytes in second page */
        if (nbytes < bytes) {
-                if (__copy_insn(mapping, vma, uprobe->arch.insn + nbytes,
+                int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes,
-                                bytes - nbytes, uprobe->offset + nbytes))
+                                bytes - nbytes, uprobe->offset + nbytes);
-                        return -ENOMEM;
+                if (err)
+                        return err;
                bytes = nbytes;
        }
-        return __copy_insn(mapping, vma, uprobe->arch.insn, bytes, uprobe->offset);
+        return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset);
 }
 /*
@@ -672,9 +664,8 @@ copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr)
 */
 static int
 install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
-                        struct vm_area_struct *vma, loff_t vaddr)
+                        struct vm_area_struct *vma, unsigned long vaddr)
 {
-        unsigned long addr;
        int ret;
        /*
@@ -687,20 +678,22 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
        if (!uprobe->consumers)
                return -EEXIST;
-        addr = (unsigned long)vaddr;
        if (!(uprobe->flags & UPROBE_COPY_INSN)) {
-                ret = copy_insn(uprobe, vma, addr);
+                ret = copy_insn(uprobe, vma->vm_file);
                if (ret)
                        return ret;
                if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
-                        return -EEXIST;
+                        return -ENOTSUPP;
-                ret = arch_uprobe_analyze_insn(&uprobe->arch, mm);
+                ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
                if (ret)
                        return ret;
+                /* write_opcode() assumes we don't cross page boundary */
+                BUG_ON((uprobe->offset & ~PAGE_MASK) +
+                                UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
                uprobe->flags |= UPROBE_COPY_INSN;
        }
@@ -713,7 +706,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
         * Hence increment before and decrement on failure.
         */
        atomic_inc(&mm->uprobes_state.count);
-        ret = set_swbp(&uprobe->arch, mm, addr);
+        ret = set_swbp(&uprobe->arch, mm, vaddr);
        if (ret)
                atomic_dec(&mm->uprobes_state.count);
@@ -721,27 +714,21 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
 }
 static void
-remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr)
+remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
 {
-        if (!set_orig_insn(&uprobe->arch, mm, (unsigned long)vaddr, true))
+        if (!set_orig_insn(&uprobe->arch, mm, vaddr, true))
                atomic_dec(&mm->uprobes_state.count);
 }
 /*
- * There could be threads that have hit the breakpoint and are entering the
+ * There could be threads that have already hit the breakpoint. They
- * notifier code and trying to acquire the uprobes_treelock. The thread
+ * will recheck the current insn and restart if find_uprobe() fails.
- * calling delete_uprobe() that is removing the uprobe from the rb_tree can
+ * See find_active_uprobe().
- * race with these threads and might acquire the uprobes_treelock compared
- * to some of the breakpoint hit threads. In such a case, the breakpoint
- * hit threads will not find the uprobe. The current unregistering thread
- * waits till all other threads have hit a breakpoint, to acquire the
- * uprobes_treelock before the uprobe is removed from the rbtree.
 */
 static void delete_uprobe(struct uprobe *uprobe)
 {
        unsigned long flags;
-        synchronize_srcu(&uprobes_srcu);
        spin_lock_irqsave(&uprobes_treelock, flags);
        rb_erase(&uprobe->rb_node, &uprobes_tree);
        spin_unlock_irqrestore(&uprobes_treelock, flags);
@@ -750,139 +737,135 @@ static void delete_uprobe(struct uprobe *uprobe)
        atomic_dec(&uprobe_events);
 }
-static struct vma_info *
+struct map_info {
-__find_next_vma_info(struct address_space *mapping, struct list_head *head,
+        struct map_info *next;
-                        struct vma_info *vi, loff_t offset, bool is_register)
+        struct mm_struct *mm;
+        unsigned long vaddr;
+};
+static inline struct map_info *free_map_info(struct map_info *info)
+{
+        struct map_info *next = info->next;
+        kfree(info);
+        return next;
+}
+static struct map_info *
+build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
 {
+        unsigned long pgoff = offset >> PAGE_SHIFT;
        struct prio_tree_iter iter;
        struct vm_area_struct *vma;
-        struct vma_info *tmpvi;
+        struct map_info *curr = NULL;
-        unsigned long pgoff;
+        struct map_info *prev = NULL;
-        int existing_vma;
+        struct map_info *info;
-        loff_t vaddr;
+        int more = 0;
-        pgoff = offset >> PAGE_SHIFT;
+ again:
+        mutex_lock(&mapping->i_mmap_mutex);
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
                if (!valid_vma(vma, is_register))
                        continue;
-                existing_vma = 0;
+                if (!prev && !more) {
-                vaddr = vma_address(vma, offset);
+                        /*
+                         * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through
-                list_for_each_entry(tmpvi, head, probe_list) {
+                         * reclaim. This is optimistic, no harm done if it fails.
-                        if (tmpvi->mm == vma->vm_mm && tmpvi->vaddr == vaddr) {
+                         */
-                                existing_vma = 1;
+                        prev = kmalloc(sizeof(struct map_info),
-                                break;
+                                        GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
-                        }
+                        if (prev)
+                                prev->next = NULL;
                }
+                if (!prev) {
-                /*
+                        more++;
-                 * Another vma needs a probe to be installed. However skip
+                        continue;
-                 * installing the probe if the vma is about to be unlinked.
-                 */
-                if (!existing_vma && atomic_inc_not_zero(&vma->vm_mm->mm_users)) {
-                        vi->mm = vma->vm_mm;
-                        vi->vaddr = vaddr;
-                        list_add(&vi->probe_list, head);
-                        return vi;
                }
-        }
-        return NULL;
+                if (!atomic_inc_not_zero(&vma->vm_mm->mm_users))
-}
+                        continue;
-/*
- * Iterate in the rmap prio tree  and find a vma where a probe has not
- * yet been inserted.
- */
-static struct vma_info *
-find_next_vma_info(struct address_space *mapping, struct list_head *head,
-                loff_t offset, bool is_register)
-{
-        struct vma_info *vi, *retvi;
-        vi = kzalloc(sizeof(struct vma_info), GFP_KERNEL);
+                info = prev;
-        if (!vi)
+                prev = prev->next;
-                return ERR_PTR(-ENOMEM);
+                info->next = curr;
+                curr = info;
-        mutex_lock(&mapping->i_mmap_mutex);
+                info->mm = vma->vm_mm;
-        retvi = __find_next_vma_info(mapping, head, vi, offset, is_register);
+                info->vaddr = vma_address(vma, offset);
+        }
        mutex_unlock(&mapping->i_mmap_mutex);
-        if (!retvi)
+        if (!more)
-                kfree(vi);
+                goto out;
+        prev = curr;
+        while (curr) {
+                mmput(curr->mm);
+                curr = curr->next;
+        }
-        return retvi;
+        do {
+                info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
+                if (!info) {
+                        curr = ERR_PTR(-ENOMEM);
+                        goto out;
+                }
+                info->next = prev;
+                prev = info;
+        } while (--more);
+        goto again;
+ out:
+        while (prev)
+                prev = free_map_info(prev);
+        return curr;
 }
 static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
 {
-        struct list_head try_list;
+        struct map_info *info;
-        struct vm_area_struct *vma;
+        int err = 0;
-        struct address_space *mapping;
-        struct vma_info *vi, *tmpvi;
-        struct mm_struct *mm;
-        loff_t vaddr;
-        int ret;
-        mapping = uprobe->inode->i_mapping;
+        info = build_map_info(uprobe->inode->i_mapping,
-        INIT_LIST_HEAD(&try_list);
+                                        uprobe->offset, is_register);
+        if (IS_ERR(info))
+                return PTR_ERR(info);
-        ret = 0;
+        while (info) {
+                struct mm_struct *mm = info->mm;
+                struct vm_area_struct *vma;
-        for (;;) {
+                if (err)
-                vi = find_next_vma_info(mapping, &try_list, uprobe->offset, is_register);
+                        goto free;
-                if (!vi)
-                        break;
-                if (IS_ERR(vi)) {
+                down_write(&mm->mmap_sem);
-                        ret = PTR_ERR(vi);
+                vma = find_vma(mm, (unsigned long)info->vaddr);
-                        break;
+                if (!vma || !valid_vma(vma, is_register))
-                }
+                        goto unlock;
-                mm = vi->mm;
-                down_read(&mm->mmap_sem);
-                vma = find_vma(mm, (unsigned long)vi->vaddr);
-                if (!vma || !valid_vma(vma, is_register)) {
-                        list_del(&vi->probe_list);
-                        kfree(vi);
-                        up_read(&mm->mmap_sem);
-                        mmput(mm);
-                        continue;
-                }
-                vaddr = vma_address(vma, uprobe->offset);
                if (vma->vm_file->f_mapping->host != uprobe->inode ||
-                                                vaddr != vi->vaddr) {
+                    vma_address(vma, uprobe->offset) != info->vaddr)
-                        list_del(&vi->probe_list);
+                        goto unlock;
-                        kfree(vi);
-                        up_read(&mm->mmap_sem);
-                        mmput(mm);
-                        continue;
-                }
-                if (is_register)
-                        ret = install_breakpoint(uprobe, mm, vma, vi->vaddr);
-                else
-                        remove_breakpoint(uprobe, mm, vi->vaddr);
-                up_read(&mm->mmap_sem);
-                mmput(mm);
                if (is_register) {
-                        if (ret && ret == -EEXIST)
+                        err = install_breakpoint(uprobe, mm, vma, info->vaddr);
-                                ret = 0;
+                        /*
-                        if (ret)
+                         * We can race against uprobe_mmap(), see the
-                                break;
+                         * comment near uprobe_hash().
+                         */
+                        if (err == -EEXIST)
+                                err = 0;
+                } else {
+                        remove_breakpoint(uprobe, mm, info->vaddr);
                }
+ unlock:
+                up_write(&mm->mmap_sem);
+ free:
+                mmput(mm);
+                info = free_map_info(info);
        }
-        list_for_each_entry_safe(vi, tmpvi, &try_list, probe_list) {
+        return err;
-                list_del(&vi->probe_list);
-                kfree(vi);
-        }
-        return ret;
 }
 static int __uprobe_register(struct uprobe *uprobe)
@@ -1048,7 +1031,7 @@ static void build_probe_list(struct inode *inode, struct list_head *head)
 int uprobe_mmap(struct vm_area_struct *vma)
 {
        struct list_head tmp_list;
-        struct uprobe *uprobe, *u;
+        struct uprobe *uprobe;
        struct inode *inode;
        int ret, count;
@@ -1066,12 +1049,9 @@ int uprobe_mmap(struct vm_area_struct *vma)
        ret = 0;
        count = 0;
-        list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
+        list_for_each_entry(uprobe, &tmp_list, pending_list) {
-                loff_t vaddr;
-                list_del(&uprobe->pending_list);
                if (!ret) {
-                        vaddr = vma_address(vma, uprobe->offset);
+                        loff_t vaddr = vma_address(vma, uprobe->offset);
                        if (vaddr < vma->vm_start || vaddr >= vma->vm_end) {
                                put_uprobe(uprobe);
@@ -1079,8 +1059,10 @@ int uprobe_mmap(struct vm_area_struct *vma)
                        }
                        ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
+                        /*
-                        /* Ignore double add: */
+                         * We can race against uprobe_register(), see the
+                         * comment near uprobe_hash().
+                         */
                        if (ret == -EEXIST) {
                                ret = 0;
@@ -1115,7 +1097,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
 void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
 {
        struct list_head tmp_list;
-        struct uprobe *uprobe, *u;
+        struct uprobe *uprobe;
        struct inode *inode;
        if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
@@ -1132,11 +1114,8 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
        mutex_lock(uprobes_mmap_hash(inode));
        build_probe_list(inode, &tmp_list);
-        list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
+        list_for_each_entry(uprobe, &tmp_list, pending_list) {
-                loff_t vaddr;
+                loff_t vaddr = vma_address(vma, uprobe->offset);
-                list_del(&uprobe->pending_list);
-                vaddr = vma_address(vma, uprobe->offset);
                if (vaddr >= start && vaddr < end) {
                        /*
@@ -1378,9 +1357,6 @@ void uprobe_free_utask(struct task_struct *t)
 {
        struct uprobe_task *utask = t->utask;
-        if (t->uprobe_srcu_id != -1)
-                srcu_read_unlock_raw(&uprobes_srcu, t->uprobe_srcu_id);
        if (!utask)
                return;
@@ -1398,7 +1374,6 @@ void uprobe_free_utask(struct task_struct *t)
 void uprobe_copy_process(struct task_struct *t)
 {
        t->utask = NULL;
-        t->uprobe_srcu_id = -1;
 }
 /*
@@ -1417,7 +1392,6 @@ static struct uprobe_task *add_utask(void)
        if (unlikely(!utask))
                return NULL;
-        utask->active_uprobe = NULL;
        current->utask = utask;
        return utask;
 }
@@ -1479,41 +1453,64 @@ static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
        return false;
 }
+static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
+{
+        struct mm_struct *mm = current->mm;
+        struct uprobe *uprobe = NULL;
+        struct vm_area_struct *vma;
+        down_read(&mm->mmap_sem);
+        vma = find_vma(mm, bp_vaddr);
+        if (vma && vma->vm_start <= bp_vaddr) {
+                if (valid_vma(vma, false)) {
+                        struct inode *inode;
+                        loff_t offset;
+                        inode = vma->vm_file->f_mapping->host;
+                        offset = bp_vaddr - vma->vm_start;
+                        offset += (vma->vm_pgoff << PAGE_SHIFT);
+                        uprobe = find_uprobe(inode, offset);
+                }
+                if (!uprobe)
+                        *is_swbp = is_swbp_at_addr(mm, bp_vaddr);
+        } else {
+                *is_swbp = -EFAULT;
+        }
+        up_read(&mm->mmap_sem);
+        return uprobe;
+}
 /*
 * Run handler and ask thread to singlestep.
 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
 */
 static void handle_swbp(struct pt_regs *regs)
 {
-        struct vm_area_struct *vma;
        struct uprobe_task *utask;
        struct uprobe *uprobe;
-        struct mm_struct *mm;
        unsigned long bp_vaddr;
+        int uninitialized_var(is_swbp);
-        uprobe = NULL;
        bp_vaddr = uprobe_get_swbp_addr(regs);
-        mm = current->mm;
+        uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
-        down_read(&mm->mmap_sem);
-        vma = find_vma(mm, bp_vaddr);
-        if (vma && vma->vm_start <= bp_vaddr && valid_vma(vma, false)) {
-                struct inode *inode;
-                loff_t offset;
-                inode = vma->vm_file->f_mapping->host;
-                offset = bp_vaddr - vma->vm_start;
-                offset += (vma->vm_pgoff << PAGE_SHIFT);
-                uprobe = find_uprobe(inode, offset);
-        }
-        srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id);
-        current->uprobe_srcu_id = -1;
-        up_read(&mm->mmap_sem);
        if (!uprobe) {
-                /* No matching uprobe; signal SIGTRAP. */
+                if (is_swbp > 0) {
-                send_sig(SIGTRAP, current, 0);
+                        /* No matching uprobe; signal SIGTRAP. */
+                        send_sig(SIGTRAP, current, 0);
+                } else {
+                        /*
+                         * Either we raced with uprobe_unregister() or we can't
+                         * access this memory. The latter is only possible if
+                         * another thread plays with our ->mm. In both cases
+                         * we can simply restart. If this vma was unmapped we
+                         * can pretend this insn was not executed yet and get
+                         * the (correct) SIGSEGV after restart.
+                         */
+                        instruction_pointer_set(regs, bp_vaddr);
+                }
                return;
        }
@@ -1620,7 +1617,6 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs)
                utask->state = UTASK_BP_HIT;
        set_thread_flag(TIF_UPROBE);
-        current->uprobe_srcu_id = srcu_read_lock_raw(&uprobes_srcu);
        return 1;
 }
@@ -1655,7 +1651,6 @@ static int __init init_uprobes(void)
                mutex_init(&uprobes_mutex[i]);
                mutex_init(&uprobes_mmap_mutex[i]);
        }
-        init_srcu_struct(&uprobes_srcu);
        return register_die_notifier(&uprobe_exception_nb);
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index ab5211b9e622..f00e319d8376 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -304,12 +304,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        }
        err = arch_dup_task_struct(tsk, orig);
-        if (err)
-                goto out;
+        /*
+         * We defer looking at err, because we will need this setup
+         * for the clean up path to work correctly.
+         */
        tsk->stack = ti;
        setup_thread_stack(tsk, orig);
+        if (err)
+                goto out;
        clear_user_return_notifier(tsk);
        clear_tsk_need_resched(tsk);
        stackend = end_of_stack(tsk);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ae34bf51682b..6db7a5ed52b5 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -657,6 +657,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
        return 0;
 }
+static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
+{
+        ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
+        ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
+        return ktime_get_update_offsets(offs_real, offs_boot);
+}
 /*
 * Retrigger next event is called after clock was set
 *
@@ -665,22 +673,12 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 static void retrigger_next_event(void *arg)
 {
        struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
-        struct timespec realtime_offset, xtim, wtm, sleep;
        if (!hrtimer_hres_active())
                return;
-        /* Optimized out for !HIGH_RES */
-        get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep);
-        set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
-        /* Adjust CLOCK_REALTIME offset */
        raw_spin_lock(&base->lock);
-        base->clock_base[HRTIMER_BASE_REALTIME].offset =
+        hrtimer_update_base(base);
-                timespec_to_ktime(realtime_offset);
-        base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
-                timespec_to_ktime(sleep);
        hrtimer_force_reprogram(base, 0);
        raw_spin_unlock(&base->lock);
 }
@@ -710,13 +708,25 @@ static int hrtimer_switch_to_hres(void)
                base->clock_base[i].resolution = KTIME_HIGH_RES;
        tick_setup_sched_timer();
        /* "Retrigger" the interrupt to get things going */
        retrigger_next_event(NULL);
        local_irq_restore(flags);
        return 1;
 }
+/*
+ * Called from timekeeping code to reprogramm the hrtimer interrupt
+ * device. If called from the timer interrupt context we defer it to
+ * softirq context.
+ */
+void clock_was_set_delayed(void)
+{
+        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+        cpu_base->clock_was_set = 1;
+        __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+}
 #else
 static inline int hrtimer_hres_active(void) { return 0; }
@@ -1250,11 +1260,10 @@ void hrtimer_interrupt(struct clock_event_device *dev)
        cpu_base->nr_events++;
        dev->next_event.tv64 = KTIME_MAX;
-        entry_time = now = ktime_get();
+        raw_spin_lock(&cpu_base->lock);
+        entry_time = now = hrtimer_update_base(cpu_base);
 retry:
        expires_next.tv64 = KTIME_MAX;
-        raw_spin_lock(&cpu_base->lock);
        /*
         * We set expires_next to KTIME_MAX here with cpu_base->lock
         * held to prevent that a timer is enqueued in our queue via
@@ -1330,8 +1339,12 @@ retry:
         * We need to prevent that we loop forever in the hrtimer
         * interrupt routine. We give it 3 attempts to avoid
         * overreacting on some spurious event.
+         *
+         * Acquire base lock for updating the offsets and retrieving
+         * the current time.
         */
-        now = ktime_get();
+        raw_spin_lock(&cpu_base->lock);
+        now = hrtimer_update_base(cpu_base);
        cpu_base->nr_retries++;
        if (++retries < 3)
                goto retry;
@@ -1343,6 +1356,7 @@ retry:
         */
        cpu_base->nr_hangs++;
        cpu_base->hang_detected = 1;
+        raw_spin_unlock(&cpu_base->lock);
        delta = ktime_sub(now, entry_time);
        if (delta.tv64 > cpu_base->max_hang_time.tv64)
                cpu_base->max_hang_time = delta;
@@ -1395,6 +1409,13 @@ void hrtimer_peek_ahead_timers(void)
 static void run_hrtimer_softirq(struct softirq_action *h)
 {
+        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+        if (cpu_base->clock_was_set) {
+                cpu_base->clock_was_set = 0;
+                clock_was_set();
+        }
        hrtimer_peek_ahead_timers();
 }
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 8b53db38a279..238025f5472e 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -27,7 +27,6 @@
 #include <linux/syscore_ops.h>
 #include <linux/ctype.h>
 #include <linux/genhd.h>
-#include <scsi/scsi_scan.h>
 #include "power.h"
@@ -748,13 +747,6 @@ static int software_resume(void)
                        async_synchronize_full();
                }
-                /*
-                 * We can't depend on SCSI devices being available after loading
-                 * one of their modules until scsi_complete_async_scans() is
-                 * called and the resume device usually is a SCSI one.
-                 */
-                scsi_complete_async_scans();
                swsusp_resume_device = name_to_dev_t(resume_file);
                if (!swsusp_resume_device) {
                        error = -ENODEV;
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 91b0fd021a95..4ed81e74f86f 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -24,7 +24,6 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
-#include <scsi/scsi_scan.h>
 #include <asm/uaccess.h>
@@ -84,7 +83,6 @@ static int snapshot_open(struct inode *inode, struct file *filp)
                 * appear.
                 */
                wait_for_device_probe();
-                scsi_complete_async_scans();
                data->swap = -1;
                data->mode = O_WRONLY;
diff --git a/kernel/printk.c b/kernel/printk.c
index dba18211685e..ac4bc9e79465 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -194,8 +194,10 @@ static int console_may_schedule;
 */
 enum log_flags {
-        LOG_DEFAULT = 0,
+        LOG_NOCONS      = 1,    /* already flushed, do not print to console */
-        LOG_NOCONS = 1,         /* already flushed, do not print to console */
+        LOG_NEWLINE     = 2,    /* text ended with a newline */
+        LOG_PREFIX      = 4,    /* text started with a prefix */
+        LOG_CONT        = 8,    /* text is a fragment of a continuation line */
 };
 struct log {
@@ -217,6 +219,8 @@ static DEFINE_RAW_SPINLOCK(logbuf_lock);
 /* the next printk record to read by syslog(READ) or /proc/kmsg */
 static u64 syslog_seq;
 static u32 syslog_idx;
+static enum log_flags syslog_prev;
+static size_t syslog_partial;
 /* index and sequence number of the first record stored in the buffer */
 static u64 log_first_seq;
@@ -430,20 +434,20 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
        ret = mutex_lock_interruptible(&user->lock);
        if (ret)
                return ret;
-        raw_spin_lock(&logbuf_lock);
+        raw_spin_lock_irq(&logbuf_lock);
        while (user->seq == log_next_seq) {
                if (file->f_flags & O_NONBLOCK) {
                        ret = -EAGAIN;
-                        raw_spin_unlock(&logbuf_lock);
+                        raw_spin_unlock_irq(&logbuf_lock);
                        goto out;
                }
-                raw_spin_unlock(&logbuf_lock);
+                raw_spin_unlock_irq(&logbuf_lock);
                ret = wait_event_interruptible(log_wait,
                                               user->seq != log_next_seq);
                if (ret)
                        goto out;
-                raw_spin_lock(&logbuf_lock);
+                raw_spin_lock_irq(&logbuf_lock);
        }
        if (user->seq < log_first_seq) {
@@ -451,7 +455,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
                user->idx = log_first_idx;
                user->seq = log_first_seq;
                ret = -EPIPE;
-                raw_spin_unlock(&logbuf_lock);
+                raw_spin_unlock_irq(&logbuf_lock);
                goto out;
        }
@@ -465,7 +469,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
        for (i = 0; i < msg->text_len; i++) {
                unsigned char c = log_text(msg)[i];
-                if (c < ' ' || c >= 128)
+                if (c < ' ' || c >= 127 || c == '\\')
                        len += sprintf(user->buf + len, "\\x%02x", c);
                else
                        user->buf[len++] = c;
@@ -489,7 +493,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
                                continue;
                        }
-                        if (c < ' ' || c >= 128) {
+                        if (c < ' ' || c >= 127 || c == '\\') {
                                len += sprintf(user->buf + len, "\\x%02x", c);
                                continue;
                        }
@@ -501,7 +505,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
        user->idx = log_next(user->idx);
        user->seq++;
-        raw_spin_unlock(&logbuf_lock);
+        raw_spin_unlock_irq(&logbuf_lock);
        if (len > count) {
                ret = -EINVAL;
@@ -528,7 +532,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
        if (offset)
                return -ESPIPE;
-        raw_spin_lock(&logbuf_lock);
+        raw_spin_lock_irq(&logbuf_lock);
        switch (whence) {
        case SEEK_SET:
                /* the first record */
@@ -552,7 +556,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
        default:
                ret = -EINVAL;
        }
-        raw_spin_unlock(&logbuf_lock);
+        raw_spin_unlock_irq(&logbuf_lock);
        return ret;
 }
@@ -566,14 +570,14 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
        poll_wait(file, &log_wait, wait);
-        raw_spin_lock(&logbuf_lock);
+        raw_spin_lock_irq(&logbuf_lock);
        if (user->seq < log_next_seq) {
                /* return error when data has vanished underneath us */
                if (user->seq < log_first_seq)
                        ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
                ret = POLLIN|POLLRDNORM;
        }
-        raw_spin_unlock(&logbuf_lock);
+        raw_spin_unlock_irq(&logbuf_lock);
        return ret;
 }
@@ -597,10 +601,10 @@ static int devkmsg_open(struct inode *inode, struct file *file)
        mutex_init(&user->lock);
-        raw_spin_lock(&logbuf_lock);
+        raw_spin_lock_irq(&logbuf_lock);
        user->idx = log_first_idx;
        user->seq = log_first_seq;
-        raw_spin_unlock(&logbuf_lock);
+        raw_spin_unlock_irq(&logbuf_lock);
        file->private_data = user;
        return 0;
@@ -818,15 +822,18 @@ static size_t print_time(u64 ts, char *buf)
 static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
 {
        size_t len = 0;
+        unsigned int prefix = (msg->facility << 3) | msg->level;
        if (syslog) {
                if (buf) {
-                        len += sprintf(buf, "<%u>", msg->level);
+                        len += sprintf(buf, "<%u>", prefix);
                } else {
                        len += 3;
-                        if (msg->level > 9)
+                        if (prefix > 999)
-                                len++;
+                                len += 3;
-                        if (msg->level > 99)
+                        else if (prefix > 99)
+                                len += 2;
+                        else if (prefix > 9)
                                len++;
                }
        }
@@ -835,13 +842,26 @@ static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
        return len;
 }
-static size_t msg_print_text(const struct log *msg, bool syslog,
+static size_t msg_print_text(const struct log *msg, enum log_flags prev,
-                             char *buf, size_t size)
+                             bool syslog, char *buf, size_t size)
 {
        const char *text = log_text(msg);
        size_t text_size = msg->text_len;
+        bool prefix = true;
+        bool newline = true;
        size_t len = 0;
+        if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))
+                prefix = false;
+        if (msg->flags & LOG_CONT) {
+                if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE))
+                        prefix = false;
+                if (!(msg->flags & LOG_NEWLINE))
+                        newline = false;
+        }
        do {
                const char *next = memchr(text, '\n', text_size);
                size_t text_len;
@@ -859,16 +879,22 @@ static size_t msg_print_text(const struct log *msg, bool syslog,
                            text_len + 1>= size - len)
                                break;
-                        len += print_prefix(msg, syslog, buf + len);
+                        if (prefix)
+                                len += print_prefix(msg, syslog, buf + len);
                        memcpy(buf + len, text, text_len);
                        len += text_len;
-                        buf[len++] = '\n';
+                        if (next || newline)
+                                buf[len++] = '\n';
                } else {
                        /* SYSLOG_ACTION_* buffer size only calculation */
-                        len += print_prefix(msg, syslog, NULL);
+                        if (prefix)
-                        len += text_len + 1;
+                                len += print_prefix(msg, syslog, NULL);
+                        len += text_len;
+                        if (next || newline)
+                                len++;
                }
+                prefix = true;
                text = next;
        } while (text);
@@ -887,22 +913,35 @@ static int syslog_print(char __user *buf, int size)
        while (size > 0) {
                size_t n;
+                size_t skip;
                raw_spin_lock_irq(&logbuf_lock);
                if (syslog_seq < log_first_seq) {
                        /* messages are gone, move to first one */
                        syslog_seq = log_first_seq;
                        syslog_idx = log_first_idx;
+                        syslog_prev = 0;
+                        syslog_partial = 0;
                }
                if (syslog_seq == log_next_seq) {
                        raw_spin_unlock_irq(&logbuf_lock);
                        break;
                }
+                skip = syslog_partial;
                msg = log_from_idx(syslog_idx);
-                n = msg_print_text(msg, true, text, LOG_LINE_MAX);
+                n = msg_print_text(msg, syslog_prev, true, text, LOG_LINE_MAX);
-                if (n <= size) {
+                if (n - syslog_partial <= size) {
+                        /* message fits into buffer, move forward */
                        syslog_idx = log_next(syslog_idx);
                        syslog_seq++;
+                        syslog_prev = msg->flags;
+                        n -= syslog_partial;
+                        syslog_partial = 0;
+                } else if (!len){
+                        /* partial read(), remember position */
+                        n = size;
+                        syslog_partial += n;
                } else
                        n = 0;
                raw_spin_unlock_irq(&logbuf_lock);
@@ -910,17 +949,15 @@ static int syslog_print(char __user *buf, int size)
                if (!n)
                        break;
-                len += n;
+                if (copy_to_user(buf, text + skip, n)) {
-                size -= n;
-                buf += n;
-                n = copy_to_user(buf - n, text, n);
-                if (n) {
-                        len -= n;
                        if (!len)
                                len = -EFAULT;
                        break;
                }
+                len += n;
+                size -= n;
+                buf += n;
        }
        kfree(text);
@@ -941,6 +978,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
                u64 next_seq;
                u64 seq;
                u32 idx;
+                enum log_flags prev;
                if (clear_seq < log_first_seq) {
                        /* messages are gone, move to first available one */
@@ -954,10 +992,11 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
                 */
                seq = clear_seq;
                idx = clear_idx;
+                prev = 0;
                while (seq < log_next_seq) {
                        struct log *msg = log_from_idx(idx);
-                        len += msg_print_text(msg, true, NULL, 0);
+                        len += msg_print_text(msg, prev, true, NULL, 0);
                        idx = log_next(idx);
                        seq++;
                }
@@ -965,10 +1004,11 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
                /* move first record forward until length fits into the buffer */
                seq = clear_seq;
                idx = clear_idx;
+                prev = 0;
                while (len > size && seq < log_next_seq) {
                        struct log *msg = log_from_idx(idx);
-                        len -= msg_print_text(msg, true, NULL, 0);
+                        len -= msg_print_text(msg, prev, true, NULL, 0);
                        idx = log_next(idx);
                        seq++;
                }
@@ -977,17 +1017,19 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
                next_seq = log_next_seq;
                len = 0;
+                prev = 0;
                while (len >= 0 && seq < next_seq) {
                        struct log *msg = log_from_idx(idx);
                        int textlen;
-                        textlen = msg_print_text(msg, true, text, LOG_LINE_MAX);
+                        textlen = msg_print_text(msg, prev, true, text, LOG_LINE_MAX);
                        if (textlen < 0) {
                                len = textlen;
                                break;
                        }
                        idx = log_next(idx);
                        seq++;
+                        prev = msg->flags;
                        raw_spin_unlock_irq(&logbuf_lock);
                        if (copy_to_user(buf + len, text, textlen))
@@ -1000,6 +1042,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
                                /* messages are gone, move to next one */
                                seq = log_first_seq;
                                idx = log_first_idx;
+                                prev = 0;
                        }
                }
        }
@@ -1018,7 +1061,6 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 {
        bool clear = false;
        static int saved_console_loglevel = -1;
-        static DEFINE_MUTEX(syslog_mutex);
        int error;
        error = check_syslog_permissions(type, from_file);
@@ -1045,17 +1087,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
                        error = -EFAULT;
                        goto out;
                }
-                error = mutex_lock_interruptible(&syslog_mutex);
-                if (error)
-                        goto out;
                error = wait_event_interruptible(log_wait,
                                                 syslog_seq != log_next_seq);
-                if (error) {
+                if (error)
-                        mutex_unlock(&syslog_mutex);
                        goto out;
-                }
                error = syslog_print(buf, len);
-                mutex_unlock(&syslog_mutex);
                break;
        /* Read/clear last kernel messages */
        case SYSLOG_ACTION_READ_CLEAR:
@@ -1111,6 +1147,8 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
                        /* messages are gone, move to first one */
                        syslog_seq = log_first_seq;
                        syslog_idx = log_first_idx;
+                        syslog_prev = 0;
+                        syslog_partial = 0;
                }
                if (from_file) {
                        /*
@@ -1120,19 +1158,20 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
                         */
                        error = log_next_idx - syslog_idx;
                } else {
-                        u64 seq;
+                        u64 seq = syslog_seq;
-                        u32 idx;
+                        u32 idx = syslog_idx;
+                        enum log_flags prev = syslog_prev;
                        error = 0;
-                        seq = syslog_seq;
-                        idx = syslog_idx;
                        while (seq < log_next_seq) {
                                struct log *msg = log_from_idx(idx);
-                                error += msg_print_text(msg, true, NULL, 0);
+                                error += msg_print_text(msg, prev, true, NULL, 0);
                                idx = log_next(idx);
                                seq++;
+                                prev = msg->flags;
                        }
+                        error -= syslog_partial;
                }
                raw_spin_unlock_irq(&logbuf_lock);
                break;
@@ -1153,21 +1192,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
        return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
 }
-#ifdef  CONFIG_KGDB_KDB
-/* kdb dmesg command needs access to the syslog buffer.  do_syslog()
- * uses locks so it cannot be used during debugging.  Just tell kdb
- * where the start and end of the physical and logical logs are.  This
- * is equivalent to do_syslog(3).
- */
-void kdb_syslog_data(char *syslog_data[4])
-{
-        syslog_data[0] = log_buf;
-        syslog_data[1] = log_buf + log_buf_len;
-        syslog_data[2] = log_buf + log_first_idx;
-        syslog_data[3] = log_buf + log_next_idx;
-}
-#endif  /* CONFIG_KGDB_KDB */
 static bool __read_mostly ignore_loglevel;
 static int __init ignore_loglevel_setup(char *str)
@@ -1400,10 +1424,9 @@ asmlinkage int vprintk_emit(int facility, int level,
        static char textbuf[LOG_LINE_MAX];
        char *text = textbuf;
        size_t text_len;
+        enum log_flags lflags = 0;
        unsigned long flags;
        int this_cpu;
-        bool newline = false;
-        bool prefix = false;
        int printed_len = 0;
        boot_delay_msec();
@@ -1442,7 +1465,7 @@ asmlinkage int vprintk_emit(int facility, int level,
                recursion_bug = 0;
                printed_len += strlen(recursion_msg);
                /* emit KERN_CRIT message */
-                log_store(0, 2, LOG_DEFAULT, 0,
+                log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
                          NULL, 0, recursion_msg, printed_len);
        }
@@ -1455,7 +1478,7 @@ asmlinkage int vprintk_emit(int facility, int level,
        /* mark and strip a trailing newline */
        if (text_len && text[text_len-1] == '\n') {
                text_len--;
-                newline = true;
+                lflags |= LOG_NEWLINE;
        }
        /* strip syslog prefix and extract log level or control flags */
@@ -1465,7 +1488,7 @@ asmlinkage int vprintk_emit(int facility, int level,
                        if (level == -1)
                                level = text[1] - '0';
                case 'd':       /* KERN_DEFAULT */
-                        prefix = true;
+                        lflags |= LOG_PREFIX;
                case 'c':       /* KERN_CONT */
                        text += 3;
                        text_len -= 3;
@@ -1475,22 +1498,20 @@ asmlinkage int vprintk_emit(int facility, int level,
        if (level == -1)
                level = default_message_loglevel;
-        if (dict) {
+        if (dict)
-                prefix = true;
+                lflags |= LOG_PREFIX|LOG_NEWLINE;
-                newline = true;
-        }
-        if (!newline) {
+        if (!(lflags & LOG_NEWLINE)) {
                /*
                 * Flush the conflicting buffer. An earlier newline was missing,
                 * or another task also prints continuation lines.
                 */
-                if (cont.len && (prefix || cont.owner != current))
+                if (cont.len && (lflags & LOG_PREFIX || cont.owner != current))
                        cont_flush();
                /* buffer line if possible, otherwise store it right away */
                if (!cont_add(facility, level, text, text_len))
-                        log_store(facility, level, LOG_DEFAULT, 0,
+                        log_store(facility, level, lflags | LOG_CONT, 0,
                                  dict, dictlen, text, text_len);
        } else {
                bool stored = false;
@@ -1502,13 +1523,13 @@ asmlinkage int vprintk_emit(int facility, int level,
                 * flush it out and store this line separately.
                 */
                if (cont.len && cont.owner == current) {
-                        if (!prefix)
+                        if (!(lflags & LOG_PREFIX))
                                stored = cont_add(facility, level, text, text_len);
                        cont_flush();
                }
                if (!stored)
-                        log_store(facility, level, LOG_DEFAULT, 0,
+                        log_store(facility, level, lflags, 0,
                                  dict, dictlen, text, text_len);
        }
        printed_len += text_len;
@@ -1607,8 +1628,8 @@ static struct cont {
 static struct log *log_from_idx(u32 idx) { return NULL; }
 static u32 log_next(u32 idx) { return 0; }
 static void call_console_drivers(int level, const char *text, size_t len) {}
-static size_t msg_print_text(const struct log *msg, bool syslog,
+static size_t msg_print_text(const struct log *msg, enum log_flags prev,
-                             char *buf, size_t size) { return 0; }
+                             bool syslog, char *buf, size_t size) { return 0; }
 static size_t cont_print_text(char *text, size_t size) { return 0; }
 #endif /* CONFIG_PRINTK */
@@ -1884,6 +1905,7 @@ void wake_up_klogd(void)
 /* the next printk record to write to the console */
 static u64 console_seq;
 static u32 console_idx;
+static enum log_flags console_prev;
 /**
 * console_unlock - unlock the console system
@@ -1944,6 +1966,7 @@ again:
                        /* messages are gone, move to first one */
                        console_seq = log_first_seq;
                        console_idx = log_first_idx;
+                        console_prev = 0;
                }
 skip:
                if (console_seq == log_next_seq)
@@ -1957,14 +1980,21 @@ skip:
                         */
                        console_idx = log_next(console_idx);
                        console_seq++;
+                        /*
+                         * We will get here again when we register a new
+                         * CON_PRINTBUFFER console. Clear the flag so we
+                         * will properly dump everything later.
+                         */
+                        msg->flags &= ~LOG_NOCONS;
                        goto skip;
                }
                level = msg->level;
-                len = msg_print_text(msg, false, text, sizeof(text));
+                len = msg_print_text(msg, console_prev, false,
+                                     text, sizeof(text));
                console_idx = log_next(console_idx);
                console_seq++;
+                console_prev = msg->flags;
                raw_spin_unlock(&logbuf_lock);
                stop_critical_timings();        /* don't trace print latency */
@@ -2227,6 +2257,7 @@ void register_console(struct console *newcon)
                raw_spin_lock_irqsave(&logbuf_lock, flags);
                console_seq = syslog_seq;
                console_idx = syslog_idx;
+                console_prev = syslog_prev;
                raw_spin_unlock_irqrestore(&logbuf_lock, flags);
                /*
                 * We're about to replay the log buffer.  Only do this to the
@@ -2479,7 +2510,7 @@ void kmsg_dump(enum kmsg_dump_reason reason)
 }
 /**
- * kmsg_dump_get_line - retrieve one kmsg log line
+ * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version)
 * @dumper: registered kmsg dumper
 * @syslog: include the "<4>" prefixes
 * @line: buffer to copy the line to
@@ -2494,11 +2525,12 @@ void kmsg_dump(enum kmsg_dump_reason reason)
 *
 * A return value of FALSE indicates that there are no more records to
 * read.
+ *
+ * The function is similar to kmsg_dump_get_line(), but grabs no locks.
 */
-bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
+bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
-                        char *line, size_t size, size_t *len)
+                               char *line, size_t size, size_t *len)
 {
-        unsigned long flags;
        struct log *msg;
        size_t l = 0;
        bool ret = false;
@@ -2506,7 +2538,6 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
        if (!dumper->active)
                goto out;
-        raw_spin_lock_irqsave(&logbuf_lock, flags);
        if (dumper->cur_seq < log_first_seq) {
                /* messages are gone, move to first available one */
                dumper->cur_seq = log_first_seq;
@@ -2514,24 +2545,50 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
        }
        /* last entry */
-        if (dumper->cur_seq >= log_next_seq) {
+        if (dumper->cur_seq >= log_next_seq)
-                raw_spin_unlock_irqrestore(&logbuf_lock, flags);
                goto out;
-        }
        msg = log_from_idx(dumper->cur_idx);
-        l = msg_print_text(msg, syslog,
+        l = msg_print_text(msg, 0, syslog, line, size);
-                              line, size);
        dumper->cur_idx = log_next(dumper->cur_idx);
        dumper->cur_seq++;
        ret = true;
-        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 out:
        if (len)
                *len = l;
        return ret;
 }
+/**
+ * kmsg_dump_get_line - retrieve one kmsg log line
+ * @dumper: registered kmsg dumper
+ * @syslog: include the "<4>" prefixes
+ * @line: buffer to copy the line to
+ * @size: maximum size of the buffer
+ * @len: length of line placed into buffer
+ *
+ * Start at the beginning of the kmsg buffer, with the oldest kmsg
+ * record, and copy one record into the provided buffer.
+ *
+ * Consecutive calls will return the next available record moving
+ * towards the end of the buffer with the youngest messages.
+ *
+ * A return value of FALSE indicates that there are no more records to
+ * read.
+ */
+bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
+                        char *line, size_t size, size_t *len)
+{
+        unsigned long flags;
+        bool ret;
+        raw_spin_lock_irqsave(&logbuf_lock, flags);
+        ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
+        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+        return ret;
+}
 EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
 /**
@@ -2561,6 +2618,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
        u32 idx;
        u64 next_seq;
        u32 next_idx;
+        enum log_flags prev;
        size_t l = 0;
        bool ret = false;
@@ -2583,23 +2641,27 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
        /* calculate length of entire buffer */
        seq = dumper->cur_seq;
        idx = dumper->cur_idx;
+        prev = 0;
        while (seq < dumper->next_seq) {
                struct log *msg = log_from_idx(idx);
-                l += msg_print_text(msg, true, NULL, 0);
+                l += msg_print_text(msg, prev, true, NULL, 0);
                idx = log_next(idx);
                seq++;
+                prev = msg->flags;
        }
        /* move first record forward until length fits into the buffer */
        seq = dumper->cur_seq;
        idx = dumper->cur_idx;
+        prev = 0;
        while (l > size && seq < dumper->next_seq) {
                struct log *msg = log_from_idx(idx);
-                l -= msg_print_text(msg, true, NULL, 0);
+                l -= msg_print_text(msg, prev, true, NULL, 0);
                idx = log_next(idx);
                seq++;
+                prev = msg->flags;
        }
        /* last message in next interation */
@@ -2607,14 +2669,14 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
        next_idx = idx;
        l = 0;
+        prev = 0;
        while (seq < dumper->next_seq) {
                struct log *msg = log_from_idx(idx);
-                l += msg_print_text(msg, syslog,
+                l += msg_print_text(msg, prev, syslog, buf + l, size - l);
-                                    buf + l, size - l);
                idx = log_next(idx);
                seq++;
+                prev = msg->flags;
        }
        dumper->next_seq = next_seq;
@@ -2629,6 +2691,24 @@ out:
 EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
 /**
+ * kmsg_dump_rewind_nolock - reset the interator (unlocked version)
+ * @dumper: registered kmsg dumper
+ *
+ * Reset the dumper's iterator so that kmsg_dump_get_line() and
+ * kmsg_dump_get_buffer() can be called again and used multiple
+ * times within the same dumper.dump() callback.
+ *
+ * The function is similar to kmsg_dump_rewind(), but grabs no locks.
+ */
+void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
+{
+        dumper->cur_seq = clear_seq;
+        dumper->cur_idx = clear_idx;
+        dumper->next_seq = log_next_seq;
+        dumper->next_idx = log_next_idx;
+}
+/**
 * kmsg_dump_rewind - reset the interator
 * @dumper: registered kmsg dumper
 *
@@ -2641,10 +2721,7 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper)
        unsigned long flags;
        raw_spin_lock_irqsave(&logbuf_lock, flags);
-        dumper->cur_seq = clear_seq;
+        kmsg_dump_rewind_nolock(dumper);
-        dumper->cur_idx = clear_idx;
-        dumper->next_seq = log_next_seq;
-        dumper->next_idx = log_next_idx;
        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 }
 EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 95cba41ce1e9..4e6a61b15e86 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -54,6 +54,50 @@
 #ifdef CONFIG_PREEMPT_RCU
 /*
+ * Preemptible RCU implementation for rcu_read_lock().
+ * Just increment ->rcu_read_lock_nesting, shared state will be updated
+ * if we block.
+ */
+void __rcu_read_lock(void)
+{
+        current->rcu_read_lock_nesting++;
+        barrier();  /* critical section after entry code. */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+/*
+ * Preemptible RCU implementation for rcu_read_unlock().
+ * Decrement ->rcu_read_lock_nesting.  If the result is zero (outermost
+ * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
+ * invoke rcu_read_unlock_special() to clean up after a context switch
+ * in an RCU read-side critical section and other special cases.
+ */
+void __rcu_read_unlock(void)
+{
+        struct task_struct *t = current;
+        if (t->rcu_read_lock_nesting != 1) {
+                --t->rcu_read_lock_nesting;
+        } else {
+                barrier();  /* critical section before exit code. */
+                t->rcu_read_lock_nesting = INT_MIN;
+                barrier();  /* assign before ->rcu_read_unlock_special load */
+                if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
+                        rcu_read_unlock_special(t);
+                barrier();  /* ->rcu_read_unlock_special load before assign */
+                t->rcu_read_lock_nesting = 0;
+        }
+#ifdef CONFIG_PROVE_LOCKING
+        {
+                int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
+                WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
+        }
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+/*
 * Check for a task exiting while in a preemptible-RCU read-side
 * critical section, clean up if so.  No need to issue warnings,
 * as debug_check_no_locks_held() already does this if lockdep
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 37a5444204d2..547b1fe5b052 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -172,7 +172,7 @@ void rcu_irq_enter(void)
        local_irq_restore(flags);
 }
-#ifdef CONFIG_PROVE_RCU
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
 /*
 * Test whether RCU thinks that the current CPU is idle.
@@ -183,7 +183,7 @@ int rcu_is_cpu_idle(void)
 }
 EXPORT_SYMBOL(rcu_is_cpu_idle);
-#endif /* #ifdef CONFIG_PROVE_RCU */
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 /*
 * Test whether the current CPU was interrupted from idle.  Nested
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index fc31a2d65100..918fd1e8509c 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -132,7 +132,6 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
        RCU_TRACE(.rcb.name = "rcu_preempt")
 };
-static void rcu_read_unlock_special(struct task_struct *t);
 static int rcu_preempted_readers_exp(void);
 static void rcu_report_exp_done(void);
@@ -351,8 +350,9 @@ static int rcu_initiate_boost(void)
                        rcu_preempt_ctrlblk.boost_tasks =
                                rcu_preempt_ctrlblk.gp_tasks;
                invoke_rcu_callbacks();
-        } else
+        } else {
                RCU_TRACE(rcu_initiate_boost_trace());
+        }
        return 1;
 }
@@ -527,23 +527,11 @@ void rcu_preempt_note_context_switch(void)
 }
 /*
- * Tiny-preemptible RCU implementation for rcu_read_lock().
- * Just increment ->rcu_read_lock_nesting, shared state will be updated
- * if we block.
- */
-void __rcu_read_lock(void)
-{
-        current->rcu_read_lock_nesting++;
-        barrier();  /* needed if we ever invoke rcu_read_lock in rcutiny.c */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_lock);
-/*
 * Handle special cases during rcu_read_unlock(), such as needing to
 * notify RCU core processing or task having blocked during the RCU
 * read-side critical section.
 */
-static noinline void rcu_read_unlock_special(struct task_struct *t)
+void rcu_read_unlock_special(struct task_struct *t)
 {
        int empty;
        int empty_exp;
@@ -627,38 +615,6 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
 }
 /*
- * Tiny-preemptible RCU implementation for rcu_read_unlock().
- * Decrement ->rcu_read_lock_nesting.  If the result is zero (outermost
- * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
- * invoke rcu_read_unlock_special() to clean up after a context switch
- * in an RCU read-side critical section and other special cases.
- */
-void __rcu_read_unlock(void)
-{
-        struct task_struct *t = current;
-        barrier();  /* needed if we ever invoke rcu_read_unlock in rcutiny.c */
-        if (t->rcu_read_lock_nesting != 1)
-                --t->rcu_read_lock_nesting;
-        else {
-                t->rcu_read_lock_nesting = INT_MIN;
-                barrier();  /* assign before ->rcu_read_unlock_special load */
-                if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
-                        rcu_read_unlock_special(t);
-                barrier();  /* ->rcu_read_unlock_special load before assign */
-                t->rcu_read_lock_nesting = 0;
-        }
-#ifdef CONFIG_PROVE_LOCKING
-        {
-                int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
-                WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
-        }
-#endif /* #ifdef CONFIG_PROVE_LOCKING */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_unlock);
-/*
 * Check for a quiescent state from the current CPU.  When a task blocks,
 * the task is recorded in the rcu_preempt_ctrlblk structure, which is
 * checked elsewhere.  This is called from the scheduling-clock interrupt.
@@ -823,9 +779,9 @@ void synchronize_rcu_expedited(void)
                rpcp->exp_tasks = NULL;
        /* Wait for tail of ->blkd_tasks list to drain. */
-        if (!rcu_preempted_readers_exp())
+        if (!rcu_preempted_readers_exp()) {
                local_irq_restore(flags);
-        else {
+        } else {
                rcu_initiate_boost();
                local_irq_restore(flags);
                wait_event(sync_rcu_preempt_exp_wq,
@@ -846,8 +802,6 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 */
 int rcu_preempt_needs_cpu(void)
 {
-        if (!rcu_preempt_running_reader())
-                rcu_preempt_cpu_qs();
        return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
 }
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index e66b34ab7555..25b15033c61f 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -49,8 +49,7 @@
 #include <asm/byteorder.h>
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
-              "Josh Triplett <josh@freedesktop.org>");
 static int nreaders = -1;       /* # reader threads, defaults to 2*ncpus */
 static int nfakewriters = 4;    /* # fake writer threads */
@@ -206,6 +205,7 @@ static unsigned long boost_starttime;	/* jiffies of next boost test start. */
 DEFINE_MUTEX(boost_mutex);              /* protect setting boost_starttime */
                                        /*  and boost task create/destroy. */
 static atomic_t barrier_cbs_count;      /* Barrier callbacks registered. */
+static bool barrier_phase;              /* Test phase. */
 static atomic_t barrier_cbs_invoked;    /* Barrier callbacks invoked. */
 static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
 static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
@@ -407,8 +407,9 @@ rcu_torture_cb(struct rcu_head *p)
        if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
                rp->rtort_mbtest = 0;
                rcu_torture_free(rp);
-        } else
+        } else {
                cur_ops->deferred_free(rp);
+        }
 }
 static int rcu_no_completed(void)
@@ -635,6 +636,17 @@ static void srcu_torture_synchronize(void)
        synchronize_srcu(&srcu_ctl);
 }
+static void srcu_torture_call(struct rcu_head *head,
+                              void (*func)(struct rcu_head *head))
+{
+        call_srcu(&srcu_ctl, head, func);
+}
+static void srcu_torture_barrier(void)
+{
+        srcu_barrier(&srcu_ctl);
+}
 static int srcu_torture_stats(char *page)
 {
        int cnt = 0;
@@ -661,8 +673,8 @@ static struct rcu_torture_ops srcu_ops = {
        .completed      = srcu_torture_completed,
        .deferred_free  = srcu_torture_deferred_free,
        .sync           = srcu_torture_synchronize,
-        .call           = NULL,
+        .call           = srcu_torture_call,
-        .cb_barrier     = NULL,
+        .cb_barrier     = srcu_torture_barrier,
        .stats          = srcu_torture_stats,
        .name           = "srcu"
 };
@@ -1013,7 +1025,11 @@ rcu_torture_fakewriter(void *arg)
        do {
                schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
                udelay(rcu_random(&rand) & 0x3ff);
-                cur_ops->sync();
+                if (cur_ops->cb_barrier != NULL &&
+                    rcu_random(&rand) % (nfakewriters * 8) == 0)
+                        cur_ops->cb_barrier();
+                else
+                        cur_ops->sync();
                rcu_stutter_wait("rcu_torture_fakewriter");
        } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
@@ -1183,27 +1199,27 @@ rcu_torture_printk(char *page)
        }
        cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
        cnt += sprintf(&page[cnt],
-                       "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
+                       "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
-                       "rtmbe: %d rtbke: %ld rtbre: %ld "
-                       "rtbf: %ld rtb: %ld nt: %ld "
-                       "onoff: %ld/%ld:%ld/%ld "
-                       "barrier: %ld/%ld:%ld",
                       rcu_torture_current,
                       rcu_torture_current_version,
                       list_empty(&rcu_torture_freelist),
                       atomic_read(&n_rcu_torture_alloc),
                       atomic_read(&n_rcu_torture_alloc_fail),
-                       atomic_read(&n_rcu_torture_free),
+                       atomic_read(&n_rcu_torture_free));
+        cnt += sprintf(&page[cnt], "rtmbe: %d rtbke: %ld rtbre: %ld ",
                       atomic_read(&n_rcu_torture_mberror),
                       n_rcu_torture_boost_ktrerror,
-                       n_rcu_torture_boost_rterror,
+                       n_rcu_torture_boost_rterror);
+        cnt += sprintf(&page[cnt], "rtbf: %ld rtb: %ld nt: %ld ",
                       n_rcu_torture_boost_failure,
                       n_rcu_torture_boosts,
-                       n_rcu_torture_timers,
+                       n_rcu_torture_timers);
+        cnt += sprintf(&page[cnt], "onoff: %ld/%ld:%ld/%ld ",
                       n_online_successes,
                       n_online_attempts,
                       n_offline_successes,
-                       n_offline_attempts,
+                       n_offline_attempts);
+        cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld",
                       n_barrier_successes,
                       n_barrier_attempts,
                       n_rcu_torture_barrier_error);
@@ -1445,8 +1461,7 @@ rcu_torture_shutdown(void *arg)
                delta = shutdown_time - jiffies_snap;
                if (verbose)
                        printk(KERN_ALERT "%s" TORTURE_FLAG
-                               "rcu_torture_shutdown task: %lu "
+                               "rcu_torture_shutdown task: %lu jiffies remaining\n",
-                               "jiffies remaining\n",
                               torture_type, delta);
                schedule_timeout_interruptible(delta);
                jiffies_snap = ACCESS_ONCE(jiffies);
@@ -1498,8 +1513,7 @@ rcu_torture_onoff(void *arg)
                        if (cpu_down(cpu) == 0) {
                                if (verbose)
                                        printk(KERN_ALERT "%s" TORTURE_FLAG
-                                               "rcu_torture_onoff task: "
+                                               "rcu_torture_onoff task: offlined %d\n",
-                                               "offlined %d\n",
                                               torture_type, cpu);
                                n_offline_successes++;
                        }
@@ -1512,8 +1526,7 @@ rcu_torture_onoff(void *arg)
                        if (cpu_up(cpu) == 0) {
                                if (verbose)
                                        printk(KERN_ALERT "%s" TORTURE_FLAG
-                                               "rcu_torture_onoff task: "
+                                               "rcu_torture_onoff task: onlined %d\n",
-                                               "onlined %d\n",
                                               torture_type, cpu);
                                n_online_successes++;
                        }
@@ -1631,6 +1644,7 @@ void rcu_torture_barrier_cbf(struct rcu_head *rcu)
 static int rcu_torture_barrier_cbs(void *arg)
 {
        long myid = (long)arg;
+        bool lastphase = 0;
        struct rcu_head rcu;
        init_rcu_head_on_stack(&rcu);
@@ -1638,9 +1652,11 @@ static int rcu_torture_barrier_cbs(void *arg)
        set_user_nice(current, 19);
        do {
                wait_event(barrier_cbs_wq[myid],
-                           atomic_read(&barrier_cbs_count) == n_barrier_cbs ||
+                           barrier_phase != lastphase ||
                           kthread_should_stop() ||
                           fullstop != FULLSTOP_DONTSTOP);
+                lastphase = barrier_phase;
+                smp_mb(); /* ensure barrier_phase load before ->call(). */
                if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
                        break;
                cur_ops->call(&rcu, rcu_torture_barrier_cbf);
@@ -1665,7 +1681,8 @@ static int rcu_torture_barrier(void *arg)
        do {
                atomic_set(&barrier_cbs_invoked, 0);
                atomic_set(&barrier_cbs_count, n_barrier_cbs);
-                /* wake_up() path contains the required barriers. */
+                smp_mb(); /* Ensure barrier_phase after prior assignments. */
+                barrier_phase = !barrier_phase;
                for (i = 0; i < n_barrier_cbs; i++)
                        wake_up(&barrier_cbs_wq[i]);
                wait_event(barrier_wq,
@@ -1684,7 +1701,7 @@ static int rcu_torture_barrier(void *arg)
                schedule_timeout_interruptible(HZ / 10);
        } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
        VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping");
-        rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
+        rcutorture_shutdown_absorb("rcu_torture_barrier");
        while (!kthread_should_stop())
                schedule_timeout_interruptible(1);
        return 0;
@@ -1908,8 +1925,8 @@ rcu_torture_init(void)
        static struct rcu_torture_ops *torture_ops[] =
                { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
                  &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
-                  &srcu_ops, &srcu_sync_ops, &srcu_raw_ops,
+                  &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops,
-                  &srcu_raw_sync_ops, &srcu_expedited_ops,
+                  &srcu_raw_ops, &srcu_raw_sync_ops,
                  &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
        mutex_lock(&fullstop_mutex);
@@ -1931,8 +1948,7 @@ rcu_torture_init(void)
                return -EINVAL;
        }
        if (cur_ops->fqs == NULL && fqs_duration != 0) {
-                printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero "
+                printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
-                                  "fqs_duration, fqs disabled.\n");
                fqs_duration = 0;
        }
        if (cur_ops->init)
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 38ecdda3f55f..f280e542e3e9 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -60,36 +60,44 @@
 /* Data structures. */
-static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
+static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
-#define RCU_STATE_INITIALIZER(structname) { \
+#define RCU_STATE_INITIALIZER(sname, cr) { \
-        .level = { &structname##_state.node[0] }, \
+        .level = { &sname##_state.node[0] }, \
-        .levelcnt = { \
+        .call = cr, \
-                NUM_RCU_LVL_0,  /* root of hierarchy. */ \
-                NUM_RCU_LVL_1, \
-                NUM_RCU_LVL_2, \
-                NUM_RCU_LVL_3, \
-                NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
-        }, \
        .fqs_state = RCU_GP_IDLE, \
        .gpnum = -300, \
        .completed = -300, \
-        .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
+        .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \
-        .orphan_nxttail = &structname##_state.orphan_nxtlist, \
+        .orphan_nxttail = &sname##_state.orphan_nxtlist, \
-        .orphan_donetail = &structname##_state.orphan_donelist, \
+        .orphan_donetail = &sname##_state.orphan_donelist, \
-        .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \
+        .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
-        .n_force_qs = 0, \
+        .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \
-        .n_force_qs_ngp = 0, \
+        .name = #sname, \
-        .name = #structname, \
 }
-struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched);
+struct rcu_state rcu_sched_state =
+        RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched);
 DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
-struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh);
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 static struct rcu_state *rcu_state;
+LIST_HEAD(rcu_struct_flavors);
+/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
+static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF;
+module_param(rcu_fanout_leaf, int, 0);
+int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
+static int num_rcu_lvl[] = {  /* Number of rcu_nodes at specified level. */
+        NUM_RCU_LVL_0,
+        NUM_RCU_LVL_1,
+        NUM_RCU_LVL_2,
+        NUM_RCU_LVL_3,
+        NUM_RCU_LVL_4,
+};
+int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
 /*
 * The rcu_scheduler_active variable transitions from zero to one just
@@ -147,13 +155,6 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
 unsigned long rcutorture_testseq;
 unsigned long rcutorture_vernum;
-/* State information for rcu_barrier() and friends. */
-static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
-static atomic_t rcu_barrier_cpu_count;
-static DEFINE_MUTEX(rcu_barrier_mutex);
-static struct completion rcu_barrier_completion;
 /*
 * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
 * permit this function to be invoked without holding the root rcu_node
@@ -201,6 +202,7 @@ void rcu_note_context_switch(int cpu)
 {
        trace_rcu_utilization("Start context switch");
        rcu_sched_qs(cpu);
+        rcu_preempt_note_context_switch(cpu);
        trace_rcu_utilization("End context switch");
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -357,7 +359,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
                struct task_struct *idle = idle_task(smp_processor_id());
                trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
-                ftrace_dump(DUMP_ALL);
+                ftrace_dump(DUMP_ORIG);
                WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
                          current->pid, current->comm,
                          idle->pid, idle->comm); /* must be idle task! */
@@ -467,7 +469,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
                trace_rcu_dyntick("Error on exit: not idle task",
                                  oldval, rdtp->dynticks_nesting);
-                ftrace_dump(DUMP_ALL);
+                ftrace_dump(DUMP_ORIG);
                WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
                          current->pid, current->comm,
                          idle->pid, idle->comm); /* must be idle task! */
@@ -584,8 +586,6 @@ void rcu_nmi_exit(void)
        WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
 }
-#ifdef CONFIG_PROVE_RCU
 /**
 * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
 *
@@ -603,7 +603,7 @@ int rcu_is_cpu_idle(void)
 }
 EXPORT_SYMBOL(rcu_is_cpu_idle);
-#ifdef CONFIG_HOTPLUG_CPU
+#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
 /*
 * Is the current CPU online?  Disable preemption to avoid false positives
@@ -644,9 +644,7 @@ bool rcu_lockdep_current_cpu_online(void)
 }
 EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
-#endif /* #ifdef CONFIG_PROVE_RCU */
 /**
 * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
@@ -732,7 +730,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
        int cpu;
        long delta;
        unsigned long flags;
-        int ndetected;
+        int ndetected = 0;
        struct rcu_node *rnp = rcu_get_root(rsp);
        /* Only let one CPU complain about others per time interval. */
@@ -773,7 +771,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
         */
        rnp = rcu_get_root(rsp);
        raw_spin_lock_irqsave(&rnp->lock, flags);
-        ndetected = rcu_print_task_stall(rnp);
+        ndetected += rcu_print_task_stall(rnp);
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        print_cpu_stall_info_end();
@@ -859,9 +857,10 @@ static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
 */
 void rcu_cpu_stall_reset(void)
 {
-        rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2;
+        struct rcu_state *rsp;
-        rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2;
-        rcu_preempt_stall_reset();
+        for_each_rcu_flavor(rsp)
+                rsp->jiffies_stall = jiffies + ULONG_MAX / 2;
 }
 static struct notifier_block rcu_panic_block = {
@@ -893,8 +892,9 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct
                if (rnp->qsmask & rdp->grpmask) {
                        rdp->qs_pending = 1;
                        rdp->passed_quiesce = 0;
-                } else
+                } else {
                        rdp->qs_pending = 0;
+                }
                zero_cpu_stall_ticks(rdp);
        }
 }
@@ -936,6 +936,18 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
 }
 /*
+ * Initialize the specified rcu_data structure's callback list to empty.
+ */
+static void init_callback_list(struct rcu_data *rdp)
+{
+        int i;
+        rdp->nxtlist = NULL;
+        for (i = 0; i < RCU_NEXT_SIZE; i++)
+                rdp->nxttail[i] = &rdp->nxtlist;
+}
+/*
 * Advance this CPU's callbacks, but only if the current grace period
 * has ended.  This may be called only from the CPU to whom the rdp
 * belongs.  In addition, the corresponding leaf rcu_node structure's
@@ -1327,8 +1339,6 @@ static void
 rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
                          struct rcu_node *rnp, struct rcu_data *rdp)
 {
-        int i;
        /*
         * Orphan the callbacks.  First adjust the counts.  This is safe
         * because ->onofflock excludes _rcu_barrier()'s adoption of
@@ -1339,7 +1349,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
                rsp->qlen += rdp->qlen;
                rdp->n_cbs_orphaned += rdp->qlen;
                rdp->qlen_lazy = 0;
-                rdp->qlen = 0;
+                ACCESS_ONCE(rdp->qlen) = 0;
        }
        /*
@@ -1368,9 +1378,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
        }
        /* Finally, initialize the rcu_data structure's list to empty.  */
-        rdp->nxtlist = NULL;
+        init_callback_list(rdp);
-        for (i = 0; i < RCU_NEXT_SIZE; i++)
-                rdp->nxttail[i] = &rdp->nxtlist;
 }
 /*
@@ -1504,6 +1512,9 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        if (need_report & RCU_OFL_TASKS_EXP_GP)
                rcu_report_exp_rnp(rsp, rnp, true);
+        WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
+                  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
+                  cpu, rdp->qlen, rdp->nxtlist);
 }
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -1591,7 +1602,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
        }
        smp_mb(); /* List handling before counting for rcu_barrier(). */
        rdp->qlen_lazy -= count_lazy;
-        rdp->qlen -= count;
+        ACCESS_ONCE(rdp->qlen) -= count;
        rdp->n_cbs_invoked += count;
        /* Reinstate batch limit if we have worked down the excess. */
@@ -1604,6 +1615,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
                rdp->n_force_qs_snap = rsp->n_force_qs;
        } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
                rdp->qlen_last_fqs_check = rdp->qlen;
+        WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0));
        local_irq_restore(flags);
@@ -1744,8 +1756,6 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
                break; /* grace period idle or initializing, ignore. */
        case RCU_SAVE_DYNTICK:
-                if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
-                        break; /* So gcc recognizes the dead code. */
                raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
@@ -1787,9 +1797,10 @@ unlock_fqs_ret:
 * whom the rdp belongs.
 */
 static void
-__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
+__rcu_process_callbacks(struct rcu_state *rsp)
 {
        unsigned long flags;
+        struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
        WARN_ON_ONCE(rdp->beenonline == 0);
@@ -1825,11 +1836,11 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 */
 static void rcu_process_callbacks(struct softirq_action *unused)
 {
+        struct rcu_state *rsp;
        trace_rcu_utilization("Start RCU core");
-        __rcu_process_callbacks(&rcu_sched_state,
+        for_each_rcu_flavor(rsp)
-                                &__get_cpu_var(rcu_sched_data));
+                __rcu_process_callbacks(rsp);
-        __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
-        rcu_preempt_process_callbacks();
        trace_rcu_utilization("End RCU core");
 }
@@ -1856,6 +1867,56 @@ static void invoke_rcu_core(void)
        raise_softirq(RCU_SOFTIRQ);
 }
+/*
+ * Handle any core-RCU processing required by a call_rcu() invocation.
+ */
+static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
+                            struct rcu_head *head, unsigned long flags)
+{
+        /*
+         * If called from an extended quiescent state, invoke the RCU
+         * core in order to force a re-evaluation of RCU's idleness.
+         */
+        if (rcu_is_cpu_idle() && cpu_online(smp_processor_id()))
+                invoke_rcu_core();
+        /* If interrupts were disabled or CPU offline, don't invoke RCU core. */
+        if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id()))
+                return;
+        /*
+         * Force the grace period if too many callbacks or too long waiting.
+         * Enforce hysteresis, and don't invoke force_quiescent_state()
+         * if some other CPU has recently done so.  Also, don't bother
+         * invoking force_quiescent_state() if the newly enqueued callback
+         * is the only one waiting for a grace period to complete.
+         */
+        if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
+                /* Are we ignoring a completed grace period? */
+                rcu_process_gp_end(rsp, rdp);
+                check_for_new_grace_period(rsp, rdp);
+                /* Start a new grace period if one not already started. */
+                if (!rcu_gp_in_progress(rsp)) {
+                        unsigned long nestflag;
+                        struct rcu_node *rnp_root = rcu_get_root(rsp);
+                        raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
+                        rcu_start_gp(rsp, nestflag);  /* rlses rnp_root->lock */
+                } else {
+                        /* Give the grace period a kick. */
+                        rdp->blimit = LONG_MAX;
+                        if (rsp->n_force_qs == rdp->n_force_qs_snap &&
+                            *rdp->nxttail[RCU_DONE_TAIL] != head)
+                                force_quiescent_state(rsp, 0);
+                        rdp->n_force_qs_snap = rsp->n_force_qs;
+                        rdp->qlen_last_fqs_check = rdp->qlen;
+                }
+        } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
+                force_quiescent_state(rsp, 1);
+}
 static void
 __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
           struct rcu_state *rsp, bool lazy)
@@ -1880,7 +1941,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
        rdp = this_cpu_ptr(rsp->rda);
        /* Add the callback to our list. */
-        rdp->qlen++;
+        ACCESS_ONCE(rdp->qlen)++;
        if (lazy)
                rdp->qlen_lazy++;
        else
@@ -1895,43 +1956,8 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
        else
                trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen);
-        /* If interrupts were disabled, don't dive into RCU core. */
+        /* Go handle any RCU core processing required. */
-        if (irqs_disabled_flags(flags)) {
+        __call_rcu_core(rsp, rdp, head, flags);
-                local_irq_restore(flags);
-                return;
-        }
-        /*
-         * Force the grace period if too many callbacks or too long waiting.
-         * Enforce hysteresis, and don't invoke force_quiescent_state()
-         * if some other CPU has recently done so.  Also, don't bother
-         * invoking force_quiescent_state() if the newly enqueued callback
-         * is the only one waiting for a grace period to complete.
-         */
-        if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
-                /* Are we ignoring a completed grace period? */
-                rcu_process_gp_end(rsp, rdp);
-                check_for_new_grace_period(rsp, rdp);
-                /* Start a new grace period if one not already started. */
-                if (!rcu_gp_in_progress(rsp)) {
-                        unsigned long nestflag;
-                        struct rcu_node *rnp_root = rcu_get_root(rsp);
-                        raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
-                        rcu_start_gp(rsp, nestflag);  /* rlses rnp_root->lock */
-                } else {
-                        /* Give the grace period a kick. */
-                        rdp->blimit = LONG_MAX;
-                        if (rsp->n_force_qs == rdp->n_force_qs_snap &&
-                            *rdp->nxttail[RCU_DONE_TAIL] != head)
-                                force_quiescent_state(rsp, 0);
-                        rdp->n_force_qs_snap = rsp->n_force_qs;
-                        rdp->qlen_last_fqs_check = rdp->qlen;
-                }
-        } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
-                force_quiescent_state(rsp, 1);
        local_irq_restore(flags);
 }
@@ -1961,28 +1987,16 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
 * occasionally incorrectly indicate that there are multiple CPUs online
 * when there was in fact only one the whole time, as this just adds
 * some overhead: RCU still operates correctly.
- *
- * Of course, sampling num_online_cpus() with preemption enabled can
- * give erroneous results if there are concurrent CPU-hotplug operations.
- * For example, given a demonic sequence of preemptions in num_online_cpus()
- * and CPU-hotplug operations, there could be two or more CPUs online at
- * all times, but num_online_cpus() might well return one (or even zero).
- *
- * However, all such demonic sequences require at least one CPU-offline
- * operation.  Furthermore, rcu_blocking_is_gp() giving the wrong answer
- * is only a problem if there is an RCU read-side critical section executing
- * throughout.  But RCU-sched and RCU-bh read-side critical sections
- * disable either preemption or bh, which prevents a CPU from going offline.
- * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return
- * that there is only one CPU when in fact there was more than one throughout
- * is when there were no RCU readers in the system.  If there are no
- * RCU readers, the grace period by definition can be of zero length,
- * regardless of the number of online CPUs.
 */
 static inline int rcu_blocking_is_gp(void)
 {
+        int ret;
        might_sleep();  /* Check for RCU read-side critical section. */
-        return num_online_cpus() <= 1;
+        preempt_disable();
+        ret = num_online_cpus() <= 1;
+        preempt_enable();
+        return ret;
 }
 /**
@@ -2117,9 +2131,9 @@ void synchronize_sched_expedited(void)
                put_online_cpus();
                /* No joy, try again later.  Or just synchronize_sched(). */
-                if (trycount++ < 10)
+                if (trycount++ < 10) {
                        udelay(trycount * num_online_cpus());
-                else {
+                } else {
                        synchronize_sched();
                        return;
                }
@@ -2240,9 +2254,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 */
 static int rcu_pending(int cpu)
 {
-        return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) ||
+        struct rcu_state *rsp;
-               __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) ||
-               rcu_preempt_pending(cpu);
+        for_each_rcu_flavor(rsp)
+                if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu)))
+                        return 1;
+        return 0;
 }
 /*
@@ -2252,20 +2269,41 @@ static int rcu_pending(int cpu)
 */
 static int rcu_cpu_has_callbacks(int cpu)
 {
+        struct rcu_state *rsp;
        /* RCU callbacks either ready or pending? */
-        return per_cpu(rcu_sched_data, cpu).nxtlist ||
+        for_each_rcu_flavor(rsp)
-               per_cpu(rcu_bh_data, cpu).nxtlist ||
+                if (per_cpu_ptr(rsp->rda, cpu)->nxtlist)
-               rcu_preempt_cpu_has_callbacks(cpu);
+                        return 1;
+        return 0;
+}
+/*
+ * Helper function for _rcu_barrier() tracing.  If tracing is disabled,
+ * the compiler is expected to optimize this away.
+ */
+static void _rcu_barrier_trace(struct rcu_state *rsp, char *s,
+                               int cpu, unsigned long done)
+{
+        trace_rcu_barrier(rsp->name, s, cpu,
+                          atomic_read(&rsp->barrier_cpu_count), done);
 }
 /*
 * RCU callback function for _rcu_barrier().  If we are last, wake
 * up the task executing _rcu_barrier().
 */
-static void rcu_barrier_callback(struct rcu_head *notused)
+static void rcu_barrier_callback(struct rcu_head *rhp)
 {
-        if (atomic_dec_and_test(&rcu_barrier_cpu_count))
+        struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head);
-                complete(&rcu_barrier_completion);
+        struct rcu_state *rsp = rdp->rsp;
+        if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
+                _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done);
+                complete(&rsp->barrier_completion);
+        } else {
+                _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done);
+        }
 }
 /*
@@ -2273,35 +2311,63 @@ static void rcu_barrier_callback(struct rcu_head *notused)
 */
 static void rcu_barrier_func(void *type)
 {
-        int cpu = smp_processor_id();
+        struct rcu_state *rsp = type;
-        struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
+        struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
-        void (*call_rcu_func)(struct rcu_head *head,
-                              void (*func)(struct rcu_head *head));
-        atomic_inc(&rcu_barrier_cpu_count);
+        _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done);
-        call_rcu_func = type;
+        atomic_inc(&rsp->barrier_cpu_count);
-        call_rcu_func(head, rcu_barrier_callback);
+        rsp->call(&rdp->barrier_head, rcu_barrier_callback);
 }
 /*
 * Orchestrate the specified type of RCU barrier, waiting for all
 * RCU callbacks of the specified type to complete.
 */
-static void _rcu_barrier(struct rcu_state *rsp,
+static void _rcu_barrier(struct rcu_state *rsp)
-                         void (*call_rcu_func)(struct rcu_head *head,
-                                               void (*func)(struct rcu_head *head)))
 {
        int cpu;
        unsigned long flags;
        struct rcu_data *rdp;
-        struct rcu_head rh;
+        struct rcu_data rd;
+        unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done);
+        unsigned long snap_done;
-        init_rcu_head_on_stack(&rh);
+        init_rcu_head_on_stack(&rd.barrier_head);
+        _rcu_barrier_trace(rsp, "Begin", -1, snap);
        /* Take mutex to serialize concurrent rcu_barrier() requests. */
-        mutex_lock(&rcu_barrier_mutex);
+        mutex_lock(&rsp->barrier_mutex);
+        /*
+         * Ensure that all prior references, including to ->n_barrier_done,
+         * are ordered before the _rcu_barrier() machinery.
+         */
+        smp_mb();  /* See above block comment. */
+        /*
+         * Recheck ->n_barrier_done to see if others did our work for us.
+         * This means checking ->n_barrier_done for an even-to-odd-to-even
+         * transition.  The "if" expression below therefore rounds the old
+         * value up to the next even number and adds two before comparing.
+         */
+        snap_done = ACCESS_ONCE(rsp->n_barrier_done);
+        _rcu_barrier_trace(rsp, "Check", -1, snap_done);
+        if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) {
+                _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done);
+                smp_mb(); /* caller's subsequent code after above check. */
+                mutex_unlock(&rsp->barrier_mutex);
+                return;
+        }
-        smp_mb();  /* Prevent any prior operations from leaking in. */
+        /*
+         * Increment ->n_barrier_done to avoid duplicate work.  Use
+         * ACCESS_ONCE() to prevent the compiler from speculating
+         * the increment to precede the early-exit check.
+         */
+        ACCESS_ONCE(rsp->n_barrier_done)++;
+        WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
+        _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
+        smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
        /*
         * Initialize the count to one rather than to zero in order to
@@ -2320,8 +2386,8 @@ static void _rcu_barrier(struct rcu_state *rsp,
         * 6.   Both rcu_barrier_callback() callbacks are invoked, awakening
         *      us -- but before CPU 1's orphaned callbacks are invoked!!!
         */
-        init_completion(&rcu_barrier_completion);
+        init_completion(&rsp->barrier_completion);
-        atomic_set(&rcu_barrier_cpu_count, 1);
+        atomic_set(&rsp->barrier_cpu_count, 1);
        raw_spin_lock_irqsave(&rsp->onofflock, flags);
        rsp->rcu_barrier_in_progress = current;
        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
@@ -2337,14 +2403,19 @@ static void _rcu_barrier(struct rcu_state *rsp,
                preempt_disable();
                rdp = per_cpu_ptr(rsp->rda, cpu);
                if (cpu_is_offline(cpu)) {
+                        _rcu_barrier_trace(rsp, "Offline", cpu,
+                                           rsp->n_barrier_done);
                        preempt_enable();
                        while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen))
                                schedule_timeout_interruptible(1);
                } else if (ACCESS_ONCE(rdp->qlen)) {
-                        smp_call_function_single(cpu, rcu_barrier_func,
+                        _rcu_barrier_trace(rsp, "OnlineQ", cpu,
-                                                 (void *)call_rcu_func, 1);
+                                           rsp->n_barrier_done);
+                        smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
                        preempt_enable();
                } else {
+                        _rcu_barrier_trace(rsp, "OnlineNQ", cpu,
+                                           rsp->n_barrier_done);
                        preempt_enable();
                }
        }
@@ -2361,24 +2432,32 @@ static void _rcu_barrier(struct rcu_state *rsp,
        rcu_adopt_orphan_cbs(rsp);
        rsp->rcu_barrier_in_progress = NULL;
        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
-        atomic_inc(&rcu_barrier_cpu_count);
+        atomic_inc(&rsp->barrier_cpu_count);
        smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */
-        call_rcu_func(&rh, rcu_barrier_callback);
+        rd.rsp = rsp;
+        rsp->call(&rd.barrier_head, rcu_barrier_callback);
        /*
         * Now that we have an rcu_barrier_callback() callback on each
         * CPU, and thus each counted, remove the initial count.
         */
-        if (atomic_dec_and_test(&rcu_barrier_cpu_count))
+        if (atomic_dec_and_test(&rsp->barrier_cpu_count))
-                complete(&rcu_barrier_completion);
+                complete(&rsp->barrier_completion);
+        /* Increment ->n_barrier_done to prevent duplicate work. */
+        smp_mb(); /* Keep increment after above mechanism. */
+        ACCESS_ONCE(rsp->n_barrier_done)++;
+        WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
+        _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
+        smp_mb(); /* Keep increment before caller's subsequent code. */
        /* Wait for all rcu_barrier_callback() callbacks to be invoked. */
-        wait_for_completion(&rcu_barrier_completion);
+        wait_for_completion(&rsp->barrier_completion);
        /* Other rcu_barrier() invocations can now safely proceed. */
-        mutex_unlock(&rcu_barrier_mutex);
+        mutex_unlock(&rsp->barrier_mutex);
-        destroy_rcu_head_on_stack(&rh);
+        destroy_rcu_head_on_stack(&rd.barrier_head);
 }
 /**
@@ -2386,7 +2465,7 @@ static void _rcu_barrier(struct rcu_state *rsp,
 */
 void rcu_barrier_bh(void)
 {
-        _rcu_barrier(&rcu_bh_state, call_rcu_bh);
+        _rcu_barrier(&rcu_bh_state);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_bh);
@@ -2395,7 +2474,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_bh);
 */
 void rcu_barrier_sched(void)
 {
-        _rcu_barrier(&rcu_sched_state, call_rcu_sched);
+        _rcu_barrier(&rcu_sched_state);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
@@ -2406,18 +2485,15 @@ static void __init
 rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 {
        unsigned long flags;
-        int i;
        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
        struct rcu_node *rnp = rcu_get_root(rsp);
        /* Set up local state, ensuring consistent view of global state. */
        raw_spin_lock_irqsave(&rnp->lock, flags);
        rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
-        rdp->nxtlist = NULL;
+        init_callback_list(rdp);
-        for (i = 0; i < RCU_NEXT_SIZE; i++)
-                rdp->nxttail[i] = &rdp->nxtlist;
        rdp->qlen_lazy = 0;
-        rdp->qlen = 0;
+        ACCESS_ONCE(rdp->qlen) = 0;
        rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
        WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
        WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
@@ -2491,9 +2567,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
 static void __cpuinit rcu_prepare_cpu(int cpu)
 {
-        rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
+        struct rcu_state *rsp;
-        rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
-        rcu_preempt_init_percpu_data(cpu);
+        for_each_rcu_flavor(rsp)
+                rcu_init_percpu_data(cpu, rsp,
+                                     strcmp(rsp->name, "rcu_preempt") == 0);
 }
 /*
@@ -2505,6 +2583,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
        long cpu = (long)hcpu;
        struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
        struct rcu_node *rnp = rdp->mynode;
+        struct rcu_state *rsp;
        trace_rcu_utilization("Start CPU hotplug");
        switch (action) {
@@ -2529,18 +2608,16 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
                 * touch any data without introducing corruption. We send the
                 * dying CPU's callbacks to an arbitrarily chosen online CPU.
                 */
-                rcu_cleanup_dying_cpu(&rcu_bh_state);
+                for_each_rcu_flavor(rsp)
-                rcu_cleanup_dying_cpu(&rcu_sched_state);
+                        rcu_cleanup_dying_cpu(rsp);
-                rcu_preempt_cleanup_dying_cpu();
                rcu_cleanup_after_idle(cpu);
                break;
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
        case CPU_UP_CANCELED:
        case CPU_UP_CANCELED_FROZEN:
-                rcu_cleanup_dead_cpu(cpu, &rcu_bh_state);
+                for_each_rcu_flavor(rsp)
-                rcu_cleanup_dead_cpu(cpu, &rcu_sched_state);
+                        rcu_cleanup_dead_cpu(cpu, rsp);
-                rcu_preempt_cleanup_dead_cpu(cpu);
                break;
        default:
                break;
@@ -2573,9 +2650,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
 {
        int i;
-        for (i = NUM_RCU_LVLS - 1; i > 0; i--)
+        for (i = rcu_num_lvls - 1; i > 0; i--)
                rsp->levelspread[i] = CONFIG_RCU_FANOUT;
-        rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF;
+        rsp->levelspread[0] = rcu_fanout_leaf;
 }
 #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
 static void __init rcu_init_levelspread(struct rcu_state *rsp)
@@ -2585,7 +2662,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
        int i;
        cprv = NR_CPUS;
-        for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+        for (i = rcu_num_lvls - 1; i >= 0; i--) {
                ccur = rsp->levelcnt[i];
                rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
                cprv = ccur;
@@ -2612,13 +2689,15 @@ static void __init rcu_init_one(struct rcu_state *rsp,
        /* Initialize the level-tracking arrays. */
-        for (i = 1; i < NUM_RCU_LVLS; i++)
+        for (i = 0; i < rcu_num_lvls; i++)
+                rsp->levelcnt[i] = num_rcu_lvl[i];
+        for (i = 1; i < rcu_num_lvls; i++)
                rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
        rcu_init_levelspread(rsp);
        /* Initialize the elements themselves, starting from the leaves. */
-        for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+        for (i = rcu_num_lvls - 1; i >= 0; i--) {
                cpustride *= rsp->levelspread[i];
                rnp = rsp->level[i];
                for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
@@ -2648,13 +2727,74 @@ static void __init rcu_init_one(struct rcu_state *rsp,
        }
        rsp->rda = rda;
-        rnp = rsp->level[NUM_RCU_LVLS - 1];
+        rnp = rsp->level[rcu_num_lvls - 1];
        for_each_possible_cpu(i) {
                while (i > rnp->grphi)
                        rnp++;
                per_cpu_ptr(rsp->rda, i)->mynode = rnp;
                rcu_boot_init_percpu_data(i, rsp);
        }
+        list_add(&rsp->flavors, &rcu_struct_flavors);
+}
+/*
+ * Compute the rcu_node tree geometry from kernel parameters.  This cannot
+ * replace the definitions in rcutree.h because those are needed to size
+ * the ->node array in the rcu_state structure.
+ */
+static void __init rcu_init_geometry(void)
+{
+        int i;
+        int j;
+        int n = nr_cpu_ids;
+        int rcu_capacity[MAX_RCU_LVLS + 1];
+        /* If the compile-time values are accurate, just leave. */
+        if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF)
+                return;
+        /*
+         * Compute number of nodes that can be handled an rcu_node tree
+         * with the given number of levels.  Setting rcu_capacity[0] makes
+         * some of the arithmetic easier.
+         */
+        rcu_capacity[0] = 1;
+        rcu_capacity[1] = rcu_fanout_leaf;
+        for (i = 2; i <= MAX_RCU_LVLS; i++)
+                rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT;
+        /*
+         * The boot-time rcu_fanout_leaf parameter is only permitted
+         * to increase the leaf-level fanout, not decrease it.  Of course,
+         * the leaf-level fanout cannot exceed the number of bits in
+         * the rcu_node masks.  Finally, the tree must be able to accommodate
+         * the configured number of CPUs.  Complain and fall back to the
+         * compile-time values if these limits are exceeded.
+         */
+        if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF ||
+            rcu_fanout_leaf > sizeof(unsigned long) * 8 ||
+            n > rcu_capacity[MAX_RCU_LVLS]) {
+                WARN_ON(1);
+                return;
+        }
+        /* Calculate the number of rcu_nodes at each level of the tree. */
+        for (i = 1; i <= MAX_RCU_LVLS; i++)
+                if (n <= rcu_capacity[i]) {
+                        for (j = 0; j <= i; j++)
+                                num_rcu_lvl[j] =
+                                        DIV_ROUND_UP(n, rcu_capacity[i - j]);
+                        rcu_num_lvls = i;
+                        for (j = i + 1; j <= MAX_RCU_LVLS; j++)
+                                num_rcu_lvl[j] = 0;
+                        break;
+                }
+        /* Calculate the total number of rcu_node structures. */
+        rcu_num_nodes = 0;
+        for (i = 0; i <= MAX_RCU_LVLS; i++)
+                rcu_num_nodes += num_rcu_lvl[i];
+        rcu_num_nodes -= n;
 }
 void __init rcu_init(void)
@@ -2662,6 +2802,7 @@ void __init rcu_init(void)
        int cpu;
        rcu_bootup_announce();
+        rcu_init_geometry();
        rcu_init_one(&rcu_sched_state, &rcu_sched_data);
        rcu_init_one(&rcu_bh_state, &rcu_bh_data);
        __rcu_init_preempt();
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index ea056495783e..4d29169f2124 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -42,28 +42,28 @@
 #define RCU_FANOUT_4          (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
 #if NR_CPUS <= RCU_FANOUT_1
-#  define NUM_RCU_LVLS        1
+#  define RCU_NUM_LVLS        1
 #  define NUM_RCU_LVL_0       1
 #  define NUM_RCU_LVL_1       (NR_CPUS)
 #  define NUM_RCU_LVL_2       0
 #  define NUM_RCU_LVL_3       0
 #  define NUM_RCU_LVL_4       0
 #elif NR_CPUS <= RCU_FANOUT_2
-#  define NUM_RCU_LVLS        2
+#  define RCU_NUM_LVLS        2
 #  define NUM_RCU_LVL_0       1
 #  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
 #  define NUM_RCU_LVL_2       (NR_CPUS)
 #  define NUM_RCU_LVL_3       0
 #  define NUM_RCU_LVL_4       0
 #elif NR_CPUS <= RCU_FANOUT_3
-#  define NUM_RCU_LVLS        3
+#  define RCU_NUM_LVLS        3
 #  define NUM_RCU_LVL_0       1
 #  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
 #  define NUM_RCU_LVL_2       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
 #  define NUM_RCU_LVL_3       (NR_CPUS)
 #  define NUM_RCU_LVL_4       0
 #elif NR_CPUS <= RCU_FANOUT_4
-#  define NUM_RCU_LVLS        4
+#  define RCU_NUM_LVLS        4
 #  define NUM_RCU_LVL_0       1
 #  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
 #  define NUM_RCU_LVL_2       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
@@ -76,6 +76,9 @@
 #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
 #define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
+extern int rcu_num_lvls;
+extern int rcu_num_nodes;
 /*
 * Dynticks per-CPU state.
 */
@@ -97,6 +100,7 @@ struct rcu_dynticks {
                                    /* # times non-lazy CBs posted to CPU. */
        unsigned long nonlazy_posted_snap;
                                    /* idle-period nonlazy_posted snapshot. */
+        int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
 };
@@ -206,7 +210,7 @@ struct rcu_node {
 */
 #define rcu_for_each_node_breadth_first(rsp, rnp) \
        for ((rnp) = &(rsp)->node[0]; \
-             (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+             (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
 /*
 * Do a breadth-first scan of the non-leaf rcu_node structures for the
@@ -215,7 +219,7 @@ struct rcu_node {
 */
 #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
        for ((rnp) = &(rsp)->node[0]; \
-             (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++)
+             (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
 /*
 * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
@@ -224,8 +228,8 @@ struct rcu_node {
 * It is still a leaf node, even if it is also the root node.
 */
 #define rcu_for_each_leaf_node(rsp, rnp) \
-        for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
+        for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
-             (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+             (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
 /* Index values for nxttail array in struct rcu_data. */
 #define RCU_DONE_TAIL           0       /* Also RCU_WAIT head. */
@@ -311,6 +315,9 @@ struct rcu_data {
        unsigned long n_rp_need_fqs;
        unsigned long n_rp_need_nothing;
+        /* 6) _rcu_barrier() callback. */
+        struct rcu_head barrier_head;
        int cpu;
        struct rcu_state *rsp;
 };
@@ -357,10 +364,12 @@ do {									\
 */
 struct rcu_state {
        struct rcu_node node[NUM_RCU_NODES];    /* Hierarchy. */
-        struct rcu_node *level[NUM_RCU_LVLS];   /* Hierarchy levels. */
+        struct rcu_node *level[RCU_NUM_LVLS];   /* Hierarchy levels. */
        u32 levelcnt[MAX_RCU_LVLS + 1];         /* # nodes in each level. */
-        u8 levelspread[NUM_RCU_LVLS];           /* kids/node in each level. */
+        u8 levelspread[RCU_NUM_LVLS];           /* kids/node in each level. */
        struct rcu_data __percpu *rda;          /* pointer of percu rcu_data. */
+        void (*call)(struct rcu_head *head,     /* call_rcu() flavor. */
+                     void (*func)(struct rcu_head *head));
        /* The following fields are guarded by the root rcu_node's lock. */
@@ -392,6 +401,11 @@ struct rcu_state {
        struct task_struct *rcu_barrier_in_progress;
                                                /* Task doing rcu_barrier(), */
                                                /*  or NULL if no barrier. */
+        struct mutex barrier_mutex;             /* Guards barrier fields. */
+        atomic_t barrier_cpu_count;             /* # CPUs waiting on. */
+        struct completion barrier_completion;   /* Wake at barrier end. */
+        unsigned long n_barrier_done;           /* ++ at start and end of */
+                                                /*  _rcu_barrier(). */
        raw_spinlock_t fqslock;                 /* Only one task forcing */
                                                /*  quiescent states. */
        unsigned long jiffies_force_qs;         /* Time at which to invoke */
@@ -409,8 +423,13 @@ struct rcu_state {
        unsigned long gp_max;                   /* Maximum GP duration in */
                                                /*  jiffies. */
        char *name;                             /* Name of structure. */
+        struct list_head flavors;               /* List of RCU flavors. */
 };
+extern struct list_head rcu_struct_flavors;
+#define for_each_rcu_flavor(rsp) \
+        list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
 /* Return values for rcu_preempt_offline_tasks(). */
 #define RCU_OFL_TASKS_NORM_GP   0x1             /* Tasks blocking normal */
@@ -444,6 +463,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
 /* Forward declarations for rcutree_plugin.h */
 static void rcu_bootup_announce(void);
 long rcu_batches_completed(void);
+static void rcu_preempt_note_context_switch(int cpu);
 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
 static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
@@ -452,25 +472,18 @@ static void rcu_stop_cpu_kthread(int cpu);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_print_detail_task_stall(struct rcu_state *rsp);
 static int rcu_print_task_stall(struct rcu_node *rnp);
-static void rcu_preempt_stall_reset(void);
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
 static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
                                     struct rcu_node *rnp,
                                     struct rcu_data *rdp);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
-static void rcu_preempt_cleanup_dead_cpu(int cpu);
 static void rcu_preempt_check_callbacks(int cpu);
-static void rcu_preempt_process_callbacks(void);
 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
 #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
 static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
                               bool wake);
 #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
-static int rcu_preempt_pending(int cpu);
-static int rcu_preempt_cpu_has_callbacks(int cpu);
-static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
-static void rcu_preempt_cleanup_dying_cpu(void);
 static void __init __rcu_init_preempt(void);
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 5271a020887e..7f3244c0df01 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -68,17 +68,21 @@ static void __init rcu_bootup_announce_oddness(void)
        printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n");
 #endif
 #if NUM_RCU_LVL_4 != 0
-        printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n");
+        printk(KERN_INFO "\tFour-level hierarchy is enabled.\n");
 #endif
+        if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
+                printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
+        if (nr_cpu_ids != NR_CPUS)
+                printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
 }
 #ifdef CONFIG_TREE_PREEMPT_RCU
-struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt);
+struct rcu_state rcu_preempt_state =
+        RCU_STATE_INITIALIZER(rcu_preempt, call_rcu);
 DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
 static struct rcu_state *rcu_state = &rcu_preempt_state;
-static void rcu_read_unlock_special(struct task_struct *t);
 static int rcu_preempted_readers_exp(struct rcu_node *rnp);
 /*
@@ -153,7 +157,7 @@ static void rcu_preempt_qs(int cpu)
 *
 * Caller must disable preemption.
 */
-void rcu_preempt_note_context_switch(void)
+static void rcu_preempt_note_context_switch(int cpu)
 {
        struct task_struct *t = current;
        unsigned long flags;
@@ -164,7 +168,7 @@ void rcu_preempt_note_context_switch(void)
            (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
                /* Possibly blocking in an RCU read-side critical section. */
-                rdp = __this_cpu_ptr(rcu_preempt_state.rda);
+                rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
                rnp = rdp->mynode;
                raw_spin_lock_irqsave(&rnp->lock, flags);
                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -228,23 +232,11 @@ void rcu_preempt_note_context_switch(void)
         * means that we continue to block the current grace period.
         */
        local_irq_save(flags);
-        rcu_preempt_qs(smp_processor_id());
+        rcu_preempt_qs(cpu);
        local_irq_restore(flags);
 }
 /*
- * Tree-preemptible RCU implementation for rcu_read_lock().
- * Just increment ->rcu_read_lock_nesting, shared state will be updated
- * if we block.
- */
-void __rcu_read_lock(void)
-{
-        current->rcu_read_lock_nesting++;
-        barrier();  /* needed if we ever invoke rcu_read_lock in rcutree.c */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_lock);
-/*
 * Check for preempted RCU readers blocking the current grace period
 * for the specified rcu_node structure.  If the caller needs a reliable
 * answer, it must hold the rcu_node's ->lock.
@@ -310,7 +302,7 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t,
 * notify RCU core processing or task having blocked during the RCU
 * read-side critical section.
 */
-static noinline void rcu_read_unlock_special(struct task_struct *t)
+void rcu_read_unlock_special(struct task_struct *t)
 {
        int empty;
        int empty_exp;
@@ -398,8 +390,9 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
                                                         rnp->grphi,
                                                         !!rnp->gp_tasks);
                        rcu_report_unblock_qs_rnp(rnp, flags);
-                } else
+                } else {
                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                }
 #ifdef CONFIG_RCU_BOOST
                /* Unboost if we were boosted. */
@@ -418,38 +411,6 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
        }
 }
-/*
- * Tree-preemptible RCU implementation for rcu_read_unlock().
- * Decrement ->rcu_read_lock_nesting.  If the result is zero (outermost
- * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
- * invoke rcu_read_unlock_special() to clean up after a context switch
- * in an RCU read-side critical section and other special cases.
- */
-void __rcu_read_unlock(void)
-{
-        struct task_struct *t = current;
-        if (t->rcu_read_lock_nesting != 1)
-                --t->rcu_read_lock_nesting;
-        else {
-                barrier();  /* critical section before exit code. */
-                t->rcu_read_lock_nesting = INT_MIN;
-                barrier();  /* assign before ->rcu_read_unlock_special load */
-                if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
-                        rcu_read_unlock_special(t);
-                barrier();  /* ->rcu_read_unlock_special load before assign */
-                t->rcu_read_lock_nesting = 0;
-        }
-#ifdef CONFIG_PROVE_LOCKING
-        {
-                int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
-                WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
-        }
-#endif /* #ifdef CONFIG_PROVE_LOCKING */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_unlock);
 #ifdef CONFIG_RCU_CPU_STALL_VERBOSE
 /*
@@ -540,16 +501,6 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
 }
 /*
- * Suppress preemptible RCU's CPU stall warnings by pushing the
- * time of the next stall-warning message comfortably far into the
- * future.
- */
-static void rcu_preempt_stall_reset(void)
-{
-        rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
-}
-/*
 * Check that the list of blocked tasks for the newly completed grace
 * period is in fact empty.  It is a serious bug to complete a grace
 * period that still has RCU readers blocked!  This function must be
@@ -650,14 +601,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
- * Do CPU-offline processing for preemptible RCU.
- */
-static void rcu_preempt_cleanup_dead_cpu(int cpu)
-{
-        rcu_cleanup_dead_cpu(cpu, &rcu_preempt_state);
-}
-/*
 * Check for a quiescent state from the current CPU.  When a task blocks,
 * the task is recorded in the corresponding CPU's rcu_node structure,
 * which is checked elsewhere.
@@ -677,15 +620,6 @@ static void rcu_preempt_check_callbacks(int cpu)
                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
 }
-/*
- * Process callbacks for preemptible RCU.
- */
-static void rcu_preempt_process_callbacks(void)
-{
-        __rcu_process_callbacks(&rcu_preempt_state,
-                                &__get_cpu_var(rcu_preempt_data));
-}
 #ifdef CONFIG_RCU_BOOST
 static void rcu_preempt_do_callbacks(void)
@@ -824,9 +758,9 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
        int must_wait = 0;
        raw_spin_lock_irqsave(&rnp->lock, flags);
-        if (list_empty(&rnp->blkd_tasks))
+        if (list_empty(&rnp->blkd_tasks)) {
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
-        else {
+        } else {
                rnp->exp_tasks = rnp->blkd_tasks.next;
                rcu_initiate_boost(rnp, flags);  /* releases rnp->lock */
                must_wait = 1;
@@ -870,9 +804,9 @@ void synchronize_rcu_expedited(void)
         * expedited grace period for us, just leave.
         */
        while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
-                if (trycount++ < 10)
+                if (trycount++ < 10) {
                        udelay(trycount * num_online_cpus());
-                else {
+                } else {
                        synchronize_rcu();
                        return;
                }
@@ -917,51 +851,16 @@ mb_ret:
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
-/*
- * Check to see if there is any immediate preemptible-RCU-related work
- * to be done.
- */
-static int rcu_preempt_pending(int cpu)
-{
-        return __rcu_pending(&rcu_preempt_state,
-                             &per_cpu(rcu_preempt_data, cpu));
-}
-/*
- * Does preemptible RCU have callbacks on this CPU?
- */
-static int rcu_preempt_cpu_has_callbacks(int cpu)
-{
-        return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
-}
 /**
 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
 */
 void rcu_barrier(void)
 {
-        _rcu_barrier(&rcu_preempt_state, call_rcu);
+        _rcu_barrier(&rcu_preempt_state);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier);
 /*
- * Initialize preemptible RCU's per-CPU data.
- */
-static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
-{
-        rcu_init_percpu_data(cpu, &rcu_preempt_state, 1);
-}
-/*
- * Move preemptible RCU's callbacks from dying CPU to other online CPU
- * and record a quiescent state.
- */
-static void rcu_preempt_cleanup_dying_cpu(void)
-{
-        rcu_cleanup_dying_cpu(&rcu_preempt_state);
-}
-/*
 * Initialize preemptible RCU's state structures.
 */
 static void __init __rcu_init_preempt(void)
@@ -1002,6 +901,14 @@ void rcu_force_quiescent_state(void)
 EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
 /*
+ * Because preemptible RCU does not exist, we never have to check for
+ * CPUs being in quiescent states.
+ */
+static void rcu_preempt_note_context_switch(int cpu)
+{
+}
+/*
 * Because preemptible RCU does not exist, there are never any preempted
 * RCU readers.
 */
@@ -1038,14 +945,6 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
 }
 /*
- * Because preemptible RCU does not exist, there is no need to suppress
- * its CPU stall warnings.
- */
-static void rcu_preempt_stall_reset(void)
-{
-}
-/*
 * Because there is no preemptible RCU, there can be no readers blocked,
 * so there is no need to check for blocked tasks.  So check only for
 * bogus qsmask values.
@@ -1073,14 +972,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
- * Because preemptible RCU does not exist, it never needs CPU-offline
- * processing.
- */
-static void rcu_preempt_cleanup_dead_cpu(int cpu)
-{
-}
-/*
 * Because preemptible RCU does not exist, it never has any callbacks
 * to check.
 */
@@ -1089,14 +980,6 @@ static void rcu_preempt_check_callbacks(int cpu)
 }
 /*
- * Because preemptible RCU does not exist, it never has any callbacks
- * to process.
- */
-static void rcu_preempt_process_callbacks(void)
-{
-}
-/*
 * Queue an RCU callback for lazy invocation after a grace period.
 * This will likely be later named something like "call_rcu_lazy()",
 * but this change will require some way of tagging the lazy RCU
@@ -1137,22 +1020,6 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
- * Because preemptible RCU does not exist, it never has any work to do.
- */
-static int rcu_preempt_pending(int cpu)
-{
-        return 0;
-}
-/*
- * Because preemptible RCU does not exist, it never has callbacks
- */
-static int rcu_preempt_cpu_has_callbacks(int cpu)
-{
-        return 0;
-}
-/*
 * Because preemptible RCU does not exist, rcu_barrier() is just
 * another name for rcu_barrier_sched().
 */
@@ -1163,21 +1030,6 @@ void rcu_barrier(void)
 EXPORT_SYMBOL_GPL(rcu_barrier);
 /*
- * Because preemptible RCU does not exist, there is no per-CPU
- * data to initialize.
- */
-static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
-{
-}
-/*
- * Because there is no preemptible RCU, there is no cleanup to do.
- */
-static void rcu_preempt_cleanup_dying_cpu(void)
-{
-}
-/*
 * Because preemptible RCU does not exist, it need not be initialized.
 */
 static void __init __rcu_init_preempt(void)
@@ -1960,9 +1812,11 @@ static void rcu_idle_count_callbacks_posted(void)
 */
 #define RCU_IDLE_FLUSHES 5              /* Number of dyntick-idle tries. */
 #define RCU_IDLE_OPT_FLUSHES 3          /* Optional dyntick-idle tries. */
-#define RCU_IDLE_GP_DELAY 6             /* Roughly one grace period. */
+#define RCU_IDLE_GP_DELAY 4             /* Roughly one grace period. */
 #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
+extern int tick_nohz_enabled;
 /*
 * Does the specified flavor of RCU have non-lazy callbacks pending on
 * the specified CPU?  Both RCU flavor and CPU are specified by the
@@ -2039,10 +1893,13 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
                return 1;
        }
        /* Set up for the possibility that RCU will post a timer. */
-        if (rcu_cpu_has_nonlazy_callbacks(cpu))
+        if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
-                *delta_jiffies = RCU_IDLE_GP_DELAY;
+                *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies,
-        else
+                                          RCU_IDLE_GP_DELAY) - jiffies;
-                *delta_jiffies = RCU_IDLE_LAZY_GP_DELAY;
+        } else {
+                *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY;
+                *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies;
+        }
        return 0;
 }
@@ -2101,6 +1958,7 @@ static void rcu_cleanup_after_idle(int cpu)
        del_timer(&rdtp->idle_gp_timer);
        trace_rcu_prep_idle("Cleanup after idle");
+        rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled);
 }
 /*
@@ -2126,6 +1984,18 @@ static void rcu_prepare_for_idle(int cpu)
 {
        struct timer_list *tp;
        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+        int tne;
+        /* Handle nohz enablement switches conservatively. */
+        tne = ACCESS_ONCE(tick_nohz_enabled);
+        if (tne != rdtp->tick_nohz_enabled_snap) {
+                if (rcu_cpu_has_callbacks(cpu))
+                        invoke_rcu_core(); /* force nohz to see update. */
+                rdtp->tick_nohz_enabled_snap = tne;
+                return;
+        }
+        if (!tne)
+                return;
        /*
         * If this is an idle re-entry, for example, due to use of
@@ -2179,10 +2049,11 @@ static void rcu_prepare_for_idle(int cpu)
                if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
                        trace_rcu_prep_idle("Dyntick with callbacks");
                        rdtp->idle_gp_timer_expires =
-                                           jiffies + RCU_IDLE_GP_DELAY;
+                                round_up(jiffies + RCU_IDLE_GP_DELAY,
+                                         RCU_IDLE_GP_DELAY);
                } else {
                        rdtp->idle_gp_timer_expires =
-                                           jiffies + RCU_IDLE_LAZY_GP_DELAY;
+                                round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
                        trace_rcu_prep_idle("Dyntick with lazy callbacks");
                }
                tp = &rdtp->idle_gp_timer;
@@ -2223,8 +2094,9 @@ static void rcu_prepare_for_idle(int cpu)
        if (rcu_cpu_has_callbacks(cpu)) {
                trace_rcu_prep_idle("More callbacks");
                invoke_rcu_core();
-        } else
+        } else {
                trace_rcu_prep_idle("Callbacks drained");
+        }
 }
 /*
@@ -2261,6 +2133,7 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
 static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
 {
+        *cp = '\0';
 }
 #endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d4bc16ddd1d4..abffb486e94e 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,6 +46,31 @@
 #define RCU_TREE_NONCORE
 #include "rcutree.h"
+static int show_rcubarrier(struct seq_file *m, void *unused)
+{
+        struct rcu_state *rsp;
+        for_each_rcu_flavor(rsp)
+                seq_printf(m, "%s: %c bcc: %d nbd: %lu\n",
+                           rsp->name, rsp->rcu_barrier_in_progress ? 'B' : '.',
+                           atomic_read(&rsp->barrier_cpu_count),
+                           rsp->n_barrier_done);
+        return 0;
+}
+static int rcubarrier_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, show_rcubarrier, NULL);
+}
+static const struct file_operations rcubarrier_fops = {
+        .owner = THIS_MODULE,
+        .open = rcubarrier_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
 #ifdef CONFIG_RCU_BOOST
 static char convert_kthread_status(unsigned int kthread_status)
@@ -95,24 +120,16 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
                   rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
 }
-#define PRINT_RCU_DATA(name, func, m) \
-        do { \
-                int _p_r_d_i; \
-                \
-                for_each_possible_cpu(_p_r_d_i) \
-                        func(m, &per_cpu(name, _p_r_d_i)); \
-        } while (0)
 static int show_rcudata(struct seq_file *m, void *unused)
 {
-#ifdef CONFIG_TREE_PREEMPT_RCU
+        int cpu;
-        seq_puts(m, "rcu_preempt:\n");
+        struct rcu_state *rsp;
-        PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data, m);
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+        for_each_rcu_flavor(rsp) {
-        seq_puts(m, "rcu_sched:\n");
+                seq_printf(m, "%s:\n", rsp->name);
-        PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m);
+                for_each_possible_cpu(cpu)
-        seq_puts(m, "rcu_bh:\n");
+                        print_one_rcu_data(m, per_cpu_ptr(rsp->rda, cpu));
-        PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m);
+        }
        return 0;
 }
@@ -166,6 +183,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
 static int show_rcudata_csv(struct seq_file *m, void *unused)
 {
+        int cpu;
+        struct rcu_state *rsp;
        seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");
        seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
        seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\"");
@@ -173,14 +193,11 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
        seq_puts(m, "\"kt\",\"ktl\"");
 #endif /* #ifdef CONFIG_RCU_BOOST */
        seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n");
-#ifdef CONFIG_TREE_PREEMPT_RCU
+        for_each_rcu_flavor(rsp) {
-        seq_puts(m, "\"rcu_preempt:\"\n");
+                seq_printf(m, "\"%s:\"\n", rsp->name);
-        PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
+                for_each_possible_cpu(cpu)
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+                        print_one_rcu_data_csv(m, per_cpu_ptr(rsp->rda, cpu));
-        seq_puts(m, "\"rcu_sched:\"\n");
+        }
-        PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m);
-        seq_puts(m, "\"rcu_bh:\"\n");
-        PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
        return 0;
 }
@@ -201,8 +218,7 @@ static const struct file_operations rcudata_csv_fops = {
 static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
 {
-        seq_printf(m,  "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu "
+        seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ",
-                   "j=%04x bt=%04x\n",
                   rnp->grplo, rnp->grphi,
                   "T."[list_empty(&rnp->blkd_tasks)],
                   "N."[!rnp->gp_tasks],
@@ -210,11 +226,11 @@ static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
                   "B."[!rnp->boost_tasks],
                   convert_kthread_status(rnp->boost_kthread_status),
                   rnp->n_tasks_boosted, rnp->n_exp_boosts,
-                   rnp->n_normal_boosts,
+                   rnp->n_normal_boosts);
+        seq_printf(m, "j=%04x bt=%04x\n",
                   (int)(jiffies & 0xffff),
                   (int)(rnp->boost_time & 0xffff));
-        seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
+        seq_printf(m, "    balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
-                   "     balk",
                   rnp->n_balk_blkd_tasks,
                   rnp->n_balk_exp_gp_tasks,
                   rnp->n_balk_boost_tasks,
@@ -270,15 +286,15 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
        struct rcu_node *rnp;
        gpnum = rsp->gpnum;
-        seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
+        seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x ",
-                      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
+                   rsp->name, rsp->completed, gpnum, rsp->fqs_state,
-                   rsp->completed, gpnum, rsp->fqs_state,
                   (long)(rsp->jiffies_force_qs - jiffies),
-                   (int)(jiffies & 0xffff),
+                   (int)(jiffies & 0xffff));
+        seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
                   rsp->n_force_qs, rsp->n_force_qs_ngp,
                   rsp->n_force_qs - rsp->n_force_qs_ngp,
                   rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen);
-        for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
+        for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
                if (rnp->level != level) {
                        seq_puts(m, "\n");
                        level = rnp->level;
@@ -295,14 +311,10 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 static int show_rcuhier(struct seq_file *m, void *unused)
 {
-#ifdef CONFIG_TREE_PREEMPT_RCU
+        struct rcu_state *rsp;
-        seq_puts(m, "rcu_preempt:\n");
-        print_one_rcu_state(m, &rcu_preempt_state);
+        for_each_rcu_flavor(rsp)
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+                print_one_rcu_state(m, rsp);
-        seq_puts(m, "rcu_sched:\n");
-        print_one_rcu_state(m, &rcu_sched_state);
-        seq_puts(m, "rcu_bh:\n");
-        print_one_rcu_state(m, &rcu_bh_state);
        return 0;
 }
@@ -343,11 +355,10 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
 static int show_rcugp(struct seq_file *m, void *unused)
 {
-#ifdef CONFIG_TREE_PREEMPT_RCU
+        struct rcu_state *rsp;
-        show_one_rcugp(m, &rcu_preempt_state);
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+        for_each_rcu_flavor(rsp)
-        show_one_rcugp(m, &rcu_sched_state);
+                show_one_rcugp(m, rsp);
-        show_one_rcugp(m, &rcu_bh_state);
        return 0;
 }
@@ -366,44 +377,36 @@ static const struct file_operations rcugp_fops = {
 static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
 {
-        seq_printf(m, "%3d%cnp=%ld "
+        seq_printf(m, "%3d%cnp=%ld ",
-                   "qsp=%ld rpq=%ld cbr=%ld cng=%ld "
-                   "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
                   rdp->cpu,
                   cpu_is_offline(rdp->cpu) ? '!' : ' ',
-                   rdp->n_rcu_pending,
+                   rdp->n_rcu_pending);
+        seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ",
                   rdp->n_rp_qs_pending,
                   rdp->n_rp_report_qs,
                   rdp->n_rp_cb_ready,
-                   rdp->n_rp_cpu_needs_gp,
+                   rdp->n_rp_cpu_needs_gp);
+        seq_printf(m, "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
                   rdp->n_rp_gp_completed,
                   rdp->n_rp_gp_started,
                   rdp->n_rp_need_fqs,
                   rdp->n_rp_need_nothing);
 }
-static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
+static int show_rcu_pending(struct seq_file *m, void *unused)
 {
        int cpu;
        struct rcu_data *rdp;
+        struct rcu_state *rsp;
-        for_each_possible_cpu(cpu) {
-                rdp = per_cpu_ptr(rsp->rda, cpu);
+        for_each_rcu_flavor(rsp) {
-                if (rdp->beenonline)
+                seq_printf(m, "%s:\n", rsp->name);
-                        print_one_rcu_pending(m, rdp);
+                for_each_possible_cpu(cpu) {
+                        rdp = per_cpu_ptr(rsp->rda, cpu);
+                        if (rdp->beenonline)
+                                print_one_rcu_pending(m, rdp);
+                }
        }
-}
-static int show_rcu_pending(struct seq_file *m, void *unused)
-{
-#ifdef CONFIG_TREE_PREEMPT_RCU
-        seq_puts(m, "rcu_preempt:\n");
-        print_rcu_pendings(m, &rcu_preempt_state);
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-        seq_puts(m, "rcu_sched:\n");
-        print_rcu_pendings(m, &rcu_sched_state);
-        seq_puts(m, "rcu_bh:\n");
-        print_rcu_pendings(m, &rcu_bh_state);
        return 0;
 }
@@ -453,6 +456,11 @@ static int __init rcutree_trace_init(void)
        if (!rcudir)
                goto free_out;
+        retval = debugfs_create_file("rcubarrier", 0444, rcudir,
+                                                NULL, &rcubarrier_fops);
+        if (!retval)
+                goto free_out;
        retval = debugfs_create_file("rcudata", 0444, rcudir,
                                                NULL, &rcudata_fops);
        if (!retval)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d5594a4268d4..468bdd44c1ba 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2081,7 +2081,6 @@ context_switch(struct rq *rq, struct task_struct *prev,
 #endif
        /* Here we just switch the register state and the stack. */
-        rcu_switch_from(prev);
        switch_to(prev, next, prev);
        barrier();
@@ -2161,11 +2160,73 @@ unsigned long this_cpu_load(void)
 }
+/*
+ * Global load-average calculations
+ *
+ * We take a distributed and async approach to calculating the global load-avg
+ * in order to minimize overhead.
+ *
+ * The global load average is an exponentially decaying average of nr_running +
+ * nr_uninterruptible.
+ *
+ * Once every LOAD_FREQ:
+ *
+ *   nr_active = 0;
+ *   for_each_possible_cpu(cpu)
+ *      nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
+ *
+ *   avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
+ *
+ * Due to a number of reasons the above turns in the mess below:
+ *
+ *  - for_each_possible_cpu() is prohibitively expensive on machines with
+ *    serious number of cpus, therefore we need to take a distributed approach
+ *    to calculating nr_active.
+ *
+ *        \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
+ *                      = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
+ *
+ *    So assuming nr_active := 0 when we start out -- true per definition, we
+ *    can simply take per-cpu deltas and fold those into a global accumulate
+ *    to obtain the same result. See calc_load_fold_active().
+ *
+ *    Furthermore, in order to avoid synchronizing all per-cpu delta folding
+ *    across the machine, we assume 10 ticks is sufficient time for every
+ *    cpu to have completed this task.
+ *
+ *    This places an upper-bound on the IRQ-off latency of the machine. Then
+ *    again, being late doesn't loose the delta, just wrecks the sample.
+ *
+ *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
+ *    this would add another cross-cpu cacheline miss and atomic operation
+ *    to the wakeup path. Instead we increment on whatever cpu the task ran
+ *    when it went into uninterruptible state and decrement on whatever cpu
+ *    did the wakeup. This means that only the sum of nr_uninterruptible over
+ *    all cpus yields the correct result.
+ *
+ *  This covers the NO_HZ=n code, for extra head-aches, see the comment below.
+ */
 /* Variables and functions for calc_load */
 static atomic_long_t calc_load_tasks;
 static unsigned long calc_load_update;
 unsigned long avenrun[3];
-EXPORT_SYMBOL(avenrun);
+EXPORT_SYMBOL(avenrun); /* should be removed */
+/**
+ * get_avenrun - get the load average array
+ * @loads:      pointer to dest load array
+ * @offset:     offset to add
+ * @shift:      shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+        loads[0] = (avenrun[0] + offset) << shift;
+        loads[1] = (avenrun[1] + offset) << shift;
+        loads[2] = (avenrun[2] + offset) << shift;
+}
 static long calc_load_fold_active(struct rq *this_rq)
 {
@@ -2182,6 +2243,9 @@ static long calc_load_fold_active(struct rq *this_rq)
        return delta;
 }
+/*
+ * a1 = a0 * e + a * (1 - e)
+ */
 static unsigned long
 calc_load(unsigned long load, unsigned long exp, unsigned long active)
 {
@@ -2193,30 +2257,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
 #ifdef CONFIG_NO_HZ
 /*
- * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
+ * Handle NO_HZ for the global load-average.
+ *
+ * Since the above described distributed algorithm to compute the global
+ * load-average relies on per-cpu sampling from the tick, it is affected by
+ * NO_HZ.
+ *
+ * The basic idea is to fold the nr_active delta into a global idle-delta upon
+ * entering NO_HZ state such that we can include this as an 'extra' cpu delta
+ * when we read the global state.
+ *
+ * Obviously reality has to ruin such a delightfully simple scheme:
+ *
+ *  - When we go NO_HZ idle during the window, we can negate our sample
+ *    contribution, causing under-accounting.
+ *
+ *    We avoid this by keeping two idle-delta counters and flipping them
+ *    when the window starts, thus separating old and new NO_HZ load.
+ *
+ *    The only trick is the slight shift in index flip for read vs write.
+ *
+ *        0s            5s            10s           15s
+ *          +10           +10           +10           +10
+ *        |-|-----------|-|-----------|-|-----------|-|
+ *    r:0 0 1           1 0           0 1           1 0
+ *    w:0 1 1           0 0           1 1           0 0
+ *
+ *    This ensures we'll fold the old idle contribution in this window while
+ *    accumlating the new one.
+ *
+ *  - When we wake up from NO_HZ idle during the window, we push up our
+ *    contribution, since we effectively move our sample point to a known
+ *    busy state.
+ *
+ *    This is solved by pushing the window forward, and thus skipping the
+ *    sample, for this cpu (effectively using the idle-delta for this cpu which
+ *    was in effect at the time the window opened). This also solves the issue
+ *    of having to deal with a cpu having been in NOHZ idle for multiple
+ *    LOAD_FREQ intervals.
 *
 * When making the ILB scale, we should try to pull this in as well.
 */
-static atomic_long_t calc_load_tasks_idle;
+static atomic_long_t calc_load_idle[2];
+static int calc_load_idx;
-void calc_load_account_idle(struct rq *this_rq)
+static inline int calc_load_write_idx(void)
 {
+        int idx = calc_load_idx;
+        /*
+         * See calc_global_nohz(), if we observe the new index, we also
+         * need to observe the new update time.
+         */
+        smp_rmb();
+        /*
+         * If the folding window started, make sure we start writing in the
+         * next idle-delta.
+         */
+        if (!time_before(jiffies, calc_load_update))
+                idx++;
+        return idx & 1;
+}
+static inline int calc_load_read_idx(void)
+{
+        return calc_load_idx & 1;
+}
+void calc_load_enter_idle(void)
+{
+        struct rq *this_rq = this_rq();
        long delta;
+        /*
+         * We're going into NOHZ mode, if there's any pending delta, fold it
+         * into the pending idle delta.
+         */
        delta = calc_load_fold_active(this_rq);
-        if (delta)
+        if (delta) {
-                atomic_long_add(delta, &calc_load_tasks_idle);
+                int idx = calc_load_write_idx();
+                atomic_long_add(delta, &calc_load_idle[idx]);
+        }
 }
-static long calc_load_fold_idle(void)
+void calc_load_exit_idle(void)
 {
-        long delta = 0;
+        struct rq *this_rq = this_rq();
+        /*
+         * If we're still before the sample window, we're done.
+         */
+        if (time_before(jiffies, this_rq->calc_load_update))
+                return;
        /*
-         * Its got a race, we don't care...
+         * We woke inside or after the sample window, this means we're already
+         * accounted through the nohz accounting, so skip the entire deal and
+         * sync up for the next window.
         */
-        if (atomic_long_read(&calc_load_tasks_idle))
+        this_rq->calc_load_update = calc_load_update;
-                delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
+        if (time_before(jiffies, this_rq->calc_load_update + 10))
+                this_rq->calc_load_update += LOAD_FREQ;
+}
+static long calc_load_fold_idle(void)
+{
+        int idx = calc_load_read_idx();
+        long delta = 0;
+        if (atomic_long_read(&calc_load_idle[idx]))
+                delta = atomic_long_xchg(&calc_load_idle[idx], 0);
        return delta;
 }
@@ -2302,66 +2454,39 @@ static void calc_global_nohz(void)
 {
        long delta, active, n;
-        /*
+        if (!time_before(jiffies, calc_load_update + 10)) {
-         * If we crossed a calc_load_update boundary, make sure to fold
+                /*
-         * any pending idle changes, the respective CPUs might have
+                 * Catch-up, fold however many we are behind still
-         * missed the tick driven calc_load_account_active() update
+                 */
-         * due to NO_HZ.
+                delta = jiffies - calc_load_update - 10;
-         */
+                n = 1 + (delta / LOAD_FREQ);
-        delta = calc_load_fold_idle();
-        if (delta)
-                atomic_long_add(delta, &calc_load_tasks);
-        /*
-         * It could be the one fold was all it took, we done!
-         */
-        if (time_before(jiffies, calc_load_update + 10))
-                return;
-        /*
-         * Catch-up, fold however many we are behind still
-         */
-        delta = jiffies - calc_load_update - 10;
-        n = 1 + (delta / LOAD_FREQ);
-        active = atomic_long_read(&calc_load_tasks);
+                active = atomic_long_read(&calc_load_tasks);
-        active = active > 0 ? active * FIXED_1 : 0;
+                active = active > 0 ? active * FIXED_1 : 0;
-        avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+                avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
-        avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+                avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
-        avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+                avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
-        calc_load_update += n * LOAD_FREQ;
+                calc_load_update += n * LOAD_FREQ;
-}
+        }
-#else
-void calc_load_account_idle(struct rq *this_rq)
-{
-}
-static inline long calc_load_fold_idle(void)
+        /*
-{
+         * Flip the idle index...
-        return 0;
+         *
+         * Make sure we first write the new time then flip the index, so that
+         * calc_load_write_idx() will see the new time when it reads the new
+         * index, this avoids a double flip messing things up.
+         */
+        smp_wmb();
+        calc_load_idx++;
 }
+#else /* !CONFIG_NO_HZ */
-static void calc_global_nohz(void)
+static inline long calc_load_fold_idle(void) { return 0; }
-{
+static inline void calc_global_nohz(void) { }
-}
-#endif
-/**
+#endif /* CONFIG_NO_HZ */
- * get_avenrun - get the load average array
- * @loads:      pointer to dest load array
- * @offset:     offset to add
- * @shift:      shift count to shift the result left
- *
- * These values are estimates at best, so no need for locking.
- */
-void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
-{
-        loads[0] = (avenrun[0] + offset) << shift;
-        loads[1] = (avenrun[1] + offset) << shift;
-        loads[2] = (avenrun[2] + offset) << shift;
-}
 /*
 * calc_load - update the avenrun load estimates 10 ticks after the
@@ -2369,11 +2494,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
 */
 void calc_global_load(unsigned long ticks)
 {
-        long active;
+        long active, delta;
        if (time_before(jiffies, calc_load_update + 10))
                return;
+        /*
+         * Fold the 'old' idle-delta to include all NO_HZ cpus.
+         */
+        delta = calc_load_fold_idle();
+        if (delta)
+                atomic_long_add(delta, &calc_load_tasks);
        active = atomic_long_read(&calc_load_tasks);
        active = active > 0 ? active * FIXED_1 : 0;
@@ -2384,12 +2516,7 @@ void calc_global_load(unsigned long ticks)
        calc_load_update += LOAD_FREQ;
        /*
-         * Account one period with whatever state we found before
+         * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
-         * folding in the nohz state and ageing the entire idle period.
-         *
-         * This avoids loosing a sample when we go idle between 
-         * calc_load_account_active() (10 ticks ago) and now and thus
-         * under-accounting.
         */
        calc_global_nohz();
 }
@@ -2406,7 +2533,6 @@ static void calc_load_account_active(struct rq *this_rq)
                return;
        delta  = calc_load_fold_active(this_rq);
-        delta += calc_load_fold_idle();
        if (delta)
                atomic_long_add(delta, &calc_load_tasks);
@@ -2414,6 +2540,10 @@ static void calc_load_account_active(struct rq *this_rq)
 }
 /*
+ * End of global load-average stuff
+ */
+/*
 * The exact cpuload at various idx values, calculated at every tick would be
 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
 *
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index b44d604b35d1..b6baf370cae9 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 static struct task_struct *pick_next_task_idle(struct rq *rq)
 {
        schedstat_inc(rq, sched_goidle);
-        calc_load_account_idle(rq);
        return rq->idle;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6d52cea7f33d..55844f24435a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -942,8 +942,6 @@ static inline u64 sched_avg_period(void)
        return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
 }
-void calc_load_account_idle(struct rq *this_rq);
 #ifdef CONFIG_SCHED_HRTICK
 /*
diff --git a/kernel/smp.c b/kernel/smp.c
index d0ae5b24875e..29dd40a9f2f4 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -581,26 +581,6 @@ int smp_call_function(smp_call_func_t func, void *info, int wait)
        return 0;
 }
 EXPORT_SYMBOL(smp_call_function);
-void ipi_call_lock(void)
-{
-        raw_spin_lock(&call_function.lock);
-}
-void ipi_call_unlock(void)
-{
-        raw_spin_unlock(&call_function.lock);
-}
-void ipi_call_lock_irq(void)
-{
-        raw_spin_lock_irq(&call_function.lock);
-}
-void ipi_call_unlock_irq(void)
-{
-        raw_spin_unlock_irq(&call_function.lock);
-}
 #endif /* USE_GENERIC_SMP_HELPERS */
 /* Setup configured maximum number of CPUs to activate */
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
index 80c0acfb8472..6ef9433e1c70 100644
--- a/kernel/smpboot.h
+++ b/kernel/smpboot.h
@@ -3,8 +3,6 @@
 struct task_struct;
-int smpboot_prepare(unsigned int cpu);
 #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
 struct task_struct *idle_thread_get(unsigned int cpu);
 void idle_thread_set_boot_cpu(void);
diff --git a/kernel/sys.c b/kernel/sys.c
index e0c8ffc50d7f..2d39a84cd857 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1788,7 +1788,6 @@ SYSCALL_DEFINE1(umask, int, mask)
 #ifdef CONFIG_CHECKPOINT_RESTORE
 static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 {
-        struct vm_area_struct *vma;
        struct file *exe_file;
        struct dentry *dentry;
        int err;
@@ -1816,13 +1815,17 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
        down_write(&mm->mmap_sem);
        /*
-         * Forbid mm->exe_file change if there are mapped other files.
+         * Forbid mm->exe_file change if old file still mapped.
         */
        err = -EBUSY;
-        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+        if (mm->exe_file) {
-                if (vma->vm_file && !path_equal(&vma->vm_file->f_path,
+                struct vm_area_struct *vma;
-                                                &exe_file->f_path))
-                        goto exit_unlock;
+                for (vma = mm->mmap; vma; vma = vma->vm_next)
+                        if (vma->vm_file &&
+                            path_equal(&vma->vm_file->f_path,
+                                       &mm->exe_file->f_path))
+                                goto exit_unlock;
        }
        /*
@@ -1835,6 +1838,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
        if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
                goto exit_unlock;
+        err = 0;
        set_mm_exe_file(mm, exe_file);
 exit_unlock:
        up_write(&mm->mmap_sem);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 70b33abcc7bb..b7fbadc5c973 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -409,7 +409,9 @@ int second_overflow(unsigned long secs)
                        time_state = TIME_DEL;
                break;
        case TIME_INS:
-                if (secs % 86400 == 0) {
+                if (!(time_status & STA_INS))
+                        time_state = TIME_OK;
+                else if (secs % 86400 == 0) {
                        leap = -1;
                        time_state = TIME_OOP;
                        time_tai++;
@@ -418,7 +420,9 @@ int second_overflow(unsigned long secs)
                }
                break;
        case TIME_DEL:
-                if ((secs + 1) % 86400 == 0) {
+                if (!(time_status & STA_DEL))
+                        time_state = TIME_OK;
+                else if ((secs + 1) % 86400 == 0) {
                        leap = 1;
                        time_tai--;
                        time_state = TIME_WAIT;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 869997833928..024540f97f74 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -105,7 +105,7 @@ static ktime_t tick_init_jiffy_update(void)
 /*
 * NO HZ enabled ?
 */
-static int tick_nohz_enabled __read_mostly  = 1;
+int tick_nohz_enabled __read_mostly  = 1;
 /*
 * Enable / Disable tickless mode
@@ -271,50 +271,15 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
 }
 EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
-static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
+static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
+                                         ktime_t now, int cpu)
 {
        unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
+        ktime_t last_update, expires, ret = { .tv64 = 0 };
        unsigned long rcu_delta_jiffies;
-        ktime_t last_update, expires, now;
        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
        u64 time_delta;
-        int cpu;
-        cpu = smp_processor_id();
-        ts = &per_cpu(tick_cpu_sched, cpu);
-        now = tick_nohz_start_idle(cpu, ts);
-        /*
-         * If this cpu is offline and it is the one which updates
-         * jiffies, then give up the assignment and let it be taken by
-         * the cpu which runs the tick timer next. If we don't drop
-         * this here the jiffies might be stale and do_timer() never
-         * invoked.
-         */
-        if (unlikely(!cpu_online(cpu))) {
-                if (cpu == tick_do_timer_cpu)
-                        tick_do_timer_cpu = TICK_DO_TIMER_NONE;
-        }
-        if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
-                return;
-        if (need_resched())
-                return;
-        if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
-                static int ratelimit;
-                if (ratelimit < 10) {
-                        printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
-                               (unsigned int) local_softirq_pending());
-                        ratelimit++;
-                }
-                return;
-        }
-        ts->idle_calls++;
        /* Read jiffies and the time when jiffies were updated last */
        do {
                seq = read_seqbegin(&xtime_lock);
@@ -397,6 +362,8 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
                if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
                        goto out;
+                ret = expires;
                /*
                 * nohz_stop_sched_tick can be called several times before
                 * the nohz_restart_sched_tick is called. This happens when
@@ -406,17 +373,12 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
                 */
                if (!ts->tick_stopped) {
                        select_nohz_load_balancer(1);
+                        calc_load_enter_idle();
-                        ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
+                        ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
                        ts->tick_stopped = 1;
-                        ts->idle_jiffies = last_jiffies;
                }
-                ts->idle_sleeps++;
-                /* Mark expires */
-                ts->idle_expires = expires;
                /*
                 * If the expiration time == KTIME_MAX, then
                 * in this case we simply stop the tick timer.
@@ -447,6 +409,65 @@ out:
        ts->next_jiffies = next_jiffies;
        ts->last_jiffies = last_jiffies;
        ts->sleep_length = ktime_sub(dev->next_event, now);
+        return ret;
+}
+static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
+{
+        /*
+         * If this cpu is offline and it is the one which updates
+         * jiffies, then give up the assignment and let it be taken by
+         * the cpu which runs the tick timer next. If we don't drop
+         * this here the jiffies might be stale and do_timer() never
+         * invoked.
+         */
+        if (unlikely(!cpu_online(cpu))) {
+                if (cpu == tick_do_timer_cpu)
+                        tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+        }
+        if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
+                return false;
+        if (need_resched())
+                return false;
+        if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
+                static int ratelimit;
+                if (ratelimit < 10) {
+                        printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
+                               (unsigned int) local_softirq_pending());
+                        ratelimit++;
+                }
+                return false;
+        }
+        return true;
+}
+static void __tick_nohz_idle_enter(struct tick_sched *ts)
+{
+        ktime_t now, expires;
+        int cpu = smp_processor_id();
+        now = tick_nohz_start_idle(cpu, ts);
+        if (can_stop_idle_tick(cpu, ts)) {
+                int was_stopped = ts->tick_stopped;
+                ts->idle_calls++;
+                expires = tick_nohz_stop_sched_tick(ts, now, cpu);
+                if (expires.tv64 > 0LL) {
+                        ts->idle_sleeps++;
+                        ts->idle_expires = expires;
+                }
+                if (!was_stopped && ts->tick_stopped)
+                        ts->idle_jiffies = ts->last_jiffies;
+        }
 }
 /**
@@ -484,7 +505,7 @@ void tick_nohz_idle_enter(void)
         * update of the idle time accounting in tick_nohz_start_idle().
         */
        ts->inidle = 1;
-        tick_nohz_stop_sched_tick(ts);
+        __tick_nohz_idle_enter(ts);
        local_irq_enable();
 }
@@ -504,7 +525,7 @@ void tick_nohz_irq_exit(void)
        if (!ts->inidle)
                return;
-        tick_nohz_stop_sched_tick(ts);
+        __tick_nohz_idle_enter(ts);
 }
 /**
@@ -522,7 +543,7 @@ ktime_t tick_nohz_get_sleep_length(void)
 static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 {
        hrtimer_cancel(&ts->sched_timer);
-        hrtimer_set_expires(&ts->sched_timer, ts->idle_tick);
+        hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
        while (1) {
                /* Forward the time to expire in the future */
@@ -545,6 +566,41 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
        }
 }
+static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
+{
+        /* Update jiffies first */
+        select_nohz_load_balancer(0);
+        tick_do_update_jiffies64(now);
+        update_cpu_load_nohz();
+        touch_softlockup_watchdog();
+        /*
+         * Cancel the scheduled timer and restore the tick
+         */
+        ts->tick_stopped  = 0;
+        ts->idle_exittime = now;
+        tick_nohz_restart(ts, now);
+}
+static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
+{
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+        unsigned long ticks;
+        /*
+         * We stopped the tick in idle. Update process times would miss the
+         * time we slept as update_process_times does only a 1 tick
+         * accounting. Enforce that this is accounted to idle !
+         */
+        ticks = jiffies - ts->idle_jiffies;
+        /*
+         * We might be one off. Do not randomly account a huge number of ticks!
+         */
+        if (ticks && ticks < LONG_MAX)
+                account_idle_ticks(ticks);
+#endif
+}
 /**
 * tick_nohz_idle_exit - restart the idle tick from the idle task
 *
@@ -556,9 +612,6 @@ void tick_nohz_idle_exit(void)
 {
        int cpu = smp_processor_id();
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
-        unsigned long ticks;
-#endif
        ktime_t now;
        local_irq_disable();
@@ -573,39 +626,11 @@ void tick_nohz_idle_exit(void)
        if (ts->idle_active)
                tick_nohz_stop_idle(cpu, now);
-        if (!ts->tick_stopped) {
+        if (ts->tick_stopped) {
-                local_irq_enable();
+                tick_nohz_restart_sched_tick(ts, now);
-                return;
+                tick_nohz_account_idle_ticks(ts);
        }
-        /* Update jiffies first */
-        select_nohz_load_balancer(0);
-        tick_do_update_jiffies64(now);
-        update_cpu_load_nohz();
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
-        /*
-         * We stopped the tick in idle. Update process times would miss the
-         * time we slept as update_process_times does only a 1 tick
-         * accounting. Enforce that this is accounted to idle !
-         */
-        ticks = jiffies - ts->idle_jiffies;
-        /*
-         * We might be one off. Do not randomly account a huge number of ticks!
-         */
-        if (ticks && ticks < LONG_MAX)
-                account_idle_ticks(ticks);
-#endif
-        touch_softlockup_watchdog();
-        /*
-         * Cancel the scheduled timer and restore the tick
-         */
-        ts->tick_stopped  = 0;
-        ts->idle_exittime = now;
-        tick_nohz_restart(ts, now);
        local_irq_enable();
 }
@@ -809,7 +834,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
                 */
                if (ts->tick_stopped) {
                        touch_softlockup_watchdog();
-                        ts->idle_jiffies++;
+                        if (idle_cpu(cpu))
+                                ts->idle_jiffies++;
                }
                update_process_times(user_mode(regs));
                profile_tick(CPU_PROFILING);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6f46a00a1e8a..f045cc50832d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -24,32 +24,32 @@
 /* Structure holding internal timekeeping values. */
 struct timekeeper {
        /* Current clocksource used for timekeeping. */
-        struct clocksource *clock;
+        struct clocksource      *clock;
        /* NTP adjusted clock multiplier */
-        u32     mult;
+        u32                     mult;
        /* The shift value of the current clocksource. */
-        int     shift;
+        u32                     shift;
        /* Number of clock cycles in one NTP interval. */
-        cycle_t cycle_interval;
+        cycle_t                 cycle_interval;
        /* Number of clock shifted nano seconds in one NTP interval. */
-        u64     xtime_interval;
+        u64                     xtime_interval;
        /* shifted nano seconds left over when rounding cycle_interval */
-        s64     xtime_remainder;
+        s64                     xtime_remainder;
        /* Raw nano seconds accumulated per NTP interval. */
-        u32     raw_interval;
+        u32                     raw_interval;
+        /* Current CLOCK_REALTIME time in seconds */
+        u64                     xtime_sec;
+        /* Clock shifted nano seconds */
+        u64                     xtime_nsec;
-        /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */
-        u64     xtime_nsec;
        /* Difference between accumulated time and NTP time in ntp
         * shifted nano seconds. */
-        s64     ntp_error;
+        s64                     ntp_error;
        /* Shift conversion between clock shifted nano seconds and
         * ntp shifted nano seconds. */
-        int     ntp_error_shift;
+        u32                     ntp_error_shift;
-        /* The current time */
-        struct timespec xtime;
        /*
         * wall_to_monotonic is what we need to add to xtime (or xtime corrected
         * for sub jiffie times) to get to monotonic time.  Monotonic is pegged
@@ -64,14 +64,17 @@ struct timekeeper {
         * - wall_to_monotonic is no longer the boot time, getboottime must be
         * used instead.
         */
-        struct timespec wall_to_monotonic;
+        struct timespec         wall_to_monotonic;
        /* time spent in suspend */
-        struct timespec total_sleep_time;
+        struct timespec         total_sleep_time;
        /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
-        struct timespec raw_time;
+        struct timespec         raw_time;
+        /* Offset clock monotonic -> clock realtime */
+        ktime_t                 offs_real;
+        /* Offset clock monotonic -> clock boottime */
+        ktime_t                 offs_boot;
        /* Seqlock for all timekeeper values */
-        seqlock_t lock;
+        seqlock_t               lock;
 };
 static struct timekeeper timekeeper;
@@ -82,11 +85,37 @@ static struct timekeeper timekeeper;
 */
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
+static inline void tk_normalize_xtime(struct timekeeper *tk)
+{
+        while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) {
+                tk->xtime_nsec -= (u64)NSEC_PER_SEC << tk->shift;
+                tk->xtime_sec++;
+        }
+}
+static struct timespec tk_xtime(struct timekeeper *tk)
+{
+        struct timespec ts;
+        ts.tv_sec = tk->xtime_sec;
+        ts.tv_nsec = (long)(tk->xtime_nsec >> tk->shift);
+        return ts;
+}
+static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts)
+{
+        tk->xtime_sec = ts->tv_sec;
+        tk->xtime_nsec = ts->tv_nsec << tk->shift;
+}
+static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts)
+{
+        tk->xtime_sec += ts->tv_sec;
+        tk->xtime_nsec += ts->tv_nsec << tk->shift;
+}
 /**
 * timekeeper_setup_internals - Set up internals to use clocksource clock.
@@ -98,12 +127,14 @@ int __read_mostly timekeeping_suspended;
 *
 * Unless you're the timekeeping code, you should not be using this!
 */
-static void timekeeper_setup_internals(struct clocksource *clock)
+static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 {
        cycle_t interval;
        u64 tmp, ntpinterval;
+        struct clocksource *old_clock;
-        timekeeper.clock = clock;
+        old_clock = tk->clock;
+        tk->clock = clock;
        clock->cycle_last = clock->read(clock);
        /* Do the ns -> cycle conversion first, using original mult */
@@ -116,71 +147,96 @@ static void timekeeper_setup_internals(struct clocksource *clock)
                tmp = 1;
        interval = (cycle_t) tmp;
-        timekeeper.cycle_interval = interval;
+        tk->cycle_interval = interval;
        /* Go back from cycles -> shifted ns */
-        timekeeper.xtime_interval = (u64) interval * clock->mult;
+        tk->xtime_interval = (u64) interval * clock->mult;
-        timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval;
+        tk->xtime_remainder = ntpinterval - tk->xtime_interval;
-        timekeeper.raw_interval =
+        tk->raw_interval =
                ((u64) interval * clock->mult) >> clock->shift;
-        timekeeper.xtime_nsec = 0;
+         /* if changing clocks, convert xtime_nsec shift units */
-        timekeeper.shift = clock->shift;
+        if (old_clock) {
+                int shift_change = clock->shift - old_clock->shift;
+                if (shift_change < 0)
+                        tk->xtime_nsec >>= -shift_change;
+                else
+                        tk->xtime_nsec <<= shift_change;
+        }
+        tk->shift = clock->shift;
-        timekeeper.ntp_error = 0;
+        tk->ntp_error = 0;
-        timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
+        tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
        /*
         * The timekeeper keeps its own mult values for the currently
         * active clocksource. These value will be adjusted via NTP
         * to counteract clock drifting.
         */
-        timekeeper.mult = clock->mult;
+        tk->mult = clock->mult;
 }
 /* Timekeeper helper functions. */
-static inline s64 timekeeping_get_ns(void)
+static inline s64 timekeeping_get_ns(struct timekeeper *tk)
 {
        cycle_t cycle_now, cycle_delta;
        struct clocksource *clock;
+        s64 nsec;
        /* read clocksource: */
-        clock = timekeeper.clock;
+        clock = tk->clock;
        cycle_now = clock->read(clock);
        /* calculate the delta since the last update_wall_time: */
        cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
-        /* return delta convert to nanoseconds using ntp adjusted mult. */
+        nsec = cycle_delta * tk->mult + tk->xtime_nsec;
-        return clocksource_cyc2ns(cycle_delta, timekeeper.mult,
+        nsec >>= tk->shift;
-                                  timekeeper.shift);
+        /* If arch requires, add in gettimeoffset() */
+        return nsec + arch_gettimeoffset();
 }
-static inline s64 timekeeping_get_ns_raw(void)
+static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
 {
        cycle_t cycle_now, cycle_delta;
        struct clocksource *clock;
+        s64 nsec;
        /* read clocksource: */
-        clock = timekeeper.clock;
+        clock = tk->clock;
        cycle_now = clock->read(clock);
        /* calculate the delta since the last update_wall_time: */
        cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
-        /* return delta convert to nanoseconds. */
+        /* convert delta to nanoseconds. */
-        return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
+        nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
+        /* If arch requires, add in gettimeoffset() */
+        return nsec + arch_gettimeoffset();
+}
+static void update_rt_offset(struct timekeeper *tk)
+{
+        struct timespec tmp, *wtm = &tk->wall_to_monotonic;
+        set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec);
+        tk->offs_real = timespec_to_ktime(tmp);
 }
 /* must hold write on timekeeper.lock */
-static void timekeeping_update(bool clearntp)
+static void timekeeping_update(struct timekeeper *tk, bool clearntp)
 {
+        struct timespec xt;
        if (clearntp) {
-                timekeeper.ntp_error = 0;
+                tk->ntp_error = 0;
                ntp_clear();
        }
-        update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic,
+        update_rt_offset(tk);
-                         timekeeper.clock, timekeeper.mult);
+        xt = tk_xtime(tk);
+        update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult);
 }
@@ -191,27 +247,26 @@ static void timekeeping_update(bool clearntp)
 * update_wall_time(). This is useful before significant clock changes,
 * as it avoids having to deal with this time offset explicitly.
 */
-static void timekeeping_forward_now(void)
+static void timekeeping_forward_now(struct timekeeper *tk)
 {
        cycle_t cycle_now, cycle_delta;
        struct clocksource *clock;
        s64 nsec;
-        clock = timekeeper.clock;
+        clock = tk->clock;
        cycle_now = clock->read(clock);
        cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
        clock->cycle_last = cycle_now;
-        nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult,
+        tk->xtime_nsec += cycle_delta * tk->mult;
-                                  timekeeper.shift);
        /* If arch requires, add in gettimeoffset() */
-        nsec += arch_gettimeoffset();
+        tk->xtime_nsec += arch_gettimeoffset() << tk->shift;
-        timespec_add_ns(&timekeeper.xtime, nsec);
+        tk_normalize_xtime(tk);
        nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
-        timespec_add_ns(&timekeeper.raw_time, nsec);
+        timespec_add_ns(&tk->raw_time, nsec);
 }
 /**
@@ -223,18 +278,15 @@ static void timekeeping_forward_now(void)
 void getnstimeofday(struct timespec *ts)
 {
        unsigned long seq;
-        s64 nsecs;
+        s64 nsecs = 0;
        WARN_ON(timekeeping_suspended);
        do {
                seq = read_seqbegin(&timekeeper.lock);
-                *ts = timekeeper.xtime;
+                ts->tv_sec = timekeeper.xtime_sec;
-                nsecs = timekeeping_get_ns();
+                ts->tv_nsec = timekeeping_get_ns(&timekeeper);
-                /* If arch requires, add in gettimeoffset() */
-                nsecs += arch_gettimeoffset();
        } while (read_seqretry(&timekeeper.lock, seq));
@@ -251,13 +303,10 @@ ktime_t ktime_get(void)
        do {
                seq = read_seqbegin(&timekeeper.lock);
-                secs = timekeeper.xtime.tv_sec +
+                secs = timekeeper.xtime_sec +
                                timekeeper.wall_to_monotonic.tv_sec;
-                nsecs = timekeeper.xtime.tv_nsec +
+                nsecs = timekeeping_get_ns(&timekeeper) +
                                timekeeper.wall_to_monotonic.tv_nsec;
-                nsecs += timekeeping_get_ns();
-                /* If arch requires, add in gettimeoffset() */
-                nsecs += arch_gettimeoffset();
        } while (read_seqretry(&timekeeper.lock, seq));
        /*
@@ -280,22 +329,19 @@ void ktime_get_ts(struct timespec *ts)
 {
        struct timespec tomono;
        unsigned int seq;
-        s64 nsecs;
        WARN_ON(timekeeping_suspended);
        do {
                seq = read_seqbegin(&timekeeper.lock);
-                *ts = timekeeper.xtime;
+                ts->tv_sec = timekeeper.xtime_sec;
+                ts->tv_nsec = timekeeping_get_ns(&timekeeper);
                tomono = timekeeper.wall_to_monotonic;
-                nsecs = timekeeping_get_ns();
-                /* If arch requires, add in gettimeoffset() */
-                nsecs += arch_gettimeoffset();
        } while (read_seqretry(&timekeeper.lock, seq));
        set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
-                                ts->tv_nsec + tomono.tv_nsec + nsecs);
+                                ts->tv_nsec + tomono.tv_nsec);
 }
 EXPORT_SYMBOL_GPL(ktime_get_ts);
@@ -318,20 +364,14 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
        WARN_ON_ONCE(timekeeping_suspended);
        do {
-                u32 arch_offset;
                seq = read_seqbegin(&timekeeper.lock);
                *ts_raw = timekeeper.raw_time;
-                *ts_real = timekeeper.xtime;
+                ts_real->tv_sec = timekeeper.xtime_sec;
+                ts_real->tv_nsec = 0;
-                nsecs_raw = timekeeping_get_ns_raw();
-                nsecs_real = timekeeping_get_ns();
-                /* If arch requires, add in gettimeoffset() */
+                nsecs_raw = timekeeping_get_ns_raw(&timekeeper);
-                arch_offset = arch_gettimeoffset();
+                nsecs_real = timekeeping_get_ns(&timekeeper);
-                nsecs_raw += arch_offset;
-                nsecs_real += arch_offset;
        } while (read_seqretry(&timekeeper.lock, seq));
@@ -366,7 +406,7 @@ EXPORT_SYMBOL(do_gettimeofday);
 */
 int do_settimeofday(const struct timespec *tv)
 {
-        struct timespec ts_delta;
+        struct timespec ts_delta, xt;
        unsigned long flags;
        if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
@@ -374,15 +414,18 @@ int do_settimeofday(const struct timespec *tv)
        write_seqlock_irqsave(&timekeeper.lock, flags);
-        timekeeping_forward_now();
+        timekeeping_forward_now(&timekeeper);
+        xt = tk_xtime(&timekeeper);
+        ts_delta.tv_sec = tv->tv_sec - xt.tv_sec;
+        ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec;
-        ts_delta.tv_sec = tv->tv_sec - timekeeper.xtime.tv_sec;
-        ts_delta.tv_nsec = tv->tv_nsec - timekeeper.xtime.tv_nsec;
        timekeeper.wall_to_monotonic =
                        timespec_sub(timekeeper.wall_to_monotonic, ts_delta);
-        timekeeper.xtime = *tv;
+        tk_set_xtime(&timekeeper, tv);
-        timekeeping_update(true);
+        timekeeping_update(&timekeeper, true);
        write_sequnlock_irqrestore(&timekeeper.lock, flags);
@@ -409,13 +452,14 @@ int timekeeping_inject_offset(struct timespec *ts)
        write_seqlock_irqsave(&timekeeper.lock, flags);
-        timekeeping_forward_now();
+        timekeeping_forward_now(&timekeeper);
-        timekeeper.xtime = timespec_add(timekeeper.xtime, *ts);
+        tk_xtime_add(&timekeeper, ts);
        timekeeper.wall_to_monotonic =
                                timespec_sub(timekeeper.wall_to_monotonic, *ts);
-        timekeeping_update(true);
+        timekeeping_update(&timekeeper, true);
        write_sequnlock_irqrestore(&timekeeper.lock, flags);
@@ -440,14 +484,14 @@ static int change_clocksource(void *data)
        write_seqlock_irqsave(&timekeeper.lock, flags);
-        timekeeping_forward_now();
+        timekeeping_forward_now(&timekeeper);
        if (!new->enable || new->enable(new) == 0) {
                old = timekeeper.clock;
-                timekeeper_setup_internals(new);
+                tk_setup_internals(&timekeeper, new);
                if (old->disable)
                        old->disable(old);
        }
-        timekeeping_update(true);
+        timekeeping_update(&timekeeper, true);
        write_sequnlock_irqrestore(&timekeeper.lock, flags);
@@ -497,7 +541,7 @@ void getrawmonotonic(struct timespec *ts)
        do {
                seq = read_seqbegin(&timekeeper.lock);
-                nsecs = timekeeping_get_ns_raw();
+                nsecs = timekeeping_get_ns_raw(&timekeeper);
                *ts = timekeeper.raw_time;
        } while (read_seqretry(&timekeeper.lock, seq));
@@ -532,6 +576,7 @@ u64 timekeeping_max_deferment(void)
 {
        unsigned long seq;
        u64 ret;
        do {
                seq = read_seqbegin(&timekeeper.lock);
@@ -592,18 +637,17 @@ void __init timekeeping_init(void)
        clock = clocksource_default_clock();
        if (clock->enable)
                clock->enable(clock);
-        timekeeper_setup_internals(clock);
+        tk_setup_internals(&timekeeper, clock);
-        timekeeper.xtime.tv_sec = now.tv_sec;
+        tk_set_xtime(&timekeeper, &now);
-        timekeeper.xtime.tv_nsec = now.tv_nsec;
        timekeeper.raw_time.tv_sec = 0;
        timekeeper.raw_time.tv_nsec = 0;
-        if (boot.tv_sec == 0 && boot.tv_nsec == 0) {
+        if (boot.tv_sec == 0 && boot.tv_nsec == 0)
-                boot.tv_sec = timekeeper.xtime.tv_sec;
+                boot = tk_xtime(&timekeeper);
-                boot.tv_nsec = timekeeper.xtime.tv_nsec;
-        }
        set_normalized_timespec(&timekeeper.wall_to_monotonic,
                                -boot.tv_sec, -boot.tv_nsec);
+        update_rt_offset(&timekeeper);
        timekeeper.total_sleep_time.tv_sec = 0;
        timekeeper.total_sleep_time.tv_nsec = 0;
        write_sequnlock_irqrestore(&timekeeper.lock, flags);
@@ -612,6 +656,12 @@ void __init timekeeping_init(void)
 /* time in seconds when suspend began */
 static struct timespec timekeeping_suspend_time;
+static void update_sleep_time(struct timespec t)
+{
+        timekeeper.total_sleep_time = t;
+        timekeeper.offs_boot = timespec_to_ktime(t);
+}
 /**
 * __timekeeping_inject_sleeptime - Internal function to add sleep interval
 * @delta: pointer to a timespec delta value
@@ -619,7 +669,8 @@ static struct timespec timekeeping_suspend_time;
 * Takes a timespec offset measuring a suspend interval and properly
 * adds the sleep offset to the timekeeping variables.
 */
-static void __timekeeping_inject_sleeptime(struct timespec *delta)
+static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
+                                                        struct timespec *delta)
 {
        if (!timespec_valid(delta)) {
                printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid "
@@ -627,11 +678,9 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta)
                return;
        }
-        timekeeper.xtime = timespec_add(timekeeper.xtime, *delta);
+        tk_xtime_add(tk, delta);
-        timekeeper.wall_to_monotonic =
+        tk->wall_to_monotonic = timespec_sub(tk->wall_to_monotonic, *delta);
-                        timespec_sub(timekeeper.wall_to_monotonic, *delta);
+        update_sleep_time(timespec_add(tk->total_sleep_time, *delta));
-        timekeeper.total_sleep_time = timespec_add(
-                                        timekeeper.total_sleep_time, *delta);
 }
@@ -657,11 +706,11 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
        write_seqlock_irqsave(&timekeeper.lock, flags);
-        timekeeping_forward_now();
+        timekeeping_forward_now(&timekeeper);
-        __timekeeping_inject_sleeptime(delta);
+        __timekeeping_inject_sleeptime(&timekeeper, delta);
-        timekeeping_update(true);
+        timekeeping_update(&timekeeper, true);
        write_sequnlock_irqrestore(&timekeeper.lock, flags);
@@ -690,12 +739,13 @@ static void timekeeping_resume(void)
        if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
                ts = timespec_sub(ts, timekeeping_suspend_time);
-                __timekeeping_inject_sleeptime(&ts);
+                __timekeeping_inject_sleeptime(&timekeeper, &ts);
        }
        /* re-base the last cycle value */
        timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
        timekeeper.ntp_error = 0;
        timekeeping_suspended = 0;
+        timekeeping_update(&timekeeper, false);
        write_sequnlock_irqrestore(&timekeeper.lock, flags);
        touch_softlockup_watchdog();
@@ -715,7 +765,7 @@ static int timekeeping_suspend(void)
        read_persistent_clock(&timekeeping_suspend_time);
        write_seqlock_irqsave(&timekeeper.lock, flags);
-        timekeeping_forward_now();
+        timekeeping_forward_now(&timekeeper);
        timekeeping_suspended = 1;
        /*
@@ -724,7 +774,7 @@ static int timekeeping_suspend(void)
         * try to compensate so the difference in system time
         * and persistent_clock time stays close to constant.
         */
-        delta = timespec_sub(timekeeper.xtime, timekeeping_suspend_time);
+        delta = timespec_sub(tk_xtime(&timekeeper), timekeeping_suspend_time);
        delta_delta = timespec_sub(delta, old_delta);
        if (abs(delta_delta.tv_sec)  >= 2) {
                /*
@@ -763,7 +813,8 @@ device_initcall(timekeeping_init_ops);
 * If the error is already larger, we look ahead even further
 * to compensate for late or lost adjustments.
 */
-static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
+static __always_inline int timekeeping_bigadjust(struct timekeeper *tk,
+                                                 s64 error, s64 *interval,
                                                 s64 *offset)
 {
        s64 tick_error, i;
@@ -779,7 +830,7 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
         * here.  This is tuned so that an error of about 1 msec is adjusted
         * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
         */
-        error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
+        error2 = tk->ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
        error2 = abs(error2);
        for (look_ahead = 0; error2 > 0; look_ahead++)
                error2 >>= 2;
@@ -788,8 +839,8 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
         * Now calculate the error in (1 << look_ahead) ticks, but first
         * remove the single look ahead already included in the error.
         */
-        tick_error = ntp_tick_length() >> (timekeeper.ntp_error_shift + 1);
+        tick_error = ntp_tick_length() >> (tk->ntp_error_shift + 1);
-        tick_error -= timekeeper.xtime_interval >> 1;
+        tick_error -= tk->xtime_interval >> 1;
        error = ((error - tick_error) >> look_ahead) + tick_error;
        /* Finally calculate the adjustment shift value.  */
@@ -814,9 +865,9 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
 * this is optimized for the most common adjustments of -1,0,1,
 * for other values we can do a bit more work.
 */
-static void timekeeping_adjust(s64 offset)
+static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
 {
-        s64 error, interval = timekeeper.cycle_interval;
+        s64 error, interval = tk->cycle_interval;
        int adj;
        /*
@@ -832,7 +883,7 @@ static void timekeeping_adjust(s64 offset)
         *
         * Note: It does not "save" on aggravation when reading the code.
         */
-        error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1);
+        error = tk->ntp_error >> (tk->ntp_error_shift - 1);
        if (error > interval) {
                /*
                 * We now divide error by 4(via shift), which checks if
@@ -854,7 +905,8 @@ static void timekeeping_adjust(s64 offset)
                if (likely(error <= interval))
                        adj = 1;
                else
-                        adj = timekeeping_bigadjust(error, &interval, &offset);
+                        adj = timekeeping_bigadjust(tk, error, &interval,
+                                                        &offset);
        } else if (error < -interval) {
                /* See comment above, this is just switched for the negative */
                error >>= 2;
@@ -863,18 +915,17 @@ static void timekeeping_adjust(s64 offset)
                        interval = -interval;
                        offset = -offset;
                } else
-                        adj = timekeeping_bigadjust(error, &interval, &offset);
+                        adj = timekeeping_bigadjust(tk, error, &interval,
-        } else /* No adjustment needed */
+                                                        &offset);
+        } else
                return;
-        if (unlikely(timekeeper.clock->maxadj &&
+        if (unlikely(tk->clock->maxadj &&
-                        (timekeeper.mult + adj >
+                (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) {
-                        timekeeper.clock->mult + timekeeper.clock->maxadj))) {
                printk_once(KERN_WARNING
                        "Adjusting %s more than 11%% (%ld vs %ld)\n",
-                        timekeeper.clock->name, (long)timekeeper.mult + adj,
+                        tk->clock->name, (long)tk->mult + adj,
-                        (long)timekeeper.clock->mult +
+                        (long)tk->clock->mult + tk->clock->maxadj);
-                                timekeeper.clock->maxadj);
        }
        /*
         * So the following can be confusing.
@@ -925,11 +976,60 @@ static void timekeeping_adjust(s64 offset)
         *
         * XXX - TODO: Doc ntp_error calculation.
         */
-        timekeeper.mult += adj;
+        tk->mult += adj;
-        timekeeper.xtime_interval += interval;
+        tk->xtime_interval += interval;
-        timekeeper.xtime_nsec -= offset;
+        tk->xtime_nsec -= offset;
-        timekeeper.ntp_error -= (interval - offset) <<
+        tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
-                                timekeeper.ntp_error_shift;
+        /*
+         * It may be possible that when we entered this function, xtime_nsec
+         * was very small.  Further, if we're slightly speeding the clocksource
+         * in the code above, its possible the required corrective factor to
+         * xtime_nsec could cause it to underflow.
+         *
+         * Now, since we already accumulated the second, cannot simply roll
+         * the accumulated second back, since the NTP subsystem has been
+         * notified via second_overflow. So instead we push xtime_nsec forward
+         * by the amount we underflowed, and add that amount into the error.
+         *
+         * We'll correct this error next time through this function, when
+         * xtime_nsec is not as small.
+         */
+        if (unlikely((s64)tk->xtime_nsec < 0)) {
+                s64 neg = -(s64)tk->xtime_nsec;
+                tk->xtime_nsec = 0;
+                tk->ntp_error += neg << tk->ntp_error_shift;
+        }
+}
+/**
+ * accumulate_nsecs_to_secs - Accumulates nsecs into secs
+ *
+ * Helper function that accumulates a the nsecs greater then a second
+ * from the xtime_nsec field to the xtime_secs field.
+ * It also calls into the NTP code to handle leapsecond processing.
+ *
+ */
+static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
+{
+        u64 nsecps = (u64)NSEC_PER_SEC << tk->shift;
+        while (tk->xtime_nsec >= nsecps) {
+                int leap;
+                tk->xtime_nsec -= nsecps;
+                tk->xtime_sec++;
+                /* Figure out if its a leap sec and apply if needed */
+                leap = second_overflow(tk->xtime_sec);
+                tk->xtime_sec += leap;
+                tk->wall_to_monotonic.tv_sec -= leap;
+                if (leap)
+                        clock_was_set_delayed();
+        }
 }
@@ -942,44 +1042,36 @@ static void timekeeping_adjust(s64 offset)
 *
 * Returns the unconsumed cycles.
 */
-static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
+static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
+                                                u32 shift)
 {
-        u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
        u64 raw_nsecs;
-        /* If the offset is smaller than a shifted interval, do nothing */
+        /* If the offset is smaller then a shifted interval, do nothing */
-        if (offset < timekeeper.cycle_interval<<shift)
+        if (offset < tk->cycle_interval<<shift)
                return offset;
        /* Accumulate one shifted interval */
-        offset -= timekeeper.cycle_interval << shift;
+        offset -= tk->cycle_interval << shift;
-        timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift;
+        tk->clock->cycle_last += tk->cycle_interval << shift;
-        timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
+        tk->xtime_nsec += tk->xtime_interval << shift;
-        while (timekeeper.xtime_nsec >= nsecps) {
+        accumulate_nsecs_to_secs(tk);
-                int leap;
-                timekeeper.xtime_nsec -= nsecps;
-                timekeeper.xtime.tv_sec++;
-                leap = second_overflow(timekeeper.xtime.tv_sec);
-                timekeeper.xtime.tv_sec += leap;
-                timekeeper.wall_to_monotonic.tv_sec -= leap;
-        }
        /* Accumulate raw time */
-        raw_nsecs = timekeeper.raw_interval << shift;
+        raw_nsecs = tk->raw_interval << shift;
-        raw_nsecs += timekeeper.raw_time.tv_nsec;
+        raw_nsecs += tk->raw_time.tv_nsec;
        if (raw_nsecs >= NSEC_PER_SEC) {
                u64 raw_secs = raw_nsecs;
                raw_nsecs = do_div(raw_secs, NSEC_PER_SEC);
-                timekeeper.raw_time.tv_sec += raw_secs;
+                tk->raw_time.tv_sec += raw_secs;
        }
-        timekeeper.raw_time.tv_nsec = raw_nsecs;
+        tk->raw_time.tv_nsec = raw_nsecs;
        /* Accumulate error between NTP and clock interval */
-        timekeeper.ntp_error += ntp_tick_length() << shift;
+        tk->ntp_error += ntp_tick_length() << shift;
-        timekeeper.ntp_error -=
+        tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) <<
-            (timekeeper.xtime_interval + timekeeper.xtime_remainder) <<
+                                                (tk->ntp_error_shift + shift);
-                                (timekeeper.ntp_error_shift + shift);
        return offset;
 }
@@ -995,6 +1087,7 @@ static void update_wall_time(void)
        cycle_t offset;
        int shift = 0, maxshift;
        unsigned long flags;
+        s64 remainder;
        write_seqlock_irqsave(&timekeeper.lock, flags);
@@ -1009,8 +1102,6 @@ static void update_wall_time(void)
 #else
        offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
 #endif
-        timekeeper.xtime_nsec = (s64)timekeeper.xtime.tv_nsec <<
-                                                timekeeper.shift;
        /*
         * With NO_HZ we may have to accumulate many cycle_intervals
@@ -1026,62 +1117,36 @@ static void update_wall_time(void)
        maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
        shift = min(shift, maxshift);
        while (offset >= timekeeper.cycle_interval) {
-                offset = logarithmic_accumulation(offset, shift);
+                offset = logarithmic_accumulation(&timekeeper, offset, shift);
                if(offset < timekeeper.cycle_interval<<shift)
                        shift--;
        }
        /* correct the clock when NTP error is too big */
-        timekeeping_adjust(offset);
+        timekeeping_adjust(&timekeeper, offset);
-        /*
-         * Since in the loop above, we accumulate any amount of time
-         * in xtime_nsec over a second into xtime.tv_sec, its possible for
-         * xtime_nsec to be fairly small after the loop. Further, if we're
-         * slightly speeding the clocksource up in timekeeping_adjust(),
-         * its possible the required corrective factor to xtime_nsec could
-         * cause it to underflow.
-         *
-         * Now, we cannot simply roll the accumulated second back, since
-         * the NTP subsystem has been notified via second_overflow. So
-         * instead we push xtime_nsec forward by the amount we underflowed,
-         * and add that amount into the error.
-         *
-         * We'll correct this error next time through this function, when
-         * xtime_nsec is not as small.
-         */
-        if (unlikely((s64)timekeeper.xtime_nsec < 0)) {
-                s64 neg = -(s64)timekeeper.xtime_nsec;
-                timekeeper.xtime_nsec = 0;
-                timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
-        }
        /*
-         * Store full nanoseconds into xtime after rounding it up and
+        * Store only full nanoseconds into xtime_nsec after rounding
-         * add the remainder to the error difference.
+        * it up and add the remainder to the error difference.
-         */
+        * XXX - This is necessary to avoid small 1ns inconsistnecies caused
-        timekeeper.xtime.tv_nsec = ((s64)timekeeper.xtime_nsec >>
+        * by truncating the remainder in vsyscalls. However, it causes
-                                                timekeeper.shift) + 1;
+        * additional work to be done in timekeeping_adjust(). Once
-        timekeeper.xtime_nsec -= (s64)timekeeper.xtime.tv_nsec <<
+        * the vsyscall implementations are converted to use xtime_nsec
-                                                timekeeper.shift;
+        * (shifted nanoseconds), this can be killed.
-        timekeeper.ntp_error += timekeeper.xtime_nsec <<
+        */
-                                timekeeper.ntp_error_shift;
+        remainder = timekeeper.xtime_nsec & ((1 << timekeeper.shift) - 1);
+        timekeeper.xtime_nsec -= remainder;
+        timekeeper.xtime_nsec += 1 << timekeeper.shift;
+        timekeeper.ntp_error += remainder << timekeeper.ntp_error_shift;
        /*
         * Finally, make sure that after the rounding
-         * xtime.tv_nsec isn't larger than NSEC_PER_SEC
+         * xtime_nsec isn't larger than NSEC_PER_SEC
         */
-        if (unlikely(timekeeper.xtime.tv_nsec >= NSEC_PER_SEC)) {
+        accumulate_nsecs_to_secs(&timekeeper);
-                int leap;
-                timekeeper.xtime.tv_nsec -= NSEC_PER_SEC;
-                timekeeper.xtime.tv_sec++;
-                leap = second_overflow(timekeeper.xtime.tv_sec);
-                timekeeper.xtime.tv_sec += leap;
-                timekeeper.wall_to_monotonic.tv_sec -= leap;
-        }
-        timekeeping_update(false);
+        timekeeping_update(&timekeeper, false);
 out:
        write_sequnlock_irqrestore(&timekeeper.lock, flags);
@@ -1126,21 +1191,20 @@ void get_monotonic_boottime(struct timespec *ts)
 {
        struct timespec tomono, sleep;
        unsigned int seq;
-        s64 nsecs;
        WARN_ON(timekeeping_suspended);
        do {
                seq = read_seqbegin(&timekeeper.lock);
-                *ts = timekeeper.xtime;
+                ts->tv_sec = timekeeper.xtime_sec;
+                ts->tv_nsec = timekeeping_get_ns(&timekeeper);
                tomono = timekeeper.wall_to_monotonic;
                sleep = timekeeper.total_sleep_time;
-                nsecs = timekeeping_get_ns();
        } while (read_seqretry(&timekeeper.lock, seq));
        set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
-                        ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs);
+                        ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec);
 }
 EXPORT_SYMBOL_GPL(get_monotonic_boottime);
@@ -1173,13 +1237,13 @@ EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
 unsigned long get_seconds(void)
 {
-        return timekeeper.xtime.tv_sec;
+        return timekeeper.xtime_sec;
 }
 EXPORT_SYMBOL(get_seconds);
 struct timespec __current_kernel_time(void)
 {
-        return timekeeper.xtime;
+        return tk_xtime(&timekeeper);
 }
 struct timespec current_kernel_time(void)
@@ -1190,7 +1254,7 @@ struct timespec current_kernel_time(void)
        do {
                seq = read_seqbegin(&timekeeper.lock);
-                now = timekeeper.xtime;
+                now = tk_xtime(&timekeeper);
        } while (read_seqretry(&timekeeper.lock, seq));
        return now;
@@ -1205,7 +1269,7 @@ struct timespec get_monotonic_coarse(void)
        do {
                seq = read_seqbegin(&timekeeper.lock);
-                now = timekeeper.xtime;
+                now = tk_xtime(&timekeeper);
                mono = timekeeper.wall_to_monotonic;
        } while (read_seqretry(&timekeeper.lock, seq));
@@ -1240,12 +1304,43 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
        do {
                seq = read_seqbegin(&timekeeper.lock);
-                *xtim = timekeeper.xtime;
+                *xtim = tk_xtime(&timekeeper);
                *wtom = timekeeper.wall_to_monotonic;
                *sleep = timekeeper.total_sleep_time;
        } while (read_seqretry(&timekeeper.lock, seq));
 }
+#ifdef CONFIG_HIGH_RES_TIMERS
+/**
+ * ktime_get_update_offsets - hrtimer helper
+ * @offs_real:  pointer to storage for monotonic -> realtime offset
+ * @offs_boot:  pointer to storage for monotonic -> boottime offset
+ *
+ * Returns current monotonic time and updates the offsets
+ * Called from hrtimer_interupt() or retrigger_next_event()
+ */
+ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
+{
+        ktime_t now;
+        unsigned int seq;
+        u64 secs, nsecs;
+        do {
+                seq = read_seqbegin(&timekeeper.lock);
+                secs = timekeeper.xtime_sec;
+                nsecs = timekeeping_get_ns(&timekeeper);
+                *offs_real = timekeeper.offs_real;
+                *offs_boot = timekeeper.offs_boot;
+        } while (read_seqretry(&timekeeper.lock, seq));
+        now = ktime_add_ns(ktime_set(secs, 0), nsecs);
+        now = ktime_sub(now, *offs_real);
+        return now;
+}
+#endif
 /**
 * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
 */
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 3258455549f4..af5a7e9f164b 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -167,7 +167,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
        {
                struct tick_sched *ts = tick_get_tick_sched(cpu);
                P(nohz_mode);
-                P_ns(idle_tick);
+                P_ns(last_tick);
                P(tick_stopped);
                P(idle_jiffies);
                P(idle_calls);
@@ -259,7 +259,7 @@ static int timer_list_show(struct seq_file *m, void *v)
        u64 now = ktime_to_ns(ktime_get());
        int cpu;
-        SEQ_printf(m, "Timer List Version: v0.6\n");
+        SEQ_printf(m, "Timer List Version: v0.7\n");
        SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
        SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
diff --git a/kernel/timer.c b/kernel/timer.c
index 6ec7e7e0db43..a61c09374eba 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -77,6 +77,7 @@ struct tvec_base {
        struct timer_list *running_timer;
        unsigned long timer_jiffies;
        unsigned long next_timer;
+        unsigned long active_timers;
        struct tvec_root tv1;
        struct tvec tv2;
        struct tvec tv3;
@@ -330,7 +331,8 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
 }
 EXPORT_SYMBOL_GPL(set_timer_slack);
-static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
+static void
+__internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 {
        unsigned long expires = timer->expires;
        unsigned long idx = expires - base->timer_jiffies;
@@ -372,6 +374,19 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
        list_add_tail(&timer->entry, vec);
 }
+static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
+{
+        __internal_add_timer(base, timer);
+        /*
+         * Update base->active_timers and base->next_timer
+         */
+        if (!tbase_get_deferrable(timer->base)) {
+                if (time_before(timer->expires, base->next_timer))
+                        base->next_timer = timer->expires;
+                base->active_timers++;
+        }
+}
 #ifdef CONFIG_TIMER_STATS
 void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
 {
@@ -654,8 +669,7 @@ void init_timer_deferrable_key(struct timer_list *timer,
 }
 EXPORT_SYMBOL(init_timer_deferrable_key);
-static inline void detach_timer(struct timer_list *timer,
+static inline void detach_timer(struct timer_list *timer, bool clear_pending)
-                                int clear_pending)
 {
        struct list_head *entry = &timer->entry;
@@ -667,6 +681,29 @@ static inline void detach_timer(struct timer_list *timer,
        entry->prev = LIST_POISON2;
 }
+static inline void
+detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
+{
+        detach_timer(timer, true);
+        if (!tbase_get_deferrable(timer->base))
+                timer->base->active_timers--;
+}
+static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
+                             bool clear_pending)
+{
+        if (!timer_pending(timer))
+                return 0;
+        detach_timer(timer, clear_pending);
+        if (!tbase_get_deferrable(timer->base)) {
+                timer->base->active_timers--;
+                if (timer->expires == base->next_timer)
+                        base->next_timer = base->timer_jiffies;
+        }
+        return 1;
+}
 /*
 * We are using hashed locking: holding per_cpu(tvec_bases).lock
 * means that all timers which are tied to this base via timer->base are
@@ -712,16 +749,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
        base = lock_timer_base(timer, &flags);
-        if (timer_pending(timer)) {
+        ret = detach_if_pending(timer, base, false);
-                detach_timer(timer, 0);
+        if (!ret && pending_only)
-                if (timer->expires == base->next_timer &&
+                goto out_unlock;
-                    !tbase_get_deferrable(timer->base))
-                        base->next_timer = base->timer_jiffies;
-                ret = 1;
-        } else {
-                if (pending_only)
-                        goto out_unlock;
-        }
        debug_activate(timer, expires);
@@ -752,9 +782,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
        }
        timer->expires = expires;
-        if (time_before(timer->expires, base->next_timer) &&
-            !tbase_get_deferrable(timer->base))
-                base->next_timer = timer->expires;
        internal_add_timer(base, timer);
 out_unlock:
@@ -920,9 +947,6 @@ void add_timer_on(struct timer_list *timer, int cpu)
        spin_lock_irqsave(&base->lock, flags);
        timer_set_base(timer, base);
        debug_activate(timer, timer->expires);
-        if (time_before(timer->expires, base->next_timer) &&
-            !tbase_get_deferrable(timer->base))
-                base->next_timer = timer->expires;
        internal_add_timer(base, timer);
        /*
         * Check whether the other CPU is idle and needs to be
@@ -959,13 +983,7 @@ int del_timer(struct timer_list *timer)
        timer_stats_timer_clear_start_info(timer);
        if (timer_pending(timer)) {
                base = lock_timer_base(timer, &flags);
-                if (timer_pending(timer)) {
+                ret = detach_if_pending(timer, base, true);
-                        detach_timer(timer, 1);
-                        if (timer->expires == base->next_timer &&
-                            !tbase_get_deferrable(timer->base))
-                                base->next_timer = base->timer_jiffies;
-                        ret = 1;
-                }
                spin_unlock_irqrestore(&base->lock, flags);
        }
@@ -990,19 +1008,10 @@ int try_to_del_timer_sync(struct timer_list *timer)
        base = lock_timer_base(timer, &flags);
-        if (base->running_timer == timer)
+        if (base->running_timer != timer) {
-                goto out;
+                timer_stats_timer_clear_start_info(timer);
+                ret = detach_if_pending(timer, base, true);
-        timer_stats_timer_clear_start_info(timer);
-        ret = 0;
-        if (timer_pending(timer)) {
-                detach_timer(timer, 1);
-                if (timer->expires == base->next_timer &&
-                    !tbase_get_deferrable(timer->base))
-                        base->next_timer = base->timer_jiffies;
-                ret = 1;
        }
-out:
        spin_unlock_irqrestore(&base->lock, flags);
        return ret;
@@ -1089,7 +1098,8 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
         */
        list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
                BUG_ON(tbase_get_base(timer->base) != base);
-                internal_add_timer(base, timer);
+                /* No accounting, while moving them */
+                __internal_add_timer(base, timer);
        }
        return index;
@@ -1178,7 +1188,7 @@ static inline void __run_timers(struct tvec_base *base)
                        timer_stats_account_timer(timer);
                        base->running_timer = timer;
-                        detach_timer(timer, 1);
+                        detach_expired_timer(timer, base);
                        spin_unlock_irq(&base->lock);
                        call_timer_fn(timer, fn, data);
@@ -1316,18 +1326,21 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
 unsigned long get_next_timer_interrupt(unsigned long now)
 {
        struct tvec_base *base = __this_cpu_read(tvec_bases);
-        unsigned long expires;
+        unsigned long expires = now + NEXT_TIMER_MAX_DELTA;
        /*
         * Pretend that there is no timer pending if the cpu is offline.
         * Possible pending timers will be migrated later to an active cpu.
         */
        if (cpu_is_offline(smp_processor_id()))
-                return now + NEXT_TIMER_MAX_DELTA;
+                return expires;
        spin_lock(&base->lock);
-        if (time_before_eq(base->next_timer, base->timer_jiffies))
+        if (base->active_timers) {
-                base->next_timer = __next_timer_interrupt(base);
+                if (time_before_eq(base->next_timer, base->timer_jiffies))
-        expires = base->next_timer;
+                        base->next_timer = __next_timer_interrupt(base);
+                expires = base->next_timer;
+        }
        spin_unlock(&base->lock);
        if (time_before_eq(expires, now))
@@ -1704,6 +1717,7 @@ static int __cpuinit init_timers_cpu(int cpu)
        base->timer_jiffies = jiffies;
        base->next_timer = base->timer_jiffies;
+        base->active_timers = 0;
        return 0;
 }
@@ -1714,11 +1728,9 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea
        while (!list_empty(head)) {
                timer = list_first_entry(head, struct timer_list, entry);
-                detach_timer(timer, 0);
+                /* We ignore the accounting on the dying cpu */
+                detach_timer(timer, false);
                timer_set_base(timer, new_base);
-                if (time_before(timer->expires, new_base->next_timer) &&
-                    !tbase_get_deferrable(timer->base))
-                        new_base->next_timer = timer->expires;
                internal_add_timer(new_base, timer);
        }
 }
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a008663d86c8..b4f20fba09fc 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -312,7 +312,7 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
 static int __register_ftrace_function(struct ftrace_ops *ops)
 {
-        if (ftrace_disabled)
+        if (unlikely(ftrace_disabled))
                return -ENODEV;
        if (FTRACE_WARN_ON(ops == &global_ops))
@@ -4299,16 +4299,12 @@ int register_ftrace_function(struct ftrace_ops *ops)
        mutex_lock(&ftrace_lock);
-        if (unlikely(ftrace_disabled))
-                goto out_unlock;
        ret = __register_ftrace_function(ops);
        if (!ret)
                ret = ftrace_startup(ops, 0);
- out_unlock:
        mutex_unlock(&ftrace_lock);
        return ret;
 }
 EXPORT_SYMBOL_GPL(register_ftrace_function);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1d0f6a8a0e5e..49491fa7daa2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1075,6 +1075,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
        rb_init_page(bpage->page);
        INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
+        INIT_LIST_HEAD(&cpu_buffer->new_pages);
        ret = rb_allocate_pages(cpu_buffer, nr_pages);
        if (ret < 0)
@@ -1346,10 +1347,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages)
                         * If something was added to this page, it was full
                         * since it is not the tail page. So we deduct the
                         * bytes consumed in ring buffer from here.
-                         * No need to update overruns, since this page is
+                         * Increment overrun to account for the lost events.
-                         * deleted from ring buffer and its entries are
-                         * already accounted for.
                         */
+                        local_add(page_entries, &cpu_buffer->overrun);
                        local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
                }
@@ -3239,6 +3239,10 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
        if (cpu_buffer->commit_page == cpu_buffer->reader_page)
                goto out;
+        /* Don't bother swapping if the ring buffer is empty */
+        if (rb_num_of_entries(cpu_buffer) == 0)
+                goto out;
        /*
         * Reset the reader page to size zero.
         */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a7fa0702be1c..a120f98c4112 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -830,6 +830,8 @@ int register_tracer(struct tracer *type)
                current_trace = saved_tracer;
                if (ret) {
                        printk(KERN_CONT "FAILED!\n");
+                        /* Add the warning after printing 'FAILED' */
+                        WARN_ON(1);
                        goto out;
                }
                /* Only reset on passing, to avoid touching corrupted buffers */
@@ -1708,9 +1710,11 @@ EXPORT_SYMBOL_GPL(trace_vprintk);
 static void trace_iterator_increment(struct trace_iterator *iter)
 {
+        struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu);
        iter->idx++;
-        if (iter->buffer_iter[iter->cpu])
+        if (buf_iter)
-                ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
+                ring_buffer_read(buf_iter, NULL);
 }
 static struct trace_entry *
@@ -1718,7 +1722,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
                unsigned long *lost_events)
 {
        struct ring_buffer_event *event;
-        struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
+        struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu);
        if (buf_iter)
                event = ring_buffer_iter_peek(buf_iter, ts);
@@ -1856,10 +1860,10 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
        tr->data[cpu]->skipped_entries = 0;
-        if (!iter->buffer_iter[cpu])
+        buf_iter = trace_buffer_iter(iter, cpu);
+        if (!buf_iter)
                return;
-        buf_iter = iter->buffer_iter[cpu];
        ring_buffer_iter_reset(buf_iter);
        /*
@@ -2205,13 +2209,15 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 int trace_empty(struct trace_iterator *iter)
 {
+        struct ring_buffer_iter *buf_iter;
        int cpu;
        /* If we are looking at one CPU buffer, only check that one */
        if (iter->cpu_file != TRACE_PIPE_ALL_CPU) {
                cpu = iter->cpu_file;
-                if (iter->buffer_iter[cpu]) {
+                buf_iter = trace_buffer_iter(iter, cpu);
-                        if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
+                if (buf_iter) {
+                        if (!ring_buffer_iter_empty(buf_iter))
                                return 0;
                } else {
                        if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
@@ -2221,8 +2227,9 @@ int trace_empty(struct trace_iterator *iter)
        }
        for_each_tracing_cpu(cpu) {
-                if (iter->buffer_iter[cpu]) {
+                buf_iter = trace_buffer_iter(iter, cpu);
-                        if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
+                if (buf_iter) {
+                        if (!ring_buffer_iter_empty(buf_iter))
                                return 0;
                } else {
                        if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
@@ -2381,6 +2388,11 @@ __tracing_open(struct inode *inode, struct file *file)
        if (!iter)
                return ERR_PTR(-ENOMEM);
+        iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(),
+                                    GFP_KERNEL);
+        if (!iter->buffer_iter)
+                goto release;
        /*
         * We make a copy of the current tracer to avoid concurrent
         * changes on it while we are reading.
@@ -2441,6 +2453,8 @@ __tracing_open(struct inode *inode, struct file *file)
 fail:
        mutex_unlock(&trace_types_lock);
        kfree(iter->trace);
+        kfree(iter->buffer_iter);
+release:
        seq_release_private(inode, file);
        return ERR_PTR(-ENOMEM);
 }
@@ -2481,6 +2495,7 @@ static int tracing_release(struct inode *inode, struct file *file)
        mutex_destroy(&iter->mutex);
        free_cpumask_var(iter->started);
        kfree(iter->trace);
+        kfree(iter->buffer_iter);
        seq_release_private(inode, file);
        return 0;
 }
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5aec220d2de0..55e1f7f0db12 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -317,6 +317,14 @@ struct tracer {
 #define TRACE_PIPE_ALL_CPU      -1
+static inline struct ring_buffer_iter *
+trace_buffer_iter(struct trace_iterator *iter, int cpu)
+{
+        if (iter->buffer_iter && iter->buffer_iter[cpu])
+                return iter->buffer_iter[cpu];
+        return NULL;
+}
 int tracer_init(struct tracer *t, struct trace_array *tr);
 int tracing_is_enabled(void);
 void trace_wake_up(void);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index a7d2a4c653d8..ce27c8ba8d31 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -538,7 +538,7 @@ get_return_for_leaf(struct trace_iterator *iter,
                next = &data->ret;
        } else {
-                ring_iter = iter->buffer_iter[iter->cpu];
+                ring_iter = trace_buffer_iter(iter, iter->cpu);
                /* First peek to compare current entry and the next one */
                if (ring_iter)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index df611a0e76c5..123b189c732c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1325,4 +1325,4 @@ __init static int init_events(void)
        return 0;
 }
-device_initcall(init_events);
+early_initcall(init_events);
author	Ingo Molnar <mingo@kernel.org>	2012-07-25 15:40:40 -0400
committer	Ingo Molnar <mingo@kernel.org>	2012-07-25 15:40:40 -0400
commit	d431adfbc9b7de651f3164c6b7ffcad75805d7e4 (patch)
tree	29bce222c81a3a392e51c11e2188659aa6d1bded /kernel
parent	d6250a3f12edb3a86db9598ffeca3de8b4a219e9 (diff)
parent	e2b34e311be3a57c9abcb927e37a57e38913714c (diff)