72 files changed, 1046 insertions, 534 deletions
diff --git a/CREDITS b/CREDITS
index 9416a9a8b95e..0640e1650483 100644
--- a/CREDITS
+++ b/CREDITS
@@ -2808,8 +2808,7 @@ S: Ottawa, Ontario
 S: Canada K2P 0X8
 N: Mikael Pettersson
-E: mikpe@it.uu.se
+E: mikpelinux@gmail.com
-W: http://user.it.uu.se/~mikpe/linux/
 D: Miscellaneous fixes
 N: Reed H. Petty
diff --git a/MAINTAINERS b/MAINTAINERS
index e61c2e83fc2b..c53fe9559642 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1812,7 +1812,8 @@ S:	Supported
 F:      drivers/net/ethernet/broadcom/bnx2x/
 BROADCOM BCM281XX/BCM11XXX ARM ARCHITECTURE
-M:      Christian Daudt <csd@broadcom.com>
+M:      Christian Daudt <bcm@fixthebug.org>
+L:      bcm-kernel-feedback-list@broadcom.com
 T:      git git://git.github.com/broadcom/bcm11351
 S:      Maintained
 F:      arch/arm/mach-bcm/
@@ -6595,7 +6596,7 @@ S:	Obsolete
 F:      drivers/net/wireless/prism54/
 PROMISE SATA TX2/TX4 CONTROLLER LIBATA DRIVER
-M:      Mikael Pettersson <mikpe@it.uu.se>
+M:      Mikael Pettersson <mikpelinux@gmail.com>
 L:      linux-ide@vger.kernel.org
 S:      Maintained
 F:      drivers/ata/sata_promise.*
diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 6a15c968d214..15ca2255f438 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -74,7 +74,7 @@ src-wlib-$(CONFIG_8xx) += mpc8xx.c planetcore.c
 src-wlib-$(CONFIG_PPC_82xx) += pq2.c fsl-soc.c planetcore.c
 src-wlib-$(CONFIG_EMBEDDED6xx) += mv64x60.c mv64x60_i2c.c ugecon.c
-src-plat-y := of.c
+src-plat-y := of.c epapr.c
 src-plat-$(CONFIG_40x) += fixed-head.S ep405.c cuboot-hotfoot.c \
                                treeboot-walnut.c cuboot-acadia.c \
                                cuboot-kilauea.c simpleboot.c \
@@ -97,7 +97,7 @@ src-plat-$(CONFIG_EMBEDDED6xx) += cuboot-pq2.c cuboot-mpc7448hpc2.c \
                                        prpmc2800.c
 src-plat-$(CONFIG_AMIGAONE) += cuboot-amigaone.c
 src-plat-$(CONFIG_PPC_PS3) += ps3-head.S ps3-hvcall.S ps3.c
-src-plat-$(CONFIG_EPAPR_BOOT) += epapr.c
+src-plat-$(CONFIG_EPAPR_BOOT) += epapr.c epapr-wrapper.c
 src-wlib := $(sort $(src-wlib-y))
 src-plat := $(sort $(src-plat-y))
diff --git a/arch/powerpc/boot/epapr-wrapper.c b/arch/powerpc/boot/epapr-wrapper.c
new file mode 100644
index 000000000000..c10191006673
--- /dev/null
+++ b/arch/powerpc/boot/epapr-wrapper.c
@@ -0,0 +1,9 @@
+extern void epapr_platform_init(unsigned long r3, unsigned long r4,
+                                unsigned long r5, unsigned long r6,
+                                unsigned long r7);
+void platform_init(unsigned long r3, unsigned long r4, unsigned long r5,
+                   unsigned long r6, unsigned long r7)
+{
+        epapr_platform_init(r3, r4, r5, r6, r7);
+}
diff --git a/arch/powerpc/boot/epapr.c b/arch/powerpc/boot/epapr.c
index 06c1961bd124..02e91aa2194a 100644
--- a/arch/powerpc/boot/epapr.c
+++ b/arch/powerpc/boot/epapr.c
@@ -48,8 +48,8 @@ static void platform_fixups(void)
                       fdt_addr, fdt_totalsize((void *)fdt_addr), ima_size);
 }
-void platform_init(unsigned long r3, unsigned long r4, unsigned long r5,
+void epapr_platform_init(unsigned long r3, unsigned long r4, unsigned long r5,
-                   unsigned long r6, unsigned long r7)
+                         unsigned long r6, unsigned long r7)
 {
        epapr_magic = r6;
        ima_size = r7;
diff --git a/arch/powerpc/boot/of.c b/arch/powerpc/boot/of.c
index 61d9899aa0d0..62e2f43ec1df 100644
--- a/arch/powerpc/boot/of.c
+++ b/arch/powerpc/boot/of.c
@@ -26,6 +26,9 @@
 static unsigned long claim_base;
+void epapr_platform_init(unsigned long r3, unsigned long r4, unsigned long r5,
+                         unsigned long r6, unsigned long r7);
 static void *of_try_claim(unsigned long size)
 {
        unsigned long addr = 0;
@@ -61,7 +64,7 @@ static void of_image_hdr(const void *hdr)
        }
 }
-void platform_init(unsigned long a1, unsigned long a2, void *promptr)
+static void of_platform_init(unsigned long a1, unsigned long a2, void *promptr)
 {
        platform_ops.image_hdr = of_image_hdr;
        platform_ops.malloc = of_try_claim;
@@ -81,3 +84,14 @@ void platform_init(unsigned long a1, unsigned long a2, void *promptr)
                loader_info.initrd_size = a2;
        }
 }
+void platform_init(unsigned long r3, unsigned long r4, unsigned long r5,
+                   unsigned long r6, unsigned long r7)
+{
+        /* Detect OF vs. ePAPR boot */
+        if (r5)
+                of_platform_init(r3, r4, (void *)r5);
+        else
+                epapr_platform_init(r3, r4, r5, r6, r7);
+}
diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper
index 6761c746048d..cd7af841ba05 100755
--- a/arch/powerpc/boot/wrapper
+++ b/arch/powerpc/boot/wrapper
@@ -148,18 +148,18 @@ make_space=y
 case "$platform" in
 pseries)
-    platformo=$object/of.o
+    platformo="$object/of.o $object/epapr.o"
    link_address='0x4000000'
    ;;
 maple)
-    platformo=$object/of.o
+    platformo="$object/of.o $object/epapr.o"
    link_address='0x400000'
    ;;
 pmac|chrp)
-    platformo=$object/of.o
+    platformo="$object/of.o $object/epapr.o"
    ;;
 coff)
-    platformo="$object/crt0.o $object/of.o"
+    platformo="$object/crt0.o $object/of.o $object/epapr.o"
    lds=$object/zImage.coff.lds
    link_address='0x500000'
    pie=
@@ -253,6 +253,7 @@ treeboot-iss4xx-mpic)
    platformo="$object/treeboot-iss4xx.o"
    ;;
 epapr)
+    platformo="$object/epapr.o $object/epapr-wrapper.o"
    link_address='0x20000000'
    pie=-pie
    ;;
diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h
index 0e40843a1c6e..41f13cec8a8f 100644
--- a/arch/powerpc/include/asm/irq.h
+++ b/arch/powerpc/include/asm/irq.h
@@ -69,9 +69,9 @@ extern struct thread_info *softirq_ctx[NR_CPUS];
 extern void irq_ctx_init(void);
 extern void call_do_softirq(struct thread_info *tp);
-extern int call_handle_irq(int irq, void *p1,
+extern void call_do_irq(struct pt_regs *regs, struct thread_info *tp);
-                           struct thread_info *tp, void *func);
 extern void do_IRQ(struct pt_regs *regs);
+extern void __do_irq(struct pt_regs *regs);
 int irq_choose_cpu(const struct cpumask *mask);
diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index e378cccfca55..ce4de5aed7b5 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -149,8 +149,6 @@ typedef struct {
 struct thread_struct {
        unsigned long   ksp;            /* Kernel stack pointer */
-        unsigned long   ksp_limit;      /* if ksp <= ksp_limit stack overflow */
 #ifdef CONFIG_PPC64
        unsigned long   ksp_vsid;
 #endif
@@ -162,6 +160,7 @@ struct thread_struct {
 #endif
 #ifdef CONFIG_PPC32
        void            *pgdir;         /* root of page-table tree */
+        unsigned long   ksp_limit;      /* if ksp <= ksp_limit stack overflow */
 #endif
 #ifdef CONFIG_PPC_ADV_DEBUG_REGS
        /*
@@ -321,7 +320,6 @@ struct thread_struct {
 #else
 #define INIT_THREAD  { \
        .ksp = INIT_SP, \
-        .ksp_limit = INIT_SP_LIMIT, \
        .regs = (struct pt_regs *)INIT_SP - 1, /* XXX bogus, I think */ \
        .fs = KERNEL_DS, \
        .fpr = {{0}}, \
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index d8958be5f31a..502c7a4e73f7 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -80,10 +80,11 @@ int main(void)
        DEFINE(TASKTHREADPPR, offsetof(struct task_struct, thread.ppr));
 #else
        DEFINE(THREAD_INFO, offsetof(struct task_struct, stack));
+        DEFINE(THREAD_INFO_GAP, _ALIGN_UP(sizeof(struct thread_info), 16));
+        DEFINE(KSP_LIMIT, offsetof(struct thread_struct, ksp_limit));
 #endif /* CONFIG_PPC64 */
        DEFINE(KSP, offsetof(struct thread_struct, ksp));
-        DEFINE(KSP_LIMIT, offsetof(struct thread_struct, ksp_limit));
        DEFINE(PT_REGS, offsetof(struct thread_struct, regs));
 #ifdef CONFIG_BOOKE
        DEFINE(THREAD_NORMSAVES, offsetof(struct thread_struct, normsave[0]));
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index c69440cef7af..57d286a78f86 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -441,50 +441,6 @@ void migrate_irqs(void)
 }
 #endif
-static inline void handle_one_irq(unsigned int irq)
-{
-        struct thread_info *curtp, *irqtp;
-        unsigned long saved_sp_limit;
-        struct irq_desc *desc;
-        desc = irq_to_desc(irq);
-        if (!desc)
-                return;
-        /* Switch to the irq stack to handle this */
-        curtp = current_thread_info();
-        irqtp = hardirq_ctx[smp_processor_id()];
-        if (curtp == irqtp) {
-                /* We're already on the irq stack, just handle it */
-                desc->handle_irq(irq, desc);
-                return;
-        }
-        saved_sp_limit = current->thread.ksp_limit;
-        irqtp->task = curtp->task;
-        irqtp->flags = 0;
-        /* Copy the softirq bits in preempt_count so that the
-         * softirq checks work in the hardirq context. */
-        irqtp->preempt_count = (irqtp->preempt_count & ~SOFTIRQ_MASK) |
-                               (curtp->preempt_count & SOFTIRQ_MASK);
-        current->thread.ksp_limit = (unsigned long)irqtp +
-                _ALIGN_UP(sizeof(struct thread_info), 16);
-        call_handle_irq(irq, desc, irqtp, desc->handle_irq);
-        current->thread.ksp_limit = saved_sp_limit;
-        irqtp->task = NULL;
-        /* Set any flag that may have been set on the
-         * alternate stack
-         */
-        if (irqtp->flags)
-                set_bits(irqtp->flags, &curtp->flags);
-}
 static inline void check_stack_overflow(void)
 {
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
@@ -501,9 +457,9 @@ static inline void check_stack_overflow(void)
 #endif
 }
-void do_IRQ(struct pt_regs *regs)
+void __do_irq(struct pt_regs *regs)
 {
-        struct pt_regs *old_regs = set_irq_regs(regs);
+        struct irq_desc *desc;
        unsigned int irq;
        irq_enter();
@@ -519,18 +475,56 @@ void do_IRQ(struct pt_regs *regs)
         */
        irq = ppc_md.get_irq();
-        /* We can hard enable interrupts now */
+        /* We can hard enable interrupts now to allow perf interrupts */
        may_hard_irq_enable();
        /* And finally process it */
-        if (irq != NO_IRQ)
+        if (unlikely(irq == NO_IRQ))
-                handle_one_irq(irq);
-        else
                __get_cpu_var(irq_stat).spurious_irqs++;
+        else {
+                desc = irq_to_desc(irq);
+                if (likely(desc))
+                        desc->handle_irq(irq, desc);
+        }
        trace_irq_exit(regs);
        irq_exit();
+}
+void do_IRQ(struct pt_regs *regs)
+{
+        struct pt_regs *old_regs = set_irq_regs(regs);
+        struct thread_info *curtp, *irqtp;
+        /* Switch to the irq stack to handle this */
+        curtp = current_thread_info();
+        irqtp = hardirq_ctx[raw_smp_processor_id()];
+        /* Already there ? */
+        if (unlikely(curtp == irqtp)) {
+                __do_irq(regs);
+                set_irq_regs(old_regs);
+                return;
+        }
+        /* Prepare the thread_info in the irq stack */
+        irqtp->task = curtp->task;
+        irqtp->flags = 0;
+        /* Copy the preempt_count so that the [soft]irq checks work. */
+        irqtp->preempt_count = curtp->preempt_count;
+        /* Switch stack and call */
+        call_do_irq(regs, irqtp);
+        /* Restore stack limit */
+        irqtp->task = NULL;
+        /* Copy back updates to the thread_info */
+        if (irqtp->flags)
+                set_bits(irqtp->flags, &curtp->flags);
        set_irq_regs(old_regs);
 }
@@ -592,28 +586,22 @@ void irq_ctx_init(void)
                memset((void *)softirq_ctx[i], 0, THREAD_SIZE);
                tp = softirq_ctx[i];
                tp->cpu = i;
-                tp->preempt_count = 0;
                memset((void *)hardirq_ctx[i], 0, THREAD_SIZE);
                tp = hardirq_ctx[i];
                tp->cpu = i;
-                tp->preempt_count = HARDIRQ_OFFSET;
        }
 }
 static inline void do_softirq_onstack(void)
 {
        struct thread_info *curtp, *irqtp;
-        unsigned long saved_sp_limit = current->thread.ksp_limit;
        curtp = current_thread_info();
        irqtp = softirq_ctx[smp_processor_id()];
        irqtp->task = curtp->task;
        irqtp->flags = 0;
-        current->thread.ksp_limit = (unsigned long)irqtp +
-                                    _ALIGN_UP(sizeof(struct thread_info), 16);
        call_do_softirq(irqtp);
-        current->thread.ksp_limit = saved_sp_limit;
        irqtp->task = NULL;
        /* Set any flag that may have been set on the
diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index 777d999f563b..2b0ad9845363 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -36,26 +36,41 @@
        .text
+/*
+ * We store the saved ksp_limit in the unused part
+ * of the STACK_FRAME_OVERHEAD
+ */
 _GLOBAL(call_do_softirq)
        mflr    r0
        stw     r0,4(r1)
+        lwz     r10,THREAD+KSP_LIMIT(r2)
+        addi    r11,r3,THREAD_INFO_GAP
        stwu    r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r3)
        mr      r1,r3
+        stw     r10,8(r1)
+        stw     r11,THREAD+KSP_LIMIT(r2)
        bl      __do_softirq
+        lwz     r10,8(r1)
        lwz     r1,0(r1)
        lwz     r0,4(r1)
+        stw     r10,THREAD+KSP_LIMIT(r2)
        mtlr    r0
        blr
-_GLOBAL(call_handle_irq)
+_GLOBAL(call_do_irq)
        mflr    r0
        stw     r0,4(r1)
-        mtctr   r6
+        lwz     r10,THREAD+KSP_LIMIT(r2)
-        stwu    r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r5)
+        addi    r11,r3,THREAD_INFO_GAP
-        mr      r1,r5
+        stwu    r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r4)
-        bctrl
+        mr      r1,r4
+        stw     r10,8(r1)
+        stw     r11,THREAD+KSP_LIMIT(r2)
+        bl      __do_irq
+        lwz     r10,8(r1)
        lwz     r1,0(r1)
        lwz     r0,4(r1)
+        stw     r10,THREAD+KSP_LIMIT(r2)
        mtlr    r0
        blr
diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
index 971d7e78aff2..e59caf874d05 100644
--- a/arch/powerpc/kernel/misc_64.S
+++ b/arch/powerpc/kernel/misc_64.S
@@ -40,14 +40,12 @@ _GLOBAL(call_do_softirq)
        mtlr    r0
        blr
-_GLOBAL(call_handle_irq)
+_GLOBAL(call_do_irq)
-        ld      r8,0(r6)
        mflr    r0
        std     r0,16(r1)
-        mtctr   r8
+        stdu    r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r4)
-        stdu    r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r5)
+        mr      r1,r4
-        mr      r1,r5
+        bl      .__do_irq
-        bctrl
        ld      r1,0(r1)
        ld      r0,16(r1)
        mtlr    r0
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 6f428da53e20..96d2fdf3aa9e 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1000,9 +1000,10 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
        kregs = (struct pt_regs *) sp;
        sp -= STACK_FRAME_OVERHEAD;
        p->thread.ksp = sp;
+#ifdef CONFIG_PPC32
        p->thread.ksp_limit = (unsigned long)task_stack_page(p) +
                                _ALIGN_UP(sizeof(struct thread_info), 16);
+#endif
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
        p->thread.ptrace_bps[0] = NULL;
 #endif
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 12e656ffe60e..5fe2842e8bab 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -196,6 +196,8 @@ static int __initdata mem_reserve_cnt;
 static cell_t __initdata regbuf[1024];
+static bool rtas_has_query_cpu_stopped;
 /*
 * Error results ... some OF calls will return "-1" on error, some
@@ -1574,6 +1576,11 @@ static void __init prom_instantiate_rtas(void)
        prom_setprop(rtas_node, "/rtas", "linux,rtas-entry",
                     &val, sizeof(val));
+        /* Check if it supports "query-cpu-stopped-state" */
+        if (prom_getprop(rtas_node, "query-cpu-stopped-state",
+                         &val, sizeof(val)) != PROM_ERROR)
+                rtas_has_query_cpu_stopped = true;
 #if defined(CONFIG_PPC_POWERNV) && defined(__BIG_ENDIAN__)
        /* PowerVN takeover hack */
        prom_rtas_data = base;
@@ -1815,6 +1822,18 @@ static void __init prom_hold_cpus(void)
                = (void *) LOW_ADDR(__secondary_hold_acknowledge);
        unsigned long secondary_hold = LOW_ADDR(__secondary_hold);
+        /*
+         * On pseries, if RTAS supports "query-cpu-stopped-state",
+         * we skip this stage, the CPUs will be started by the
+         * kernel using RTAS.
+         */
+        if ((of_platform == PLATFORM_PSERIES ||
+             of_platform == PLATFORM_PSERIES_LPAR) &&
+            rtas_has_query_cpu_stopped) {
+                prom_printf("prom_hold_cpus: skipped\n");
+                return;
+        }
        prom_debug("prom_hold_cpus: start...\n");
        prom_debug("    1) spinloop       = 0x%x\n", (unsigned long)spinloop);
        prom_debug("    1) *spinloop      = 0x%x\n", *spinloop);
@@ -3011,6 +3030,8 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4,
         * On non-powermacs, put all CPUs in spin-loops.
         *
         * PowerMacs use a different mechanism to spin CPUs
+         *
+         * (This must be done after instanciating RTAS)
         */
        if (of_platform != PLATFORM_POWERMAC &&
            of_platform != PLATFORM_OPAL)
diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index a7ee978fb860..b1faa1593c90 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -1505,6 +1505,7 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)
                 */
                if ((ra == 1) && !(regs->msr & MSR_PR) \
                        && (val3 >= (regs->gpr[1] - STACK_INT_FRAME_SIZE))) {
+#ifdef CONFIG_PPC32
                        /*
                         * Check if we will touch kernel sack overflow
                         */
@@ -1513,7 +1514,7 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)
                                err = -EINVAL;
                                break;
                        }
+#endif /* CONFIG_PPC32 */
                        /*
                         * Check if we already set since that means we'll
                         * lose the previous value.
diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
index 1c1771a40250..24f58cb0a543 100644
--- a/arch/powerpc/platforms/pseries/smp.c
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -233,18 +233,24 @@ static void __init smp_init_pseries(void)
        alloc_bootmem_cpumask_var(&of_spin_mask);
-        /* Mark threads which are still spinning in hold loops. */
+        /*
-        if (cpu_has_feature(CPU_FTR_SMT)) {
+         * Mark threads which are still spinning in hold loops
-                for_each_present_cpu(i) { 
+         *
-                        if (cpu_thread_in_core(i) == 0)
+         * We know prom_init will not have started them if RTAS supports
-                                cpumask_set_cpu(i, of_spin_mask);
+         * query-cpu-stopped-state.
-                }
+         */
-        } else {
+        if (rtas_token("query-cpu-stopped-state") == RTAS_UNKNOWN_SERVICE) {
-                cpumask_copy(of_spin_mask, cpu_present_mask);
+                if (cpu_has_feature(CPU_FTR_SMT)) {
+                        for_each_present_cpu(i) {
+                                if (cpu_thread_in_core(i) == 0)
+                                        cpumask_set_cpu(i, of_spin_mask);
+                        }
+                } else
+                        cpumask_copy(of_spin_mask, cpu_present_mask);
+                cpumask_clear_cpu(boot_cpuid, of_spin_mask);
        }
-        cpumask_clear_cpu(boot_cpuid, of_spin_mask);
        /* Non-lpar has additional take/give timebase */
        if (rtas_token("freeze-time-base") != RTAS_UNKNOWN_SERVICE) {
                smp_ops->give_timebase = rtas_give_timebase;
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 8355c84b9729..a9c606bb4945 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1883,9 +1883,9 @@ static struct pmu pmu = {
 void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
 {
-        userpg->cap_usr_time = 0;
+        userpg->cap_user_time = 0;
-        userpg->cap_usr_time_zero = 0;
+        userpg->cap_user_time_zero = 0;
-        userpg->cap_usr_rdpmc = x86_pmu.attr_rdpmc;
+        userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc;
        userpg->pmc_width = x86_pmu.cntval_bits;
        if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
@@ -1894,13 +1894,13 @@ void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
        if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
                return;
-        userpg->cap_usr_time = 1;
+        userpg->cap_user_time = 1;
        userpg->time_mult = this_cpu_read(cyc2ns);
        userpg->time_shift = CYC2NS_SCALE_FACTOR;
        userpg->time_offset = this_cpu_read(cyc2ns_offset) - now;
        if (sched_clock_stable && !check_tsc_disabled()) {
-                userpg->cap_usr_time_zero = 1;
+                userpg->cap_user_time_zero = 1;
                userpg->time_zero = this_cpu_read(cyc2ns_offset);
        }
 }
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 9db76c31b3c3..f31a1655d1ff 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2325,6 +2325,7 @@ __init int intel_pmu_init(void)
                break;
        case 55: /* Atom 22nm "Silvermont" */
+        case 77: /* Avoton "Silvermont" */
                memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
                        sizeof(hw_cache_event_ids));
                memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 8ed44589b0e4..4118f9f68315 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -2706,14 +2706,14 @@ static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box)
        box->hrtimer.function = uncore_pmu_hrtimer;
 }
-struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int cpu)
+static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int node)
 {
        struct intel_uncore_box *box;
        int i, size;
        size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg);
-        box = kzalloc_node(size, GFP_KERNEL, cpu_to_node(cpu));
+        box = kzalloc_node(size, GFP_KERNEL, node);
        if (!box)
                return NULL;
@@ -3031,7 +3031,7 @@ static int uncore_validate_group(struct intel_uncore_pmu *pmu,
        struct intel_uncore_box *fake_box;
        int ret = -EINVAL, n;
-        fake_box = uncore_alloc_box(pmu->type, smp_processor_id());
+        fake_box = uncore_alloc_box(pmu->type, NUMA_NO_NODE);
        if (!fake_box)
                return -ENOMEM;
@@ -3294,7 +3294,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
        }
        type = pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
-        box = uncore_alloc_box(type, 0);
+        box = uncore_alloc_box(type, NUMA_NO_NODE);
        if (!box)
                return -ENOMEM;
@@ -3499,7 +3499,7 @@ static int uncore_cpu_prepare(int cpu, int phys_id)
                        if (pmu->func_id < 0)
                                pmu->func_id = j;
-                        box = uncore_alloc_box(type, cpu);
+                        box = uncore_alloc_box(type, cpu_to_node(cpu));
                        if (!box)
                                return -ENOMEM;
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 563ed91e6faa..e643e744e4d8 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -352,12 +352,28 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
        },
        {       /* Handle problems with rebooting on the Precision M6600. */
                .callback = set_pci_reboot,
-                .ident = "Dell OptiPlex 990",
+                .ident = "Dell Precision M6600",
                .matches = {
                        DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
                        DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"),
                },
        },
+        {       /* Handle problems with rebooting on the Dell PowerEdge C6100. */
+                .callback = set_pci_reboot,
+                .ident = "Dell PowerEdge C6100",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "C6100"),
+                },
+        },
+        {       /* Some C6100 machines were shipped with vendor being 'Dell'. */
+                .callback = set_pci_reboot,
+                .ident = "Dell PowerEdge C6100",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "C6100"),
+                },
+        },
        { }
 };
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 90f6ed127096..c7e22ab29a5a 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -912,10 +912,13 @@ void __init efi_enter_virtual_mode(void)
        for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
                md = p;
-                if (!(md->attribute & EFI_MEMORY_RUNTIME) &&
+                if (!(md->attribute & EFI_MEMORY_RUNTIME)) {
-                    md->type != EFI_BOOT_SERVICES_CODE &&
+#ifdef CONFIG_X86_64
-                    md->type != EFI_BOOT_SERVICES_DATA)
+                        if (md->type != EFI_BOOT_SERVICES_CODE &&
-                        continue;
+                            md->type != EFI_BOOT_SERVICES_DATA)
+#endif
+                                continue;
+                }
                size = md->num_pages << EFI_PAGE_SHIFT;
                end = md->phys_addr + size;
diff --git a/drivers/ata/sata_promise.c b/drivers/ata/sata_promise.c
index 958ba2a420c3..97f4acb54ad6 100644
--- a/drivers/ata/sata_promise.c
+++ b/drivers/ata/sata_promise.c
@@ -2,7 +2,7 @@
 *  sata_promise.c - Promise SATA
 *
 *  Maintained by:  Tejun Heo <tj@kernel.org>
- *                  Mikael Pettersson <mikpe@it.uu.se>
+ *                  Mikael Pettersson
 *                  Please ALWAYS copy linux-ide@vger.kernel.org
 *                  on emails.
 *
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index d2d95ff5353b..edfa2515bc86 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1189,6 +1189,7 @@ static int cciss_ioctl32_passthru(struct block_device *bdev, fmode_t mode,
        int err;
        u32 cp;
+        memset(&arg64, 0, sizeof(arg64));
        err = 0;
        err |=
            copy_from_user(&arg64.LUN_info, &arg32->LUN_info,
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index 639d26b90b91..2b9440384536 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -1193,6 +1193,7 @@ out_passthru:
                ida_pci_info_struct pciinfo;
                if (!arg) return -EINVAL;
+                memset(&pciinfo, 0, sizeof(pciinfo));
                pciinfo.bus = host->pci_dev->bus->number;
                pciinfo.dev_fn = host->pci_dev->devfn;
                pciinfo.board_id = host->board_id;
diff --git a/drivers/gpu/drm/i2c/tda998x_drv.c b/drivers/gpu/drm/i2c/tda998x_drv.c
index b1f8fc69023f..60e84043aa34 100644
--- a/drivers/gpu/drm/i2c/tda998x_drv.c
+++ b/drivers/gpu/drm/i2c/tda998x_drv.c
@@ -707,8 +707,7 @@ tda998x_encoder_dpms(struct drm_encoder *encoder, int mode)
                reg_write(encoder, REG_VIP_CNTRL_2, priv->vip_cntrl_2);
                break;
        case DRM_MODE_DPMS_OFF:
-                /* disable audio and video ports */
+                /* disable video ports */
-                reg_write(encoder, REG_ENA_AP, 0x00);
                reg_write(encoder, REG_ENA_VP_0, 0x00);
                reg_write(encoder, REG_ENA_VP_1, 0x00);
                reg_write(encoder, REG_ENA_VP_2, 0x00);
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index b39f6f0b45f2..0f12382aa35d 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -498,7 +498,7 @@ struct cached_dev {
         */
        atomic_t                has_dirty;
-        struct ratelimit        writeback_rate;
+        struct bch_ratelimit    writeback_rate;
        struct delayed_work     writeback_rate_update;
        /*
@@ -507,10 +507,9 @@ struct cached_dev {
         */
        sector_t                last_read;
-        /* Number of writeback bios in flight */
+        /* Limit number of writeback bios in flight */
-        atomic_t                in_flight;
+        struct semaphore        in_flight;
        struct closure_with_timer writeback;
-        struct closure_waitlist writeback_wait;
        struct keybuf           writeback_keys;
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 8010eed06a51..22d1ae72c282 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -926,28 +926,45 @@ struct bkey *bch_next_recurse_key(struct btree *b, struct bkey *search)
 /* Mergesort */
+static void sort_key_next(struct btree_iter *iter,
+                          struct btree_iter_set *i)
+{
+        i->k = bkey_next(i->k);
+        if (i->k == i->end)
+                *i = iter->data[--iter->used];
+}
 static void btree_sort_fixup(struct btree_iter *iter)
 {
        while (iter->used > 1) {
                struct btree_iter_set *top = iter->data, *i = top + 1;
-                struct bkey *k;
                if (iter->used > 2 &&
                    btree_iter_cmp(i[0], i[1]))
                        i++;
-                for (k = i->k;
+                if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0)
-                     k != i->end && bkey_cmp(top->k, &START_KEY(k)) > 0;
-                     k = bkey_next(k))
-                        if (top->k > i->k)
-                                __bch_cut_front(top->k, k);
-                        else if (KEY_SIZE(k))
-                                bch_cut_back(&START_KEY(k), top->k);
-                if (top->k < i->k || k == i->k)
                        break;
-                heap_sift(iter, i - top, btree_iter_cmp);
+                if (!KEY_SIZE(i->k)) {
+                        sort_key_next(iter, i);
+                        heap_sift(iter, i - top, btree_iter_cmp);
+                        continue;
+                }
+                if (top->k > i->k) {
+                        if (bkey_cmp(top->k, i->k) >= 0)
+                                sort_key_next(iter, i);
+                        else
+                                bch_cut_front(top->k, i->k);
+                        heap_sift(iter, i - top, btree_iter_cmp);
+                } else {
+                        /* can't happen because of comparison func */
+                        BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
+                        bch_cut_back(&START_KEY(i->k), top->k);
+                }
        }
 }
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index f9764e61978b..f42fc7ed9cd6 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -255,7 +255,7 @@ void bch_btree_node_read(struct btree *b)
        return;
 err:
-        bch_cache_set_error(b->c, "io error reading bucket %lu",
+        bch_cache_set_error(b->c, "io error reading bucket %zu",
                            PTR_BUCKET_NR(b->c, &b->key, 0));
 }
@@ -612,7 +612,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
                return SHRINK_STOP;
        /* Return -1 if we can't do anything right now */
-        if (sc->gfp_mask & __GFP_WAIT)
+        if (sc->gfp_mask & __GFP_IO)
                mutex_lock(&c->bucket_lock);
        else if (!mutex_trylock(&c->bucket_lock))
                return -1;
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index ba95ab84b2be..8435f81e5d85 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -153,7 +153,8 @@ int bch_journal_read(struct cache_set *c, struct list_head *list,
                bitmap_zero(bitmap, SB_JOURNAL_BUCKETS);
                pr_debug("%u journal buckets", ca->sb.njournal_buckets);
-                /* Read journal buckets ordered by golden ratio hash to quickly
+                /*
+                 * Read journal buckets ordered by golden ratio hash to quickly
                 * find a sequence of buckets with valid journal entries
                 */
                for (i = 0; i < ca->sb.njournal_buckets; i++) {
@@ -166,18 +167,20 @@ int bch_journal_read(struct cache_set *c, struct list_head *list,
                                goto bsearch;
                }
-                /* If that fails, check all the buckets we haven't checked
+                /*
+                 * If that fails, check all the buckets we haven't checked
                 * already
                 */
                pr_debug("falling back to linear search");
-                for (l = 0; l < ca->sb.njournal_buckets; l++) {
+                for (l = find_first_zero_bit(bitmap, ca->sb.njournal_buckets);
-                        if (test_bit(l, bitmap))
+                     l < ca->sb.njournal_buckets;
-                                continue;
+                     l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets, l + 1))
                        if (read_bucket(l))
                                goto bsearch;
-                }
+                if (list_empty(list))
+                        continue;
 bsearch:
                /* Binary search */
                m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
@@ -197,10 +200,12 @@ bsearch:
                                r = m;
                }
-                /* Read buckets in reverse order until we stop finding more
+                /*
+                 * Read buckets in reverse order until we stop finding more
                 * journal entries
                 */
-                pr_debug("finishing up");
+                pr_debug("finishing up: m %u njournal_buckets %u",
+                         m, ca->sb.njournal_buckets);
                l = m;
                while (1) {
@@ -228,9 +233,10 @@ bsearch:
                        }
        }
-        c->journal.seq = list_entry(list->prev,
+        if (!list_empty(list))
-                                    struct journal_replay,
+                c->journal.seq = list_entry(list->prev,
-                                    list)->j.seq;
+                                            struct journal_replay,
+                                            list)->j.seq;
        return 0;
 #undef read_bucket
@@ -428,7 +434,7 @@ static void do_journal_discard(struct cache *ca)
                return;
        }
-        switch (atomic_read(&ja->discard_in_flight) == DISCARD_IN_FLIGHT) {
+        switch (atomic_read(&ja->discard_in_flight)) {
        case DISCARD_IN_FLIGHT:
                return;
@@ -689,6 +695,7 @@ void bch_journal_meta(struct cache_set *c, struct closure *cl)
                if (cl)
                        BUG_ON(!closure_wait(&w->wait, cl));
+                closure_flush(&c->journal.io);
                __journal_try_write(c, true);
        }
 }
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 786a1a4f74d8..71eb233b9ace 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -997,14 +997,17 @@ static void request_write(struct cached_dev *dc, struct search *s)
        } else {
                bch_writeback_add(dc);
-                if (s->op.flush_journal) {
+                if (bio->bi_rw & REQ_FLUSH) {
                        /* Also need to send a flush to the backing device */
-                        s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
+                        struct bio *flush = bio_alloc_bioset(0, GFP_NOIO,
-                                                           dc->disk.bio_split);
+                                                             dc->disk.bio_split);
-                        bio->bi_size = 0;
+                        flush->bi_rw    = WRITE_FLUSH;
-                        bio->bi_vcnt = 0;
+                        flush->bi_bdev  = bio->bi_bdev;
-                        closure_bio_submit(bio, cl, s->d);
+                        flush->bi_end_io = request_endio;
+                        flush->bi_private = cl;
+                        closure_bio_submit(flush, cl, s->d);
                } else {
                        s->op.cache_bio = bio;
                }
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 4fe6ab2fbe2e..924dcfdae111 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -223,8 +223,13 @@ STORE(__cached_dev)
        }
        if (attr == &sysfs_label) {
-                /* note: endlines are preserved */
+                if (size > SB_LABEL_SIZE)
-                memcpy(dc->sb.label, buf, SB_LABEL_SIZE);
+                        return -EINVAL;
+                memcpy(dc->sb.label, buf, size);
+                if (size < SB_LABEL_SIZE)
+                        dc->sb.label[size] = '\0';
+                if (size && dc->sb.label[size - 1] == '\n')
+                        dc->sb.label[size - 1] = '\0';
                bch_write_bdev_super(dc, NULL);
                if (dc->disk.c) {
                        memcpy(dc->disk.c->uuids[dc->disk.id].label,
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index 98eb81159a22..420dad545c7d 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -190,7 +190,16 @@ void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
        stats->last = now ?: 1;
 }
-unsigned bch_next_delay(struct ratelimit *d, uint64_t done)
+/**
+ * bch_next_delay() - increment @d by the amount of work done, and return how
+ * long to delay until the next time to do some work.
+ *
+ * @d - the struct bch_ratelimit to update
+ * @done - the amount of work done, in arbitrary units
+ *
+ * Returns the amount of time to delay by, in jiffies
+ */
+uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
 {
        uint64_t now = local_clock();
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 1ae2a73ad85f..ea345c6896f4 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -450,17 +450,23 @@ read_attribute(name ## _last_ ## frequency_units)
        (ewma) >> factor;                                               \
 })
-struct ratelimit {
+struct bch_ratelimit {
+        /* Next time we want to do some work, in nanoseconds */
        uint64_t                next;
+        /*
+         * Rate at which we want to do work, in units per nanosecond
+         * The units here correspond to the units passed to bch_next_delay()
+         */
        unsigned                rate;
 };
-static inline void ratelimit_reset(struct ratelimit *d)
+static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
 {
        d->next = local_clock();
 }
-unsigned bch_next_delay(struct ratelimit *d, uint64_t done);
+uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done);
 #define __DIV_SAFE(n, d, zero)                                          \
 ({                                                                      \
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 22cbff551628..ba3ee48320f2 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -94,11 +94,15 @@ static void update_writeback_rate(struct work_struct *work)
 static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
 {
+        uint64_t ret;
        if (atomic_read(&dc->disk.detaching) ||
            !dc->writeback_percent)
                return 0;
-        return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL);
+        ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL);
+        return min_t(uint64_t, ret, HZ);
 }
 /* Background writeback */
@@ -208,7 +212,7 @@ normal_refill:
        up_write(&dc->writeback_lock);
-        ratelimit_reset(&dc->writeback_rate);
+        bch_ratelimit_reset(&dc->writeback_rate);
        /* Punt to workqueue only so we don't recurse and blow the stack */
        continue_at(cl, read_dirty, dirty_wq);
@@ -318,9 +322,7 @@ static void write_dirty_finish(struct closure *cl)
        }
        bch_keybuf_del(&dc->writeback_keys, w);
-        atomic_dec_bug(&dc->in_flight);
+        up(&dc->in_flight);
-        closure_wake_up(&dc->writeback_wait);
        closure_return_with_destructor(cl, dirty_io_destructor);
 }
@@ -349,7 +351,7 @@ static void write_dirty(struct closure *cl)
        closure_bio_submit(&io->bio, cl, &io->dc->disk);
-        continue_at(cl, write_dirty_finish, dirty_wq);
+        continue_at(cl, write_dirty_finish, system_wq);
 }
 static void read_dirty_endio(struct bio *bio, int error)
@@ -369,7 +371,7 @@ static void read_dirty_submit(struct closure *cl)
        closure_bio_submit(&io->bio, cl, &io->dc->disk);
-        continue_at(cl, write_dirty, dirty_wq);
+        continue_at(cl, write_dirty, system_wq);
 }
 static void read_dirty(struct closure *cl)
@@ -394,12 +396,8 @@ static void read_dirty(struct closure *cl)
                if (delay > 0 &&
                    (KEY_START(&w->key) != dc->last_read ||
-                     jiffies_to_msecs(delay) > 50)) {
+                     jiffies_to_msecs(delay) > 50))
-                        w->private = NULL;
+                        delay = schedule_timeout_uninterruptible(delay);
-                        closure_delay(&dc->writeback, delay);
-                        continue_at(cl, read_dirty, dirty_wq);
-                }
                dc->last_read   = KEY_OFFSET(&w->key);
@@ -424,15 +422,10 @@ static void read_dirty(struct closure *cl)
                trace_bcache_writeback(&w->key);
-                closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl);
+                down(&dc->in_flight);
+                closure_call(&io->cl, read_dirty_submit, NULL, cl);
                delay = writeback_delay(dc, KEY_SIZE(&w->key));
-                atomic_inc(&dc->in_flight);
-                if (!closure_wait_event(&dc->writeback_wait, cl,
-                                        atomic_read(&dc->in_flight) < 64))
-                        continue_at(cl, read_dirty, dirty_wq);
        }
        if (0) {
@@ -442,7 +435,11 @@ err:
                bch_keybuf_del(&dc->writeback_keys, w);
        }
-        refill_dirty(cl);
+        /*
+         * Wait for outstanding writeback IOs to finish (and keybuf slots to be
+         * freed) before refilling again
+         */
+        continue_at(cl, refill_dirty, dirty_wq);
 }
 /* Init */
@@ -484,6 +481,7 @@ void bch_sectors_dirty_init(struct cached_dev *dc)
 void bch_cached_dev_writeback_init(struct cached_dev *dc)
 {
+        sema_init(&dc->in_flight, 64);
        closure_init_unlocked(&dc->writeback);
        init_rwsem(&dc->writeback_lock);
@@ -513,7 +511,7 @@ void bch_writeback_exit(void)
 int __init bch_writeback_init(void)
 {
-        dirty_wq = create_singlethread_workqueue("bcache_writeback");
+        dirty_wq = create_workqueue("bcache_writeback");
        if (!dirty_wq)
                return -ENOMEM;
diff --git a/fs/bio.c b/fs/bio.c
index b3b20ed9510e..ea5035da4d9a 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -917,8 +917,8 @@ void bio_copy_data(struct bio *dst, struct bio *src)
                src_p = kmap_atomic(src_bv->bv_page);
                dst_p = kmap_atomic(dst_bv->bv_page);
-                memcpy(dst_p + dst_bv->bv_offset,
+                memcpy(dst_p + dst_offset,
-                       src_p + src_bv->bv_offset,
+                       src_p + src_offset,
                       bytes);
                kunmap_atomic(dst_p);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 121da2dc3be8..d4e81e4a9b04 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1924,7 +1924,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 {
        int tmp, hangup_needed = 0;
        struct ocfs2_super *osb = NULL;
-        char nodestr[8];
+        char nodestr[12];
        trace_ocfs2_dismount_volume(sb);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 60e95872da29..ecc82b37c4cc 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -53,23 +53,6 @@ struct mem_cgroup_reclaim_cookie {
        unsigned int generation;
 };
-enum mem_cgroup_filter_t {
-        VISIT,          /* visit current node */
-        SKIP,           /* skip the current node and continue traversal */
-        SKIP_TREE,      /* skip the whole subtree and continue traversal */
-};
-/*
- * mem_cgroup_filter_t predicate might instruct mem_cgroup_iter_cond how to
- * iterate through the hierarchy tree. Each tree element is checked by the
- * predicate before it is returned by the iterator. If a filter returns
- * SKIP or SKIP_TREE then the iterator code continues traversal (with the
- * next node down the hierarchy or the next node that doesn't belong under the
- * memcg's subtree).
- */
-typedef enum mem_cgroup_filter_t
-(*mem_cgroup_iter_filter)(struct mem_cgroup *memcg, struct mem_cgroup *root);
 #ifdef CONFIG_MEMCG
 /*
 * All "charge" functions with gfp_mask should use GFP_KERNEL or
@@ -137,18 +120,9 @@ mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
 extern void mem_cgroup_end_migration(struct mem_cgroup *memcg,
        struct page *oldpage, struct page *newpage, bool migration_ok);
-struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
+struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
-                                   struct mem_cgroup *prev,
+                                   struct mem_cgroup *,
-                                   struct mem_cgroup_reclaim_cookie *reclaim,
+                                   struct mem_cgroup_reclaim_cookie *);
-                                   mem_cgroup_iter_filter cond);
-static inline struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
-                                   struct mem_cgroup *prev,
-                                   struct mem_cgroup_reclaim_cookie *reclaim)
-{
-        return mem_cgroup_iter_cond(root, prev, reclaim, NULL);
-}
 void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
 /*
@@ -260,9 +234,9 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
        mem_cgroup_update_page_stat(page, idx, -1);
 }
-enum mem_cgroup_filter_t
+unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
-mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
+                                                gfp_t gfp_mask,
-                struct mem_cgroup *root);
+                                                unsigned long *total_scanned);
 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
 static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
@@ -376,15 +350,6 @@ static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg,
                struct page *oldpage, struct page *newpage, bool migration_ok)
 {
 }
-static inline struct mem_cgroup *
-mem_cgroup_iter_cond(struct mem_cgroup *root,
-                struct mem_cgroup *prev,
-                struct mem_cgroup_reclaim_cookie *reclaim,
-                mem_cgroup_iter_filter cond)
-{
-        /* first call must return non-NULL, second return NULL */
-        return (struct mem_cgroup *)(unsigned long)!prev;
-}
 static inline struct mem_cgroup *
 mem_cgroup_iter(struct mem_cgroup *root,
@@ -471,11 +436,11 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
 }
 static inline
-enum mem_cgroup_filter_t
+unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
-mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
+                                            gfp_t gfp_mask,
-                struct mem_cgroup *root)
+                                            unsigned long *total_scanned)
 {
-        return VISIT;
+        return 0;
 }
 static inline void mem_cgroup_split_huge_fixup(struct page *head)
diff --git a/include/linux/smp.h b/include/linux/smp.h
index cfb7ca094b38..731f5237d5f4 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -155,6 +155,12 @@ smp_call_function_any(const struct cpumask *mask, smp_call_func_t func,
 static inline void kick_all_cpus_sync(void) {  }
+static inline void __smp_call_function_single(int cpuid,
+                struct call_single_data *data, int wait)
+{
+        on_each_cpu(data->func, data->info, wait);
+}
 #endif /* !SMP */
 /*
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 40a1fb807396..009a655a5d35 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -380,10 +380,13 @@ struct perf_event_mmap_page {
        union {
                __u64   capabilities;
                struct {
-                        __u64   cap_usr_time            : 1,
+                        __u64   cap_bit0                : 1, /* Always 0, deprecated, see commit 860f085b74e9 */
-                                cap_usr_rdpmc           : 1,
+                                cap_bit0_is_deprecated  : 1, /* Always 1, signals that bit 0 is zero */
-                                cap_usr_time_zero       : 1,
-                                cap_____res             : 61;
+                                cap_user_rdpmc          : 1, /* The RDPMC instruction can be used to read counts */
+                                cap_user_time           : 1, /* The time_* fields are used */
+                                cap_user_time_zero      : 1, /* The time_zero field is used */
+                                cap_____res             : 59;
                };
        };
@@ -442,12 +445,13 @@ struct perf_event_mmap_page {
         *               ((rem * time_mult) >> time_shift);
         */
        __u64   time_zero;
+        __u32   size;                   /* Header size up to __reserved[] fields. */
                /*
                 * Hole for extension of the self monitor capabilities
                 */
-        __u64   __reserved[119];        /* align to 1k */
+        __u8    __reserved[118*8+4];    /* align to 1k. */
        /*
         * Control data for the mmap() data buffer.
@@ -528,6 +532,7 @@ enum perf_event_type {
         *      u64                             len;
         *      u64                             pgoff;
         *      char                            filename[];
+         *      struct sample_id                sample_id;
         * };
         */
        PERF_RECORD_MMAP                        = 1,
diff --git a/ipc/msg.c b/ipc/msg.c
index b0d541d42677..9e4310c546ae 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -165,6 +165,15 @@ static inline void msg_rmid(struct ipc_namespace *ns, struct msg_queue *s)
        ipc_rmid(&msg_ids(ns), &s->q_perm);
 }
+static void msg_rcu_free(struct rcu_head *head)
+{
+        struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu);
+        struct msg_queue *msq = ipc_rcu_to_struct(p);
+        security_msg_queue_free(msq);
+        ipc_rcu_free(head);
+}
 /**
 * newque - Create a new msg queue
 * @ns: namespace
@@ -189,15 +198,14 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params)
        msq->q_perm.security = NULL;
        retval = security_msg_queue_alloc(msq);
        if (retval) {
-                ipc_rcu_putref(msq);
+                ipc_rcu_putref(msq, ipc_rcu_free);
                return retval;
        }
        /* ipc_addid() locks msq upon success. */
        id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni);
        if (id < 0) {
-                security_msg_queue_free(msq);
+                ipc_rcu_putref(msq, msg_rcu_free);
-                ipc_rcu_putref(msq);
                return id;
        }
@@ -276,8 +284,7 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
                free_msg(msg);
        }
        atomic_sub(msq->q_cbytes, &ns->msg_bytes);
-        security_msg_queue_free(msq);
+        ipc_rcu_putref(msq, msg_rcu_free);
-        ipc_rcu_putref(msq);
 }
 /*
@@ -717,7 +724,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
                rcu_read_lock();
                ipc_lock_object(&msq->q_perm);
-                ipc_rcu_putref(msq);
+                ipc_rcu_putref(msq, ipc_rcu_free);
                if (msq->q_perm.deleted) {
                        err = -EIDRM;
                        goto out_unlock0;
diff --git a/ipc/sem.c b/ipc/sem.c
index 69b6a21f3844..19c8b980d1fe 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -243,6 +243,15 @@ static void merge_queues(struct sem_array *sma)
        }
 }
+static void sem_rcu_free(struct rcu_head *head)
+{
+        struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu);
+        struct sem_array *sma = ipc_rcu_to_struct(p);
+        security_sem_free(sma);
+        ipc_rcu_free(head);
+}
 /*
 * If the request contains only one semaphore operation, and there are
 * no complex transactions pending, lock only the semaphore involved.
@@ -374,12 +383,7 @@ static inline struct sem_array *sem_obtain_object_check(struct ipc_namespace *ns
 static inline void sem_lock_and_putref(struct sem_array *sma)
 {
        sem_lock(sma, NULL, -1);
-        ipc_rcu_putref(sma);
+        ipc_rcu_putref(sma, ipc_rcu_free);
-}
-static inline void sem_putref(struct sem_array *sma)
-{
-        ipc_rcu_putref(sma);
 }
 static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
@@ -458,14 +462,13 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
        sma->sem_perm.security = NULL;
        retval = security_sem_alloc(sma);
        if (retval) {
-                ipc_rcu_putref(sma);
+                ipc_rcu_putref(sma, ipc_rcu_free);
                return retval;
        }
        id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni);
        if (id < 0) {
-                security_sem_free(sma);
+                ipc_rcu_putref(sma, sem_rcu_free);
-                ipc_rcu_putref(sma);
                return id;
        }
        ns->used_sems += nsems;
@@ -1047,8 +1050,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
        wake_up_sem_queue_do(&tasks);
        ns->used_sems -= sma->sem_nsems;
-        security_sem_free(sma);
+        ipc_rcu_putref(sma, sem_rcu_free);
-        ipc_rcu_putref(sma);
 }
 static unsigned long copy_semid_to_user(void __user *buf, struct semid64_ds *in, int version)
@@ -1292,7 +1294,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
                        rcu_read_unlock();
                        sem_io = ipc_alloc(sizeof(ushort)*nsems);
                        if(sem_io == NULL) {
-                                sem_putref(sma);
+                                ipc_rcu_putref(sma, ipc_rcu_free);
                                return -ENOMEM;
                        }
@@ -1328,20 +1330,20 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
                if(nsems > SEMMSL_FAST) {
                        sem_io = ipc_alloc(sizeof(ushort)*nsems);
                        if(sem_io == NULL) {
-                                sem_putref(sma);
+                                ipc_rcu_putref(sma, ipc_rcu_free);
                                return -ENOMEM;
                        }
                }
                if (copy_from_user (sem_io, p, nsems*sizeof(ushort))) {
-                        sem_putref(sma);
+                        ipc_rcu_putref(sma, ipc_rcu_free);
                        err = -EFAULT;
                        goto out_free;
                }
                for (i = 0; i < nsems; i++) {
                        if (sem_io[i] > SEMVMX) {
-                                sem_putref(sma);
+                                ipc_rcu_putref(sma, ipc_rcu_free);
                                err = -ERANGE;
                                goto out_free;
                        }
@@ -1629,7 +1631,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid)
        /* step 2: allocate new undo structure */
        new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL);
        if (!new) {
-                sem_putref(sma);
+                ipc_rcu_putref(sma, ipc_rcu_free);
                return ERR_PTR(-ENOMEM);
        }
diff --git a/ipc/shm.c b/ipc/shm.c
index 2821cdf93adb..d69739610fd4 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -167,6 +167,15 @@ static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp)
        ipc_lock_object(&ipcp->shm_perm);
 }
+static void shm_rcu_free(struct rcu_head *head)
+{
+        struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu);
+        struct shmid_kernel *shp = ipc_rcu_to_struct(p);
+        security_shm_free(shp);
+        ipc_rcu_free(head);
+}
 static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
 {
        ipc_rmid(&shm_ids(ns), &s->shm_perm);
@@ -208,8 +217,7 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
                user_shm_unlock(file_inode(shp->shm_file)->i_size,
                                                shp->mlock_user);
        fput (shp->shm_file);
-        security_shm_free(shp);
+        ipc_rcu_putref(shp, shm_rcu_free);
-        ipc_rcu_putref(shp);
 }
 /*
@@ -497,7 +505,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
        shp->shm_perm.security = NULL;
        error = security_shm_alloc(shp);
        if (error) {
-                ipc_rcu_putref(shp);
+                ipc_rcu_putref(shp, ipc_rcu_free);
                return error;
        }
@@ -566,8 +574,7 @@ no_id:
                user_shm_unlock(size, shp->mlock_user);
        fput(file);
 no_file:
-        security_shm_free(shp);
+        ipc_rcu_putref(shp, shm_rcu_free);
-        ipc_rcu_putref(shp);
        return error;
 }
diff --git a/ipc/util.c b/ipc/util.c
index e829da9ed01f..fdb8ae740775 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -474,11 +474,6 @@ void ipc_free(void* ptr, int size)
                kfree(ptr);
 }
-struct ipc_rcu {
-        struct rcu_head rcu;
-        atomic_t refcount;
-} ____cacheline_aligned_in_smp;
 /**
 *      ipc_rcu_alloc   -       allocate ipc and rcu space 
 *      @size: size desired
@@ -505,27 +500,24 @@ int ipc_rcu_getref(void *ptr)
        return atomic_inc_not_zero(&p->refcount);
 }
-/**
+void ipc_rcu_putref(void *ptr, void (*func)(struct rcu_head *head))
- * ipc_schedule_free - free ipc + rcu space
- * @head: RCU callback structure for queued work
- */
-static void ipc_schedule_free(struct rcu_head *head)
-{
-        vfree(container_of(head, struct ipc_rcu, rcu));
-}
-void ipc_rcu_putref(void *ptr)
 {
        struct ipc_rcu *p = ((struct ipc_rcu *)ptr) - 1;
        if (!atomic_dec_and_test(&p->refcount))
                return;
-        if (is_vmalloc_addr(ptr)) {
+        call_rcu(&p->rcu, func);
-                call_rcu(&p->rcu, ipc_schedule_free);
+}
-        } else {
-                kfree_rcu(p, rcu);
+void ipc_rcu_free(struct rcu_head *head)
-        }
+{
+        struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu);
+        if (is_vmalloc_addr(p))
+                vfree(p);
+        else
+                kfree(p);
 }
 /**
diff --git a/ipc/util.h b/ipc/util.h
index c5f3338ba1fa..f2f5036f2eed 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -47,6 +47,13 @@ static inline void msg_exit_ns(struct ipc_namespace *ns) { }
 static inline void shm_exit_ns(struct ipc_namespace *ns) { }
 #endif
+struct ipc_rcu {
+        struct rcu_head rcu;
+        atomic_t refcount;
+} ____cacheline_aligned_in_smp;
+#define ipc_rcu_to_struct(p)  ((void *)(p+1))
 /*
 * Structure that holds the parameters needed by the ipc operations
 * (see after)
@@ -120,7 +127,8 @@ void ipc_free(void* ptr, int size);
 */
 void* ipc_rcu_alloc(int size);
 int ipc_rcu_getref(void *ptr);
-void ipc_rcu_putref(void *ptr);
+void ipc_rcu_putref(void *ptr, void (*func)(struct rcu_head *head));
+void ipc_rcu_free(struct rcu_head *head);
 struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int);
 struct kern_ipc_perm *ipc_obtain_object(struct ipc_ids *ids, int id);
diff --git a/kernel/audit.c b/kernel/audit.c
index 91e53d04b6a9..7b0e23a740ce 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1117,9 +1117,10 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
                        sleep_time = timeout_start + audit_backlog_wait_time -
                                        jiffies;
-                        if ((long)sleep_time > 0)
+                        if ((long)sleep_time > 0) {
                                wait_for_auditd(sleep_time);
-                        continue;
+                                continue;
+                        }
                }
                if (audit_rate_check() && printk_ratelimit())
                        printk(KERN_WARNING
diff --git a/kernel/events/core.c b/kernel/events/core.c
index dd236b66ca3a..cb4238e85b38 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3660,6 +3660,26 @@ static void calc_timer_values(struct perf_event *event,
        *running = ctx_time - event->tstamp_running;
 }
+static void perf_event_init_userpage(struct perf_event *event)
+{
+        struct perf_event_mmap_page *userpg;
+        struct ring_buffer *rb;
+        rcu_read_lock();
+        rb = rcu_dereference(event->rb);
+        if (!rb)
+                goto unlock;
+        userpg = rb->user_page;
+        /* Allow new userspace to detect that bit 0 is deprecated */
+        userpg->cap_bit0_is_deprecated = 1;
+        userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
+unlock:
+        rcu_read_unlock();
+}
 void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
 {
 }
@@ -4044,6 +4064,7 @@ again:
        ring_buffer_attach(event, rb);
        rcu_assign_pointer(event->rb, rb);
+        perf_event_init_userpage(event);
        perf_event_update_userpage(event);
 unlock:
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 269ed9384cc4..f813b3474646 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -32,7 +32,14 @@ EXPORT_SYMBOL(cad_pid);
 #endif
 enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE;
-int reboot_default;
+/*
+ * This variable is used privately to keep track of whether or not
+ * reboot_type is still set to its default value (i.e., reboot= hasn't
+ * been set on the command line).  This is needed so that we can
+ * suppress DMI scanning for reboot quirks.  Without it, it's
+ * impossible to override a faulty reboot quirk without recompiling.
+ */
+int reboot_default = 1;
 int reboot_cpu;
 enum reboot_type reboot_type = BOOT_ACPI;
 int reboot_force;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 11cd13667359..7c70201fbc61 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4242,7 +4242,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
        }
        if (!se) {
-                cfs_rq->h_load = rq->avg.load_avg_contrib;
+                cfs_rq->h_load = cfs_rq->runnable_load_avg;
                cfs_rq->last_h_load_update = now;
        }
@@ -4823,8 +4823,8 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
                (busiest->load_per_task * SCHED_POWER_SCALE) /
                busiest->group_power;
-        if (busiest->avg_load - local->avg_load + scaled_busy_load_per_task >=
+        if (busiest->avg_load + scaled_busy_load_per_task >=
-            (scaled_busy_load_per_task * imbn)) {
+            local->avg_load + (scaled_busy_load_per_task * imbn)) {
                env->imbalance = busiest->load_per_task;
                return;
        }
@@ -4896,7 +4896,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
         * max load less than avg load(as we skip the groups at or below
         * its cpu_power, while calculating max_load..)
         */
-        if (busiest->avg_load < sds->avg_load) {
+        if (busiest->avg_load <= sds->avg_load ||
+            local->avg_load >= sds->avg_load) {
                env->imbalance = 0;
                return fix_small_imbalance(env, sds);
        }
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 51c4f34d258e..4431610f049a 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -486,7 +486,52 @@ static struct smp_hotplug_thread watchdog_threads = {
        .unpark                 = watchdog_enable,
 };
-static int watchdog_enable_all_cpus(void)
+static void restart_watchdog_hrtimer(void *info)
+{
+        struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
+        int ret;
+        /*
+         * No need to cancel and restart hrtimer if it is currently executing
+         * because it will reprogram itself with the new period now.
+         * We should never see it unqueued here because we are running per-cpu
+         * with interrupts disabled.
+         */
+        ret = hrtimer_try_to_cancel(hrtimer);
+        if (ret == 1)
+                hrtimer_start(hrtimer, ns_to_ktime(sample_period),
+                                HRTIMER_MODE_REL_PINNED);
+}
+static void update_timers(int cpu)
+{
+        struct call_single_data data = {.func = restart_watchdog_hrtimer};
+        /*
+         * Make sure that perf event counter will adopt to a new
+         * sampling period. Updating the sampling period directly would
+         * be much nicer but we do not have an API for that now so
+         * let's use a big hammer.
+         * Hrtimer will adopt the new period on the next tick but this
+         * might be late already so we have to restart the timer as well.
+         */
+        watchdog_nmi_disable(cpu);
+        __smp_call_function_single(cpu, &data, 1);
+        watchdog_nmi_enable(cpu);
+}
+static void update_timers_all_cpus(void)
+{
+        int cpu;
+        get_online_cpus();
+        preempt_disable();
+        for_each_online_cpu(cpu)
+                update_timers(cpu);
+        preempt_enable();
+        put_online_cpus();
+}
+static int watchdog_enable_all_cpus(bool sample_period_changed)
 {
        int err = 0;
@@ -496,6 +541,8 @@ static int watchdog_enable_all_cpus(void)
                        pr_err("Failed to create watchdog threads, disabled\n");
                else
                        watchdog_running = 1;
+        } else if (sample_period_changed) {
+                update_timers_all_cpus();
        }
        return err;
@@ -520,13 +567,15 @@ int proc_dowatchdog(struct ctl_table *table, int write,
                    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        int err, old_thresh, old_enabled;
+        static DEFINE_MUTEX(watchdog_proc_mutex);
+        mutex_lock(&watchdog_proc_mutex);
        old_thresh = ACCESS_ONCE(watchdog_thresh);
        old_enabled = ACCESS_ONCE(watchdog_user_enabled);
        err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (err || !write)
-                return err;
+                goto out;
        set_sample_period();
        /*
@@ -535,7 +584,7 @@ int proc_dowatchdog(struct ctl_table *table, int write,
         * watchdog_*_all_cpus() function takes care of this.
         */
        if (watchdog_user_enabled && watchdog_thresh)
-                err = watchdog_enable_all_cpus();
+                err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh);
        else
                watchdog_disable_all_cpus();
@@ -544,7 +593,8 @@ int proc_dowatchdog(struct ctl_table *table, int write,
                watchdog_thresh = old_thresh;
                watchdog_user_enabled = old_enabled;
        }
+out:
+        mutex_unlock(&watchdog_proc_mutex);
        return err;
 }
 #endif /* CONFIG_SYSCTL */
@@ -554,5 +604,5 @@ void __init lockup_detector_init(void)
        set_sample_period();
        if (watchdog_user_enabled)
-                watchdog_enable_all_cpus();
+                watchdog_enable_all_cpus(false);
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d5ff3ce13029..1c52ddbc839b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -39,6 +39,7 @@
 #include <linux/limits.h>
 #include <linux/export.h>
 #include <linux/mutex.h>
+#include <linux/rbtree.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
@@ -160,6 +161,10 @@ struct mem_cgroup_per_zone {
        struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
+        struct rb_node          tree_node;      /* RB tree node */
+        unsigned long long      usage_in_excess;/* Set to the value by which */
+                                                /* the soft limit is exceeded*/
+        bool                    on_tree;
        struct mem_cgroup       *memcg;         /* Back pointer, we cannot */
                                                /* use container_of        */
 };
@@ -168,6 +173,26 @@ struct mem_cgroup_per_node {
        struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
 };
+/*
+ * Cgroups above their limits are maintained in a RB-Tree, independent of
+ * their hierarchy representation
+ */
+struct mem_cgroup_tree_per_zone {
+        struct rb_root rb_root;
+        spinlock_t lock;
+};
+struct mem_cgroup_tree_per_node {
+        struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
+};
+struct mem_cgroup_tree {
+        struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
+};
+static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 struct mem_cgroup_threshold {
        struct eventfd_ctx *eventfd;
        u64 threshold;
@@ -303,22 +328,6 @@ struct mem_cgroup {
        atomic_t        numainfo_events;
        atomic_t        numainfo_updating;
 #endif
-        /*
-         * Protects soft_contributed transitions.
-         * See mem_cgroup_update_soft_limit
-         */
-        spinlock_t soft_lock;
-        /*
-         * If true then this group has increased parents' children_in_excess
-         * when it got over the soft limit.
-         * When a group falls bellow the soft limit, parents' children_in_excess
-         * is decreased and soft_contributed changed to false.
-         */
-        bool soft_contributed;
-        /* Number of children that are in soft limit excess */
-        atomic_t children_in_excess;
        struct mem_cgroup_per_node *nodeinfo[0];
        /* WARNING: nodeinfo must be the last member here */
@@ -422,6 +431,7 @@ static bool move_file(void)
 * limit reclaim to prevent infinite loops, if they ever occur.
 */
 #define MEM_CGROUP_MAX_RECLAIM_LOOPS            100
+#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
 enum charge_type {
        MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
@@ -648,6 +658,164 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
        return mem_cgroup_zoneinfo(memcg, nid, zid);
 }
+static struct mem_cgroup_tree_per_zone *
+soft_limit_tree_node_zone(int nid, int zid)
+{
+        return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
+}
+static struct mem_cgroup_tree_per_zone *
+soft_limit_tree_from_page(struct page *page)
+{
+        int nid = page_to_nid(page);
+        int zid = page_zonenum(page);
+        return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
+}
+static void
+__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
+                                struct mem_cgroup_per_zone *mz,
+                                struct mem_cgroup_tree_per_zone *mctz,
+                                unsigned long long new_usage_in_excess)
+{
+        struct rb_node **p = &mctz->rb_root.rb_node;
+        struct rb_node *parent = NULL;
+        struct mem_cgroup_per_zone *mz_node;
+        if (mz->on_tree)
+                return;
+        mz->usage_in_excess = new_usage_in_excess;
+        if (!mz->usage_in_excess)
+                return;
+        while (*p) {
+                parent = *p;
+                mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
+                                        tree_node);
+                if (mz->usage_in_excess < mz_node->usage_in_excess)
+                        p = &(*p)->rb_left;
+                /*
+                 * We can't avoid mem cgroups that are over their soft
+                 * limit by the same amount
+                 */
+                else if (mz->usage_in_excess >= mz_node->usage_in_excess)
+                        p = &(*p)->rb_right;
+        }
+        rb_link_node(&mz->tree_node, parent, p);
+        rb_insert_color(&mz->tree_node, &mctz->rb_root);
+        mz->on_tree = true;
+}
+static void
+__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
+                                struct mem_cgroup_per_zone *mz,
+                                struct mem_cgroup_tree_per_zone *mctz)
+{
+        if (!mz->on_tree)
+                return;
+        rb_erase(&mz->tree_node, &mctz->rb_root);
+        mz->on_tree = false;
+}
+static void
+mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
+                                struct mem_cgroup_per_zone *mz,
+                                struct mem_cgroup_tree_per_zone *mctz)
+{
+        spin_lock(&mctz->lock);
+        __mem_cgroup_remove_exceeded(memcg, mz, mctz);
+        spin_unlock(&mctz->lock);
+}
+static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
+{
+        unsigned long long excess;
+        struct mem_cgroup_per_zone *mz;
+        struct mem_cgroup_tree_per_zone *mctz;
+        int nid = page_to_nid(page);
+        int zid = page_zonenum(page);
+        mctz = soft_limit_tree_from_page(page);
+        /*
+         * Necessary to update all ancestors when hierarchy is used.
+         * because their event counter is not touched.
+         */
+        for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+                mz = mem_cgroup_zoneinfo(memcg, nid, zid);
+                excess = res_counter_soft_limit_excess(&memcg->res);
+                /*
+                 * We have to update the tree if mz is on RB-tree or
+                 * mem is over its softlimit.
+                 */
+                if (excess || mz->on_tree) {
+                        spin_lock(&mctz->lock);
+                        /* if on-tree, remove it */
+                        if (mz->on_tree)
+                                __mem_cgroup_remove_exceeded(memcg, mz, mctz);
+                        /*
+                         * Insert again. mz->usage_in_excess will be updated.
+                         * If excess is 0, no tree ops.
+                         */
+                        __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
+                        spin_unlock(&mctz->lock);
+                }
+        }
+}
+static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
+{
+        int node, zone;
+        struct mem_cgroup_per_zone *mz;
+        struct mem_cgroup_tree_per_zone *mctz;
+        for_each_node(node) {
+                for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+                        mz = mem_cgroup_zoneinfo(memcg, node, zone);
+                        mctz = soft_limit_tree_node_zone(node, zone);
+                        mem_cgroup_remove_exceeded(memcg, mz, mctz);
+                }
+        }
+}
+static struct mem_cgroup_per_zone *
+__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
+{
+        struct rb_node *rightmost = NULL;
+        struct mem_cgroup_per_zone *mz;
+retry:
+        mz = NULL;
+        rightmost = rb_last(&mctz->rb_root);
+        if (!rightmost)
+                goto done;              /* Nothing to reclaim from */
+        mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
+        /*
+         * Remove the node now but someone else can add it back,
+         * we will to add it back at the end of reclaim to its correct
+         * position in the tree.
+         */
+        __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
+        if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
+                !css_tryget(&mz->memcg->css))
+                goto retry;
+done:
+        return mz;
+}
+static struct mem_cgroup_per_zone *
+mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
+{
+        struct mem_cgroup_per_zone *mz;
+        spin_lock(&mctz->lock);
+        mz = __mem_cgroup_largest_soft_limit_node(mctz);
+        spin_unlock(&mctz->lock);
+        return mz;
+}
 /*
 * Implementation Note: reading percpu statistics for memcg.
 *
@@ -822,48 +990,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 }
 /*
- * Called from rate-limited memcg_check_events when enough
- * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure
- * that all the parents up the hierarchy will be notified that this group
- * is in excess or that it is not in excess anymore. mmecg->soft_contributed
- * makes the transition a single action whenever the state flips from one to
- * the other.
- */
-static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg)
-{
-        unsigned long long excess = res_counter_soft_limit_excess(&memcg->res);
-        struct mem_cgroup *parent = memcg;
-        int delta = 0;
-        spin_lock(&memcg->soft_lock);
-        if (excess) {
-                if (!memcg->soft_contributed) {
-                        delta = 1;
-                        memcg->soft_contributed = true;
-                }
-        } else {
-                if (memcg->soft_contributed) {
-                        delta = -1;
-                        memcg->soft_contributed = false;
-                }
-        }
-        /*
-         * Necessary to update all ancestors when hierarchy is used
-         * because their event counter is not touched.
-         * We track children even outside the hierarchy for the root
-         * cgroup because tree walk starting at root should visit
-         * all cgroups and we want to prevent from pointless tree
-         * walk if no children is below the limit.
-         */
-        while (delta && (parent = parent_mem_cgroup(parent)))
-                atomic_add(delta, &parent->children_in_excess);
-        if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
-                atomic_add(delta, &root_mem_cgroup->children_in_excess);
-        spin_unlock(&memcg->soft_lock);
-}
-/*
 * Check events in order.
 *
 */
@@ -886,7 +1012,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
                mem_cgroup_threshold(memcg);
                if (unlikely(do_softlimit))
-                        mem_cgroup_update_soft_limit(memcg);
+                        mem_cgroup_update_tree(memcg, page);
 #if MAX_NUMNODES > 1
                if (unlikely(do_numainfo))
                        atomic_inc(&memcg->numainfo_events);
@@ -929,15 +1055,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
        return memcg;
 }
-static enum mem_cgroup_filter_t
-mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
-                mem_cgroup_iter_filter cond)
-{
-        if (!cond)
-                return VISIT;
-        return cond(memcg, root);
-}
 /*
 * Returns a next (in a pre-order walk) alive memcg (with elevated css
 * ref. count) or NULL if the whole root's subtree has been visited.
@@ -945,7 +1062,7 @@ mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
 * helper function to be used by mem_cgroup_iter
 */
 static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
-                struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond)
+                struct mem_cgroup *last_visited)
 {
        struct cgroup_subsys_state *prev_css, *next_css;
@@ -963,31 +1080,11 @@ skip_node:
        if (next_css) {
                struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
-                switch (mem_cgroup_filter(mem, root, cond)) {
+                if (css_tryget(&mem->css))
-                case SKIP:
+                        return mem;
+                else {
                        prev_css = next_css;
                        goto skip_node;
-                case SKIP_TREE:
-                        if (mem == root)
-                                return NULL;
-                        /*
-                         * css_rightmost_descendant is not an optimal way to
-                         * skip through a subtree (especially for imbalanced
-                         * trees leaning to right) but that's what we have right
-                         * now. More effective solution would be traversing
-                         * right-up for first non-NULL without calling
-                         * css_next_descendant_pre afterwards.
-                         */
-                        prev_css = css_rightmost_descendant(next_css);
-                        goto skip_node;
-                case VISIT:
-                        if (css_tryget(&mem->css))
-                                return mem;
-                        else {
-                                prev_css = next_css;
-                                goto skip_node;
-                        }
-                        break;
                }
        }
@@ -1051,7 +1148,6 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
 * @root: hierarchy root
 * @prev: previously returned memcg, NULL on first invocation
 * @reclaim: cookie for shared reclaim walks, NULL for full walks
- * @cond: filter for visited nodes, NULL for no filter
 *
 * Returns references to children of the hierarchy below @root, or
 * @root itself, or %NULL after a full round-trip.
@@ -1064,18 +1160,15 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
 * divide up the memcgs in the hierarchy among all concurrent
 * reclaimers operating on the same zone and priority.
 */
-struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
+struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                                   struct mem_cgroup *prev,
-                                   struct mem_cgroup_reclaim_cookie *reclaim,
+                                   struct mem_cgroup_reclaim_cookie *reclaim)
-                                   mem_cgroup_iter_filter cond)
 {
        struct mem_cgroup *memcg = NULL;
        struct mem_cgroup *last_visited = NULL;
-        if (mem_cgroup_disabled()) {
+        if (mem_cgroup_disabled())
-                /* first call must return non-NULL, second return NULL */
+                return NULL;
-                return (struct mem_cgroup *)(unsigned long)!prev;
-        }
        if (!root)
                root = root_mem_cgroup;
@@ -1086,9 +1179,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
        if (!root->use_hierarchy && root != root_mem_cgroup) {
                if (prev)
                        goto out_css_put;
-                if (mem_cgroup_filter(root, root, cond) == VISIT)
+                return root;
-                        return root;
-                return NULL;
        }
        rcu_read_lock();
@@ -1111,7 +1202,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
                        last_visited = mem_cgroup_iter_load(iter, root, &seq);
                }
-                memcg = __mem_cgroup_iter_next(root, last_visited, cond);
+                memcg = __mem_cgroup_iter_next(root, last_visited);
                if (reclaim) {
                        mem_cgroup_iter_update(iter, last_visited, memcg, seq);
@@ -1122,11 +1213,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
                                reclaim->generation = iter->generation;
                }
-                /*
+                if (prev && !memcg)
-                 * We have finished the whole tree walk or no group has been
-                 * visited because filter told us to skip the root node.
-                 */
-                if (!memcg && (prev || (cond && !last_visited)))
                        goto out_unlock;
        }
 out_unlock:
@@ -1767,7 +1854,6 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
        return total;
 }
-#if MAX_NUMNODES > 1
 /**
 * test_mem_cgroup_node_reclaimable
 * @memcg: the target memcg
@@ -1790,6 +1876,7 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
        return false;
 }
+#if MAX_NUMNODES > 1
 /*
 * Always updating the nodemask is not very good - even if we have an empty
@@ -1857,50 +1944,104 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
        return node;
 }
+/*
+ * Check all nodes whether it contains reclaimable pages or not.
+ * For quick scan, we make use of scan_nodes. This will allow us to skip
+ * unused nodes. But scan_nodes is lazily updated and may not cotain
+ * enough new information. We need to do double check.
+ */
+static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
+{
+        int nid;
+        /*
+         * quick check...making use of scan_node.
+         * We can skip unused nodes.
+         */
+        if (!nodes_empty(memcg->scan_nodes)) {
+                for (nid = first_node(memcg->scan_nodes);
+                     nid < MAX_NUMNODES;
+                     nid = next_node(nid, memcg->scan_nodes)) {
+                        if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
+                                return true;
+                }
+        }
+        /*
+         * Check rest of nodes.
+         */
+        for_each_node_state(nid, N_MEMORY) {
+                if (node_isset(nid, memcg->scan_nodes))
+                        continue;
+                if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
+                        return true;
+        }
+        return false;
+}
 #else
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
 {
        return 0;
 }
-#endif
+static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
-/*
- * A group is eligible for the soft limit reclaim under the given root
- * hierarchy if
- *      a) it is over its soft limit
- *      b) any parent up the hierarchy is over its soft limit
- *
- * If the given group doesn't have any children over the limit then it
- * doesn't make any sense to iterate its subtree.
- */
-enum mem_cgroup_filter_t
-mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
-                struct mem_cgroup *root)
 {
-        struct mem_cgroup *parent;
+        return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
+}
-        if (!memcg)
+#endif
-                memcg = root_mem_cgroup;
-        parent = memcg;
-        if (res_counter_soft_limit_excess(&memcg->res))
-                return VISIT;
-        /*
+static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
-         * If any parent up to the root in the hierarchy is over its soft limit
+                                   struct zone *zone,
-         * then we have to obey and reclaim from this group as well.
+                                   gfp_t gfp_mask,
-         */
+                                   unsigned long *total_scanned)
-        while ((parent = parent_mem_cgroup(parent))) {
+{
-                if (res_counter_soft_limit_excess(&parent->res))
+        struct mem_cgroup *victim = NULL;
-                        return VISIT;
+        int total = 0;
-                if (parent == root)
+        int loop = 0;
+        unsigned long excess;
+        unsigned long nr_scanned;
+        struct mem_cgroup_reclaim_cookie reclaim = {
+                .zone = zone,
+                .priority = 0,
+        };
+        excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
+        while (1) {
+                victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
+                if (!victim) {
+                        loop++;
+                        if (loop >= 2) {
+                                /*
+                                 * If we have not been able to reclaim
+                                 * anything, it might because there are
+                                 * no reclaimable pages under this hierarchy
+                                 */
+                                if (!total)
+                                        break;
+                                /*
+                                 * We want to do more targeted reclaim.
+                                 * excess >> 2 is not to excessive so as to
+                                 * reclaim too much, nor too less that we keep
+                                 * coming back to reclaim from this cgroup
+                                 */
+                                if (total >= (excess >> 2) ||
+                                        (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
+                                        break;
+                        }
+                        continue;
+                }
+                if (!mem_cgroup_reclaimable(victim, false))
+                        continue;
+                total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
+                                                     zone, &nr_scanned);
+                *total_scanned += nr_scanned;
+                if (!res_counter_soft_limit_excess(&root_memcg->res))
                        break;
        }
+        mem_cgroup_iter_break(root_memcg, victim);
-        if (!atomic_read(&memcg->children_in_excess))
+        return total;
-                return SKIP_TREE;
-        return SKIP;
 }
 static DEFINE_SPINLOCK(memcg_oom_lock);
@@ -2812,7 +2953,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
        unlock_page_cgroup(pc);
        /*
-         * "charge_statistics" updated event counter.
+         * "charge_statistics" updated event counter. Then, check it.
+         * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
+         * if they exceeds softlimit.
         */
        memcg_check_events(memcg, page);
 }
@@ -4647,6 +4790,98 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
        return ret;
 }
+unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
+                                            gfp_t gfp_mask,
+                                            unsigned long *total_scanned)
+{
+        unsigned long nr_reclaimed = 0;
+        struct mem_cgroup_per_zone *mz, *next_mz = NULL;
+        unsigned long reclaimed;
+        int loop = 0;
+        struct mem_cgroup_tree_per_zone *mctz;
+        unsigned long long excess;
+        unsigned long nr_scanned;
+        if (order > 0)
+                return 0;
+        mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
+        /*
+         * This loop can run a while, specially if mem_cgroup's continuously
+         * keep exceeding their soft limit and putting the system under
+         * pressure
+         */
+        do {
+                if (next_mz)
+                        mz = next_mz;
+                else
+                        mz = mem_cgroup_largest_soft_limit_node(mctz);
+                if (!mz)
+                        break;
+                nr_scanned = 0;
+                reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
+                                                    gfp_mask, &nr_scanned);
+                nr_reclaimed += reclaimed;
+                *total_scanned += nr_scanned;
+                spin_lock(&mctz->lock);
+                /*
+                 * If we failed to reclaim anything from this memory cgroup
+                 * it is time to move on to the next cgroup
+                 */
+                next_mz = NULL;
+                if (!reclaimed) {
+                        do {
+                                /*
+                                 * Loop until we find yet another one.
+                                 *
+                                 * By the time we get the soft_limit lock
+                                 * again, someone might have aded the
+                                 * group back on the RB tree. Iterate to
+                                 * make sure we get a different mem.
+                                 * mem_cgroup_largest_soft_limit_node returns
+                                 * NULL if no other cgroup is present on
+                                 * the tree
+                                 */
+                                next_mz =
+                                __mem_cgroup_largest_soft_limit_node(mctz);
+                                if (next_mz == mz)
+                                        css_put(&next_mz->memcg->css);
+                                else /* next_mz == NULL or other memcg */
+                                        break;
+                        } while (1);
+                }
+                __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
+                excess = res_counter_soft_limit_excess(&mz->memcg->res);
+                /*
+                 * One school of thought says that we should not add
+                 * back the node to the tree if reclaim returns 0.
+                 * But our reclaim could return 0, simply because due
+                 * to priority we are exposing a smaller subset of
+                 * memory to reclaim from. Consider this as a longer
+                 * term TODO.
+                 */
+                /* If excess == 0, no tree ops */
+                __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
+                spin_unlock(&mctz->lock);
+                css_put(&mz->memcg->css);
+                loop++;
+                /*
+                 * Could not reclaim anything and there are no more
+                 * mem cgroups to try or we seem to be looping without
+                 * reclaiming anything.
+                 */
+                if (!nr_reclaimed &&
+                        (next_mz == NULL ||
+                        loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
+                        break;
+        } while (!nr_reclaimed);
+        if (next_mz)
+                css_put(&next_mz->memcg->css);
+        return nr_reclaimed;
+}
 /**
 * mem_cgroup_force_empty_list - clears LRU of a group
 * @memcg: group to clear
@@ -5911,6 +6146,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
        for (zone = 0; zone < MAX_NR_ZONES; zone++) {
                mz = &pn->zoneinfo[zone];
                lruvec_init(&mz->lruvec);
+                mz->usage_in_excess = 0;
+                mz->on_tree = false;
                mz->memcg = memcg;
        }
        memcg->nodeinfo[node] = pn;
@@ -5966,6 +6203,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
        int node;
        size_t size = memcg_size();
+        mem_cgroup_remove_from_trees(memcg);
        free_css_id(&mem_cgroup_subsys, &memcg->css);
        for_each_node(node)
@@ -6002,6 +6240,29 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
 }
 EXPORT_SYMBOL(parent_mem_cgroup);
+static void __init mem_cgroup_soft_limit_tree_init(void)
+{
+        struct mem_cgroup_tree_per_node *rtpn;
+        struct mem_cgroup_tree_per_zone *rtpz;
+        int tmp, node, zone;
+        for_each_node(node) {
+                tmp = node;
+                if (!node_state(node, N_NORMAL_MEMORY))
+                        tmp = -1;
+                rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
+                BUG_ON(!rtpn);
+                soft_limit_tree.rb_tree_per_node[node] = rtpn;
+                for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+                        rtpz = &rtpn->rb_tree_per_zone[zone];
+                        rtpz->rb_root = RB_ROOT;
+                        spin_lock_init(&rtpz->lock);
+                }
+        }
+}
 static struct cgroup_subsys_state * __ref
 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 {
@@ -6031,7 +6292,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
        mutex_init(&memcg->thresholds_lock);
        spin_lock_init(&memcg->move_lock);
        vmpressure_init(&memcg->vmpressure);
-        spin_lock_init(&memcg->soft_lock);
        return &memcg->css;
@@ -6109,13 +6369,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
        mem_cgroup_invalidate_reclaim_iterators(memcg);
        mem_cgroup_reparent_charges(memcg);
-        if (memcg->soft_contributed) {
-                while ((memcg = parent_mem_cgroup(memcg)))
-                        atomic_dec(&memcg->children_in_excess);
-                if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
-                        atomic_dec(&root_mem_cgroup->children_in_excess);
-        }
        mem_cgroup_destroy_all_caches(memcg);
        vmpressure_cleanup(&memcg->vmpressure);
 }
@@ -6790,6 +7043,7 @@ static int __init mem_cgroup_init(void)
 {
        hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
        enable_swap_cgroup();
+        mem_cgroup_soft_limit_tree_init();
        memcg_stock_init();
        return 0;
 }
diff --git a/mm/mlock.c b/mm/mlock.c
index d63802663242..67ba6da7d0e3 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -736,6 +736,7 @@ static int do_mlockall(int flags)
                /* Ignore errors */
                mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
+                cond_resched();
        }
 out:
        return 0;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8ed1b775bdc9..beb35778c69f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -139,23 +139,11 @@ static bool global_reclaim(struct scan_control *sc)
 {
        return !sc->target_mem_cgroup;
 }
-static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
-{
-        struct mem_cgroup *root = sc->target_mem_cgroup;
-        return !mem_cgroup_disabled() &&
-                mem_cgroup_soft_reclaim_eligible(root, root) != SKIP_TREE;
-}
 #else
 static bool global_reclaim(struct scan_control *sc)
 {
        return true;
 }
-static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
-{
-        return false;
-}
 #endif
 unsigned long zone_reclaimable_pages(struct zone *zone)
@@ -2176,11 +2164,9 @@ static inline bool should_continue_reclaim(struct zone *zone,
        }
 }
-static int
+static void shrink_zone(struct zone *zone, struct scan_control *sc)
-__shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
 {
        unsigned long nr_reclaimed, nr_scanned;
-        int groups_scanned = 0;
        do {
                struct mem_cgroup *root = sc->target_mem_cgroup;
@@ -2188,17 +2174,15 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
                        .zone = zone,
                        .priority = sc->priority,
                };
-                struct mem_cgroup *memcg = NULL;
+                struct mem_cgroup *memcg;
-                mem_cgroup_iter_filter filter = (soft_reclaim) ?
-                        mem_cgroup_soft_reclaim_eligible : NULL;
                nr_reclaimed = sc->nr_reclaimed;
                nr_scanned = sc->nr_scanned;
-                while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) {
+                memcg = mem_cgroup_iter(root, NULL, &reclaim);
+                do {
                        struct lruvec *lruvec;
-                        groups_scanned++;
                        lruvec = mem_cgroup_zone_lruvec(zone, memcg);
                        shrink_lruvec(lruvec, sc);
@@ -2218,7 +2202,8 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
                                mem_cgroup_iter_break(root, memcg);
                                break;
                        }
-                }
+                        memcg = mem_cgroup_iter(root, memcg, &reclaim);
+                } while (memcg);
                vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
                           sc->nr_scanned - nr_scanned,
@@ -2226,37 +2211,6 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
        } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
                                         sc->nr_scanned - nr_scanned, sc));
-        return groups_scanned;
-}
-static void shrink_zone(struct zone *zone, struct scan_control *sc)
-{
-        bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc);
-        unsigned long nr_scanned = sc->nr_scanned;
-        int scanned_groups;
-        scanned_groups = __shrink_zone(zone, sc, do_soft_reclaim);
-        /*
-         * memcg iterator might race with other reclaimer or start from
-         * a incomplete tree walk so the tree walk in __shrink_zone
-         * might have missed groups that are above the soft limit. Try
-         * another loop to catch up with others. Do it just once to
-         * prevent from reclaim latencies when other reclaimers always
-         * preempt this one.
-         */
-        if (do_soft_reclaim && !scanned_groups)
-                __shrink_zone(zone, sc, do_soft_reclaim);
-        /*
-         * No group is over the soft limit or those that are do not have
-         * pages in the zone we are reclaiming so we have to reclaim everybody
-         */
-        if (do_soft_reclaim && (sc->nr_scanned == nr_scanned)) {
-                __shrink_zone(zone, sc, false);
-                return;
-        }
 }
 /* Returns true if compaction should go ahead for a high-order request */
@@ -2320,6 +2274,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 {
        struct zoneref *z;
        struct zone *zone;
+        unsigned long nr_soft_reclaimed;
+        unsigned long nr_soft_scanned;
        bool aborted_reclaim = false;
        /*
@@ -2359,6 +2315,18 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                                        continue;
                                }
                        }
+                        /*
+                         * This steals pages from memory cgroups over softlimit
+                         * and returns the number of reclaimed pages and
+                         * scanned pages. This works for global memory pressure
+                         * and balancing, not for a memcg's limit.
+                         */
+                        nr_soft_scanned = 0;
+                        nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+                                                sc->order, sc->gfp_mask,
+                                                &nr_soft_scanned);
+                        sc->nr_reclaimed += nr_soft_reclaimed;
+                        sc->nr_scanned += nr_soft_scanned;
                        /* need some check for avoid more shrink_zone() */
                }
@@ -2952,6 +2920,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 {
        int i;
        int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
+        unsigned long nr_soft_reclaimed;
+        unsigned long nr_soft_scanned;
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
                .priority = DEF_PRIORITY,
@@ -3066,6 +3036,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                        sc.nr_scanned = 0;
+                        nr_soft_scanned = 0;
+                        /*
+                         * Call soft limit reclaim before calling shrink_zone.
+                         */
+                        nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+                                                        order, sc.gfp_mask,
+                                                        &nr_soft_scanned);
+                        sc.nr_reclaimed += nr_soft_reclaimed;
                        /*
                         * There should be no need to raise the scanning
                         * priority if enough pages are already being scanned
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 47016c304c84..66cad506b8a2 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -3975,8 +3975,8 @@ sub string_find_replace {
 # check for new externs in .h files.
                if ($realfile =~ /\.h$/ &&
                    $line =~ /^\+\s*(extern\s+)$Type\s*$Ident\s*\(/s) {
-                        if (WARN("AVOID_EXTERNS",
+                        if (CHK("AVOID_EXTERNS",
-                                 "extern prototypes should be avoided in .h files\n" . $herecurr) &&
+                                "extern prototypes should be avoided in .h files\n" . $herecurr) &&
                            $fix) {
                                $fixed[$linenr - 1] =~ s/(.*)\bextern\b\s*(.*)/$1$2/;
                        }
diff --git a/tools/lib/lk/debugfs.c b/tools/lib/lk/debugfs.c
index 099e7cd022e4..7c4347962353 100644
--- a/tools/lib/lk/debugfs.c
+++ b/tools/lib/lk/debugfs.c
@@ -5,7 +5,6 @@
 #include <stdbool.h>
 #include <sys/vfs.h>
 #include <sys/mount.h>
-#include <linux/magic.h>
 #include <linux/kernel.h>
 #include "debugfs.h"
diff --git a/tools/perf/arch/x86/util/tsc.c b/tools/perf/arch/x86/util/tsc.c
index 9570c2b0f83c..b2519e49424f 100644
--- a/tools/perf/arch/x86/util/tsc.c
+++ b/tools/perf/arch/x86/util/tsc.c
@@ -32,7 +32,7 @@ u64 tsc_to_perf_time(u64 cyc, struct perf_tsc_conversion *tc)
 int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc,
                             struct perf_tsc_conversion *tc)
 {
-        bool cap_usr_time_zero;
+        bool cap_user_time_zero;
        u32 seq;
        int i = 0;
@@ -42,7 +42,7 @@ int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc,
                tc->time_mult = pc->time_mult;
                tc->time_shift = pc->time_shift;
                tc->time_zero = pc->time_zero;
-                cap_usr_time_zero = pc->cap_usr_time_zero;
+                cap_user_time_zero = pc->cap_user_time_zero;
                rmb();
                if (pc->lock == seq && !(seq & 1))
                        break;
@@ -52,7 +52,7 @@ int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc,
                }
        }
-        if (!cap_usr_time_zero)
+        if (!cap_user_time_zero)
                return -EOPNOTSUPP;
        return 0;
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 423875c999b2..afe377b2884f 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -321,8 +321,6 @@ found:
        return perf_event__repipe(tool, event_sw, &sample_sw, machine);
 }
-extern volatile int session_done;
 static void sig_handler(int sig __maybe_unused)
 {
        session_done = 1;
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 8e50d8d77419..72eae7498c09 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -401,8 +401,6 @@ static int perf_report__setup_sample_type(struct perf_report *rep)
        return 0;
 }
-extern volatile int session_done;
 static void sig_handler(int sig __maybe_unused)
 {
        session_done = 1;
@@ -568,6 +566,9 @@ static int __cmd_report(struct perf_report *rep)
                }
        }
+        if (session_done())
+                return 0;
        if (nr_samples == 0) {
                ui__error("The %s file has no samples!\n", session->filename);
                return 0;
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 7f31a3ded1b6..9c333ff3dfeb 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -553,8 +553,6 @@ static struct perf_tool perf_script = {
        .ordering_requires_timestamps = true,
 };
-extern volatile int session_done;
 static void sig_handler(int sig __maybe_unused)
 {
        session_done = 1;
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index f5aa6375e3e9..fd4853404727 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -16,6 +16,23 @@
 #include <sys/mman.h>
 #include <linux/futex.h>
+/* For older distros: */
+#ifndef MAP_STACK
+# define MAP_STACK              0x20000
+#endif
+#ifndef MADV_HWPOISON
+# define MADV_HWPOISON          100
+#endif
+#ifndef MADV_MERGEABLE
+# define MADV_MERGEABLE         12
+#endif
+#ifndef MADV_UNMERGEABLE
+# define MADV_UNMERGEABLE       13
+#endif
 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
                                         unsigned long arg,
                                         u8 arg_idx __maybe_unused,
diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile
index 214e17e97e5c..346ee929d250 100644
--- a/tools/perf/config/Makefile
+++ b/tools/perf/config/Makefile
@@ -180,6 +180,9 @@ FLAGS_LIBELF=$(CFLAGS) $(LDFLAGS) $(EXTLIBS)
 ifeq ($(call try-cc,$(SOURCE_ELF_MMAP),$(FLAGS_LIBELF),-DLIBELF_MMAP),y)
  CFLAGS += -DLIBELF_MMAP
 endif
+ifeq ($(call try-cc,$(SOURCE_ELF_GETPHDRNUM),$(FLAGS_LIBELF),-DHAVE_ELF_GETPHDRNUM),y)
+  CFLAGS += -DHAVE_ELF_GETPHDRNUM
+endif
 # include ARCH specific config
 -include $(src-perf)/arch/$(ARCH)/Makefile
diff --git a/tools/perf/config/feature-tests.mak b/tools/perf/config/feature-tests.mak
index 708fb8e9822a..d5a8dd44945f 100644
--- a/tools/perf/config/feature-tests.mak
+++ b/tools/perf/config/feature-tests.mak
@@ -61,6 +61,15 @@ int main(void)
 }
 endef
+define SOURCE_ELF_GETPHDRNUM
+#include <libelf.h>
+int main(void)
+{
+        size_t dst;
+        return elf_getphdrnum(0, &dst);
+}
+endef
 ifndef NO_SLANG
 define SOURCE_SLANG
 #include <slang.h>
@@ -210,6 +219,7 @@ define SOURCE_LIBAUDIT
 int main(void)
 {
+        printf(\"error message: %s\n\", audit_errno_to_name(0));
        return audit_open();
 }
 endef
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index bfc5a27597d6..7eae5488ecea 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -809,7 +809,7 @@ static int symbol__parse_objdump_line(struct symbol *sym, struct map *map,
                    end = map__rip_2objdump(map, sym->end);
                offset = line_ip - start;
-                if (offset < 0 || (u64)line_ip > end)
+                if ((u64)line_ip < start || (u64)line_ip > end)
                        offset = -1;
                else
                        parsed_line = tmp2 + 1;
diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c
index 3e5f5430a28a..e23bde19d590 100644
--- a/tools/perf/util/dwarf-aux.c
+++ b/tools/perf/util/dwarf-aux.c
@@ -263,6 +263,21 @@ bool die_is_signed_type(Dwarf_Die *tp_die)
 }
 /**
+ * die_is_func_def - Ensure that this DIE is a subprogram and definition
+ * @dw_die: a DIE
+ *
+ * Ensure that this DIE is a subprogram and NOT a declaration. This
+ * returns true if @dw_die is a function definition.
+ **/
+bool die_is_func_def(Dwarf_Die *dw_die)
+{
+        Dwarf_Attribute attr;
+        return (dwarf_tag(dw_die) == DW_TAG_subprogram &&
+                dwarf_attr(dw_die, DW_AT_declaration, &attr) == NULL);
+}
+/**
 * die_get_data_member_location - Get the data-member offset
 * @mb_die: a DIE of a member of a data structure
 * @offs: The offset of the member in the data structure
@@ -392,6 +407,10 @@ static int __die_search_func_cb(Dwarf_Die *fn_die, void *data)
 {
        struct __addr_die_search_param *ad = data;
+        /*
+         * Since a declaration entry doesn't has given pc, this always returns
+         * function definition entry.
+         */
        if (dwarf_tag(fn_die) == DW_TAG_subprogram &&
            dwarf_haspc(fn_die, ad->addr)) {
                memcpy(ad->die_mem, fn_die, sizeof(Dwarf_Die));
diff --git a/tools/perf/util/dwarf-aux.h b/tools/perf/util/dwarf-aux.h
index 6ce1717784b7..8658d41697d2 100644
--- a/tools/perf/util/dwarf-aux.h
+++ b/tools/perf/util/dwarf-aux.h
@@ -38,6 +38,9 @@ extern int cu_find_lineinfo(Dwarf_Die *cudie, unsigned long addr,
 extern int cu_walk_functions_at(Dwarf_Die *cu_die, Dwarf_Addr addr,
                        int (*callback)(Dwarf_Die *, void *), void *data);
+/* Ensure that this DIE is a subprogram and definition (not declaration) */
+extern bool die_is_func_def(Dwarf_Die *dw_die);
 /* Compare diename and tname */
 extern bool die_compare_name(Dwarf_Die *dw_die, const char *tname);
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 26441d0e571b..ce69901176d8 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -199,9 +199,11 @@ static int write_buildid(char *name, size_t name_len, u8 *build_id,
        return write_padded(fd, name, name_len + 1, len);
 }
-static int __dsos__write_buildid_table(struct list_head *head, pid_t pid,
+static int __dsos__write_buildid_table(struct list_head *head,
-                                u16 misc, int fd)
+                                       struct machine *machine,
+                                       pid_t pid, u16 misc, int fd)
 {
+        char nm[PATH_MAX];
        struct dso *pos;
        dsos__for_each_with_build_id(pos, head) {
@@ -215,6 +217,10 @@ static int __dsos__write_buildid_table(struct list_head *head, pid_t pid,
                if (is_vdso_map(pos->short_name)) {
                        name = (char *) VDSO__MAP_NAME;
                        name_len = sizeof(VDSO__MAP_NAME) + 1;
+                } else if (dso__is_kcore(pos)) {
+                        machine__mmap_name(machine, nm, sizeof(nm));
+                        name = nm;
+                        name_len = strlen(nm) + 1;
                } else {
                        name = pos->long_name;
                        name_len = pos->long_name_len + 1;
@@ -240,10 +246,10 @@ static int machine__write_buildid_table(struct machine *machine, int fd)
                umisc = PERF_RECORD_MISC_GUEST_USER;
        }
-        err = __dsos__write_buildid_table(&machine->kernel_dsos, machine->pid,
+        err = __dsos__write_buildid_table(&machine->kernel_dsos, machine,
-                                          kmisc, fd);
+                                          machine->pid, kmisc, fd);
        if (err == 0)
-                err = __dsos__write_buildid_table(&machine->user_dsos,
+                err = __dsos__write_buildid_table(&machine->user_dsos, machine,
                                                  machine->pid, umisc, fd);
        return err;
 }
@@ -375,23 +381,31 @@ out_free:
        return err;
 }
-static int dso__cache_build_id(struct dso *dso, const char *debugdir)
+static int dso__cache_build_id(struct dso *dso, struct machine *machine,
+                               const char *debugdir)
 {
        bool is_kallsyms = dso->kernel && dso->long_name[0] != '/';
        bool is_vdso = is_vdso_map(dso->short_name);
+        char *name = dso->long_name;
+        char nm[PATH_MAX];
-        return build_id_cache__add_b(dso->build_id, sizeof(dso->build_id),
+        if (dso__is_kcore(dso)) {
-                                     dso->long_name, debugdir,
+                is_kallsyms = true;
-                                     is_kallsyms, is_vdso);
+                machine__mmap_name(machine, nm, sizeof(nm));
+                name = nm;
+        }
+        return build_id_cache__add_b(dso->build_id, sizeof(dso->build_id), name,
+                                     debugdir, is_kallsyms, is_vdso);
 }
-static int __dsos__cache_build_ids(struct list_head *head, const char *debugdir)
+static int __dsos__cache_build_ids(struct list_head *head,
+                                   struct machine *machine, const char *debugdir)
 {
        struct dso *pos;
        int err = 0;
        dsos__for_each_with_build_id(pos, head)
-                if (dso__cache_build_id(pos, debugdir))
+                if (dso__cache_build_id(pos, machine, debugdir))
                        err = -1;
        return err;
@@ -399,8 +413,9 @@ static int __dsos__cache_build_ids(struct list_head *head, const char *debugdir)
 static int machine__cache_build_ids(struct machine *machine, const char *debugdir)
 {
-        int ret = __dsos__cache_build_ids(&machine->kernel_dsos, debugdir);
+        int ret = __dsos__cache_build_ids(&machine->kernel_dsos, machine,
-        ret |= __dsos__cache_build_ids(&machine->user_dsos, debugdir);
+                                          debugdir);
+        ret |= __dsos__cache_build_ids(&machine->user_dsos, machine, debugdir);
        return ret;
 }
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 46a0d35a05e1..9ff6cf3e9a99 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -611,6 +611,8 @@ void hists__collapse_resort(struct hists *hists)
        next = rb_first(root);
        while (next) {
+                if (session_done())
+                        break;
                n = rb_entry(next, struct hist_entry, rb_node_in);
                next = rb_next(&n->rb_node_in);
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index be0329394d56..20c7299a9d4e 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -734,7 +734,7 @@ static int call_probe_finder(Dwarf_Die *sc_die, struct probe_finder *pf)
        }
        /* If not a real subprogram, find a real one */
-        if (dwarf_tag(sc_die) != DW_TAG_subprogram) {
+        if (!die_is_func_def(sc_die)) {
                if (!die_find_realfunc(&pf->cu_die, pf->addr, &pf->sp_die)) {
                        pr_warning("Failed to find probe point in any "
                                   "functions.\n");
@@ -980,12 +980,10 @@ static int probe_point_search_cb(Dwarf_Die *sp_die, void *data)
        struct dwarf_callback_param *param = data;
        struct probe_finder *pf = param->data;
        struct perf_probe_point *pp = &pf->pev->point;
-        Dwarf_Attribute attr;
        /* Check tag and diename */
-        if (dwarf_tag(sp_die) != DW_TAG_subprogram ||
+        if (!die_is_func_def(sp_die) ||
-            !die_compare_name(sp_die, pp->function) ||
+            !die_compare_name(sp_die, pp->function))
-            dwarf_attr(sp_die, DW_AT_declaration, &attr))
                return DWARF_CB_OK;
        /* Check declared file */
@@ -1474,7 +1472,7 @@ static int line_range_inline_cb(Dwarf_Die *in_die, void *data)
        return 0;
 }
-/* Search function from function name */
+/* Search function definition from function name */
 static int line_range_search_cb(Dwarf_Die *sp_die, void *data)
 {
        struct dwarf_callback_param *param = data;
@@ -1485,7 +1483,7 @@ static int line_range_search_cb(Dwarf_Die *sp_die, void *data)
        if (lr->file && strtailcmp(lr->file, dwarf_decl_file(sp_die)))
                return DWARF_CB_OK;
-        if (dwarf_tag(sp_die) == DW_TAG_subprogram &&
+        if (die_is_func_def(sp_die) &&
            die_compare_name(sp_die, lr->function)) {
                lf->fname = dwarf_decl_file(sp_die);
                dwarf_decl_line(sp_die, &lr->offset);
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 51f5edf2a6d0..70ffa41518f3 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -531,6 +531,9 @@ static int flush_sample_queue(struct perf_session *s,
                return 0;
        list_for_each_entry_safe(iter, tmp, head, list) {
+                if (session_done())
+                        return 0;
                if (iter->timestamp > limit)
                        break;
@@ -1160,7 +1163,6 @@ static void perf_session__warn_about_errors(const struct perf_session *session,
        }
 }
-#define session_done()  (*(volatile int *)(&session_done))
 volatile int session_done;
 static int __perf_session__process_pipe_events(struct perf_session *self,
@@ -1372,10 +1374,13 @@ more:
                                    "Processing events...");
        }
+        err = 0;
+        if (session_done())
+                goto out_err;
        if (file_pos < file_size)
                goto more;
-        err = 0;
        /* do the final flush for ordered samples */
        session->ordered_samples.next_flush = ULLONG_MAX;
        err = flush_sample_queue(session, tool);
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 3aa75fb2225f..04bf7373a7e5 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -124,4 +124,8 @@ int __perf_session__set_tracepoints_handlers(struct perf_session *session,
 #define perf_session__set_tracepoints_handlers(session, array) \
        __perf_session__set_tracepoints_handlers(session, array, ARRAY_SIZE(array))
+extern volatile int session_done;
+#define session_done()  (*(volatile int *)(&session_done))
 #endif /* __PERF_SESSION_H */
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index a7b9ab557380..a9c829be5216 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -8,6 +8,22 @@
 #include "symbol.h"
 #include "debug.h"
+#ifndef HAVE_ELF_GETPHDRNUM
+static int elf_getphdrnum(Elf *elf, size_t *dst)
+{
+        GElf_Ehdr gehdr;
+        GElf_Ehdr *ehdr;
+        ehdr = gelf_getehdr(elf, &gehdr);
+        if (!ehdr)
+                return -1;
+        *dst = ehdr->e_phnum;
+        return 0;
+}
+#endif
 #ifndef NT_GNU_BUILD_ID
 #define NT_GNU_BUILD_ID 3
 #endif
diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c
index fe7a27d67d2b..e9e1c03f927d 100644
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -186,7 +186,7 @@ void parse_proc_kallsyms(struct pevent *pevent,
        char *next = NULL;
        char *addr_str;
        char *mod;
-        char *fmt;
+        char *fmt = NULL;
        line = strtok_r(file, "\n", &next);
        while (line) {