47 files changed, 2534 insertions, 413 deletions
diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index 5d7a16eab312..af71d38c8e41 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -189,9 +189,21 @@ callback_init(void * kernel_end)
        if (alpha_using_srm) {
                static struct vm_struct console_remap_vm;
-                unsigned long vaddr = VMALLOC_START;
+                unsigned long nr_pages = 0;
+                unsigned long vaddr;
                unsigned long i, j;
+                /* calculate needed size */
+                for (i = 0; i < crb->map_entries; ++i)
+                        nr_pages += crb->map[i].count;
+                /* register the vm area */
+                console_remap_vm.flags = VM_ALLOC;
+                console_remap_vm.size = nr_pages << PAGE_SHIFT;
+                vm_area_register_early(&console_remap_vm, PAGE_SIZE);
+                vaddr = (unsigned long)console_remap_vm.addr;
                /* Set up the third level PTEs and update the virtual
                   addresses of the CRB entries.  */
                for (i = 0; i < crb->map_entries; ++i) {
@@ -213,12 +225,6 @@ callback_init(void * kernel_end)
                                vaddr += PAGE_SIZE;
                        }
                }
-                /* Let vmalloc know that we've allocated some space.  */
-                console_remap_vm.flags = VM_ALLOC;
-                console_remap_vm.addr = (void *) VMALLOC_START;
-                console_remap_vm.size = vaddr - VMALLOC_START;
-                vmlist = &console_remap_vm;
        }
        callback_init_done = 1;
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index b189680d18b0..05fe3053dcae 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -181,7 +181,7 @@ source "kernel/Kconfig.preempt"
 config QUICKLIST
        def_bool y
-config HAVE_ARCH_BOOTMEM_NODE
+config HAVE_ARCH_BOOTMEM
        def_bool n
 config ARCH_HAVE_MEMORY_PRESENT
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 469f3450bf81..31758378bcd2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -138,6 +138,9 @@ config ARCH_HAS_CACHE_LINE_SIZE
 config HAVE_SETUP_PER_CPU_AREA
        def_bool y
+config HAVE_DYNAMIC_PER_CPU_AREA
+        def_bool y
 config HAVE_CPUMASK_OF_CPU_MAP
        def_bool X86_64_SMP
@@ -780,6 +783,11 @@ config X86_MCE_AMD
           Additional support for AMD specific MCE features such as
           the DRAM Error Threshold.
+config X86_MCE_THRESHOLD
+        depends on X86_MCE_AMD || X86_MCE_INTEL
+        bool
+        default y
 config X86_MCE_NONFATAL
        tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
        depends on X86_32 && X86_MCE
@@ -1125,7 +1133,7 @@ config NODES_SHIFT
          Specify the maximum number of NUMA Nodes available on the target
          system.  Increases memory reserved to accomodate various tables.
-config HAVE_ARCH_BOOTMEM_NODE
+config HAVE_ARCH_BOOTMEM
        def_bool y
        depends on X86_32 && NUMA
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 63134e31e8b9..bc9514fb3b13 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -53,6 +53,7 @@
 #define         APIC_ESR_SENDILL        0x00020
 #define         APIC_ESR_RECVILL        0x00040
 #define         APIC_ESR_ILLREGA        0x00080
+#define         APIC_LVTCMCI    0x2f0
 #define APIC_ICR        0x300
 #define         APIC_DEST_SELF          0x40000
 #define         APIC_DEST_ALLINC        0x80000
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 2f8466540fb5..5b301b7ff5f4 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -5,24 +5,43 @@
 #include <linux/mm.h>
 /* Caches aren't brain-dead on the intel. */
-#define flush_cache_all()                       do { } while (0)
+static inline void flush_cache_all(void) { }
-#define flush_cache_mm(mm)                      do { } while (0)
+static inline void flush_cache_mm(struct mm_struct *mm) { }
-#define flush_cache_dup_mm(mm)                  do { } while (0)
+static inline void flush_cache_dup_mm(struct mm_struct *mm) { }
-#define flush_cache_range(vma, start, end)      do { } while (0)
+static inline void flush_cache_range(struct vm_area_struct *vma,
-#define flush_cache_page(vma, vmaddr, pfn)      do { } while (0)
+                                     unsigned long start, unsigned long end) { }
-#define flush_dcache_page(page)                 do { } while (0)
+static inline void flush_cache_page(struct vm_area_struct *vma,
-#define flush_dcache_mmap_lock(mapping)         do { } while (0)
+                                    unsigned long vmaddr, unsigned long pfn) { }
-#define flush_dcache_mmap_unlock(mapping)       do { } while (0)
+static inline void flush_dcache_page(struct page *page) { }
-#define flush_icache_range(start, end)          do { } while (0)
+static inline void flush_dcache_mmap_lock(struct address_space *mapping) { }
-#define flush_icache_page(vma, pg)              do { } while (0)
+static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { }
-#define flush_icache_user_range(vma, pg, adr, len)      do { } while (0)
+static inline void flush_icache_range(unsigned long start,
-#define flush_cache_vmap(start, end)            do { } while (0)
+                                      unsigned long end) { }
-#define flush_cache_vunmap(start, end)          do { } while (0)
+static inline void flush_icache_page(struct vm_area_struct *vma,
+                                     struct page *page) { }
+static inline void flush_icache_user_range(struct vm_area_struct *vma,
+                                           struct page *page,
+                                           unsigned long addr,
+                                           unsigned long len) { }
+static inline void flush_cache_vmap(unsigned long start, unsigned long end) { }
+static inline void flush_cache_vunmap(unsigned long start,
+                                      unsigned long end) { }
-#define copy_to_user_page(vma, page, vaddr, dst, src, len)      \
+static inline void copy_to_user_page(struct vm_area_struct *vma,
-        memcpy((dst), (src), (len))
+                                     struct page *page, unsigned long vaddr,
-#define copy_from_user_page(vma, page, vaddr, dst, src, len)    \
+                                     void *dst, const void *src,
-        memcpy((dst), (src), (len))
+                                     unsigned long len)
+{
+        memcpy(dst, src, len);
+}
+static inline void copy_from_user_page(struct vm_area_struct *vma,
+                                       struct page *page, unsigned long vaddr,
+                                       void *dst, const void *src,
+                                       unsigned long len)
+{
+        memcpy(dst, src, len);
+}
 #define PG_non_WB                               PG_arch_1
 PAGEFLAG(NonWB, non_WB)
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 48f0004db8c9..71c9e5183982 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -172,7 +172,13 @@ static inline void __save_init_fpu(struct task_struct *tsk)
 #else  /* CONFIG_X86_32 */
-extern void finit(void);
+#ifdef CONFIG_MATH_EMULATION
+extern void finit_task(struct task_struct *tsk);
+#else
+static inline void finit_task(struct task_struct *tsk)
+{
+}
+#endif
 static inline void tolerant_fwait(void)
 {
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 683d0b4c00fc..e5383e3d2f8c 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -172,8 +172,6 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
 extern void iounmap(volatile void __iomem *addr);
-extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
 #ifdef CONFIG_X86_32
 # include "io_32.h"
@@ -198,7 +196,6 @@ extern void early_ioremap_reset(void);
 extern void __iomem *early_ioremap(unsigned long offset, unsigned long size);
 extern void __iomem *early_memremap(unsigned long offset, unsigned long size);
 extern void early_iounmap(void __iomem *addr, unsigned long size);
-extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
 #define IO_SPACE_LIMIT 0xffff
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 32c6e17b960b..563933e06a35 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -11,6 +11,8 @@
 */
 #define MCG_CTL_P        (1UL<<8)   /* MCG_CAP register available */
+#define MCG_EXT_P        (1ULL<<9)   /* Extended registers available */
+#define MCG_CMCI_P       (1ULL<<10)  /* CMCI supported */
 #define MCG_STATUS_RIPV  (1UL<<0)   /* restart ip valid */
 #define MCG_STATUS_EIPV  (1UL<<1)   /* ip points to correct instruction */
@@ -90,14 +92,29 @@ extern int mce_disabled;
 #include <asm/atomic.h>
+void mce_setup(struct mce *m);
 void mce_log(struct mce *m);
 DECLARE_PER_CPU(struct sys_device, device_mce);
 extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
+/*
+ * To support more than 128 would need to escape the predefined
+ * Linux defined extended banks first.
+ */
+#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
 #ifdef CONFIG_X86_MCE_INTEL
 void mce_intel_feature_init(struct cpuinfo_x86 *c);
+void cmci_clear(void);
+void cmci_reenable(void);
+void cmci_rediscover(int dying);
+void cmci_recheck(void);
 #else
 static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
+static inline void cmci_clear(void) {}
+static inline void cmci_reenable(void) {}
+static inline void cmci_rediscover(int dying) {}
+static inline void cmci_recheck(void) {}
 #endif
 #ifdef CONFIG_X86_MCE_AMD
@@ -106,11 +123,23 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c);
 static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
 #endif
-void mce_log_therm_throt_event(unsigned int cpu, __u64 status);
+extern int mce_available(struct cpuinfo_x86 *c);
+void mce_log_therm_throt_event(__u64 status);
 extern atomic_t mce_entry;
 extern void do_machine_check(struct pt_regs *, long);
+typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
+DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
+enum mcp_flags {
+        MCP_TIMESTAMP = (1 << 0),       /* log time stamp */
+        MCP_UC = (1 << 1),              /* log uncorrected errors */
+};
+extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
 extern int mce_notify_user(void);
 #endif /* !CONFIG_X86_32 */
@@ -120,8 +149,8 @@ extern void mcheck_init(struct cpuinfo_x86 *c);
 #else
 #define mcheck_init(c) do { } while (0)
 #endif
-extern void stop_mce(void);
-extern void restart_mce(void);
+extern void (*mce_threshold_vector)(void);
 #endif /* __KERNEL__ */
 #endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 105fb90a0635..ede6998bd92c 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -91,46 +91,9 @@ static inline int pfn_valid(int pfn)
 #endif /* CONFIG_DISCONTIGMEM */
 #ifdef CONFIG_NEED_MULTIPLE_NODES
+/* always use node 0 for bootmem on this numa platform */
-/*
+#define bootmem_arch_preferred_node(__bdata, size, align, goal, limit)  \
- * Following are macros that are specific to this numa platform.
+        (NODE_DATA(0)->bdata)
- */
-#define reserve_bootmem(addr, size, flags) \
-        reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags))
-#define alloc_bootmem(x) \
-        __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_nopanic(x) \
-        __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
-                                __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_low(x) \
-        __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0)
-#define alloc_bootmem_pages(x) \
-        __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_pages_nopanic(x) \
-        __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \
-                                __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_low_pages(x) \
-        __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
-#define alloc_bootmem_node(pgdat, x)                                    \
-({                                                                      \
-        struct pglist_data  __maybe_unused                      \
-                                *__alloc_bootmem_node__pgdat = (pgdat); \
-        __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES,        \
-                                                __pa(MAX_DMA_ADDRESS)); \
-})
-#define alloc_bootmem_pages_node(pgdat, x)                              \
-({                                                                      \
-        struct pglist_data  __maybe_unused                      \
-                                *__alloc_bootmem_node__pgdat = (pgdat); \
-        __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE,              \
-                                                __pa(MAX_DMA_ADDRESS)); \
-})
-#define alloc_bootmem_low_pages_node(pgdat, x)                          \
-({                                                                      \
-        struct pglist_data  __maybe_unused                      \
-                                *__alloc_bootmem_node__pgdat = (pgdat); \
-        __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0);          \
-})
 #endif /* CONFIG_NEED_MULTIPLE_NODES */
 #endif /* _ASM_X86_MMZONE_32_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 358acc59ae04..2dbd2314139e 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -77,6 +77,11 @@
 #define MSR_IA32_MC0_ADDR               0x00000402
 #define MSR_IA32_MC0_MISC               0x00000403
+/* These are consecutive and not in the normal 4er MCE bank block */
+#define MSR_IA32_MC0_CTL2               0x00000280
+#define CMCI_EN                 (1ULL << 30)
+#define CMCI_THRESHOLD_MASK             0xffffULL
 #define MSR_P6_PERFCTR0                 0x000000c1
 #define MSR_P6_PERFCTR1                 0x000000c2
 #define MSR_P6_EVNTSEL0                 0x00000186
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index aee103b26d01..8f1d2fbec1d4 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -43,6 +43,14 @@
 #else /* ...!ASSEMBLY */
 #include <linux/stringify.h>
+#include <asm/sections.h>
+#define __addr_to_pcpu_ptr(addr)                                        \
+        (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr  \
+                 + (unsigned long)__per_cpu_start)
+#define __pcpu_ptr_to_addr(ptr)                                         \
+        (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr   \
+                 - (unsigned long)__per_cpu_start)
 #ifdef CONFIG_SMP
 #define __percpu_arg(x)         "%%"__stringify(__percpu_seg)":%P" #x
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 1c097a3a6669..d0812e155f1d 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -288,6 +288,8 @@ static inline int is_new_memtype_allowed(unsigned long flags,
        return 1;
 }
+pmd_t *populate_extra_pmd(unsigned long vaddr);
+pte_t *populate_extra_pte(unsigned long vaddr);
 #endif  /* __ASSEMBLY__ */
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 4bd990ee43df..1a918dde46b5 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -164,6 +164,7 @@ static inline pte_t __pte_ma(pteval_t x)
 xmaddr_t arbitrary_virt_to_machine(void *address);
+unsigned long arbitrary_virt_to_mfn(void *vaddr);
 void make_lowmem_page_readonly(void *vaddr);
 void make_lowmem_page_readwrite(void *vaddr);
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 6907b8e85d52..4c80f1557433 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -414,9 +414,17 @@ void __init alternative_instructions(void)
           that might execute the to be patched code.
           Other CPUs are not running. */
        stop_nmi();
-#ifdef CONFIG_X86_MCE
-        stop_mce();
+        /*
-#endif
+         * Don't stop machine check exceptions while patching.
+         * MCEs only happen when something got corrupted and in this
+         * case we must do something about the corruption.
+         * Ignoring it is worse than a unlikely patching race.
+         * Also machine checks tend to be broadcast and if one CPU
+         * goes into machine check the others follow quickly, so we don't
+         * expect a machine check to cause undue problems during to code
+         * patching.
+         */
        apply_alternatives(__alt_instructions, __alt_instructions_end);
@@ -456,9 +464,6 @@ void __init alternative_instructions(void)
                                (unsigned long)__smp_locks_end);
        restart_nmi();
-#ifdef CONFIG_X86_MCE
-        restart_mce();
-#endif
 }
 /**
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index f9cecdfd05c5..30909a258d0f 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -46,6 +46,7 @@
 #include <asm/idle.h>
 #include <asm/mtrr.h>
 #include <asm/smp.h>
+#include <asm/mce.h>
 unsigned int num_processors;
@@ -842,6 +843,14 @@ void clear_local_APIC(void)
                apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
        }
 #endif
+#ifdef CONFIG_X86_MCE_INTEL
+        if (maxlvt >= 6) {
+                v = apic_read(APIC_LVTCMCI);
+                if (!(v & APIC_LVT_MASKED))
+                        apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED);
+        }
+#endif
        /*
         * Clean APIC state for other OSs:
         */
@@ -1241,6 +1250,12 @@ void __cpuinit setup_local_APIC(void)
        apic_write(APIC_LVT1, value);
        preempt_enable();
+#ifdef CONFIG_X86_MCE_INTEL
+        /* Recheck CMCI information after local APIC is up on CPU #0 */
+        if (smp_processor_id() == 0)
+                cmci_recheck();
+#endif
 }
 void __cpuinit end_local_APIC_setup(void)
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 4b1c319d30c3..22590cf688ae 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -601,7 +601,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
        if (!data)
                return -ENOMEM;
-        data->acpi_data = percpu_ptr(acpi_perf_data, cpu);
+        data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
        per_cpu(drv_data, cpu) = data;
        if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index d7d2323bbb69..b2f89829bbe8 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32)		+= k7.o p4.o p5.o p6.o winchip.o
 obj-$(CONFIG_X86_MCE_INTEL)     += mce_intel_64.o
 obj-$(CONFIG_X86_MCE_AMD)       += mce_amd_64.o
 obj-$(CONFIG_X86_MCE_NONFATAL)  += non-fatal.o
+obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index dfaebce3633e..3552119b091d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c)
        }
 }
-static unsigned long old_cr4 __initdata;
-void __init stop_mce(void)
-{
-        old_cr4 = read_cr4();
-        clear_in_cr4(X86_CR4_MCE);
-}
-void __init restart_mce(void)
-{
-        if (old_cr4 & X86_CR4_MCE)
-                set_in_cr4(X86_CR4_MCE);
-}
 static int __init mcheck_disable(char *str)
 {
        mce_disabled = 1;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index fe79985ce0f2..bfbd5323a635 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -3,6 +3,8 @@
 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
 * Rest from unknown author(s).
 * 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
 */
 #include <linux/init.h>
@@ -24,6 +26,9 @@
 #include <linux/ctype.h>
 #include <linux/kmod.h>
 #include <linux/kdebug.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/ratelimit.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
 #include <asm/mce.h>
@@ -32,7 +37,6 @@
 #include <asm/idle.h>
 #define MISC_MCELOG_MINOR 227
-#define NR_SYSFS_BANKS 6
 atomic_t mce_entry;
@@ -47,7 +51,7 @@ static int mce_dont_init;
 */
 static int tolerant = 1;
 static int banks;
-static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL };
+static u64 *bank;
 static unsigned long notify_user;
 static int rip_msr;
 static int mce_bootlog = -1;
@@ -58,6 +62,19 @@ static char *trigger_argv[2] = { trigger, NULL };
 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
+/* MCA banks polled by the period polling timer for corrected events */
+DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
+        [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
+};
+/* Do initial initialization of a struct mce */
+void mce_setup(struct mce *m)
+{
+        memset(m, 0, sizeof(struct mce));
+        m->cpu = smp_processor_id();
+        rdtscll(m->tsc);
+}
 /*
 * Lockless MCE logging infrastructure.
 * This avoids deadlocks on printk locks without having to break locks. Also
@@ -119,11 +136,11 @@ static void print_mce(struct mce *m)
                        print_symbol("{%s}", m->ip);
                printk("\n");
        }
-        printk(KERN_EMERG "TSC %Lx ", m->tsc);
+        printk(KERN_EMERG "TSC %llx ", m->tsc);
        if (m->addr)
-                printk("ADDR %Lx ", m->addr);
+                printk("ADDR %llx ", m->addr);
        if (m->misc)
-                printk("MISC %Lx ", m->misc);
+                printk("MISC %llx ", m->misc);
        printk("\n");
        printk(KERN_EMERG "This is not a software problem!\n");
        printk(KERN_EMERG "Run through mcelog --ascii to decode "
@@ -149,8 +166,10 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start)
        panic(msg);
 }
-static int mce_available(struct cpuinfo_x86 *c)
+int mce_available(struct cpuinfo_x86 *c)
 {
+        if (mce_dont_init)
+                return 0;
        return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 }
@@ -172,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 }
 /*
- * The actual machine check handler
+ * Poll for corrected events or events that happened before reset.
+ * Those are just logged through /dev/mcelog.
+ *
+ * This is executed in standard interrupt context.
+ */
+void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
+{
+        struct mce m;
+        int i;
+        mce_setup(&m);
+        rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+        for (i = 0; i < banks; i++) {
+                if (!bank[i] || !test_bit(i, *b))
+                        continue;
+                m.misc = 0;
+                m.addr = 0;
+                m.bank = i;
+                m.tsc = 0;
+                barrier();
+                rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
+                if (!(m.status & MCI_STATUS_VAL))
+                        continue;
+                /*
+                 * Uncorrected events are handled by the exception handler
+                 * when it is enabled. But when the exception is disabled log
+                 * everything.
+                 *
+                 * TBD do the same check for MCI_STATUS_EN here?
+                 */
+                if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
+                        continue;
+                if (m.status & MCI_STATUS_MISCV)
+                        rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
+                if (m.status & MCI_STATUS_ADDRV)
+                        rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
+                if (!(flags & MCP_TIMESTAMP))
+                        m.tsc = 0;
+                /*
+                 * Don't get the IP here because it's unlikely to
+                 * have anything to do with the actual error location.
+                 */
+                mce_log(&m);
+                add_taint(TAINT_MACHINE_CHECK);
+                /*
+                 * Clear state for this bank.
+                 */
+                wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+        }
+        /*
+         * Don't clear MCG_STATUS here because it's only defined for
+         * exceptions.
+         */
+}
+/*
+ * The actual machine check handler. This only handles real
+ * exceptions when something got corrupted coming in through int 18.
+ *
+ * This is executed in NMI context not subject to normal locking rules. This
+ * implies that most kernel services cannot be safely used. Don't even
+ * think about putting a printk in there!
 */
 void do_machine_check(struct pt_regs * regs, long error_code)
 {
@@ -190,17 +279,18 @@ void do_machine_check(struct pt_regs * regs, long error_code)
         * error.
         */
        int kill_it = 0;
+        DECLARE_BITMAP(toclear, MAX_NR_BANKS);
        atomic_inc(&mce_entry);
-        if ((regs
+        if (notify_die(DIE_NMI, "machine check", regs, error_code,
-             && notify_die(DIE_NMI, "machine check", regs, error_code,
                           18, SIGKILL) == NOTIFY_STOP)
-            || !banks)
+                goto out2;
+        if (!banks)
                goto out2;
-        memset(&m, 0, sizeof(struct mce));
+        mce_setup(&m);
-        m.cpu = smp_processor_id();
        rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
        /* if the restart IP is not valid, we're done for */
        if (!(m.mcgstatus & MCG_STATUS_RIPV))
@@ -210,18 +300,32 @@ void do_machine_check(struct pt_regs * regs, long error_code)
        barrier();
        for (i = 0; i < banks; i++) {
-                if (i < NR_SYSFS_BANKS && !bank[i])
+                __clear_bit(i, toclear);
+                if (!bank[i])
                        continue;
                m.misc = 0;
                m.addr = 0;
                m.bank = i;
-                m.tsc = 0;
                rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
                if ((m.status & MCI_STATUS_VAL) == 0)
                        continue;
+                /*
+                 * Non uncorrected errors are handled by machine_check_poll
+                 * Leave them alone.
+                 */
+                if ((m.status & MCI_STATUS_UC) == 0)
+                        continue;
+                /*
+                 * Set taint even when machine check was not enabled.
+                 */
+                add_taint(TAINT_MACHINE_CHECK);
+                __set_bit(i, toclear);
                if (m.status & MCI_STATUS_EN) {
                        /* if PCC was set, there's no way out */
                        no_way_out |= !!(m.status & MCI_STATUS_PCC);
@@ -235,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
                                        no_way_out = 1;
                                kill_it = 1;
                        }
+                } else {
+                        /*
+                         * Machine check event was not enabled. Clear, but
+                         * ignore.
+                         */
+                        continue;
                }
                if (m.status & MCI_STATUS_MISCV)
@@ -243,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
                        rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
                mce_get_rip(&m, regs);
-                if (error_code >= 0)
+                mce_log(&m);
-                        rdtscll(m.tsc);
-                if (error_code != -2)
-                        mce_log(&m);
                /* Did this bank cause the exception? */
                /* Assume that the bank with uncorrectable errors did it,
@@ -255,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
                        panicm = m;
                        panicm_found = 1;
                }
-                add_taint(TAINT_MACHINE_CHECK);
        }
-        /* Never do anything final in the polling timer */
-        if (!regs)
-                goto out;
        /* If we didn't find an uncorrectable error, pick
           the last one (shouldn't happen, just being safe). */
        if (!panicm_found)
@@ -309,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code)
        /* notify userspace ASAP */
        set_thread_flag(TIF_MCE_NOTIFY);
- out:
        /* the last thing we do is clear state */
-        for (i = 0; i < banks; i++)
+        for (i = 0; i < banks; i++) {
-                wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+                if (test_bit(i, toclear))
+                        wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+        }
        wrmsrl(MSR_IA32_MCG_STATUS, 0);
 out2:
        atomic_dec(&mce_entry);
@@ -332,15 +434,13 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 * and historically has been the register value of the
 * MSR_IA32_THERMAL_STATUS (Intel) msr.
 */
-void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
+void mce_log_therm_throt_event(__u64 status)
 {
        struct mce m;
-        memset(&m, 0, sizeof(m));
+        mce_setup(&m);
-        m.cpu = cpu;
        m.bank = MCE_THERMAL_BANK;
        m.status = status;
-        rdtscll(m.tsc);
        mce_log(&m);
 }
 #endif /* CONFIG_X86_MCE_INTEL */
@@ -353,18 +453,18 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
 static int check_interval = 5 * 60; /* 5 minutes */
 static int next_interval; /* in jiffies */
-static void mcheck_timer(struct work_struct *work);
+static void mcheck_timer(unsigned long);
-static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
+static DEFINE_PER_CPU(struct timer_list, mce_timer);
-static void mcheck_check_cpu(void *info)
+static void mcheck_timer(unsigned long data)
 {
-        if (mce_available(&current_cpu_data))
+        struct timer_list *t = &per_cpu(mce_timer, data);
-                do_machine_check(NULL, 0);
-}
-static void mcheck_timer(struct work_struct *work)
+        WARN_ON(smp_processor_id() != data);
-{
-        on_each_cpu(mcheck_check_cpu, NULL, 1);
+        if (mce_available(&current_cpu_data))
+                machine_check_poll(MCP_TIMESTAMP,
+                                &__get_cpu_var(mce_poll_banks));
        /*
         * Alert userspace if needed.  If we logged an MCE, reduce the
@@ -377,31 +477,41 @@ static void mcheck_timer(struct work_struct *work)
                                (int)round_jiffies_relative(check_interval*HZ));
        }
-        schedule_delayed_work(&mcheck_work, next_interval);
+        t->expires = jiffies + next_interval;
+        add_timer(t);
+}
+static void mce_do_trigger(struct work_struct *work)
+{
+        call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
 }
+static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
 /*
- * This is only called from process context.  This is where we do
+ * Notify the user(s) about new machine check events.
- * anything we need to alert userspace about new MCEs.  This is called
+ * Can be called from interrupt context, but not from machine check/NMI
- * directly from the poller and also from entry.S and idle, thanks to
+ * context.
- * TIF_MCE_NOTIFY.
 */
 int mce_notify_user(void)
 {
+        /* Not more than two messages every minute */
+        static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
        clear_thread_flag(TIF_MCE_NOTIFY);
        if (test_and_clear_bit(0, &notify_user)) {
-                static unsigned long last_print;
-                unsigned long now = jiffies;
                wake_up_interruptible(&mce_wait);
-                if (trigger[0])
-                        call_usermodehelper(trigger, trigger_argv, NULL,
-                                                UMH_NO_WAIT);
-                if (time_after_eq(now, last_print + (check_interval*HZ))) {
+                /*
-                        last_print = now;
+                 * There is no risk of missing notifications because
+                 * work_pending is always cleared before the function is
+                 * executed.
+                 */
+                if (trigger[0] && !work_pending(&mce_trigger_work))
+                        schedule_work(&mce_trigger_work);
+                if (__ratelimit(&ratelimit))
                        printk(KERN_INFO "Machine check events logged\n");
-                }
                return 1;
        }
@@ -425,63 +535,78 @@ static struct notifier_block mce_idle_notifier = {
 static __init int periodic_mcheck_init(void)
 {
-        next_interval = check_interval * HZ;
+       idle_notifier_register(&mce_idle_notifier);
-        if (next_interval)
+       return 0;
-                schedule_delayed_work(&mcheck_work,
-                                      round_jiffies_relative(next_interval));
-        idle_notifier_register(&mce_idle_notifier);
-        return 0;
 }
 __initcall(periodic_mcheck_init);
 /*
 * Initialize Machine Checks for a CPU.
 */
-static void mce_init(void *dummy)
+static int mce_cap_init(void)
 {
        u64 cap;
-        int i;
+        unsigned b;
        rdmsrl(MSR_IA32_MCG_CAP, cap);
-        banks = cap & 0xff;
+        b = cap & 0xff;
-        if (banks > MCE_EXTENDED_BANK) {
+        if (b > MAX_NR_BANKS) {
-                banks = MCE_EXTENDED_BANK;
+                printk(KERN_WARNING
-                printk(KERN_INFO "MCE: warning: using only %d banks\n",
+                       "MCE: Using only %u machine check banks out of %u\n",
-                       MCE_EXTENDED_BANK);
+                        MAX_NR_BANKS, b);
+                b = MAX_NR_BANKS;
        }
+        /* Don't support asymmetric configurations today */
+        WARN_ON(banks != 0 && b != banks);
+        banks = b;
+        if (!bank) {
+                bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
+                if (!bank)
+                        return -ENOMEM;
+                memset(bank, 0xff, banks * sizeof(u64));
+        }
        /* Use accurate RIP reporting if available. */
        if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
                rip_msr = MSR_IA32_MCG_EIP;
-        /* Log the machine checks left over from the previous reset.
+        return 0;
-           This also clears all registers */
+}
-        do_machine_check(NULL, mce_bootlog ? -1 : -2);
+static void mce_init(void *dummy)
+{
+        u64 cap;
+        int i;
+        mce_banks_t all_banks;
+        /*
+         * Log the machine checks left over from the previous reset.
+         */
+        bitmap_fill(all_banks, MAX_NR_BANKS);
+        machine_check_poll(MCP_UC, &all_banks);
        set_in_cr4(X86_CR4_MCE);
+        rdmsrl(MSR_IA32_MCG_CAP, cap);
        if (cap & MCG_CTL_P)
                wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
        for (i = 0; i < banks; i++) {
-                if (i < NR_SYSFS_BANKS)
+                wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
-                        wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
-                else
-                        wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
                wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
        }
 }
 /* Add per CPU specific workarounds here */
-static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
+static void mce_cpu_quirks(struct cpuinfo_x86 *c)
 {
        /* This should be disabled by the BIOS, but isn't always */
        if (c->x86_vendor == X86_VENDOR_AMD) {
-                if(c->x86 == 15)
+                if (c->x86 == 15 && banks > 4)
                        /* disable GART TBL walk error reporting, which trips off
                           incorrectly with the IOMMU & 3ware & Cerberus. */
-                        clear_bit(10, &bank[4]);
+                        clear_bit(10, (unsigned long *)&bank[4]);
                if(c->x86 <= 17 && mce_bootlog < 0)
                        /* Lots of broken BIOS around that don't clear them
                           by default and leave crap in there. Don't log. */
@@ -504,20 +629,38 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
        }
 }
+static void mce_init_timer(void)
+{
+        struct timer_list *t = &__get_cpu_var(mce_timer);
+        /* data race harmless because everyone sets to the same value */
+        if (!next_interval)
+                next_interval = check_interval * HZ;
+        if (!next_interval)
+                return;
+        setup_timer(t, mcheck_timer, smp_processor_id());
+        t->expires = round_jiffies_relative(jiffies + next_interval);
+        add_timer(t);
+}
 /*
 * Called for each booted CPU to set up machine checks.
 * Must be called with preempt off.
 */
 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 {
-        mce_cpu_quirks(c);
+        if (!mce_available(c))
+                return;
-        if (mce_dont_init ||
+        if (mce_cap_init() < 0) {
-            !mce_available(c))
+                mce_dont_init = 1;
                return;
+        }
+        mce_cpu_quirks(c);
        mce_init(NULL);
        mce_cpu_features(c);
+        mce_init_timer();
 }
 /*
@@ -573,7 +716,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 {
        unsigned long *cpu_tsc;
        static DEFINE_MUTEX(mce_read_mutex);
-        unsigned next;
+        unsigned prev, next;
        char __user *buf = ubuf;
        int i, err;
@@ -592,25 +735,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
        }
        err = 0;
-        for (i = 0; i < next; i++) {
+        prev = 0;
-                unsigned long start = jiffies;
+        do {
+                for (i = prev; i < next; i++) {
-                while (!mcelog.entry[i].finished) {
+                        unsigned long start = jiffies;
-                        if (time_after_eq(jiffies, start + 2)) {
-                                memset(mcelog.entry + i,0, sizeof(struct mce));
+                        while (!mcelog.entry[i].finished) {
-                                goto timeout;
+                                if (time_after_eq(jiffies, start + 2)) {
+                                        memset(mcelog.entry + i, 0,
+                                               sizeof(struct mce));
+                                        goto timeout;
+                                }
+                                cpu_relax();
                        }
-                        cpu_relax();
+                        smp_rmb();
+                        err |= copy_to_user(buf, mcelog.entry + i,
+                                            sizeof(struct mce));
+                        buf += sizeof(struct mce);
+timeout:
+                        ;
                }
-                smp_rmb();
-                err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
-                buf += sizeof(struct mce);
- timeout:
-                ;
-        }
-        memset(mcelog.entry, 0, next * sizeof(struct mce));
+                memset(mcelog.entry + prev, 0,
-        mcelog.next = 0;
+                       (next - prev) * sizeof(struct mce));
+                prev = next;
+                next = cmpxchg(&mcelog.next, prev, 0);
+        } while (next != prev);
        synchronize_sched();
@@ -680,20 +830,6 @@ static struct miscdevice mce_log_device = {
        &mce_chrdev_ops,
 };
-static unsigned long old_cr4 __initdata;
-void __init stop_mce(void)
-{
-        old_cr4 = read_cr4();
-        clear_in_cr4(X86_CR4_MCE);
-}
-void __init restart_mce(void)
-{
-        if (old_cr4 & X86_CR4_MCE)
-                set_in_cr4(X86_CR4_MCE);
-}
 /*
 * Old style boot options parsing. Only for compatibility.
 */
@@ -703,8 +839,7 @@ static int __init mcheck_disable(char *str)
        return 1;
 }
-/* mce=off disables machine check. Note you can re-enable it later
+/* mce=off disables machine check.
-   using sysfs.
   mce=TOLERANCELEVEL (number, see above)
   mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
   mce=nobootlog Don't log MCEs from before booting. */
@@ -728,6 +863,29 @@ __setup("mce=", mcheck_enable);
 * Sysfs support
 */
+/*
+ * Disable machine checks on suspend and shutdown. We can't really handle
+ * them later.
+ */
+static int mce_disable(void)
+{
+        int i;
+        for (i = 0; i < banks; i++)
+                wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+        return 0;
+}
+static int mce_suspend(struct sys_device *dev, pm_message_t state)
+{
+        return mce_disable();
+}
+static int mce_shutdown(struct sys_device *dev)
+{
+        return mce_disable();
+}
 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
   Only one CPU is active at this time, the others get readded later using
   CPU hotplug. */
@@ -738,20 +896,24 @@ static int mce_resume(struct sys_device *dev)
        return 0;
 }
+static void mce_cpu_restart(void *data)
+{
+        del_timer_sync(&__get_cpu_var(mce_timer));
+        if (mce_available(&current_cpu_data))
+                mce_init(NULL);
+        mce_init_timer();
+}
 /* Reinit MCEs after user configuration changes */
 static void mce_restart(void)
 {
-        if (next_interval)
-                cancel_delayed_work(&mcheck_work);
-        /* Timer race is harmless here */
-        on_each_cpu(mce_init, NULL, 1);
        next_interval = check_interval * HZ;
-        if (next_interval)
+        on_each_cpu(mce_cpu_restart, NULL, 1);
-                schedule_delayed_work(&mcheck_work,
-                                      round_jiffies_relative(next_interval));
 }
 static struct sysdev_class mce_sysclass = {
+        .suspend = mce_suspend,
+        .shutdown = mce_shutdown,
        .resume = mce_resume,
        .name = "machinecheck",
 };
@@ -778,16 +940,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit
        }                                                               \
        static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
-/*
+static struct sysdev_attribute *bank_attrs;
- * TBD should generate these dynamically based on number of available banks.
- * Have only 6 contol banks in /sysfs until then.
+static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
- */
+                         char *buf)
-ACCESSOR(bank0ctl,bank[0],mce_restart())
+{
-ACCESSOR(bank1ctl,bank[1],mce_restart())
+        u64 b = bank[attr - bank_attrs];
-ACCESSOR(bank2ctl,bank[2],mce_restart())
+        return sprintf(buf, "%llx\n", b);
-ACCESSOR(bank3ctl,bank[3],mce_restart())
+}
-ACCESSOR(bank4ctl,bank[4],mce_restart())
-ACCESSOR(bank5ctl,bank[5],mce_restart())
+static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
+                        const char *buf, size_t siz)
+{
+        char *end;
+        u64 new = simple_strtoull(buf, &end, 0);
+        if (end == buf)
+                return -EINVAL;
+        bank[attr - bank_attrs] = new;
+        mce_restart();
+        return end-buf;
+}
 static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
                                char *buf)
@@ -814,8 +986,6 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
 static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
 ACCESSOR(check_interval,check_interval,mce_restart())
 static struct sysdev_attribute *mce_attributes[] = {
-        &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
-        &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
        &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
        NULL
 };
@@ -845,11 +1015,22 @@ static __cpuinit int mce_create_device(unsigned int cpu)
                if (err)
                        goto error;
        }
+        for (i = 0; i < banks; i++) {
+                err = sysdev_create_file(&per_cpu(device_mce, cpu),
+                                        &bank_attrs[i]);
+                if (err)
+                        goto error2;
+        }
        cpu_set(cpu, mce_device_initialized);
        return 0;
+error2:
+        while (--i >= 0) {
+                sysdev_remove_file(&per_cpu(device_mce, cpu),
+                                        &bank_attrs[i]);
+        }
 error:
-        while (i--) {
+        while (--i >= 0) {
                sysdev_remove_file(&per_cpu(device_mce,cpu),
                                   mce_attributes[i]);
        }
@@ -868,15 +1049,46 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
        for (i = 0; mce_attributes[i]; i++)
                sysdev_remove_file(&per_cpu(device_mce,cpu),
                        mce_attributes[i]);
+        for (i = 0; i < banks; i++)
+                sysdev_remove_file(&per_cpu(device_mce, cpu),
+                        &bank_attrs[i]);
        sysdev_unregister(&per_cpu(device_mce,cpu));
        cpu_clear(cpu, mce_device_initialized);
 }
+/* Make sure there are no machine checks on offlined CPUs. */
+static void mce_disable_cpu(void *h)
+{
+        int i;
+        unsigned long action = *(unsigned long *)h;
+        if (!mce_available(&current_cpu_data))
+                return;
+        if (!(action & CPU_TASKS_FROZEN))
+                cmci_clear();
+        for (i = 0; i < banks; i++)
+                wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+}
+static void mce_reenable_cpu(void *h)
+{
+        int i;
+        unsigned long action = *(unsigned long *)h;
+        if (!mce_available(&current_cpu_data))
+                return;
+        if (!(action & CPU_TASKS_FROZEN))
+                cmci_reenable();
+        for (i = 0; i < banks; i++)
+                wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
+}
 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
 static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
                                      unsigned long action, void *hcpu)
 {
        unsigned int cpu = (unsigned long)hcpu;
+        struct timer_list *t = &per_cpu(mce_timer, cpu);
        switch (action) {
        case CPU_ONLINE:
@@ -891,6 +1103,21 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
                        threshold_cpu_callback(action, cpu);
                mce_remove_device(cpu);
                break;
+        case CPU_DOWN_PREPARE:
+        case CPU_DOWN_PREPARE_FROZEN:
+                del_timer_sync(t);
+                smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
+                break;
+        case CPU_DOWN_FAILED:
+        case CPU_DOWN_FAILED_FROZEN:
+                t->expires = round_jiffies_relative(jiffies + next_interval);
+                add_timer_on(t, cpu);
+                smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
+                break;
+        case CPU_POST_DEAD:
+                /* intentionally ignoring frozen here */
+                cmci_rediscover(cpu);
+                break;
        }
        return NOTIFY_OK;
 }
@@ -899,6 +1126,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {
        .notifier_call = mce_cpu_callback,
 };
+static __init int mce_init_banks(void)
+{
+        int i;
+        bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
+                                GFP_KERNEL);
+        if (!bank_attrs)
+                return -ENOMEM;
+        for (i = 0; i < banks; i++) {
+                struct sysdev_attribute *a = &bank_attrs[i];
+                a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
+                if (!a->attr.name)
+                        goto nomem;
+                a->attr.mode = 0644;
+                a->show = show_bank;
+                a->store = set_bank;
+        }
+        return 0;
+nomem:
+        while (--i >= 0)
+                kfree(bank_attrs[i].attr.name);
+        kfree(bank_attrs);
+        bank_attrs = NULL;
+        return -ENOMEM;
+}
 static __init int mce_init_device(void)
 {
        int err;
@@ -906,6 +1161,11 @@ static __init int mce_init_device(void)
        if (!mce_available(&boot_cpu_data))
                return -EIO;
+        err = mce_init_banks();
+        if (err)
+                return err;
        err = sysdev_class_register(&mce_sysclass);
        if (err)
                return err;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 9817506dd469..c5a32f92d07e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANKS] = {
 static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
+static void amd_threshold_interrupt(void);
 /*
 * CPU Initialization
 */
@@ -174,6 +176,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
                        tr.reset = 0;
                        tr.old_limit = 0;
                        threshold_restart_bank(&tr);
+                        mce_threshold_vector = amd_threshold_interrupt;
                }
        }
 }
@@ -187,19 +191,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 * the interrupt goes off when error_count reaches threshold_limit.
 * the handler will simply log mcelog w/ software defined bank number.
 */
-asmlinkage void mce_threshold_interrupt(void)
+static void amd_threshold_interrupt(void)
 {
        unsigned int bank, block;
        struct mce m;
        u32 low = 0, high = 0, address = 0;
-        ack_APIC_irq();
+        mce_setup(&m);
-        exit_idle();
-        irq_enter();
-        memset(&m, 0, sizeof(m));
-        rdtscll(m.tsc);
-        m.cpu = smp_processor_id();
        /* assume first bank caused it */
        for (bank = 0; bank < NR_BANKS; ++bank) {
@@ -233,7 +231,8 @@ asmlinkage void mce_threshold_interrupt(void)
                        /* Log the machine check that caused the threshold
                           event. */
-                        do_machine_check(NULL, 0);
+                        machine_check_poll(MCP_TIMESTAMP,
+                                        &__get_cpu_var(mce_poll_banks));
                        if (high & MASK_OVERFLOW_HI) {
                                rdmsrl(address, m.misc);
@@ -243,13 +242,10 @@ asmlinkage void mce_threshold_interrupt(void)
                                       + bank * NR_BLOCKS
                                       + block;
                                mce_log(&m);
-                                goto out;
+                                return;
                        }
                }
        }
-out:
-        inc_irq_stat(irq_threshold_count);
-        irq_exit();
 }
 /*
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index aa5e287c98e0..aaa7d9730938 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -1,6 +1,8 @@
 /*
 * Intel specific MCE features.
 * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
+ * Copyright (C) 2008, 2009 Intel Corporation
+ * Author: Andi Kleen
 */
 #include <linux/init.h>
@@ -13,6 +15,7 @@
 #include <asm/hw_irq.h>
 #include <asm/idle.h>
 #include <asm/therm_throt.h>
+#include <asm/apic.h>
 asmlinkage void smp_thermal_interrupt(void)
 {
@@ -25,7 +28,7 @@ asmlinkage void smp_thermal_interrupt(void)
        rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
        if (therm_throt_process(msr_val & 1))
-                mce_log_therm_throt_event(smp_processor_id(), msr_val);
+                mce_log_therm_throt_event(msr_val);
        inc_irq_stat(irq_thermal_count);
        irq_exit();
@@ -85,7 +88,209 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
        return;
 }
+/*
+ * Support for Intel Correct Machine Check Interrupts. This allows
+ * the CPU to raise an interrupt when a corrected machine check happened.
+ * Normally we pick those up using a regular polling timer.
+ * Also supports reliable discovery of shared banks.
+ */
+static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
+/*
+ * cmci_discover_lock protects against parallel discovery attempts
+ * which could race against each other.
+ */
+static DEFINE_SPINLOCK(cmci_discover_lock);
+#define CMCI_THRESHOLD 1
+static int cmci_supported(int *banks)
+{
+        u64 cap;
+        /*
+         * Vendor check is not strictly needed, but the initial
+         * initialization is vendor keyed and this
+         * makes sure none of the backdoors are entered otherwise.
+         */
+        if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+                return 0;
+        if (!cpu_has_apic || lapic_get_maxlvt() < 6)
+                return 0;
+        rdmsrl(MSR_IA32_MCG_CAP, cap);
+        *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
+        return !!(cap & MCG_CMCI_P);
+}
+/*
+ * The interrupt handler. This is called on every event.
+ * Just call the poller directly to log any events.
+ * This could in theory increase the threshold under high load,
+ * but doesn't for now.
+ */
+static void intel_threshold_interrupt(void)
+{
+        machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+        mce_notify_user();
+}
+static void print_update(char *type, int *hdr, int num)
+{
+        if (*hdr == 0)
+                printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
+        *hdr = 1;
+        printk(KERN_CONT " %s:%d", type, num);
+}
+/*
+ * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
+ * on this CPU. Use the algorithm recommended in the SDM to discover shared
+ * banks.
+ */
+static void cmci_discover(int banks, int boot)
+{
+        unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
+        int hdr = 0;
+        int i;
+        spin_lock(&cmci_discover_lock);
+        for (i = 0; i < banks; i++) {
+                u64 val;
+                if (test_bit(i, owned))
+                        continue;
+                rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+                /* Already owned by someone else? */
+                if (val & CMCI_EN) {
+                        if (test_and_clear_bit(i, owned) || boot)
+                                print_update("SHD", &hdr, i);
+                        __clear_bit(i, __get_cpu_var(mce_poll_banks));
+                        continue;
+                }
+                val |= CMCI_EN | CMCI_THRESHOLD;
+                wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
+                rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+                /* Did the enable bit stick? -- the bank supports CMCI */
+                if (val & CMCI_EN) {
+                        if (!test_and_set_bit(i, owned) || boot)
+                                print_update("CMCI", &hdr, i);
+                        __clear_bit(i, __get_cpu_var(mce_poll_banks));
+                } else {
+                        WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
+                }
+        }
+        spin_unlock(&cmci_discover_lock);
+        if (hdr)
+                printk(KERN_CONT "\n");
+}
+/*
+ * Just in case we missed an event during initialization check
+ * all the CMCI owned banks.
+ */
+void cmci_recheck(void)
+{
+        unsigned long flags;
+        int banks;
+        if (!mce_available(&current_cpu_data) || !cmci_supported(&banks))
+                return;
+        local_irq_save(flags);
+        machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+        local_irq_restore(flags);
+}
+/*
+ * Disable CMCI on this CPU for all banks it owns when it goes down.
+ * This allows other CPUs to claim the banks on rediscovery.
+ */
+void cmci_clear(void)
+{
+        int i;
+        int banks;
+        u64 val;
+        if (!cmci_supported(&banks))
+                return;
+        spin_lock(&cmci_discover_lock);
+        for (i = 0; i < banks; i++) {
+                if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
+                        continue;
+                /* Disable CMCI */
+                rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+                val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
+                wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
+                __clear_bit(i, __get_cpu_var(mce_banks_owned));
+        }
+        spin_unlock(&cmci_discover_lock);
+}
+/*
+ * After a CPU went down cycle through all the others and rediscover
+ * Must run in process context.
+ */
+void cmci_rediscover(int dying)
+{
+        int banks;
+        int cpu;
+        cpumask_var_t old;
+        if (!cmci_supported(&banks))
+                return;
+        if (!alloc_cpumask_var(&old, GFP_KERNEL))
+                return;
+        cpumask_copy(old, &current->cpus_allowed);
+        for_each_online_cpu (cpu) {
+                if (cpu == dying)
+                        continue;
+                if (set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)))
+                        continue;
+                /* Recheck banks in case CPUs don't all have the same */
+                if (cmci_supported(&banks))
+                        cmci_discover(banks, 0);
+        }
+        set_cpus_allowed_ptr(current, old);
+        free_cpumask_var(old);
+}
+/*
+ * Reenable CMCI on this CPU in case a CPU down failed.
+ */
+void cmci_reenable(void)
+{
+        int banks;
+        if (cmci_supported(&banks))
+                cmci_discover(banks, 0);
+}
+static __cpuinit void intel_init_cmci(void)
+{
+        int banks;
+        if (!cmci_supported(&banks))
+                return;
+        mce_threshold_vector = intel_threshold_interrupt;
+        cmci_discover(banks, 1);
+        /*
+         * For CPU #0 this runs with still disabled APIC, but that's
+         * ok because only the vector is set up. We still do another
+         * check for the banks later for CPU #0 just to make sure
+         * to not miss any events.
+         */
+        apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
+        cmci_recheck();
+}
 void mce_intel_feature_init(struct cpuinfo_x86 *c)
 {
        intel_init_thermal(c);
+        intel_init_cmci();
 }
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
new file mode 100644
index 000000000000..23ee9e730f78
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -0,0 +1,29 @@
+/*
+ * Common corrected MCE threshold handler code:
+ */
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <asm/irq_vectors.h>
+#include <asm/apic.h>
+#include <asm/idle.h>
+#include <asm/mce.h>
+static void default_threshold_interrupt(void)
+{
+        printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
+                         THRESHOLD_APIC_VECTOR);
+}
+void (*mce_threshold_vector)(void) = default_threshold_interrupt;
+asmlinkage void mce_threshold_interrupt(void)
+{
+        exit_idle();
+        irq_enter();
+        inc_irq_stat(irq_threshold_count);
+        mce_threshold_vector();
+        irq_exit();
+        /* Ack only at the end to avoid potential reentry */
+        ack_APIC_irq();
+}
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index b0f61f0dcd0a..f2f8540a7f3d 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -136,7 +136,7 @@ int init_fpu(struct task_struct *tsk)
 #ifdef CONFIG_X86_32
        if (!HAVE_HWFP) {
                memset(tsk->thread.xstate, 0, xstate_size);
-                finit();
+                finit_task(tsk);
                set_stopped_child_used_math(tsk);
                return 0;
        }
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 9dc6b2b24275..3b09634a5153 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -16,6 +16,7 @@
 #include <linux/cpu.h>
 #include <linux/delay.h>
 #include <linux/uaccess.h>
+#include <linux/percpu.h>
 #include <asm/apic.h>
@@ -55,13 +56,13 @@ static inline void print_stack_overflow(void) { }
 union irq_ctx {
        struct thread_info      tinfo;
        u32                     stack[THREAD_SIZE/sizeof(u32)];
-};
+} __attribute__((aligned(PAGE_SIZE)));
-static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
+static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
-static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
+static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
-static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
+static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack);
-static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
+static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack);
 static void call_on_stack(void *func, void *stack)
 {
@@ -81,7 +82,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
        u32 *isp, arg1, arg2;
        curctx = (union irq_ctx *) current_thread_info();
-        irqctx = hardirq_ctx[smp_processor_id()];
+        irqctx = __get_cpu_var(hardirq_ctx);
        /*
         * this is where we switch to the IRQ stack. However, if we are
@@ -125,34 +126,34 @@ void __cpuinit irq_ctx_init(int cpu)
 {
        union irq_ctx *irqctx;
-        if (hardirq_ctx[cpu])
+        if (per_cpu(hardirq_ctx, cpu))
                return;
-        irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
+        irqctx = &per_cpu(hardirq_stack, cpu);
        irqctx->tinfo.task              = NULL;
        irqctx->tinfo.exec_domain       = NULL;
        irqctx->tinfo.cpu               = cpu;
        irqctx->tinfo.preempt_count     = HARDIRQ_OFFSET;
        irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
-        hardirq_ctx[cpu] = irqctx;
+        per_cpu(hardirq_ctx, cpu) = irqctx;
-        irqctx = (union irq_ctx *) &softirq_stack[cpu*THREAD_SIZE];
+        irqctx = &per_cpu(softirq_stack, cpu);
        irqctx->tinfo.task              = NULL;
        irqctx->tinfo.exec_domain       = NULL;
        irqctx->tinfo.cpu               = cpu;
        irqctx->tinfo.preempt_count     = 0;
        irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
-        softirq_ctx[cpu] = irqctx;
+        per_cpu(softirq_ctx, cpu) = irqctx;
        printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
-               cpu, hardirq_ctx[cpu], softirq_ctx[cpu]);
+               cpu, per_cpu(hardirq_ctx, cpu),  per_cpu(softirq_ctx, cpu));
 }
 void irq_ctx_exit(int cpu)
 {
-        hardirq_ctx[cpu] = NULL;
+        per_cpu(hardirq_ctx, cpu) = NULL;
 }
 asmlinkage void do_softirq(void)
@@ -169,7 +170,7 @@ asmlinkage void do_softirq(void)
        if (local_softirq_pending()) {
                curctx = current_thread_info();
-                irqctx = softirq_ctx[smp_processor_id()];
+                irqctx = __get_cpu_var(softirq_ctx);
                irqctx->tinfo.task = curctx->task;
                irqctx->tinfo.previous_esp = current_stack_pointer;
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 1cc18d439bbb..2aef36d8aca2 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -216,6 +216,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
                        DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
                },
        },
+        {       /* Handle problems with rebooting on Dell XPS710 */
+                .callback = set_bios_reboot,
+                .ident = "Dell XPS710",
+                .matches = {
+                        DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+                        DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"),
+                },
+        },
        { }
 };
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index d992e6cff730..c29f301d3885 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -7,6 +7,7 @@
 #include <linux/crash_dump.h>
 #include <linux/smp.h>
 #include <linux/topology.h>
+#include <linux/pfn.h>
 #include <asm/sections.h>
 #include <asm/processor.h>
 #include <asm/setup.h>
@@ -41,6 +42,321 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
 };
 EXPORT_SYMBOL(__per_cpu_offset);
+/**
+ * pcpu_need_numa - determine percpu allocation needs to consider NUMA
+ *
+ * If NUMA is not configured or there is only one NUMA node available,
+ * there is no reason to consider NUMA.  This function determines
+ * whether percpu allocation should consider NUMA or not.
+ *
+ * RETURNS:
+ * true if NUMA should be considered; otherwise, false.
+ */
+static bool __init pcpu_need_numa(void)
+{
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+        pg_data_t *last = NULL;
+        unsigned int cpu;
+        for_each_possible_cpu(cpu) {
+                int node = early_cpu_to_node(cpu);
+                if (node_online(node) && NODE_DATA(node) &&
+                    last && last != NODE_DATA(node))
+                        return true;
+                last = NODE_DATA(node);
+        }
+#endif
+        return false;
+}
+/**
+ * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
+ * @cpu: cpu to allocate for
+ * @size: size allocation in bytes
+ * @align: alignment
+ *
+ * Allocate @size bytes aligned at @align for cpu @cpu.  This wrapper
+ * does the right thing for NUMA regardless of the current
+ * configuration.
+ *
+ * RETURNS:
+ * Pointer to the allocated area on success, NULL on failure.
+ */
+static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
+                                        unsigned long align)
+{
+        const unsigned long goal = __pa(MAX_DMA_ADDRESS);
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+        int node = early_cpu_to_node(cpu);
+        void *ptr;
+        if (!node_online(node) || !NODE_DATA(node)) {
+                ptr = __alloc_bootmem_nopanic(size, align, goal);
+                pr_info("cpu %d has no node %d or node-local memory\n",
+                        cpu, node);
+                pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
+                         cpu, size, __pa(ptr));
+        } else {
+                ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
+                                                   size, align, goal);
+                pr_debug("per cpu data for cpu%d %lu bytes on node%d at "
+                         "%016lx\n", cpu, size, node, __pa(ptr));
+        }
+        return ptr;
+#else
+        return __alloc_bootmem_nopanic(size, align, goal);
+#endif
+}
+/*
+ * Remap allocator
+ *
+ * This allocator uses PMD page as unit.  A PMD page is allocated for
+ * each cpu and each is remapped into vmalloc area using PMD mapping.
+ * As PMD page is quite large, only part of it is used for the first
+ * chunk.  Unused part is returned to the bootmem allocator.
+ *
+ * So, the PMD pages are mapped twice - once to the physical mapping
+ * and to the vmalloc area for the first percpu chunk.  The double
+ * mapping does add one more PMD TLB entry pressure but still is much
+ * better than only using 4k mappings while still being NUMA friendly.
+ */
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+static size_t pcpur_size __initdata;
+static void **pcpur_ptrs __initdata;
+static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
+{
+        size_t off = (size_t)pageno << PAGE_SHIFT;
+        if (off >= pcpur_size)
+                return NULL;
+        return virt_to_page(pcpur_ptrs[cpu] + off);
+}
+static ssize_t __init setup_pcpu_remap(size_t static_size)
+{
+        static struct vm_struct vm;
+        pg_data_t *last;
+        size_t ptrs_size;
+        unsigned int cpu;
+        ssize_t ret;
+        /*
+         * If large page isn't supported, there's no benefit in doing
+         * this.  Also, on non-NUMA, embedding is better.
+         */
+        if (!cpu_has_pse || pcpu_need_numa())
+                return -EINVAL;
+        last = NULL;
+        for_each_possible_cpu(cpu) {
+                int node = early_cpu_to_node(cpu);
+                if (node_online(node) && NODE_DATA(node) &&
+                    last && last != NODE_DATA(node))
+                        goto proceed;
+                last = NODE_DATA(node);
+        }
+        return -EINVAL;
+proceed:
+        /*
+         * Currently supports only single page.  Supporting multiple
+         * pages won't be too difficult if it ever becomes necessary.
+         */
+        pcpur_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
+        if (pcpur_size > PMD_SIZE) {
+                pr_warning("PERCPU: static data is larger than large page, "
+                           "can't use large page\n");
+                return -EINVAL;
+        }
+        /* allocate pointer array and alloc large pages */
+        ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
+        pcpur_ptrs = alloc_bootmem(ptrs_size);
+        for_each_possible_cpu(cpu) {
+                pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE);
+                if (!pcpur_ptrs[cpu])
+                        goto enomem;
+                /*
+                 * Only use pcpur_size bytes and give back the rest.
+                 *
+                 * Ingo: The 2MB up-rounding bootmem is needed to make
+                 * sure the partial 2MB page is still fully RAM - it's
+                 * not well-specified to have a PAT-incompatible area
+                 * (unmapped RAM, device memory, etc.) in that hole.
+                 */
+                free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
+                             PMD_SIZE - pcpur_size);
+                memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
+        }
+        /* allocate address and map */
+        vm.flags = VM_ALLOC;
+        vm.size = num_possible_cpus() * PMD_SIZE;
+        vm_area_register_early(&vm, PMD_SIZE);
+        for_each_possible_cpu(cpu) {
+                pmd_t *pmd;
+                pmd = populate_extra_pmd((unsigned long)vm.addr
+                                         + cpu * PMD_SIZE);
+                set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])),
+                                     PAGE_KERNEL_LARGE));
+        }
+        /* we're ready, commit */
+        pr_info("PERCPU: Remapped at %p with large pages, static data "
+                "%zu bytes\n", vm.addr, static_size);
+        ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, PMD_SIZE,
+                                     pcpur_size - static_size, vm.addr, NULL);
+        goto out_free_ar;
+enomem:
+        for_each_possible_cpu(cpu)
+                if (pcpur_ptrs[cpu])
+                        free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE);
+        ret = -ENOMEM;
+out_free_ar:
+        free_bootmem(__pa(pcpur_ptrs), ptrs_size);
+        return ret;
+}
+#else
+static ssize_t __init setup_pcpu_remap(size_t static_size)
+{
+        return -EINVAL;
+}
+#endif
+/*
+ * Embedding allocator
+ *
+ * The first chunk is sized to just contain the static area plus
+ * PERCPU_DYNAMIC_RESERVE and allocated as a contiguous area using
+ * bootmem allocator and used as-is without being mapped into vmalloc
+ * area.  This enables the first chunk to piggy back on the linear
+ * physical PMD mapping and doesn't add any additional pressure to
+ * TLB.
+ */
+static void *pcpue_ptr __initdata;
+static size_t pcpue_unit_size __initdata;
+static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
+{
+        return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size
+                            + ((size_t)pageno << PAGE_SHIFT));
+}
+static ssize_t __init setup_pcpu_embed(size_t static_size)
+{
+        unsigned int cpu;
+        /*
+         * If large page isn't supported, there's no benefit in doing
+         * this.  Also, embedding allocation doesn't play well with
+         * NUMA.
+         */
+        if (!cpu_has_pse || pcpu_need_numa())
+                return -EINVAL;
+        /* allocate and copy */
+        pcpue_unit_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
+        pcpue_unit_size = max_t(size_t, pcpue_unit_size, PCPU_MIN_UNIT_SIZE);
+        pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size,
+                                       PAGE_SIZE);
+        if (!pcpue_ptr)
+                return -ENOMEM;
+        for_each_possible_cpu(cpu)
+                memcpy(pcpue_ptr + cpu * pcpue_unit_size, __per_cpu_load,
+                       static_size);
+        /* we're ready, commit */
+        pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
+                pcpue_unit_size >> PAGE_SHIFT, pcpue_ptr, static_size);
+        return pcpu_setup_first_chunk(pcpue_get_page, static_size,
+                                      pcpue_unit_size,
+                                      pcpue_unit_size - static_size, pcpue_ptr,
+                                      NULL);
+}
+/*
+ * 4k page allocator
+ *
+ * This is the basic allocator.  Static percpu area is allocated
+ * page-by-page and most of initialization is done by the generic
+ * setup function.
+ */
+static struct page **pcpu4k_pages __initdata;
+static int pcpu4k_nr_static_pages __initdata;
+static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
+{
+        if (pageno < pcpu4k_nr_static_pages)
+                return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
+        return NULL;
+}
+static void __init pcpu4k_populate_pte(unsigned long addr)
+{
+        populate_extra_pte(addr);
+}
+static ssize_t __init setup_pcpu_4k(size_t static_size)
+{
+        size_t pages_size;
+        unsigned int cpu;
+        int i, j;
+        ssize_t ret;
+        pcpu4k_nr_static_pages = PFN_UP(static_size);
+        /* unaligned allocations can't be freed, round up to page size */
+        pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus()
+                               * sizeof(pcpu4k_pages[0]));
+        pcpu4k_pages = alloc_bootmem(pages_size);
+        /* allocate and copy */
+        j = 0;
+        for_each_possible_cpu(cpu)
+                for (i = 0; i < pcpu4k_nr_static_pages; i++) {
+                        void *ptr;
+                        ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
+                        if (!ptr)
+                                goto enomem;
+                        memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
+                        pcpu4k_pages[j++] = virt_to_page(ptr);
+                }
+        /* we're ready, commit */
+        pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
+                pcpu4k_nr_static_pages, static_size);
+        ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, 0, NULL,
+                                     pcpu4k_populate_pte);
+        goto out_free_ar;
+enomem:
+        while (--j >= 0)
+                free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
+        ret = -ENOMEM;
+out_free_ar:
+        free_bootmem(__pa(pcpu4k_pages), pages_size);
+        return ret;
+}
 static inline void setup_percpu_segment(int cpu)
 {
 #ifdef CONFIG_X86_32
@@ -61,38 +377,35 @@ static inline void setup_percpu_segment(int cpu)
 */
 void __init setup_per_cpu_areas(void)
 {
-        ssize_t size;
+        size_t static_size = __per_cpu_end - __per_cpu_start;
-        char *ptr;
+        unsigned int cpu;
-        int cpu;
+        unsigned long delta;
+        size_t pcpu_unit_size;
-        /* Copy section for each CPU (we discard the original) */
+        ssize_t ret;
-        size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
        pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
                NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
-        pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size);
+        /*
+         * Allocate percpu area.  If PSE is supported, try to make use
+         * of large page mappings.  Please read comments on top of
+         * each allocator for details.
+         */
+        ret = setup_pcpu_remap(static_size);
+        if (ret < 0)
+                ret = setup_pcpu_embed(static_size);
+        if (ret < 0)
+                ret = setup_pcpu_4k(static_size);
+        if (ret < 0)
+                panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
+                      static_size, ret);
-        for_each_possible_cpu(cpu) {
+        pcpu_unit_size = ret;
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-                ptr = alloc_bootmem_pages(size);
-#else
-                int node = early_cpu_to_node(cpu);
-                if (!node_online(node) || !NODE_DATA(node)) {
-                        ptr = alloc_bootmem_pages(size);
-                        pr_info("cpu %d has no node %d or node-local memory\n",
-                                cpu, node);
-                        pr_debug("per cpu data for cpu%d at %016lx\n",
-                                 cpu, __pa(ptr));
-                } else {
-                        ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
-                        pr_debug("per cpu data for cpu%d on node%d at %016lx\n",
-                                cpu, node, __pa(ptr));
-                }
-#endif
-                memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start);
+        /* alrighty, percpu areas up and running */
-                per_cpu_offset(cpu) = ptr - __per_cpu_start;
+        delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
+        for_each_possible_cpu(cpu) {
+                per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
                per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
                per_cpu(cpu_number, cpu) = cpu;
                setup_percpu_segment(cpu);
@@ -125,8 +438,6 @@ void __init setup_per_cpu_areas(void)
                 */
                if (cpu == boot_cpu_id)
                        switch_to_new_gdt(cpu);
-                DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
        }
        /* indicate the early static arrays will soon be gone */
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
index 491e737ce547..aa0987088774 100644
--- a/arch/x86/math-emu/fpu_aux.c
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -30,20 +30,29 @@ static void fclex(void)
 }
 /* Needs to be externally visible */
-void finit(void)
+void finit_task(struct task_struct *tsk)
 {
-        control_word = 0x037f;
+        struct i387_soft_struct *soft = &tsk->thread.xstate->soft;
-        partial_status = 0;
+        struct address *oaddr, *iaddr;
-        top = 0;                /* We don't keep top in the status word internally. */
+        soft->cwd = 0x037f;
-        fpu_tag_word = 0xffff;
+        soft->swd = 0;
+        soft->ftop = 0; /* We don't keep top in the status word internally. */
+        soft->twd = 0xffff;
        /* The behaviour is different from that detailed in
           Section 15.1.6 of the Intel manual */
-        operand_address.offset = 0;
+        oaddr = (struct address *)&soft->foo;
-        operand_address.selector = 0;
+        oaddr->offset = 0;
-        instruction_address.offset = 0;
+        oaddr->selector = 0;
-        instruction_address.selector = 0;
+        iaddr = (struct address *)&soft->fip;
-        instruction_address.opcode = 0;
+        iaddr->offset = 0;
-        no_ip_update = 1;
+        iaddr->selector = 0;
+        iaddr->opcode = 0;
+        soft->no_update = 1;
+}
+void finit(void)
+{
+        finit_task(current);
 }
 /*
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index d57dfffb0213..2966c6b8d304 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -131,6 +131,23 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
        return pte_offset_kernel(pmd, 0);
 }
+pmd_t * __init populate_extra_pmd(unsigned long vaddr)
+{
+        int pgd_idx = pgd_index(vaddr);
+        int pmd_idx = pmd_index(vaddr);
+        return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx;
+}
+pte_t * __init populate_extra_pte(unsigned long vaddr)
+{
+        int pte_idx = pte_index(vaddr);
+        pmd_t *pmd;
+        pmd = populate_extra_pmd(vaddr);
+        return one_page_table_init(pmd) + pte_idx;
+}
 static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
                                           unsigned long vaddr, pte_t *lastpte)
 {
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 7dd7ce49d69b..8a853bc3b287 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -161,34 +161,51 @@ static __ref void *spp_getpage(void)
        return ptr;
 }
-void
+static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
-set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
 {
-        pud_t *pud;
+        if (pgd_none(*pgd)) {
-        pmd_t *pmd;
+                pud_t *pud = (pud_t *)spp_getpage();
-        pte_t *pte;
+                pgd_populate(&init_mm, pgd, pud);
+                if (pud != pud_offset(pgd, 0))
+                        printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
+                               pud, pud_offset(pgd, 0));
+        }
+        return pud_offset(pgd, vaddr);
+}
-        pud = pud_page + pud_index(vaddr);
+static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
+{
        if (pud_none(*pud)) {
-                pmd = (pmd_t *) spp_getpage();
+                pmd_t *pmd = (pmd_t *) spp_getpage();
                pud_populate(&init_mm, pud, pmd);
-                if (pmd != pmd_offset(pud, 0)) {
+                if (pmd != pmd_offset(pud, 0))
                        printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
-                                pmd, pmd_offset(pud, 0));
+                               pmd, pmd_offset(pud, 0));
-                        return;
-                }
        }
-        pmd = pmd_offset(pud, vaddr);
+        return pmd_offset(pud, vaddr);
+}
+static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
+{
        if (pmd_none(*pmd)) {
-                pte = (pte_t *) spp_getpage();
+                pte_t *pte = (pte_t *) spp_getpage();
                pmd_populate_kernel(&init_mm, pmd, pte);
-                if (pte != pte_offset_kernel(pmd, 0)) {
+                if (pte != pte_offset_kernel(pmd, 0))
                        printk(KERN_ERR "PAGETABLE BUG #02!\n");
-                        return;
-                }
        }
+        return pte_offset_kernel(pmd, vaddr);
+}
+void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
+{
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *pte;
+        pud = pud_page + pud_index(vaddr);
+        pmd = fill_pmd(pud, vaddr);
+        pte = fill_pte(pmd, vaddr);
-        pte = pte_offset_kernel(pmd, vaddr);
        set_pte(pte, new_pte);
        /*
@@ -198,8 +215,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
        __flush_tlb_one(vaddr);
 }
-void
+void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
-set_pte_vaddr(unsigned long vaddr, pte_t pteval)
 {
        pgd_t *pgd;
        pud_t *pud_page;
@@ -216,6 +232,24 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval)
        set_pte_vaddr_pud(pud_page, vaddr, pteval);
 }
+pmd_t * __init populate_extra_pmd(unsigned long vaddr)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pgd = pgd_offset_k(vaddr);
+        pud = fill_pud(pgd, vaddr);
+        return fill_pmd(pud, vaddr);
+}
+pte_t * __init populate_extra_pte(unsigned long vaddr)
+{
+        pmd_t *pmd;
+        pmd = populate_extra_pmd(vaddr);
+        return fill_pte(pmd, vaddr);
+}
 /*
 * Create large page table mappings for a range of physical addresses.
 */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c52f4034c7fd..82cd39a6cbd3 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -103,7 +103,7 @@ static void xen_vcpu_setup(int cpu)
        vcpup = &per_cpu(xen_vcpu_info, cpu);
-        info.mfn = virt_to_mfn(vcpup);
+        info.mfn = arbitrary_virt_to_mfn(vcpup);
        info.offset = offset_in_page(vcpup);
        printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
@@ -301,8 +301,10 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
        frames = mcs.args;
        for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
-                frames[f] = virt_to_mfn(va);
+                frames[f] = arbitrary_virt_to_mfn((void *)va);
                make_lowmem_page_readonly((void *)va);
+                make_lowmem_page_readonly(mfn_to_virt(frames[f]));
        }
        MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
@@ -314,7 +316,7 @@ static void load_TLS_descriptor(struct thread_struct *t,
                                unsigned int cpu, unsigned int i)
 {
        struct desc_struct *gdt = get_cpu_gdt_table(cpu);
-        xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
+        xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
        struct multicall_space mc = __xen_mc_entry(0);
        MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
@@ -488,7 +490,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
                break;
        default: {
-                xmaddr_t maddr = virt_to_machine(&dt[entry]);
+                xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]);
                xen_mc_flush();
                if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 319bd40a57c2..cb6afa4ec95c 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -276,6 +276,13 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
        p2m_top[topidx][idx] = mfn;
 }
+unsigned long arbitrary_virt_to_mfn(void *vaddr)
+{
+        xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
+        return PFN_DOWN(maddr.maddr);
+}
 xmaddr_t arbitrary_virt_to_machine(void *vaddr)
 {
        unsigned long address = (unsigned long)vaddr;
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 035582ae815d..8d470562ffc9 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -219,6 +219,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 {
        struct vcpu_guest_context *ctxt;
        struct desc_struct *gdt;
+        unsigned long gdt_mfn;
        if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
                return 0;
@@ -248,9 +249,12 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
        ctxt->ldt_ents = 0;
        BUG_ON((unsigned long)gdt & ~PAGE_MASK);
+        gdt_mfn = arbitrary_virt_to_mfn(gdt);
        make_lowmem_page_readonly(gdt);
+        make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
-        ctxt->gdt_frames[0] = virt_to_mfn(gdt);
+        ctxt->gdt_frames[0] = gdt_mfn;
        ctxt->gdt_ents      = GDT_ENTRIES;
        ctxt->user_regs.cs = __KERNEL_CS;
diff --git a/block/blktrace.c b/block/blktrace.c
index 7cf9d1ff45a0..028120a0965a 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -363,7 +363,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
        if (!bt->sequence)
                goto err;
-        bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG);
+        bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
        if (!bt->msg_data)
                goto err;
diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c
index 9cc769b587ff..68fd3d292799 100644
--- a/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@ -516,12 +516,12 @@ int acpi_processor_preregister_performance(
                        continue;
                }
-                if (!performance || !percpu_ptr(performance, i)) {
+                if (!performance || !per_cpu_ptr(performance, i)) {
                        retval = -EINVAL;
                        continue;
                }
-                pr->performance = percpu_ptr(performance, i);
+                pr->performance = per_cpu_ptr(performance, i);
                cpumask_set_cpu(i, pr->performance->shared_cpu_map);
                if (acpi_processor_get_psd(pr)) {
                        retval = -EINVAL;
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 95837bfb5256..455d83219fae 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -65,23 +65,20 @@ extern void free_bootmem(unsigned long addr, unsigned long size);
 #define BOOTMEM_DEFAULT         0
 #define BOOTMEM_EXCLUSIVE       (1<<0)
+extern int reserve_bootmem(unsigned long addr,
+                           unsigned long size,
+                           int flags);
 extern int reserve_bootmem_node(pg_data_t *pgdat,
-                                 unsigned long physaddr,
+                                unsigned long physaddr,
-                                 unsigned long size,
+                                unsigned long size,
-                                 int flags);
+                                int flags);
-#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
-extern int reserve_bootmem(unsigned long addr, unsigned long size, int flags);
-#endif
-extern void *__alloc_bootmem_nopanic(unsigned long size,
+extern void *__alloc_bootmem(unsigned long size,
                             unsigned long align,
                             unsigned long goal);
-extern void *__alloc_bootmem(unsigned long size,
+extern void *__alloc_bootmem_nopanic(unsigned long size,
                                     unsigned long align,
                                     unsigned long goal);
-extern void *__alloc_bootmem_low(unsigned long size,
-                                 unsigned long align,
-                                 unsigned long goal);
 extern void *__alloc_bootmem_node(pg_data_t *pgdat,
                                  unsigned long size,
                                  unsigned long align,
@@ -90,30 +87,35 @@ extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
                                  unsigned long size,
                                  unsigned long align,
                                  unsigned long goal);
+extern void *__alloc_bootmem_low(unsigned long size,
+                                 unsigned long align,
+                                 unsigned long goal);
 extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
                                      unsigned long size,
                                      unsigned long align,
                                      unsigned long goal);
-#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 #define alloc_bootmem(x) \
        __alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_nopanic(x) \
        __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_low(x) \
-        __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
 #define alloc_bootmem_pages(x) \
        __alloc_bootmem(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_pages_nopanic(x) \
        __alloc_bootmem_nopanic(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_low_pages(x) \
-        __alloc_bootmem_low(x, PAGE_SIZE, 0)
 #define alloc_bootmem_node(pgdat, x) \
        __alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_pages_node(pgdat, x) \
        __alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
+#define alloc_bootmem_pages_node_nopanic(pgdat, x) \
+        __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
+#define alloc_bootmem_low(x) \
+        __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
+#define alloc_bootmem_low_pages(x) \
+        __alloc_bootmem_low(x, PAGE_SIZE, 0)
 #define alloc_bootmem_low_pages_node(pgdat, x) \
        __alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0)
-#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 extern int reserve_bootmem_generic(unsigned long addr, unsigned long size,
                                   int flags);
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 3577ffd90d45..545b068bcb70 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -76,52 +76,98 @@
 #ifdef CONFIG_SMP
+#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+/* minimum unit size, also is the maximum supported allocation size */
+#define PCPU_MIN_UNIT_SIZE              (16UL << PAGE_SHIFT)
+/*
+ * PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy
+ * back on the first chunk if arch is manually allocating and mapping
+ * it for faster access (as a part of large page mapping for example).
+ * Note that dynamic percpu allocator covers both static and dynamic
+ * areas, so these values are bigger than PERCPU_MODULE_RESERVE.
+ *
+ * On typical configuration with modules, the following values leave
+ * about 8k of free space on the first chunk after boot on both x86_32
+ * and 64 when module support is enabled.  When module support is
+ * disabled, it's much tighter.
+ */
+#ifndef PERCPU_DYNAMIC_RESERVE
+#  if BITS_PER_LONG > 32
+#    ifdef CONFIG_MODULES
+#      define PERCPU_DYNAMIC_RESERVE    (6 << PAGE_SHIFT)
+#    else
+#      define PERCPU_DYNAMIC_RESERVE    (4 << PAGE_SHIFT)
+#    endif
+#  else
+#    ifdef CONFIG_MODULES
+#      define PERCPU_DYNAMIC_RESERVE    (4 << PAGE_SHIFT)
+#    else
+#      define PERCPU_DYNAMIC_RESERVE    (2 << PAGE_SHIFT)
+#    endif
+#  endif
+#endif  /* PERCPU_DYNAMIC_RESERVE */
+extern void *pcpu_base_addr;
+typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
+typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
+extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
+                                        size_t static_size, size_t unit_size,
+                                        size_t free_size, void *base_addr,
+                                        pcpu_populate_pte_fn_t populate_pte_fn);
+/*
+ * Use this to get to a cpu's version of the per-cpu object
+ * dynamically allocated. Non-atomic access to the current CPU's
+ * version should probably be combined with get_cpu()/put_cpu().
+ */
+#define per_cpu_ptr(ptr, cpu)   SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu)))
+#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
 struct percpu_data {
        void *ptrs[1];
 };
 #define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
-/* 
- * Use this to get to a cpu's version of the per-cpu object dynamically
+#define per_cpu_ptr(ptr, cpu)                                           \
- * allocated. Non-atomic access to the current CPU's version should
+({                                                                      \
- * probably be combined with get_cpu()/put_cpu().
+        struct percpu_data *__p = __percpu_disguise(ptr);               \
- */ 
+        (__typeof__(ptr))__p->ptrs[(cpu)];                              \
-#define percpu_ptr(ptr, cpu)                              \
-({                                                        \
-        struct percpu_data *__p = __percpu_disguise(ptr); \
-        (__typeof__(ptr))__p->ptrs[(cpu)];                \
 })
-extern void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask);
+#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
-extern void percpu_free(void *__pdata);
+extern void *__alloc_percpu(size_t size, size_t align);
+extern void free_percpu(void *__pdata);
 #else /* CONFIG_SMP */
-#define percpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
+#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
-static __always_inline void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
+static inline void *__alloc_percpu(size_t size, size_t align)
 {
-        return kzalloc(size, gfp);
+        /*
+         * Can't easily make larger alignment work with kmalloc.  WARN
+         * on it.  Larger alignment should only be used for module
+         * percpu sections on SMP for which this path isn't used.
+         */
+        WARN_ON_ONCE(align > SMP_CACHE_BYTES);
+        return kzalloc(size, GFP_KERNEL);
 }
-static inline void percpu_free(void *__pdata)
+static inline void free_percpu(void *p)
 {
-        kfree(__pdata);
+        kfree(p);
 }
 #endif /* CONFIG_SMP */
-#define percpu_alloc_mask(size, gfp, mask) \
+#define alloc_percpu(type)      (type *)__alloc_percpu(sizeof(type), \
-        __percpu_alloc_mask((size), (gfp), &(mask))
+                                                       __alignof__(type))
-#define percpu_alloc(size, gfp) percpu_alloc_mask((size), (gfp), cpu_online_map)
-/* (legacy) interface for use without CPU hotplug handling */
-#define __alloc_percpu(size)    percpu_alloc_mask((size), GFP_KERNEL, \
-                                                  cpu_possible_map)
-#define alloc_percpu(type)      (type *)__alloc_percpu(sizeof(type))
-#define free_percpu(ptr)        percpu_free((ptr))
-#define per_cpu_ptr(ptr, cpu)   percpu_ptr((ptr), (cpu))
 #endif /* __LINUX_PERCPU_H */
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 9c0890c7a06a..a43ebec3a7b9 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -95,6 +95,9 @@ extern struct vm_struct *remove_vm_area(const void *addr);
 extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
                        struct page ***pages);
+extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
+                                    pgprot_t prot, struct page **pages);
+extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size);
 extern void unmap_kernel_range(unsigned long addr, unsigned long size);
 /* Allocate/destroy a 'vmalloc' VM area. */
@@ -110,5 +113,6 @@ extern long vwrite(char *buf, char *addr, unsigned long count);
 */
 extern rwlock_t vmlist_lock;
 extern struct vm_struct *vmlist;
+extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
 #endif /* _LINUX_VMALLOC_H */
diff --git a/kernel/module.c b/kernel/module.c
index ba22484a987e..1f0657ae555b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -51,6 +51,7 @@
 #include <linux/tracepoint.h>
 #include <linux/ftrace.h>
 #include <linux/async.h>
+#include <linux/percpu.h>
 #if 0
 #define DEBUGP printk
@@ -366,6 +367,34 @@ static struct module *find_module(const char *name)
 }
 #ifdef CONFIG_SMP
+#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+static void *percpu_modalloc(unsigned long size, unsigned long align,
+                             const char *name)
+{
+        void *ptr;
+        if (align > PAGE_SIZE) {
+                printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
+                       name, align, PAGE_SIZE);
+                align = PAGE_SIZE;
+        }
+        ptr = __alloc_percpu(size, align);
+        if (!ptr)
+                printk(KERN_WARNING
+                       "Could not allocate %lu bytes percpu data\n", size);
+        return ptr;
+}
+static void percpu_modfree(void *freeme)
+{
+        free_percpu(freeme);
+}
+#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
 /* Number of blocks used and allocated. */
 static unsigned int pcpu_num_used, pcpu_num_allocated;
 /* Size of each block.  -ve means used. */
@@ -480,21 +509,6 @@ static void percpu_modfree(void *freeme)
        }
 }
-static unsigned int find_pcpusec(Elf_Ehdr *hdr,
-                                 Elf_Shdr *sechdrs,
-                                 const char *secstrings)
-{
-        return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
-}
-static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
-{
-        int cpu;
-        for_each_possible_cpu(cpu)
-                memcpy(pcpudest + per_cpu_offset(cpu), from, size);
-}
 static int percpu_modinit(void)
 {
        pcpu_num_used = 2;
@@ -513,7 +527,26 @@ static int percpu_modinit(void)
        return 0;
 }
 __initcall(percpu_modinit);
+#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+static unsigned int find_pcpusec(Elf_Ehdr *hdr,
+                                 Elf_Shdr *sechdrs,
+                                 const char *secstrings)
+{
+        return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
+}
+static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
+{
+        int cpu;
+        for_each_possible_cpu(cpu)
+                memcpy(pcpudest + per_cpu_offset(cpu), from, size);
+}
 #else /* ... !CONFIG_SMP */
 static inline void *percpu_modalloc(unsigned long size, unsigned long align,
                                    const char *name)
 {
@@ -535,6 +568,7 @@ static inline void percpu_modcopy(void *pcpudst, const void *src,
        /* pcpusec should be 0, and size of that section should be 0. */
        BUG_ON(size != 0);
 }
 #endif /* CONFIG_SMP */
 #define MODINFO_ATTR(field)     \
diff --git a/kernel/sched.c b/kernel/sched.c
index 4070cd34effd..0a76d0b6f215 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9485,7 +9485,7 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
 {
-        u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+        u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
        u64 data;
 #ifndef CONFIG_64BIT
@@ -9504,7 +9504,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
 static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
 {
-        u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+        u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
 #ifndef CONFIG_64BIT
        /*
@@ -9600,7 +9600,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
        ca = task_ca(tsk);
        for (; ca; ca = ca->parent) {
-                u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+                u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
                *cpuusage += cputime;
        }
 }
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 0cd415ee62a2..74541ca49536 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -170,7 +170,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
         * doesn't hit this CPU until we're ready. */
        get_cpu();
        for_each_online_cpu(i) {
-                sm_work = percpu_ptr(stop_machine_work, i);
+                sm_work = per_cpu_ptr(stop_machine_work, i);
                INIT_WORK(sm_work, stop_cpu);
                queue_work_on(i, stop_machine_wq, sm_work);
        }
diff --git a/mm/Makefile b/mm/Makefile
index 72255be57f89..818569b68f46 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -30,6 +30,10 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
+ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+obj-$(CONFIG_SMP) += percpu.o
+else
 obj-$(CONFIG_SMP) += allocpercpu.o
+endif
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 4297bc41bfd2..3653c570232b 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -99,45 +99,51 @@ static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
        __percpu_populate_mask((__pdata), (size), (gfp), &(mask))
 /**
- * percpu_alloc_mask - initial setup of per-cpu data
+ * alloc_percpu - initial setup of per-cpu data
 * @size: size of per-cpu object
- * @gfp: may sleep or not etc.
+ * @align: alignment
- * @mask: populate per-data for cpu's selected through mask bits
 *
- * Populating per-cpu data for all online cpu's would be a typical use case,
+ * Allocate dynamic percpu area.  Percpu objects are populated with
- * which is simplified by the percpu_alloc() wrapper.
+ * zeroed buffers.
- * Per-cpu objects are populated with zeroed buffers.
 */
-void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
+void *__alloc_percpu(size_t size, size_t align)
 {
        /*
         * We allocate whole cache lines to avoid false sharing
         */
        size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size());
-        void *pdata = kzalloc(sz, gfp);
+        void *pdata = kzalloc(sz, GFP_KERNEL);
        void *__pdata = __percpu_disguise(pdata);
+        /*
+         * Can't easily make larger alignment work with kmalloc.  WARN
+         * on it.  Larger alignment should only be used for module
+         * percpu sections on SMP for which this path isn't used.
+         */
+        WARN_ON_ONCE(align > __alignof__(unsigned long long));
        if (unlikely(!pdata))
                return NULL;
-        if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask)))
+        if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL,
+                                           &cpu_possible_map)))
                return __pdata;
        kfree(pdata);
        return NULL;
 }
-EXPORT_SYMBOL_GPL(__percpu_alloc_mask);
+EXPORT_SYMBOL_GPL(__alloc_percpu);
 /**
- * percpu_free - final cleanup of per-cpu data
+ * free_percpu - final cleanup of per-cpu data
 * @__pdata: object to clean up
 *
 * We simply clean up any per-cpu object left. No need for the client to
 * track and specify through a bis mask which per-cpu objects are to free.
 */
-void percpu_free(void *__pdata)
+void free_percpu(void *__pdata)
 {
        if (unlikely(!__pdata))
                return;
        __percpu_depopulate_mask(__pdata, &cpu_possible_map);
        kfree(__percpu_disguise(__pdata));
 }
-EXPORT_SYMBOL_GPL(percpu_free);
+EXPORT_SYMBOL_GPL(free_percpu);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 51a0ccf61e0e..daf92713f7de 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -382,7 +382,6 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
        return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
 }
-#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 /**
 * reserve_bootmem - mark a page range as usable
 * @addr: starting address of the range
@@ -403,7 +402,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
        return mark_bootmem(start, end, 1, flags);
 }
-#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
                        unsigned long step)
@@ -429,8 +427,8 @@ static unsigned long align_off(struct bootmem_data *bdata, unsigned long off,
 }
 static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
-                                unsigned long size, unsigned long align,
+                                        unsigned long size, unsigned long align,
-                                unsigned long goal, unsigned long limit)
+                                        unsigned long goal, unsigned long limit)
 {
        unsigned long fallback = 0;
        unsigned long min, max, start, sidx, midx, step;
@@ -530,17 +528,34 @@ find_block:
        return NULL;
 }
+static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
+                                        unsigned long size, unsigned long align,
+                                        unsigned long goal, unsigned long limit)
+{
+#ifdef CONFIG_HAVE_ARCH_BOOTMEM
+        bootmem_data_t *p_bdata;
+        p_bdata = bootmem_arch_preferred_node(bdata, size, align, goal, limit);
+        if (p_bdata)
+                return alloc_bootmem_core(p_bdata, size, align, goal, limit);
+#endif
+        return NULL;
+}
 static void * __init ___alloc_bootmem_nopanic(unsigned long size,
                                        unsigned long align,
                                        unsigned long goal,
                                        unsigned long limit)
 {
        bootmem_data_t *bdata;
+        void *region;
 restart:
-        list_for_each_entry(bdata, &bdata_list, list) {
+        region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit);
-                void *region;
+        if (region)
+                return region;
+        list_for_each_entry(bdata, &bdata_list, list) {
                if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
                        continue;
                if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
@@ -618,6 +633,10 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
 {
        void *ptr;
+        ptr = alloc_arch_preferred_bootmem(bdata, size, align, goal, limit);
+        if (ptr)
+                return ptr;
        ptr = alloc_bootmem_core(bdata, size, align, goal, limit);
        if (ptr)
                return ptr;
@@ -674,6 +693,10 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
 {
        void *ptr;
+        ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
+        if (ptr)
+                return ptr;
        ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
        if (ptr)
                return ptr;
diff --git a/mm/percpu.c b/mm/percpu.c
new file mode 100644
index 000000000000..3d0f5456827c
--- /dev/null
+++ b/mm/percpu.c
@@ -0,0 +1,979 @@
+/*
+ * linux/mm/percpu.c - percpu memory allocator
+ *
+ * Copyright (C) 2009           SUSE Linux Products GmbH
+ * Copyright (C) 2009           Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * This is percpu allocator which can handle both static and dynamic
+ * areas.  Percpu areas are allocated in chunks in vmalloc area.  Each
+ * chunk is consisted of num_possible_cpus() units and the first chunk
+ * is used for static percpu variables in the kernel image (special
+ * boot time alloc/init handling necessary as these areas need to be
+ * brought up before allocation services are running).  Unit grows as
+ * necessary and all units grow or shrink in unison.  When a chunk is
+ * filled up, another chunk is allocated.  ie. in vmalloc area
+ *
+ *  c0                           c1                         c2
+ *  -------------------          -------------------        ------------
+ * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
+ *  -------------------  ......  -------------------  ....  ------------
+ *
+ * Allocation is done in offset-size areas of single unit space.  Ie,
+ * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
+ * c1:u1, c1:u2 and c1:u3.  Percpu access can be done by configuring
+ * percpu base registers UNIT_SIZE apart.
+ *
+ * There are usually many small percpu allocations many of them as
+ * small as 4 bytes.  The allocator organizes chunks into lists
+ * according to free size and tries to allocate from the fullest one.
+ * Each chunk keeps the maximum contiguous area size hint which is
+ * guaranteed to be eqaul to or larger than the maximum contiguous
+ * area in the chunk.  This helps the allocator not to iterate the
+ * chunk maps unnecessarily.
+ *
+ * Allocation state in each chunk is kept using an array of integers
+ * on chunk->map.  A positive value in the map represents a free
+ * region and negative allocated.  Allocation inside a chunk is done
+ * by scanning this map sequentially and serving the first matching
+ * entry.  This is mostly copied from the percpu_modalloc() allocator.
+ * Chunks are also linked into a rb tree to ease address to chunk
+ * mapping during free.
+ *
+ * To use this allocator, arch code should do the followings.
+ *
+ * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+ *
+ * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
+ *   regular address to percpu pointer and back
+ *
+ * - use pcpu_setup_first_chunk() during percpu area initialization to
+ *   setup the first chunk containing the kernel static percpu area
+ */
+#include <linux/bitmap.h>
+#include <linux/bootmem.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/pfn.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#define PCPU_SLOT_BASE_SHIFT            5       /* 1-31 shares the same slot */
+#define PCPU_DFL_MAP_ALLOC              16      /* start a map with 16 ents */
+struct pcpu_chunk {
+        struct list_head        list;           /* linked to pcpu_slot lists */
+        struct rb_node          rb_node;        /* key is chunk->vm->addr */
+        int                     free_size;      /* free bytes in the chunk */
+        int                     contig_hint;    /* max contiguous size hint */
+        struct vm_struct        *vm;            /* mapped vmalloc region */
+        int                     map_used;       /* # of map entries used */
+        int                     map_alloc;      /* # of map entries allocated */
+        int                     *map;           /* allocation map */
+        bool                    immutable;      /* no [de]population allowed */
+        struct page             *page[];        /* #cpus * UNIT_PAGES */
+};
+static int pcpu_unit_pages __read_mostly;
+static int pcpu_unit_size __read_mostly;
+static int pcpu_chunk_size __read_mostly;
+static int pcpu_nr_slots __read_mostly;
+static size_t pcpu_chunk_struct_size __read_mostly;
+/* the address of the first chunk which starts with the kernel static area */
+void *pcpu_base_addr __read_mostly;
+EXPORT_SYMBOL_GPL(pcpu_base_addr);
+/* the size of kernel static area */
+static int pcpu_static_size __read_mostly;
+/*
+ * One mutex to rule them all.
+ *
+ * The following mutex is grabbed in the outermost public alloc/free
+ * interface functions and released only when the operation is
+ * complete.  As such, every function in this file other than the
+ * outermost functions are called under pcpu_mutex.
+ *
+ * It can easily be switched to use spinlock such that only the area
+ * allocation and page population commit are protected with it doing
+ * actual [de]allocation without holding any lock.  However, given
+ * what this allocator does, I think it's better to let them run
+ * sequentially.
+ */
+static DEFINE_MUTEX(pcpu_mutex);
+static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
+static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */
+static int __pcpu_size_to_slot(int size)
+{
+        int highbit = fls(size);        /* size is in bytes */
+        return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
+}
+static int pcpu_size_to_slot(int size)
+{
+        if (size == pcpu_unit_size)
+                return pcpu_nr_slots - 1;
+        return __pcpu_size_to_slot(size);
+}
+static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
+{
+        if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
+                return 0;
+        return pcpu_size_to_slot(chunk->free_size);
+}
+static int pcpu_page_idx(unsigned int cpu, int page_idx)
+{
+        return cpu * pcpu_unit_pages + page_idx;
+}
+static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
+                                      unsigned int cpu, int page_idx)
+{
+        return &chunk->page[pcpu_page_idx(cpu, page_idx)];
+}
+static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
+                                     unsigned int cpu, int page_idx)
+{
+        return (unsigned long)chunk->vm->addr +
+                (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
+}
+static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
+                                     int page_idx)
+{
+        return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
+}
+/**
+ * pcpu_realloc - versatile realloc
+ * @p: the current pointer (can be NULL for new allocations)
+ * @size: the current size in bytes (can be 0 for new allocations)
+ * @new_size: the wanted new size in bytes (can be 0 for free)
+ *
+ * More robust realloc which can be used to allocate, resize or free a
+ * memory area of arbitrary size.  If the needed size goes over
+ * PAGE_SIZE, kernel VM is used.
+ *
+ * RETURNS:
+ * The new pointer on success, NULL on failure.
+ */
+static void *pcpu_realloc(void *p, size_t size, size_t new_size)
+{
+        void *new;
+        if (new_size <= PAGE_SIZE)
+                new = kmalloc(new_size, GFP_KERNEL);
+        else
+                new = vmalloc(new_size);
+        if (new_size && !new)
+                return NULL;
+        memcpy(new, p, min(size, new_size));
+        if (new_size > size)
+                memset(new + size, 0, new_size - size);
+        if (size <= PAGE_SIZE)
+                kfree(p);
+        else
+                vfree(p);
+        return new;
+}
+/**
+ * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
+ * @chunk: chunk of interest
+ * @oslot: the previous slot it was on
+ *
+ * This function is called after an allocation or free changed @chunk.
+ * New slot according to the changed state is determined and @chunk is
+ * moved to the slot.
+ */
+static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
+{
+        int nslot = pcpu_chunk_slot(chunk);
+        if (oslot != nslot) {
+                if (oslot < nslot)
+                        list_move(&chunk->list, &pcpu_slot[nslot]);
+                else
+                        list_move_tail(&chunk->list, &pcpu_slot[nslot]);
+        }
+}
+static struct rb_node **pcpu_chunk_rb_search(void *addr,
+                                             struct rb_node **parentp)
+{
+        struct rb_node **p = &pcpu_addr_root.rb_node;
+        struct rb_node *parent = NULL;
+        struct pcpu_chunk *chunk;
+        while (*p) {
+                parent = *p;
+                chunk = rb_entry(parent, struct pcpu_chunk, rb_node);
+                if (addr < chunk->vm->addr)
+                        p = &(*p)->rb_left;
+                else if (addr > chunk->vm->addr)
+                        p = &(*p)->rb_right;
+                else
+                        break;
+        }
+        if (parentp)
+                *parentp = parent;
+        return p;
+}
+/**
+ * pcpu_chunk_addr_search - search for chunk containing specified address
+ * @addr: address to search for
+ *
+ * Look for chunk which might contain @addr.  More specifically, it
+ * searchs for the chunk with the highest start address which isn't
+ * beyond @addr.
+ *
+ * RETURNS:
+ * The address of the found chunk.
+ */
+static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
+{
+        struct rb_node *n, *parent;
+        struct pcpu_chunk *chunk;
+        n = *pcpu_chunk_rb_search(addr, &parent);
+        if (!n) {
+                /* no exactly matching chunk, the parent is the closest */
+                n = parent;
+                BUG_ON(!n);
+        }
+        chunk = rb_entry(n, struct pcpu_chunk, rb_node);
+        if (addr < chunk->vm->addr) {
+                /* the parent was the next one, look for the previous one */
+                n = rb_prev(n);
+                BUG_ON(!n);
+                chunk = rb_entry(n, struct pcpu_chunk, rb_node);
+        }
+        return chunk;
+}
+/**
+ * pcpu_chunk_addr_insert - insert chunk into address rb tree
+ * @new: chunk to insert
+ *
+ * Insert @new into address rb tree.
+ */
+static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
+{
+        struct rb_node **p, *parent;
+        p = pcpu_chunk_rb_search(new->vm->addr, &parent);
+        BUG_ON(*p);
+        rb_link_node(&new->rb_node, parent, p);
+        rb_insert_color(&new->rb_node, &pcpu_addr_root);
+}
+/**
+ * pcpu_split_block - split a map block
+ * @chunk: chunk of interest
+ * @i: index of map block to split
+ * @head: head size in bytes (can be 0)
+ * @tail: tail size in bytes (can be 0)
+ *
+ * Split the @i'th map block into two or three blocks.  If @head is
+ * non-zero, @head bytes block is inserted before block @i moving it
+ * to @i+1 and reducing its size by @head bytes.
+ *
+ * If @tail is non-zero, the target block, which can be @i or @i+1
+ * depending on @head, is reduced by @tail bytes and @tail byte block
+ * is inserted after the target block.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
+{
+        int nr_extra = !!head + !!tail;
+        int target = chunk->map_used + nr_extra;
+        /* reallocation required? */
+        if (chunk->map_alloc < target) {
+                int new_alloc = chunk->map_alloc;
+                int *new;
+                while (new_alloc < target)
+                        new_alloc *= 2;
+                new = pcpu_realloc(chunk->map,
+                                   chunk->map_alloc * sizeof(new[0]),
+                                   new_alloc * sizeof(new[0]));
+                if (!new)
+                        return -ENOMEM;
+                chunk->map_alloc = new_alloc;
+                chunk->map = new;
+        }
+        /* insert a new subblock */
+        memmove(&chunk->map[i + nr_extra], &chunk->map[i],
+                sizeof(chunk->map[0]) * (chunk->map_used - i));
+        chunk->map_used += nr_extra;
+        if (head) {
+                chunk->map[i + 1] = chunk->map[i] - head;
+                chunk->map[i++] = head;
+        }
+        if (tail) {
+                chunk->map[i++] -= tail;
+                chunk->map[i] = tail;
+        }
+        return 0;
+}
+/**
+ * pcpu_alloc_area - allocate area from a pcpu_chunk
+ * @chunk: chunk of interest
+ * @size: wanted size in bytes
+ * @align: wanted align
+ *
+ * Try to allocate @size bytes area aligned at @align from @chunk.
+ * Note that this function only allocates the offset.  It doesn't
+ * populate or map the area.
+ *
+ * RETURNS:
+ * Allocated offset in @chunk on success, -errno on failure.
+ */
+static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
+{
+        int oslot = pcpu_chunk_slot(chunk);
+        int max_contig = 0;
+        int i, off;
+        /*
+         * The static chunk initially doesn't have map attached
+         * because kmalloc wasn't available during init.  Give it one.
+         */
+        if (unlikely(!chunk->map)) {
+                chunk->map = pcpu_realloc(NULL, 0,
+                                PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
+                if (!chunk->map)
+                        return -ENOMEM;
+                chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
+                chunk->map[chunk->map_used++] = -pcpu_static_size;
+                if (chunk->free_size)
+                        chunk->map[chunk->map_used++] = chunk->free_size;
+        }
+        for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {
+                bool is_last = i + 1 == chunk->map_used;
+                int head, tail;
+                /* extra for alignment requirement */
+                head = ALIGN(off, align) - off;
+                BUG_ON(i == 0 && head != 0);
+                if (chunk->map[i] < 0)
+                        continue;
+                if (chunk->map[i] < head + size) {
+                        max_contig = max(chunk->map[i], max_contig);
+                        continue;
+                }
+                /*
+                 * If head is small or the previous block is free,
+                 * merge'em.  Note that 'small' is defined as smaller
+                 * than sizeof(int), which is very small but isn't too
+                 * uncommon for percpu allocations.
+                 */
+                if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) {
+                        if (chunk->map[i - 1] > 0)
+                                chunk->map[i - 1] += head;
+                        else {
+                                chunk->map[i - 1] -= head;
+                                chunk->free_size -= head;
+                        }
+                        chunk->map[i] -= head;
+                        off += head;
+                        head = 0;
+                }
+                /* if tail is small, just keep it around */
+                tail = chunk->map[i] - head - size;
+                if (tail < sizeof(int))
+                        tail = 0;
+                /* split if warranted */
+                if (head || tail) {
+                        if (pcpu_split_block(chunk, i, head, tail))
+                                return -ENOMEM;
+                        if (head) {
+                                i++;
+                                off += head;
+                                max_contig = max(chunk->map[i - 1], max_contig);
+                        }
+                        if (tail)
+                                max_contig = max(chunk->map[i + 1], max_contig);
+                }
+                /* update hint and mark allocated */
+                if (is_last)
+                        chunk->contig_hint = max_contig; /* fully scanned */
+                else
+                        chunk->contig_hint = max(chunk->contig_hint,
+                                                 max_contig);
+                chunk->free_size -= chunk->map[i];
+                chunk->map[i] = -chunk->map[i];
+                pcpu_chunk_relocate(chunk, oslot);
+                return off;
+        }
+        chunk->contig_hint = max_contig;        /* fully scanned */
+        pcpu_chunk_relocate(chunk, oslot);
+        /*
+         * Tell the upper layer that this chunk has no area left.
+         * Note that this is not an error condition but a notification
+         * to upper layer that it needs to look at other chunks.
+         * -ENOSPC is chosen as it isn't used in memory subsystem and
+         * matches the meaning in a way.
+         */
+        return -ENOSPC;
+}
+/**
+ * pcpu_free_area - free area to a pcpu_chunk
+ * @chunk: chunk of interest
+ * @freeme: offset of area to free
+ *
+ * Free area starting from @freeme to @chunk.  Note that this function
+ * only modifies the allocation map.  It doesn't depopulate or unmap
+ * the area.
+ */
+static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
+{
+        int oslot = pcpu_chunk_slot(chunk);
+        int i, off;
+        for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++]))
+                if (off == freeme)
+                        break;
+        BUG_ON(off != freeme);
+        BUG_ON(chunk->map[i] > 0);
+        chunk->map[i] = -chunk->map[i];
+        chunk->free_size += chunk->map[i];
+        /* merge with previous? */
+        if (i > 0 && chunk->map[i - 1] >= 0) {
+                chunk->map[i - 1] += chunk->map[i];
+                chunk->map_used--;
+                memmove(&chunk->map[i], &chunk->map[i + 1],
+                        (chunk->map_used - i) * sizeof(chunk->map[0]));
+                i--;
+        }
+        /* merge with next? */
+        if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) {
+                chunk->map[i] += chunk->map[i + 1];
+                chunk->map_used--;
+                memmove(&chunk->map[i + 1], &chunk->map[i + 2],
+                        (chunk->map_used - (i + 1)) * sizeof(chunk->map[0]));
+        }
+        chunk->contig_hint = max(chunk->map[i], chunk->contig_hint);
+        pcpu_chunk_relocate(chunk, oslot);
+}
+/**
+ * pcpu_unmap - unmap pages out of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @page_start: page index of the first page to unmap
+ * @page_end: page index of the last page to unmap + 1
+ * @flush: whether to flush cache and tlb or not
+ *
+ * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
+ * If @flush is true, vcache is flushed before unmapping and tlb
+ * after.
+ */
+static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
+                       bool flush)
+{
+        unsigned int last = num_possible_cpus() - 1;
+        unsigned int cpu;
+        /* unmap must not be done on immutable chunk */
+        WARN_ON(chunk->immutable);
+        /*
+         * Each flushing trial can be very expensive, issue flush on
+         * the whole region at once rather than doing it for each cpu.
+         * This could be an overkill but is more scalable.
+         */
+        if (flush)
+                flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
+                                   pcpu_chunk_addr(chunk, last, page_end));
+        for_each_possible_cpu(cpu)
+                unmap_kernel_range_noflush(
+                                pcpu_chunk_addr(chunk, cpu, page_start),
+                                (page_end - page_start) << PAGE_SHIFT);
+        /* ditto as flush_cache_vunmap() */
+        if (flush)
+                flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
+                                       pcpu_chunk_addr(chunk, last, page_end));
+}
+/**
+ * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
+ * @chunk: chunk to depopulate
+ * @off: offset to the area to depopulate
+ * @size: size of the area to depopulate in bytes
+ * @flush: whether to flush cache and tlb or not
+ *
+ * For each cpu, depopulate and unmap pages [@page_start,@page_end)
+ * from @chunk.  If @flush is true, vcache is flushed before unmapping
+ * and tlb after.
+ */
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
+                                  bool flush)
+{
+        int page_start = PFN_DOWN(off);
+        int page_end = PFN_UP(off + size);
+        int unmap_start = -1;
+        int uninitialized_var(unmap_end);
+        unsigned int cpu;
+        int i;
+        for (i = page_start; i < page_end; i++) {
+                for_each_possible_cpu(cpu) {
+                        struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
+                        if (!*pagep)
+                                continue;
+                        __free_page(*pagep);
+                        /*
+                         * If it's partial depopulation, it might get
+                         * populated or depopulated again.  Mark the
+                         * page gone.
+                         */
+                        *pagep = NULL;
+                        unmap_start = unmap_start < 0 ? i : unmap_start;
+                        unmap_end = i + 1;
+                }
+        }
+        if (unmap_start >= 0)
+                pcpu_unmap(chunk, unmap_start, unmap_end, flush);
+}
+/**
+ * pcpu_map - map pages into a pcpu_chunk
+ * @chunk: chunk of interest
+ * @page_start: page index of the first page to map
+ * @page_end: page index of the last page to map + 1
+ *
+ * For each cpu, map pages [@page_start,@page_end) into @chunk.
+ * vcache is flushed afterwards.
+ */
+static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
+{
+        unsigned int last = num_possible_cpus() - 1;
+        unsigned int cpu;
+        int err;
+        /* map must not be done on immutable chunk */
+        WARN_ON(chunk->immutable);
+        for_each_possible_cpu(cpu) {
+                err = map_kernel_range_noflush(
+                                pcpu_chunk_addr(chunk, cpu, page_start),
+                                (page_end - page_start) << PAGE_SHIFT,
+                                PAGE_KERNEL,
+                                pcpu_chunk_pagep(chunk, cpu, page_start));
+                if (err < 0)
+                        return err;
+        }
+        /* flush at once, please read comments in pcpu_unmap() */
+        flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
+                         pcpu_chunk_addr(chunk, last, page_end));
+        return 0;
+}
+/**
+ * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @off: offset to the area to populate
+ * @size: size of the area to populate in bytes
+ *
+ * For each cpu, populate and map pages [@page_start,@page_end) into
+ * @chunk.  The area is cleared on return.
+ */
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+        const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
+        int page_start = PFN_DOWN(off);
+        int page_end = PFN_UP(off + size);
+        int map_start = -1;
+        int uninitialized_var(map_end);
+        unsigned int cpu;
+        int i;
+        for (i = page_start; i < page_end; i++) {
+                if (pcpu_chunk_page_occupied(chunk, i)) {
+                        if (map_start >= 0) {
+                                if (pcpu_map(chunk, map_start, map_end))
+                                        goto err;
+                                map_start = -1;
+                        }
+                        continue;
+                }
+                map_start = map_start < 0 ? i : map_start;
+                map_end = i + 1;
+                for_each_possible_cpu(cpu) {
+                        struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
+                        *pagep = alloc_pages_node(cpu_to_node(cpu),
+                                                  alloc_mask, 0);
+                        if (!*pagep)
+                                goto err;
+                }
+        }
+        if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
+                goto err;
+        for_each_possible_cpu(cpu)
+                memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
+                       size);
+        return 0;
+err:
+        /* likely under heavy memory pressure, give memory back */
+        pcpu_depopulate_chunk(chunk, off, size, true);
+        return -ENOMEM;
+}
+static void free_pcpu_chunk(struct pcpu_chunk *chunk)
+{
+        if (!chunk)
+                return;
+        if (chunk->vm)
+                free_vm_area(chunk->vm);
+        pcpu_realloc(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]), 0);
+        kfree(chunk);
+}
+static struct pcpu_chunk *alloc_pcpu_chunk(void)
+{
+        struct pcpu_chunk *chunk;
+        chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
+        if (!chunk)
+                return NULL;
+        chunk->map = pcpu_realloc(NULL, 0,
+                                  PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
+        chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
+        chunk->map[chunk->map_used++] = pcpu_unit_size;
+        chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
+        if (!chunk->vm) {
+                free_pcpu_chunk(chunk);
+                return NULL;
+        }
+        INIT_LIST_HEAD(&chunk->list);
+        chunk->free_size = pcpu_unit_size;
+        chunk->contig_hint = pcpu_unit_size;
+        return chunk;
+}
+/**
+ * __alloc_percpu - allocate percpu area
+ * @size: size of area to allocate in bytes
+ * @align: alignment of area (max PAGE_SIZE)
+ *
+ * Allocate percpu area of @size bytes aligned at @align.  Might
+ * sleep.  Might trigger writeouts.
+ *
+ * RETURNS:
+ * Percpu pointer to the allocated area on success, NULL on failure.
+ */
+void *__alloc_percpu(size_t size, size_t align)
+{
+        void *ptr = NULL;
+        struct pcpu_chunk *chunk;
+        int slot, off;
+        if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
+                WARN(true, "illegal size (%zu) or align (%zu) for "
+                     "percpu allocation\n", size, align);
+                return NULL;
+        }
+        mutex_lock(&pcpu_mutex);
+        /* allocate area */
+        for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
+                list_for_each_entry(chunk, &pcpu_slot[slot], list) {
+                        if (size > chunk->contig_hint)
+                                continue;
+                        off = pcpu_alloc_area(chunk, size, align);
+                        if (off >= 0)
+                                goto area_found;
+                        if (off != -ENOSPC)
+                                goto out_unlock;
+                }
+        }
+        /* hmmm... no space left, create a new chunk */
+        chunk = alloc_pcpu_chunk();
+        if (!chunk)
+                goto out_unlock;
+        pcpu_chunk_relocate(chunk, -1);
+        pcpu_chunk_addr_insert(chunk);
+        off = pcpu_alloc_area(chunk, size, align);
+        if (off < 0)
+                goto out_unlock;
+area_found:
+        /* populate, map and clear the area */
+        if (pcpu_populate_chunk(chunk, off, size)) {
+                pcpu_free_area(chunk, off);
+                goto out_unlock;
+        }
+        ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off);
+out_unlock:
+        mutex_unlock(&pcpu_mutex);
+        return ptr;
+}
+EXPORT_SYMBOL_GPL(__alloc_percpu);
+static void pcpu_kill_chunk(struct pcpu_chunk *chunk)
+{
+        WARN_ON(chunk->immutable);
+        pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
+        list_del(&chunk->list);
+        rb_erase(&chunk->rb_node, &pcpu_addr_root);
+        free_pcpu_chunk(chunk);
+}
+/**
+ * free_percpu - free percpu area
+ * @ptr: pointer to area to free
+ *
+ * Free percpu area @ptr.  Might sleep.
+ */
+void free_percpu(void *ptr)
+{
+        void *addr = __pcpu_ptr_to_addr(ptr);
+        struct pcpu_chunk *chunk;
+        int off;
+        if (!ptr)
+                return;
+        mutex_lock(&pcpu_mutex);
+        chunk = pcpu_chunk_addr_search(addr);
+        off = addr - chunk->vm->addr;
+        pcpu_free_area(chunk, off);
+        /* the chunk became fully free, kill one if there are other free ones */
+        if (chunk->free_size == pcpu_unit_size) {
+                struct pcpu_chunk *pos;
+                list_for_each_entry(pos,
+                                    &pcpu_slot[pcpu_chunk_slot(chunk)], list)
+                        if (pos != chunk) {
+                                pcpu_kill_chunk(pos);
+                                break;
+                        }
+        }
+        mutex_unlock(&pcpu_mutex);
+}
+EXPORT_SYMBOL_GPL(free_percpu);
+/**
+ * pcpu_setup_first_chunk - initialize the first percpu chunk
+ * @get_page_fn: callback to fetch page pointer
+ * @static_size: the size of static percpu area in bytes
+ * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, 0 for auto
+ * @free_size: free size in bytes, 0 for auto
+ * @base_addr: mapped address, NULL for auto
+ * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
+ *
+ * Initialize the first percpu chunk which contains the kernel static
+ * perpcu area.  This function is to be called from arch percpu area
+ * setup path.  The first two parameters are mandatory.  The rest are
+ * optional.
+ *
+ * @get_page_fn() should return pointer to percpu page given cpu
+ * number and page number.  It should at least return enough pages to
+ * cover the static area.  The returned pages for static area should
+ * have been initialized with valid data.  If @unit_size is specified,
+ * it can also return pages after the static area.  NULL return
+ * indicates end of pages for the cpu.  Note that @get_page_fn() must
+ * return the same number of pages for all cpus.
+ *
+ * @unit_size, if non-zero, determines unit size and must be aligned
+ * to PAGE_SIZE and equal to or larger than @static_size + @free_size.
+ *
+ * @free_size determines the number of free bytes after the static
+ * area in the first chunk.  If zero, whatever left is available.
+ * Specifying non-zero value make percpu leave the area after
+ * @static_size + @free_size alone.
+ *
+ * Non-null @base_addr means that the caller already allocated virtual
+ * region for the first chunk and mapped it.  percpu must not mess
+ * with the chunk.  Note that @base_addr with 0 @unit_size or non-NULL
+ * @populate_pte_fn doesn't make any sense.
+ *
+ * @populate_pte_fn is used to populate the pagetable.  NULL means the
+ * caller already populated the pagetable.
+ *
+ * RETURNS:
+ * The determined pcpu_unit_size which can be used to initialize
+ * percpu access.
+ */
+size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
+                                     size_t static_size, size_t unit_size,
+                                     size_t free_size, void *base_addr,
+                                     pcpu_populate_pte_fn_t populate_pte_fn)
+{
+        static struct vm_struct static_vm;
+        struct pcpu_chunk *static_chunk;
+        unsigned int cpu;
+        int nr_pages;
+        int err, i;
+        /* santiy checks */
+        BUG_ON(!static_size);
+        BUG_ON(!unit_size && free_size);
+        BUG_ON(unit_size && unit_size < static_size + free_size);
+        BUG_ON(unit_size & ~PAGE_MASK);
+        BUG_ON(base_addr && !unit_size);
+        BUG_ON(base_addr && populate_pte_fn);
+        if (unit_size)
+                pcpu_unit_pages = unit_size >> PAGE_SHIFT;
+        else
+                pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
+                                        PFN_UP(static_size));
+        pcpu_static_size = static_size;
+        pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
+        pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
+        pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
+                + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
+        /*
+         * Allocate chunk slots.  The additional last slot is for
+         * empty chunks.
+         */
+        pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
+        pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
+        for (i = 0; i < pcpu_nr_slots; i++)
+                INIT_LIST_HEAD(&pcpu_slot[i]);
+        /* init static_chunk */
+        static_chunk = alloc_bootmem(pcpu_chunk_struct_size);
+        INIT_LIST_HEAD(&static_chunk->list);
+        static_chunk->vm = &static_vm;
+        if (free_size)
+                static_chunk->free_size = free_size;
+        else
+                static_chunk->free_size = pcpu_unit_size - pcpu_static_size;
+        static_chunk->contig_hint = static_chunk->free_size;
+        /* allocate vm address */
+        static_vm.flags = VM_ALLOC;
+        static_vm.size = pcpu_chunk_size;
+        if (!base_addr)
+                vm_area_register_early(&static_vm, PAGE_SIZE);
+        else {
+                /*
+                 * Pages already mapped.  No need to remap into
+                 * vmalloc area.  In this case the static chunk can't
+                 * be mapped or unmapped by percpu and is marked
+                 * immutable.
+                 */
+                static_vm.addr = base_addr;
+                static_chunk->immutable = true;
+        }
+        /* assign pages */
+        nr_pages = -1;
+        for_each_possible_cpu(cpu) {
+                for (i = 0; i < pcpu_unit_pages; i++) {
+                        struct page *page = get_page_fn(cpu, i);
+                        if (!page)
+                                break;
+                        *pcpu_chunk_pagep(static_chunk, cpu, i) = page;
+                }
+                BUG_ON(i < PFN_UP(pcpu_static_size));
+                if (nr_pages < 0)
+                        nr_pages = i;
+                else
+                        BUG_ON(nr_pages != i);
+        }
+        /* map them */
+        if (populate_pte_fn) {
+                for_each_possible_cpu(cpu)
+                        for (i = 0; i < nr_pages; i++)
+                                populate_pte_fn(pcpu_chunk_addr(static_chunk,
+                                                                cpu, i));
+                err = pcpu_map(static_chunk, 0, nr_pages);
+                if (err)
+                        panic("failed to setup static percpu area, err=%d\n",
+                              err);
+        }
+        /* link static_chunk in */
+        pcpu_chunk_relocate(static_chunk, -1);
+        pcpu_chunk_addr_insert(static_chunk);
+        /* we're done */
+        pcpu_base_addr = (void *)pcpu_chunk_addr(static_chunk, 0, 0);
+        return pcpu_unit_size;
+}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 11a929872ebd..af58324c361a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -24,6 +24,7 @@
 #include <linux/radix-tree.h>
 #include <linux/rcupdate.h>
 #include <linux/bootmem.h>
+#include <linux/pfn.h>
 #include <asm/atomic.h>
 #include <asm/uaccess.h>
@@ -152,8 +153,8 @@ static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
 *
 * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
 */
-static int vmap_page_range(unsigned long start, unsigned long end,
+static int vmap_page_range_noflush(unsigned long start, unsigned long end,
-                                pgprot_t prot, struct page **pages)
+                                   pgprot_t prot, struct page **pages)
 {
        pgd_t *pgd;
        unsigned long next;
@@ -169,13 +170,22 @@ static int vmap_page_range(unsigned long start, unsigned long end,
                if (err)
                        break;
        } while (pgd++, addr = next, addr != end);
-        flush_cache_vmap(start, end);
        if (unlikely(err))
                return err;
        return nr;
 }
+static int vmap_page_range(unsigned long start, unsigned long end,
+                           pgprot_t prot, struct page **pages)
+{
+        int ret;
+        ret = vmap_page_range_noflush(start, end, prot, pages);
+        flush_cache_vmap(start, end);
+        return ret;
+}
 static inline int is_vmalloc_or_module_addr(const void *x)
 {
        /*
@@ -990,6 +1000,32 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro
 }
 EXPORT_SYMBOL(vm_map_ram);
+/**
+ * vm_area_register_early - register vmap area early during boot
+ * @vm: vm_struct to register
+ * @align: requested alignment
+ *
+ * This function is used to register kernel vm area before
+ * vmalloc_init() is called.  @vm->size and @vm->flags should contain
+ * proper values on entry and other fields should be zero.  On return,
+ * vm->addr contains the allocated address.
+ *
+ * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
+ */
+void __init vm_area_register_early(struct vm_struct *vm, size_t align)
+{
+        static size_t vm_init_off __initdata;
+        unsigned long addr;
+        addr = ALIGN(VMALLOC_START + vm_init_off, align);
+        vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
+        vm->addr = (void *)addr;
+        vm->next = vmlist;
+        vmlist = vm;
+}
 void __init vmalloc_init(void)
 {
        struct vmap_area *va;
@@ -1017,6 +1053,58 @@ void __init vmalloc_init(void)
        vmap_initialized = true;
 }
+/**
+ * map_kernel_range_noflush - map kernel VM area with the specified pages
+ * @addr: start of the VM area to map
+ * @size: size of the VM area to map
+ * @prot: page protection flags to use
+ * @pages: pages to map
+ *
+ * Map PFN_UP(@size) pages at @addr.  The VM area @addr and @size
+ * specify should have been allocated using get_vm_area() and its
+ * friends.
+ *
+ * NOTE:
+ * This function does NOT do any cache flushing.  The caller is
+ * responsible for calling flush_cache_vmap() on to-be-mapped areas
+ * before calling this function.
+ *
+ * RETURNS:
+ * The number of pages mapped on success, -errno on failure.
+ */
+int map_kernel_range_noflush(unsigned long addr, unsigned long size,
+                             pgprot_t prot, struct page **pages)
+{
+        return vmap_page_range_noflush(addr, addr + size, prot, pages);
+}
+/**
+ * unmap_kernel_range_noflush - unmap kernel VM area
+ * @addr: start of the VM area to unmap
+ * @size: size of the VM area to unmap
+ *
+ * Unmap PFN_UP(@size) pages at @addr.  The VM area @addr and @size
+ * specify should have been allocated using get_vm_area() and its
+ * friends.
+ *
+ * NOTE:
+ * This function does NOT do any cache flushing.  The caller is
+ * responsible for calling flush_cache_vunmap() on to-be-mapped areas
+ * before calling this function and flush_tlb_kernel_range() after.
+ */
+void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
+{
+        vunmap_page_range(addr, addr + size);
+}
+/**
+ * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
+ * @addr: start of the VM area to unmap
+ * @size: size of the VM area to unmap
+ *
+ * Similar to unmap_kernel_range_noflush() but flushes vcache before
+ * the unmapping and tlb after.
+ */
 void unmap_kernel_range(unsigned long addr, unsigned long size)
 {
        unsigned long end = addr + size;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 743f5542d65a..3a3dad801354 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1375,10 +1375,10 @@ EXPORT_SYMBOL_GPL(snmp_fold_field);
 int snmp_mib_init(void *ptr[2], size_t mibsize)
 {
        BUG_ON(ptr == NULL);
-        ptr[0] = __alloc_percpu(mibsize);
+        ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
        if (!ptr[0])
                goto err0;
-        ptr[1] = __alloc_percpu(mibsize);
+        ptr[1] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
        if (!ptr[1])
                goto err1;
        return 0;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 97f71153584f..bf895401218f 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -3376,7 +3376,7 @@ int __init ip_rt_init(void)
        int rc = 0;
 #ifdef CONFIG_NET_CLS_ROUTE
-        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
+        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
        if (!ip_rt_acct)
                panic("IP: failed to allocate ip_rt_acct\n");
 #endif