Merge branch 'x86/core' into tracing/textedit

Conflicts: arch/x86/Kconfig block/blktrace.c kernel/irq/handle.c Semantic conflict: kernel/trace/blktrace.c Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Ingo Molnar <mingo@elte.hu> 2009-03-06 10:44:14 -0500
committer: Ingo Molnar <mingo@elte.hu> 2009-03-06 10:45:01 -0500
commit: f0ef03985130287c6c84ebe69416cf790e6cc00e (patch)
tree: 3ecb04cc4d82e5fc3ae5f1747e6da172ae8cbcb7 /arch/x86/mm
parent: 16097439703bcd38e9fe5608c12add6dacb825ea (diff)
parent: 31bbed527e7039203920c51c9fb48c27aed0820c (diff)
18 files changed, 2095 insertions, 1349 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index d8cc96a2738f..08537747cb58 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,6 +1,8 @@
-obj-y   :=  init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
+obj-y   :=  init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
            pat.o pgtable.o gup.o
+obj-$(CONFIG_SMP)               += tlb.o
 obj-$(CONFIG_X86_32)            += pgtable_32.o iomap_32.o
 obj-$(CONFIG_HUGETLB_PAGE)      += hugetlbpage.o
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 7e8db53528a7..61b41ca3b5a2 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -23,6 +23,12 @@ int fixup_exception(struct pt_regs *regs)
        fixup = search_exception_tables(regs->ip);
        if (fixup) {
+                /* If fixup is less than 16, it means uaccess error */
+                if (fixup->fixup < 16) {
+                        current_thread_info()->uaccess_err = -EFAULT;
+                        regs->ip += fixup->fixup;
+                        return 1;
+                }
                regs->ip = fixup->fixup;
                return 1;
        }
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index c76ef1d701c9..a03b7279efa0 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1,73 +1,79 @@
 /*
 *  Copyright (C) 1995  Linus Torvalds
- *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
+ *  Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
+ *  Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
 */
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/mmiotrace.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
 #include <linux/interrupt.h>
-#include <linux/init.h>
+#include <linux/mmiotrace.h>
-#include <linux/tty.h>
+#include <linux/bootmem.h>
-#include <linux/vt_kern.h>              /* For unblank_screen() */
 #include <linux/compiler.h>
 #include <linux/highmem.h>
-#include <linux/bootmem.h>              /* for max_low_pfn */
-#include <linux/vmalloc.h>
-#include <linux/module.h>
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/vt_kern.h>
+#include <linux/signal.h>
+#include <linux/kernel.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/module.h>
 #include <linux/kdebug.h>
+#include <linux/errno.h>
+#include <linux/magic.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/mman.h>
+#include <linux/tty.h>
+#include <linux/smp.h>
+#include <linux/mm.h>
+#include <asm-generic/sections.h>
-#include <asm/system.h>
-#include <asm/desc.h>
-#include <asm/segment.h>
-#include <asm/pgalloc.h>
-#include <asm/smp.h>
 #include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
+#include <asm/segment.h>
+#include <asm/system.h>
 #include <asm/proto.h>
-#include <asm-generic/sections.h>
 #include <asm/traps.h>
+#include <asm/desc.h>
 /*
- * Page fault error code bits
+ * Page fault error code bits:
- *      bit 0 == 0 means no page found, 1 means protection fault
+ *
- *      bit 1 == 0 means read, 1 means write
+ *   bit 0 ==    0: no page found       1: protection fault
- *      bit 2 == 0 means kernel, 1 means user-mode
+ *   bit 1 ==    0: read access         1: write access
- *      bit 3 == 1 means use of reserved bit detected
+ *   bit 2 ==    0: kernel-mode access  1: user-mode access
- *      bit 4 == 1 means fault was an instruction fetch
+ *   bit 3 ==                           1: use of reserved bit detected
+ *   bit 4 ==                           1: fault was an instruction fetch
 */
-#define PF_PROT         (1<<0)
+enum x86_pf_error_code {
-#define PF_WRITE        (1<<1)
-#define PF_USER         (1<<2)
+        PF_PROT         =               1 << 0,
-#define PF_RSVD         (1<<3)
+        PF_WRITE        =               1 << 1,
-#define PF_INSTR        (1<<4)
+        PF_USER         =               1 << 2,
+        PF_RSVD         =               1 << 3,
+        PF_INSTR        =               1 << 4,
+};
+/*
+ * Returns 0 if mmiotrace is disabled, or if the fault is not
+ * handled by mmiotrace:
+ */
 static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
 {
-#ifdef CONFIG_MMIOTRACE
        if (unlikely(is_kmmio_active()))
                if (kmmio_handler(regs, addr) == 1)
                        return -1;
-#endif
        return 0;
 }
 static inline int notify_page_fault(struct pt_regs *regs)
 {
-#ifdef CONFIG_KPROBES
        int ret = 0;
        /* kprobe_running() needs smp_processor_id() */
-        if (!user_mode_vm(regs)) {
+        if (kprobes_built_in() && !user_mode_vm(regs)) {
                preempt_disable();
                if (kprobe_running() && kprobe_fault_handler(regs, 14))
                        ret = 1;
@@ -75,29 +81,76 @@ static inline int notify_page_fault(struct pt_regs *regs)
        }
        return ret;
-#else
-        return 0;
-#endif
 }
 /*
- * X86_32
+ * Prefetch quirks:
- * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
+ *
- * Check that here and ignore it.
+ * 32-bit mode:
+ *
+ *   Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
+ *   Check that here and ignore it.
+ *
+ * 64-bit mode:
 *
- * X86_64
+ *   Sometimes the CPU reports invalid exceptions on prefetch.
- * Sometimes the CPU reports invalid exceptions on prefetch.
+ *   Check that here and ignore it.
- * Check that here and ignore it.
 *
- * Opcode checker based on code by Richard Brunner
+ * Opcode checker based on code by Richard Brunner.
 */
-static int is_prefetch(struct pt_regs *regs, unsigned long addr,
+static inline int
-                       unsigned long error_code)
+check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
+                      unsigned char opcode, int *prefetch)
 {
+        unsigned char instr_hi = opcode & 0xf0;
+        unsigned char instr_lo = opcode & 0x0f;
+        switch (instr_hi) {
+        case 0x20:
+        case 0x30:
+                /*
+                 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
+                 * In X86_64 long mode, the CPU will signal invalid
+                 * opcode if some of these prefixes are present so
+                 * X86_64 will never get here anyway
+                 */
+                return ((instr_lo & 7) == 0x6);
+#ifdef CONFIG_X86_64
+        case 0x40:
+                /*
+                 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
+                 * Need to figure out under what instruction mode the
+                 * instruction was issued. Could check the LDT for lm,
+                 * but for now it's good enough to assume that long
+                 * mode only uses well known segments or kernel.
+                 */
+                return (!user_mode(regs)) || (regs->cs == __USER_CS);
+#endif
+        case 0x60:
+                /* 0x64 thru 0x67 are valid prefixes in all modes. */
+                return (instr_lo & 0xC) == 0x4;
+        case 0xF0:
+                /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
+                return !instr_lo || (instr_lo>>1) == 1;
+        case 0x00:
+                /* Prefetch instruction is 0x0F0D or 0x0F18 */
+                if (probe_kernel_address(instr, opcode))
+                        return 0;
+                *prefetch = (instr_lo == 0xF) &&
+                        (opcode == 0x0D || opcode == 0x18);
+                return 0;
+        default:
+                return 0;
+        }
+}
+static int
+is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
+{
+        unsigned char *max_instr;
        unsigned char *instr;
-        int scan_more = 1;
        int prefetch = 0;
-        unsigned char *max_instr;
        /*
         * If it was a exec (instruction fetch) fault on NX page, then
@@ -106,106 +159,170 @@ static int is_prefetch(struct pt_regs *regs, unsigned long addr,
        if (error_code & PF_INSTR)
                return 0;
-        instr = (unsigned char *)convert_ip_to_linear(current, regs);
+        instr = (void *)convert_ip_to_linear(current, regs);
        max_instr = instr + 15;
        if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
                return 0;
-        while (scan_more && instr < max_instr) {
+        while (instr < max_instr) {
                unsigned char opcode;
-                unsigned char instr_hi;
-                unsigned char instr_lo;
                if (probe_kernel_address(instr, opcode))
                        break;
-                instr_hi = opcode & 0xf0;
-                instr_lo = opcode & 0x0f;
                instr++;
-                switch (instr_hi) {
+                if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
-                case 0x20:
-                case 0x30:
-                        /*
-                         * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
-                         * In X86_64 long mode, the CPU will signal invalid
-                         * opcode if some of these prefixes are present so
-                         * X86_64 will never get here anyway
-                         */
-                        scan_more = ((instr_lo & 7) == 0x6);
                        break;
-#ifdef CONFIG_X86_64
-                case 0x40:
-                        /*
-                         * In AMD64 long mode 0x40..0x4F are valid REX prefixes
-                         * Need to figure out under what instruction mode the
-                         * instruction was issued. Could check the LDT for lm,
-                         * but for now it's good enough to assume that long
-                         * mode only uses well known segments or kernel.
-                         */
-                        scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
-                        break;
-#endif
-                case 0x60:
-                        /* 0x64 thru 0x67 are valid prefixes in all modes. */
-                        scan_more = (instr_lo & 0xC) == 0x4;
-                        break;
-                case 0xF0:
-                        /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
-                        scan_more = !instr_lo || (instr_lo>>1) == 1;
-                        break;
-                case 0x00:
-                        /* Prefetch instruction is 0x0F0D or 0x0F18 */
-                        scan_more = 0;
-                        if (probe_kernel_address(instr, opcode))
-                                break;
-                        prefetch = (instr_lo == 0xF) &&
-                                (opcode == 0x0D || opcode == 0x18);
-                        break;
-                default:
-                        scan_more = 0;
-                        break;
-                }
        }
        return prefetch;
 }
-static void force_sig_info_fault(int si_signo, int si_code,
+static void
-        unsigned long address, struct task_struct *tsk)
+force_sig_info_fault(int si_signo, int si_code, unsigned long address,
+                     struct task_struct *tsk)
 {
        siginfo_t info;
-        info.si_signo = si_signo;
+        info.si_signo   = si_signo;
-        info.si_errno = 0;
+        info.si_errno   = 0;
-        info.si_code = si_code;
+        info.si_code    = si_code;
-        info.si_addr = (void __user *)address;
+        info.si_addr    = (void __user *)address;
        force_sig_info(si_signo, &info, tsk);
 }
-#ifdef CONFIG_X86_64
+DEFINE_SPINLOCK(pgd_lock);
-static int bad_address(void *p)
+LIST_HEAD(pgd_list);
+#ifdef CONFIG_X86_32
+static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
 {
-        unsigned long dummy;
+        unsigned index = pgd_index(address);
-        return probe_kernel_address((unsigned long *)p, dummy);
+        pgd_t *pgd_k;
+        pud_t *pud, *pud_k;
+        pmd_t *pmd, *pmd_k;
+        pgd += index;
+        pgd_k = init_mm.pgd + index;
+        if (!pgd_present(*pgd_k))
+                return NULL;
+        /*
+         * set_pgd(pgd, *pgd_k); here would be useless on PAE
+         * and redundant with the set_pmd() on non-PAE. As would
+         * set_pud.
+         */
+        pud = pud_offset(pgd, address);
+        pud_k = pud_offset(pgd_k, address);
+        if (!pud_present(*pud_k))
+                return NULL;
+        pmd = pmd_offset(pud, address);
+        pmd_k = pmd_offset(pud_k, address);
+        if (!pmd_present(*pmd_k))
+                return NULL;
+        if (!pmd_present(*pmd)) {
+                set_pmd(pmd, *pmd_k);
+                arch_flush_lazy_mmu_mode();
+        } else {
+                BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
+        }
+        return pmd_k;
+}
+void vmalloc_sync_all(void)
+{
+        unsigned long address;
+        if (SHARED_KERNEL_PMD)
+                return;
+        for (address = VMALLOC_START & PMD_MASK;
+             address >= TASK_SIZE && address < FIXADDR_TOP;
+             address += PMD_SIZE) {
+                unsigned long flags;
+                struct page *page;
+                spin_lock_irqsave(&pgd_lock, flags);
+                list_for_each_entry(page, &pgd_list, lru) {
+                        if (!vmalloc_sync_one(page_address(page), address))
+                                break;
+                }
+                spin_unlock_irqrestore(&pgd_lock, flags);
+        }
+}
+/*
+ * 32-bit:
+ *
+ *   Handle a fault on the vmalloc or module mapping area
+ */
+static noinline int vmalloc_fault(unsigned long address)
+{
+        unsigned long pgd_paddr;
+        pmd_t *pmd_k;
+        pte_t *pte_k;
+        /* Make sure we are in vmalloc area: */
+        if (!(address >= VMALLOC_START && address < VMALLOC_END))
+                return -1;
+        /*
+         * Synchronize this task's top level page-table
+         * with the 'reference' page table.
+         *
+         * Do _not_ use "current" here. We might be inside
+         * an interrupt in the middle of a task switch..
+         */
+        pgd_paddr = read_cr3();
+        pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
+        if (!pmd_k)
+                return -1;
+        pte_k = pte_offset_kernel(pmd_k, address);
+        if (!pte_present(*pte_k))
+                return -1;
+        return 0;
+}
+/*
+ * Did it hit the DOS screen memory VA from vm86 mode?
+ */
+static inline void
+check_v8086_mode(struct pt_regs *regs, unsigned long address,
+                 struct task_struct *tsk)
+{
+        unsigned long bit;
+        if (!v8086_mode(regs))
+                return;
+        bit = (address - 0xA0000) >> PAGE_SHIFT;
+        if (bit < 32)
+                tsk->thread.screen_bitmap |= 1 << bit;
 }
-#endif
 static void dump_pagetable(unsigned long address)
 {
-#ifdef CONFIG_X86_32
        __typeof__(pte_val(__pte(0))) page;
        page = read_cr3();
        page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
 #ifdef CONFIG_X86_PAE
        printk("*pdpt = %016Lx ", page);
        if ((page >> PAGE_SHIFT) < max_low_pfn
            && page & _PAGE_PRESENT) {
                page &= PAGE_MASK;
                page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
-                                                         & (PTRS_PER_PMD - 1)];
+                                                        & (PTRS_PER_PMD - 1)];
                printk(KERN_CONT "*pde = %016Lx ", page);
                page &= ~_PAGE_NX;
        }
@@ -217,19 +334,145 @@ static void dump_pagetable(unsigned long address)
         * We must not directly access the pte in the highpte
         * case if the page table is located in highmem.
         * And let's rather not kmap-atomic the pte, just in case
-         * it's allocated already.
+         * it's allocated already:
         */
        if ((page >> PAGE_SHIFT) < max_low_pfn
            && (page & _PAGE_PRESENT)
            && !(page & _PAGE_PSE)) {
                page &= PAGE_MASK;
                page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
-                                                         & (PTRS_PER_PTE - 1)];
+                                                        & (PTRS_PER_PTE - 1)];
                printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
        }
        printk("\n");
-#else /* CONFIG_X86_64 */
+}
+#else /* CONFIG_X86_64: */
+void vmalloc_sync_all(void)
+{
+        unsigned long address;
+        for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
+             address += PGDIR_SIZE) {
+                const pgd_t *pgd_ref = pgd_offset_k(address);
+                unsigned long flags;
+                struct page *page;
+                if (pgd_none(*pgd_ref))
+                        continue;
+                spin_lock_irqsave(&pgd_lock, flags);
+                list_for_each_entry(page, &pgd_list, lru) {
+                        pgd_t *pgd;
+                        pgd = (pgd_t *)page_address(page) + pgd_index(address);
+                        if (pgd_none(*pgd))
+                                set_pgd(pgd, *pgd_ref);
+                        else
+                                BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+                }
+                spin_unlock_irqrestore(&pgd_lock, flags);
+        }
+}
+/*
+ * 64-bit:
+ *
+ *   Handle a fault on the vmalloc area
+ *
+ * This assumes no large pages in there.
+ */
+static noinline int vmalloc_fault(unsigned long address)
+{
+        pgd_t *pgd, *pgd_ref;
+        pud_t *pud, *pud_ref;
+        pmd_t *pmd, *pmd_ref;
+        pte_t *pte, *pte_ref;
+        /* Make sure we are in vmalloc area: */
+        if (!(address >= VMALLOC_START && address < VMALLOC_END))
+                return -1;
+        /*
+         * Copy kernel mappings over when needed. This can also
+         * happen within a race in page table update. In the later
+         * case just flush:
+         */
+        pgd = pgd_offset(current->active_mm, address);
+        pgd_ref = pgd_offset_k(address);
+        if (pgd_none(*pgd_ref))
+                return -1;
+        if (pgd_none(*pgd))
+                set_pgd(pgd, *pgd_ref);
+        else
+                BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+        /*
+         * Below here mismatches are bugs because these lower tables
+         * are shared:
+         */
+        pud = pud_offset(pgd, address);
+        pud_ref = pud_offset(pgd_ref, address);
+        if (pud_none(*pud_ref))
+                return -1;
+        if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
+                BUG();
+        pmd = pmd_offset(pud, address);
+        pmd_ref = pmd_offset(pud_ref, address);
+        if (pmd_none(*pmd_ref))
+                return -1;
+        if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
+                BUG();
+        pte_ref = pte_offset_kernel(pmd_ref, address);
+        if (!pte_present(*pte_ref))
+                return -1;
+        pte = pte_offset_kernel(pmd, address);
+        /*
+         * Don't use pte_page here, because the mappings can point
+         * outside mem_map, and the NUMA hash lookup cannot handle
+         * that:
+         */
+        if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
+                BUG();
+        return 0;
+}
+static const char errata93_warning[] =
+KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
+KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
+KERN_ERR "******* Please consider a BIOS update.\n"
+KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
+/*
+ * No vm86 mode in 64-bit mode:
+ */
+static inline void
+check_v8086_mode(struct pt_regs *regs, unsigned long address,
+                 struct task_struct *tsk)
+{
+}
+static int bad_address(void *p)
+{
+        unsigned long dummy;
+        return probe_kernel_address((unsigned long *)p, dummy);
+}
+static void dump_pagetable(unsigned long address)
+{
        pgd_t *pgd;
        pud_t *pud;
        pmd_t *pmd;
@@ -238,102 +481,77 @@ static void dump_pagetable(unsigned long address)
        pgd = (pgd_t *)read_cr3();
        pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
        pgd += pgd_index(address);
-        if (bad_address(pgd)) goto bad;
+        if (bad_address(pgd))
+                goto bad;
        printk("PGD %lx ", pgd_val(*pgd));
-        if (!pgd_present(*pgd)) goto ret;
+        if (!pgd_present(*pgd))
+                goto out;
        pud = pud_offset(pgd, address);
-        if (bad_address(pud)) goto bad;
+        if (bad_address(pud))
+                goto bad;
        printk("PUD %lx ", pud_val(*pud));
        if (!pud_present(*pud) || pud_large(*pud))
-                goto ret;
+                goto out;
        pmd = pmd_offset(pud, address);
-        if (bad_address(pmd)) goto bad;
+        if (bad_address(pmd))
+                goto bad;
        printk("PMD %lx ", pmd_val(*pmd));
-        if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
+        if (!pmd_present(*pmd) || pmd_large(*pmd))
+                goto out;
        pte = pte_offset_kernel(pmd, address);
-        if (bad_address(pte)) goto bad;
+        if (bad_address(pte))
+                goto bad;
        printk("PTE %lx", pte_val(*pte));
-ret:
+out:
        printk("\n");
        return;
 bad:
        printk("BAD\n");
-#endif
-}
-#ifdef CONFIG_X86_32
-static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
-{
-        unsigned index = pgd_index(address);
-        pgd_t *pgd_k;
-        pud_t *pud, *pud_k;
-        pmd_t *pmd, *pmd_k;
-        pgd += index;
-        pgd_k = init_mm.pgd + index;
-        if (!pgd_present(*pgd_k))
-                return NULL;
-        /*
-         * set_pgd(pgd, *pgd_k); here would be useless on PAE
-         * and redundant with the set_pmd() on non-PAE. As would
-         * set_pud.
-         */
-        pud = pud_offset(pgd, address);
-        pud_k = pud_offset(pgd_k, address);
-        if (!pud_present(*pud_k))
-                return NULL;
-        pmd = pmd_offset(pud, address);
-        pmd_k = pmd_offset(pud_k, address);
-        if (!pmd_present(*pmd_k))
-                return NULL;
-        if (!pmd_present(*pmd)) {
-                set_pmd(pmd, *pmd_k);
-                arch_flush_lazy_mmu_mode();
-        } else
-                BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
-        return pmd_k;
 }
-#endif
-#ifdef CONFIG_X86_64
+#endif /* CONFIG_X86_64 */
-static const char errata93_warning[] =
-KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
-KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
-KERN_ERR "******* Please consider a BIOS update.\n"
-KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
-#endif
-/* Workaround for K8 erratum #93 & buggy BIOS.
+/*
-   BIOS SMM functions are required to use a specific workaround
+ * Workaround for K8 erratum #93 & buggy BIOS.
-   to avoid corruption of the 64bit RIP register on C stepping K8.
+ *
-   A lot of BIOS that didn't get tested properly miss this.
+ * BIOS SMM functions are required to use a specific workaround
-   The OS sees this as a page fault with the upper 32bits of RIP cleared.
+ * to avoid corruption of the 64bit RIP register on C stepping K8.
-   Try to work around it here.
+ *
-   Note we only handle faults in kernel here.
+ * A lot of BIOS that didn't get tested properly miss this.
-   Does nothing for X86_32
+ *
+ * The OS sees this as a page fault with the upper 32bits of RIP cleared.
+ * Try to work around it here.
+ *
+ * Note we only handle faults in kernel here.
+ * Does nothing on 32-bit.
 */
 static int is_errata93(struct pt_regs *regs, unsigned long address)
 {
 #ifdef CONFIG_X86_64
-        static int warned;
+        static int once;
        if (address != regs->ip)
                return 0;
        if ((address >> 32) != 0)
                return 0;
        address |= 0xffffffffUL << 32;
        if ((address >= (u64)_stext && address <= (u64)_etext) ||
            (address >= MODULES_VADDR && address <= MODULES_END)) {
-                if (!warned) {
+                if (!once) {
                        printk(errata93_warning);
-                        warned = 1;
+                        once = 1;
                }
                regs->ip = address;
                return 1;
@@ -343,16 +561,17 @@ static int is_errata93(struct pt_regs *regs, unsigned long address)
 }
 /*
- * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
+ * Work around K8 erratum #100 K8 in compat mode occasionally jumps
- * addresses >4GB.  We catch this in the page fault handler because these
+ * to illegal addresses >4GB.
- * addresses are not reachable. Just detect this case and return.  Any code
+ *
+ * We catch this in the page fault handler because these addresses
+ * are not reachable. Just detect this case and return.  Any code
 * segment in LDT is compatibility mode.
 */
 static int is_errata100(struct pt_regs *regs, unsigned long address)
 {
 #ifdef CONFIG_X86_64
-        if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
+        if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
-            (address >> 32))
                return 1;
 #endif
        return 0;
@@ -362,8 +581,9 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
 {
 #ifdef CONFIG_X86_F00F_BUG
        unsigned long nr;
        /*
-         * Pentium F0 0F C7 C8 bug workaround.
+         * Pentium F0 0F C7 C8 bug workaround:
         */
        if (boot_cpu_data.f00f_bug) {
                nr = (address - idt_descr.address) >> 3;
@@ -377,62 +597,277 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
        return 0;
 }
-static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
+static const char nx_warning[] = KERN_CRIT
-                            unsigned long address)
+"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
+static void
+show_fault_oops(struct pt_regs *regs, unsigned long error_code,
+                unsigned long address)
 {
-#ifdef CONFIG_X86_32
        if (!oops_may_print())
                return;
-#endif
-#ifdef CONFIG_X86_PAE
        if (error_code & PF_INSTR) {
                unsigned int level;
                pte_t *pte = lookup_address(address, &level);
                if (pte && pte_present(*pte) && !pte_exec(*pte))
-                        printk(KERN_CRIT "kernel tried to execute "
+                        printk(nx_warning, current_uid());
-                                "NX-protected page - exploit attempt? "
-                                "(uid: %d)\n", current_uid());
        }
-#endif
        printk(KERN_ALERT "BUG: unable to handle kernel ");
        if (address < PAGE_SIZE)
                printk(KERN_CONT "NULL pointer dereference");
        else
                printk(KERN_CONT "paging request");
        printk(KERN_CONT " at %p\n", (void *) address);
        printk(KERN_ALERT "IP:");
        printk_address(regs->ip, 1);
        dump_pagetable(address);
 }
-#ifdef CONFIG_X86_64
+static noinline void
-static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
+pgtable_bad(struct pt_regs *regs, unsigned long error_code,
-                                 unsigned long error_code)
+            unsigned long address)
 {
-        unsigned long flags = oops_begin();
-        int sig = SIGKILL;
        struct task_struct *tsk;
+        unsigned long flags;
+        int sig;
+        flags = oops_begin();
+        tsk = current;
+        sig = SIGKILL;
        printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
-               current->comm, address);
+               tsk->comm, address);
        dump_pagetable(address);
-        tsk = current;
-        tsk->thread.cr2 = address;
+        tsk->thread.cr2         = address;
-        tsk->thread.trap_no = 14;
+        tsk->thread.trap_no     = 14;
-        tsk->thread.error_code = error_code;
+        tsk->thread.error_code  = error_code;
        if (__die("Bad pagetable", regs, error_code))
                sig = 0;
        oops_end(flags, regs, sig);
 }
-#endif
+static noinline void
+no_context(struct pt_regs *regs, unsigned long error_code,
+           unsigned long address)
+{
+        struct task_struct *tsk = current;
+        unsigned long *stackend;
+        unsigned long flags;
+        int sig;
+        /* Are we prepared to handle this kernel fault? */
+        if (fixup_exception(regs))
+                return;
+        /*
+         * 32-bit:
+         *
+         *   Valid to do another page fault here, because if this fault
+         *   had been triggered by is_prefetch fixup_exception would have
+         *   handled it.
+         *
+         * 64-bit:
+         *
+         *   Hall of shame of CPU/BIOS bugs.
+         */
+        if (is_prefetch(regs, error_code, address))
+                return;
+        if (is_errata93(regs, address))
+                return;
+        /*
+         * Oops. The kernel tried to access some bad page. We'll have to
+         * terminate things with extreme prejudice:
+         */
+        flags = oops_begin();
+        show_fault_oops(regs, error_code, address);
+        stackend = end_of_stack(tsk);
+        if (*stackend != STACK_END_MAGIC)
+                printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
+        tsk->thread.cr2         = address;
+        tsk->thread.trap_no     = 14;
+        tsk->thread.error_code  = error_code;
+        sig = SIGKILL;
+        if (__die("Oops", regs, error_code))
+                sig = 0;
+        /* Executive summary in case the body of the oops scrolled away */
+        printk(KERN_EMERG "CR2: %016lx\n", address);
+        oops_end(flags, regs, sig);
+}
+/*
+ * Print out info about fatal segfaults, if the show_unhandled_signals
+ * sysctl is set:
+ */
+static inline void
+show_signal_msg(struct pt_regs *regs, unsigned long error_code,
+                unsigned long address, struct task_struct *tsk)
+{
+        if (!unhandled_signal(tsk, SIGSEGV))
+                return;
+        if (!printk_ratelimit())
+                return;
+        printk(KERN_CONT "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
+                task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
+                tsk->comm, task_pid_nr(tsk), address,
+                (void *)regs->ip, (void *)regs->sp, error_code);
+        print_vma_addr(KERN_CONT " in ", regs->ip);
+        printk(KERN_CONT "\n");
+}
+static void
+__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
+                       unsigned long address, int si_code)
+{
+        struct task_struct *tsk = current;
+        /* User mode accesses just cause a SIGSEGV */
+        if (error_code & PF_USER) {
+                /*
+                 * It's possible to have interrupts off here:
+                 */
+                local_irq_enable();
+                /*
+                 * Valid to do another page fault here because this one came
+                 * from user space:
+                 */
+                if (is_prefetch(regs, error_code, address))
+                        return;
+                if (is_errata100(regs, address))
+                        return;
+                if (unlikely(show_unhandled_signals))
+                        show_signal_msg(regs, error_code, address, tsk);
+                /* Kernel addresses are always protection faults: */
+                tsk->thread.cr2         = address;
+                tsk->thread.error_code  = error_code | (address >= TASK_SIZE);
+                tsk->thread.trap_no     = 14;
+                force_sig_info_fault(SIGSEGV, si_code, address, tsk);
+                return;
+        }
+        if (is_f00f_bug(regs, address))
+                return;
+        no_context(regs, error_code, address);
+}
+static noinline void
+bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
+                     unsigned long address)
+{
+        __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
+}
+static void
+__bad_area(struct pt_regs *regs, unsigned long error_code,
+           unsigned long address, int si_code)
+{
+        struct mm_struct *mm = current->mm;
+        /*
+         * Something tried to access memory that isn't in our memory map..
+         * Fix it, but check if it's kernel or user first..
+         */
+        up_read(&mm->mmap_sem);
+        __bad_area_nosemaphore(regs, error_code, address, si_code);
+}
+static noinline void
+bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+{
+        __bad_area(regs, error_code, address, SEGV_MAPERR);
+}
+static noinline void
+bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
+                      unsigned long address)
+{
+        __bad_area(regs, error_code, address, SEGV_ACCERR);
+}
+/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
+static void
+out_of_memory(struct pt_regs *regs, unsigned long error_code,
+              unsigned long address)
+{
+        /*
+         * We ran out of memory, call the OOM killer, and return the userspace
+         * (which will retry the fault, or kill us if we got oom-killed):
+         */
+        up_read(&current->mm->mmap_sem);
+        pagefault_out_of_memory();
+}
+static void
+do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+{
+        struct task_struct *tsk = current;
+        struct mm_struct *mm = tsk->mm;
+        up_read(&mm->mmap_sem);
+        /* Kernel mode? Handle exceptions or die: */
+        if (!(error_code & PF_USER))
+                no_context(regs, error_code, address);
+        /* User-space => ok to do another page fault: */
+        if (is_prefetch(regs, error_code, address))
+                return;
+        tsk->thread.cr2         = address;
+        tsk->thread.error_code  = error_code;
+        tsk->thread.trap_no     = 14;
+        force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
+}
+static noinline void
+mm_fault_error(struct pt_regs *regs, unsigned long error_code,
+               unsigned long address, unsigned int fault)
+{
+        if (fault & VM_FAULT_OOM) {
+                out_of_memory(regs, error_code, address);
+        } else {
+                if (fault & VM_FAULT_SIGBUS)
+                        do_sigbus(regs, error_code, address);
+                else
+                        BUG();
+        }
+}
 static int spurious_fault_check(unsigned long error_code, pte_t *pte)
 {
        if ((error_code & PF_WRITE) && !pte_write(*pte))
                return 0;
        if ((error_code & PF_INSTR) && !pte_exec(*pte))
                return 0;
@@ -440,21 +875,25 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
 }
 /*
- * Handle a spurious fault caused by a stale TLB entry.  This allows
+ * Handle a spurious fault caused by a stale TLB entry.
- * us to lazily refresh the TLB when increasing the permissions of a
+ *
- * kernel page (RO -> RW or NX -> X).  Doing it eagerly is very
+ * This allows us to lazily refresh the TLB when increasing the
- * expensive since that implies doing a full cross-processor TLB
+ * permissions of a kernel page (RO -> RW or NX -> X).  Doing it
- * flush, even if no stale TLB entries exist on other processors.
+ * eagerly is very expensive since that implies doing a full
+ * cross-processor TLB flush, even if no stale TLB entries exist
+ * on other processors.
+ *
 * There are no security implications to leaving a stale TLB when
 * increasing the permissions on a page.
 */
-static int spurious_fault(unsigned long address,
+static noinline int
-                          unsigned long error_code)
+spurious_fault(unsigned long error_code, unsigned long address)
 {
        pgd_t *pgd;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *pte;
+        int ret;
        /* Reserved-bit violation or user access to kernel space? */
        if (error_code & (PF_USER | PF_RSVD))
@@ -482,127 +921,71 @@ static int spurious_fault(unsigned long address,
        if (!pte_present(*pte))
                return 0;
-        return spurious_fault_check(error_code, pte);
+        ret = spurious_fault_check(error_code, pte);
-}
+        if (!ret)
+                return 0;
-/*
- * X86_32
- * Handle a fault on the vmalloc or module mapping area
- *
- * X86_64
- * Handle a fault on the vmalloc area
- *
- * This assumes no large pages in there.
- */
-static int vmalloc_fault(unsigned long address)
-{
-#ifdef CONFIG_X86_32
-        unsigned long pgd_paddr;
-        pmd_t *pmd_k;
-        pte_t *pte_k;
-        /* Make sure we are in vmalloc area */
-        if (!(address >= VMALLOC_START && address < VMALLOC_END))
-                return -1;
        /*
-         * Synchronize this task's top level page-table
+         * Make sure we have permissions in PMD.
-         * with the 'reference' page table.
+         * If not, then there's a bug in the page tables:
-         *
-         * Do _not_ use "current" here. We might be inside
-         * an interrupt in the middle of a task switch..
         */
-        pgd_paddr = read_cr3();
+        ret = spurious_fault_check(error_code, (pte_t *) pmd);
-        pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
+        WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
-        if (!pmd_k)
-                return -1;
-        pte_k = pte_offset_kernel(pmd_k, address);
-        if (!pte_present(*pte_k))
-                return -1;
-        return 0;
-#else
-        pgd_t *pgd, *pgd_ref;
-        pud_t *pud, *pud_ref;
-        pmd_t *pmd, *pmd_ref;
-        pte_t *pte, *pte_ref;
-        /* Make sure we are in vmalloc area */
+        return ret;
-        if (!(address >= VMALLOC_START && address < VMALLOC_END))
+}
-                return -1;
-        /* Copy kernel mappings over when needed. This can also
+int show_unhandled_signals = 1;
-           happen within a race in page table update. In the later
-           case just flush. */
-        pgd = pgd_offset(current->active_mm, address);
+static inline int
-        pgd_ref = pgd_offset_k(address);
+access_error(unsigned long error_code, int write, struct vm_area_struct *vma)
-        if (pgd_none(*pgd_ref))
+{
-                return -1;
+        if (write) {
-        if (pgd_none(*pgd))
+                /* write, present and write, not present: */
-                set_pgd(pgd, *pgd_ref);
+                if (unlikely(!(vma->vm_flags & VM_WRITE)))
-        else
+                        return 1;
-                BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+                return 0;
+        }
-        /* Below here mismatches are bugs because these lower tables
+        /* read, present: */
-           are shared */
+        if (unlikely(error_code & PF_PROT))
+                return 1;
+        /* read, not present: */
+        if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
+                return 1;
-        pud = pud_offset(pgd, address);
-        pud_ref = pud_offset(pgd_ref, address);
-        if (pud_none(*pud_ref))
-                return -1;
-        if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
-                BUG();
-        pmd = pmd_offset(pud, address);
-        pmd_ref = pmd_offset(pud_ref, address);
-        if (pmd_none(*pmd_ref))
-                return -1;
-        if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
-                BUG();
-        pte_ref = pte_offset_kernel(pmd_ref, address);
-        if (!pte_present(*pte_ref))
-                return -1;
-        pte = pte_offset_kernel(pmd, address);
-        /* Don't use pte_page here, because the mappings can point
-           outside mem_map, and the NUMA hash lookup cannot handle
-           that. */
-        if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
-                BUG();
        return 0;
-#endif
 }
-int show_unhandled_signals = 1;
+static int fault_in_kernel_space(unsigned long address)
+{
+        return address >= TASK_SIZE_MAX;
+}
 /*
 * This routine handles page faults.  It determines the address,
 * and the problem, and then passes it off to one of the appropriate
 * routines.
 */
-#ifdef CONFIG_X86_64
+dotraplinkage void __kprobes
-asmlinkage
+do_page_fault(struct pt_regs *regs, unsigned long error_code)
-#endif
-void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 {
-        struct task_struct *tsk;
-        struct mm_struct *mm;
        struct vm_area_struct *vma;
+        struct task_struct *tsk;
        unsigned long address;
-        int write, si_code;
+        struct mm_struct *mm;
+        int write;
        int fault;
-#ifdef CONFIG_X86_64
-        unsigned long flags;
-        int sig;
-#endif
        tsk = current;
        mm = tsk->mm;
        prefetchw(&mm->mmap_sem);
-        /* get the address */
+        /* Get the faulting address: */
        address = read_cr2();
-        si_code = SEGV_MAPERR;
        if (unlikely(kmmio_fault(regs, address)))
                return;
@@ -619,319 +1002,147 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
         * (error_code & 4) == 0, and that the fault was not a
         * protection error (error_code & 9) == 0.
         */
-#ifdef CONFIG_X86_32
+        if (unlikely(fault_in_kernel_space(address))) {
-        if (unlikely(address >= TASK_SIZE)) {
-#else
-        if (unlikely(address >= TASK_SIZE64)) {
-#endif
                if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
                    vmalloc_fault(address) >= 0)
                        return;
-                /* Can handle a stale RO->RW TLB */
+                /* Can handle a stale RO->RW TLB: */
-                if (spurious_fault(address, error_code))
+                if (spurious_fault(error_code, address))
                        return;
-                /* kprobes don't want to hook the spurious faults. */
+                /* kprobes don't want to hook the spurious faults: */
                if (notify_page_fault(regs))
                        return;
                /*
                 * Don't take the mm semaphore here. If we fixup a prefetch
-                 * fault we could otherwise deadlock.
+                 * fault we could otherwise deadlock:
                 */
-                goto bad_area_nosemaphore;
+                bad_area_nosemaphore(regs, error_code, address);
-        }
-        /* kprobes don't want to hook the spurious faults. */
-        if (notify_page_fault(regs))
                return;
+        }
+        /* kprobes don't want to hook the spurious faults: */
+        if (unlikely(notify_page_fault(regs)))
+                return;
        /*
         * It's safe to allow irq's after cr2 has been saved and the
         * vmalloc fault has been handled.
         *
         * User-mode registers count as a user access even for any
-         * potential system fault or CPU buglet.
+         * potential system fault or CPU buglet:
         */
        if (user_mode_vm(regs)) {
                local_irq_enable();
                error_code |= PF_USER;
-        } else if (regs->flags & X86_EFLAGS_IF)
+        } else {
-                local_irq_enable();
+                if (regs->flags & X86_EFLAGS_IF)
+                        local_irq_enable();
+        }
-#ifdef CONFIG_X86_64
        if (unlikely(error_code & PF_RSVD))
-                pgtable_bad(address, regs, error_code);
+                pgtable_bad(regs, error_code, address);
-#endif
        /*
-         * If we're in an interrupt, have no user context or are running in an
+         * If we're in an interrupt, have no user context or are running
-         * atomic region then we must not take the fault.
+         * in an atomic region then we must not take the fault:
         */
-        if (unlikely(in_atomic() || !mm))
+        if (unlikely(in_atomic() || !mm)) {
-                goto bad_area_nosemaphore;
+                bad_area_nosemaphore(regs, error_code, address);
+                return;
+        }
        /*
         * When running in the kernel we expect faults to occur only to
-         * addresses in user space.  All other faults represent errors in the
+         * addresses in user space.  All other faults represent errors in
-         * kernel and should generate an OOPS.  Unfortunately, in the case of an
+         * the kernel and should generate an OOPS.  Unfortunately, in the
-         * erroneous fault occurring in a code path which already holds mmap_sem
+         * case of an erroneous fault occurring in a code path which already
-         * we will deadlock attempting to validate the fault against the
+         * holds mmap_sem we will deadlock attempting to validate the fault
-         * address space.  Luckily the kernel only validly references user
+         * against the address space.  Luckily the kernel only validly
-         * space from well defined areas of code, which are listed in the
+         * references user space from well defined areas of code, which are
-         * exceptions table.
+         * listed in the exceptions table.
         *
         * As the vast majority of faults will be valid we will only perform
-         * the source reference check when there is a possibility of a deadlock.
+         * the source reference check when there is a possibility of a
-         * Attempt to lock the address space, if we cannot we then validate the
+         * deadlock. Attempt to lock the address space, if we cannot we then
-         * source.  If this is invalid we can skip the address space check,
+         * validate the source. If this is invalid we can skip the address
-         * thus avoiding the deadlock.
+         * space check, thus avoiding the deadlock:
         */
-        if (!down_read_trylock(&mm->mmap_sem)) {
+        if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
                if ((error_code & PF_USER) == 0 &&
-                    !search_exception_tables(regs->ip))
+                    !search_exception_tables(regs->ip)) {
-                        goto bad_area_nosemaphore;
+                        bad_area_nosemaphore(regs, error_code, address);
+                        return;
+                }
                down_read(&mm->mmap_sem);
+        } else {
+                /*
+                 * The above down_read_trylock() might have succeeded in
+                 * which case we'll have missed the might_sleep() from
+                 * down_read():
+                 */
+                might_sleep();
        }
        vma = find_vma(mm, address);
-        if (!vma)
+        if (unlikely(!vma)) {
-                goto bad_area;
+                bad_area(regs, error_code, address);
-        if (vma->vm_start <= address)
+                return;
+        }
+        if (likely(vma->vm_start <= address))
                goto good_area;
-        if (!(vma->vm_flags & VM_GROWSDOWN))
+        if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
-                goto bad_area;
+                bad_area(regs, error_code, address);
+                return;
+        }
        if (error_code & PF_USER) {
                /*
                 * Accessing the stack below %sp is always a bug.
                 * The large cushion allows instructions like enter
-                 * and pusha to work.  ("enter $65535,$31" pushes
+                 * and pusha to work. ("enter $65535, $31" pushes
                 * 32 pointers and then decrements %sp by 65535.)
                 */
-                if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
+                if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
-                        goto bad_area;
+                        bad_area(regs, error_code, address);
+                        return;
+                }
        }
-        if (expand_stack(vma, address))
+        if (unlikely(expand_stack(vma, address))) {
-                goto bad_area;
+                bad_area(regs, error_code, address);
-/*
+                return;
- * Ok, we have a good vm_area for this memory access, so
+        }
- * we can handle it..
- */
+        /*
+         * Ok, we have a good vm_area for this memory access, so
+         * we can handle it..
+         */
 good_area:
-        si_code = SEGV_ACCERR;
+        write = error_code & PF_WRITE;
-        write = 0;
-        switch (error_code & (PF_PROT|PF_WRITE)) {
+        if (unlikely(access_error(error_code, write, vma))) {
-        default:        /* 3: write, present */
+                bad_area_access_error(regs, error_code, address);
-                /* fall through */
+                return;
-        case PF_WRITE:          /* write, not present */
-                if (!(vma->vm_flags & VM_WRITE))
-                        goto bad_area;
-                write++;
-                break;
-        case PF_PROT:           /* read, present */
-                goto bad_area;
-        case 0:                 /* read, not present */
-                if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
-                        goto bad_area;
        }
        /*
         * If for any reason at all we couldn't handle the fault,
         * make sure we exit gracefully rather than endlessly redo
-         * the fault.
+         * the fault:
         */
        fault = handle_mm_fault(mm, vma, address, write);
        if (unlikely(fault & VM_FAULT_ERROR)) {
-                if (fault & VM_FAULT_OOM)
+                mm_fault_error(regs, error_code, address, fault);
-                        goto out_of_memory;
+                return;
-                else if (fault & VM_FAULT_SIGBUS)
-                        goto do_sigbus;
-                BUG();
        }
        if (fault & VM_FAULT_MAJOR)
                tsk->maj_flt++;
        else
                tsk->min_flt++;
-#ifdef CONFIG_X86_32
+        check_v8086_mode(regs, address, tsk);
-        /*
-         * Did it hit the DOS screen memory VA from vm86 mode?
-         */
-        if (v8086_mode(regs)) {
-                unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
-                if (bit < 32)
-                        tsk->thread.screen_bitmap |= 1 << bit;
-        }
-#endif
-        up_read(&mm->mmap_sem);
-        return;
-/*
- * Something tried to access memory that isn't in our memory map..
- * Fix it, but check if it's kernel or user first..
- */
-bad_area:
        up_read(&mm->mmap_sem);
-bad_area_nosemaphore:
-        /* User mode accesses just cause a SIGSEGV */
-        if (error_code & PF_USER) {
-                /*
-                 * It's possible to have interrupts off here.
-                 */
-                local_irq_enable();
-                /*
-                 * Valid to do another page fault here because this one came
-                 * from user space.
-                 */
-                if (is_prefetch(regs, address, error_code))
-                        return;
-                if (is_errata100(regs, address))
-                        return;
-                if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
-                    printk_ratelimit()) {
-                        printk(
-                        "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
-                        task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
-                        tsk->comm, task_pid_nr(tsk), address,
-                        (void *) regs->ip, (void *) regs->sp, error_code);
-                        print_vma_addr(" in ", regs->ip);
-                        printk("\n");
-                }
-                tsk->thread.cr2 = address;
-                /* Kernel addresses are always protection faults */
-                tsk->thread.error_code = error_code | (address >= TASK_SIZE);
-                tsk->thread.trap_no = 14;
-                force_sig_info_fault(SIGSEGV, si_code, address, tsk);
-                return;
-        }
-        if (is_f00f_bug(regs, address))
-                return;
-no_context:
-        /* Are we prepared to handle this kernel fault?  */
-        if (fixup_exception(regs))
-                return;
-        /*
-         * X86_32
-         * Valid to do another page fault here, because if this fault
-         * had been triggered by is_prefetch fixup_exception would have
-         * handled it.
-         *
-         * X86_64
-         * Hall of shame of CPU/BIOS bugs.
-         */
-        if (is_prefetch(regs, address, error_code))
-                return;
-        if (is_errata93(regs, address))
-                return;
-/*
- * Oops. The kernel tried to access some bad page. We'll have to
- * terminate things with extreme prejudice.
- */
-#ifdef CONFIG_X86_32
-        bust_spinlocks(1);
-#else
-        flags = oops_begin();
-#endif
-        show_fault_oops(regs, error_code, address);
-        tsk->thread.cr2 = address;
-        tsk->thread.trap_no = 14;
-        tsk->thread.error_code = error_code;
-#ifdef CONFIG_X86_32
-        die("Oops", regs, error_code);
-        bust_spinlocks(0);
-        do_exit(SIGKILL);
-#else
-        sig = SIGKILL;
-        if (__die("Oops", regs, error_code))
-                sig = 0;
-        /* Executive summary in case the body of the oops scrolled away */
-        printk(KERN_EMERG "CR2: %016lx\n", address);
-        oops_end(flags, regs, sig);
-#endif
-out_of_memory:
-        /*
-         * We ran out of memory, call the OOM killer, and return the userspace
-         * (which will retry the fault, or kill us if we got oom-killed).
-         */
-        up_read(&mm->mmap_sem);
-        pagefault_out_of_memory();
-        return;
-do_sigbus:
-        up_read(&mm->mmap_sem);
-        /* Kernel mode? Handle exceptions or die */
-        if (!(error_code & PF_USER))
-                goto no_context;
-#ifdef CONFIG_X86_32
-        /* User space => ok to do another page fault */
-        if (is_prefetch(regs, address, error_code))
-                return;
-#endif
-        tsk->thread.cr2 = address;
-        tsk->thread.error_code = error_code;
-        tsk->thread.trap_no = 14;
-        force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
-}
-DEFINE_SPINLOCK(pgd_lock);
-LIST_HEAD(pgd_list);
-void vmalloc_sync_all(void)
-{
-        unsigned long address;
-#ifdef CONFIG_X86_32
-        if (SHARED_KERNEL_PMD)
-                return;
-        for (address = VMALLOC_START & PMD_MASK;
-             address >= TASK_SIZE && address < FIXADDR_TOP;
-             address += PMD_SIZE) {
-                unsigned long flags;
-                struct page *page;
-                spin_lock_irqsave(&pgd_lock, flags);
-                list_for_each_entry(page, &pgd_list, lru) {
-                        if (!vmalloc_sync_one(page_address(page),
-                                              address))
-                                break;
-                }
-                spin_unlock_irqrestore(&pgd_lock, flags);
-        }
-#else /* CONFIG_X86_64 */
-        for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
-             address += PGDIR_SIZE) {
-                const pgd_t *pgd_ref = pgd_offset_k(address);
-                unsigned long flags;
-                struct page *page;
-                if (pgd_none(*pgd_ref))
-                        continue;
-                spin_lock_irqsave(&pgd_lock, flags);
-                list_for_each_entry(page, &pgd_list, lru) {
-                        pgd_t *pgd;
-                        pgd = (pgd_t *)page_address(page) + pgd_index(address);
-                        if (pgd_none(*pgd))
-                                set_pgd(pgd, *pgd_ref);
-                        else
-                                BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
-                }
-                spin_unlock_irqrestore(&pgd_lock, flags);
-        }
-#endif
 }
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index bcc079c282dd..d11745334a67 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -1,5 +1,6 @@
 #include <linux/highmem.h>
 #include <linux/module.h>
+#include <linux/swap.h> /* for totalram_pages */
 void *kmap(struct page *page)
 {
@@ -156,3 +157,27 @@ EXPORT_SYMBOL(kmap);
 EXPORT_SYMBOL(kunmap);
 EXPORT_SYMBOL(kmap_atomic);
 EXPORT_SYMBOL(kunmap_atomic);
+void __init set_highmem_pages_init(void)
+{
+        struct zone *zone;
+        int nid;
+        for_each_zone(zone) {
+                unsigned long zone_start_pfn, zone_end_pfn;
+                if (!is_highmem(zone))
+                        continue;
+                zone_start_pfn = zone->zone_start_pfn;
+                zone_end_pfn = zone_start_pfn + zone->spanned_pages;
+                nid = zone_to_nid(zone);
+                printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
+                                zone->name, nid, zone_start_pfn, zone_end_pfn);
+                add_highpages_with_active_regions(nid, zone_start_pfn,
+                                 zone_end_pfn);
+        }
+        totalram_pages += totalhigh_pages;
+}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
new file mode 100644
index 000000000000..6d63e3d1253d
--- /dev/null
+++ b/arch/x86/mm/init.c
@@ -0,0 +1,393 @@
+#include <linux/ioport.h>
+#include <linux/swap.h>
+#include <asm/cacheflush.h>
+#include <asm/e820.h>
+#include <asm/init.h>
+#include <asm/page.h>
+#include <asm/page_types.h>
+#include <asm/sections.h>
+#include <asm/system.h>
+#include <asm/tlbflush.h>
+unsigned long __initdata e820_table_start;
+unsigned long __meminitdata e820_table_end;
+unsigned long __meminitdata e820_table_top;
+int after_bootmem;
+int direct_gbpages
+#ifdef CONFIG_DIRECT_GBPAGES
+                                = 1
+#endif
+;
+static void __init find_early_table_space(unsigned long end, int use_pse,
+                                          int use_gbpages)
+{
+        unsigned long puds, pmds, ptes, tables, start;
+        puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
+        tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
+        if (use_gbpages) {
+                unsigned long extra;
+                extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
+                pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
+        } else
+                pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+        tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
+        if (use_pse) {
+                unsigned long extra;
+                extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
+#ifdef CONFIG_X86_32
+                extra += PMD_SIZE;
+#endif
+                ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        } else
+                ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
+#ifdef CONFIG_X86_32
+        /* for fixmap */
+        tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
+#endif
+        /*
+         * RED-PEN putting page tables only on node 0 could
+         * cause a hotspot and fill up ZONE_DMA. The page tables
+         * need roughly 0.5KB per GB.
+         */
+#ifdef CONFIG_X86_32
+        start = 0x7000;
+        e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
+                                        tables, PAGE_SIZE);
+#else /* CONFIG_X86_64 */
+        start = 0x8000;
+        e820_table_start = find_e820_area(start, end, tables, PAGE_SIZE);
+#endif
+        if (e820_table_start == -1UL)
+                panic("Cannot find space for the kernel page tables");
+        e820_table_start >>= PAGE_SHIFT;
+        e820_table_end = e820_table_start;
+        e820_table_top = e820_table_start + (tables >> PAGE_SHIFT);
+        printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
+                end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT);
+}
+struct map_range {
+        unsigned long start;
+        unsigned long end;
+        unsigned page_size_mask;
+};
+#ifdef CONFIG_X86_32
+#define NR_RANGE_MR 3
+#else /* CONFIG_X86_64 */
+#define NR_RANGE_MR 5
+#endif
+static int save_mr(struct map_range *mr, int nr_range,
+                   unsigned long start_pfn, unsigned long end_pfn,
+                   unsigned long page_size_mask)
+{
+        if (start_pfn < end_pfn) {
+                if (nr_range >= NR_RANGE_MR)
+                        panic("run out of range for init_memory_mapping\n");
+                mr[nr_range].start = start_pfn<<PAGE_SHIFT;
+                mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
+                mr[nr_range].page_size_mask = page_size_mask;
+                nr_range++;
+        }
+        return nr_range;
+}
+#ifdef CONFIG_X86_64
+static void __init init_gbpages(void)
+{
+        if (direct_gbpages && cpu_has_gbpages)
+                printk(KERN_INFO "Using GB pages for direct mapping\n");
+        else
+                direct_gbpages = 0;
+}
+#else
+static inline void init_gbpages(void)
+{
+}
+#endif
+/*
+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
+ * This runs before bootmem is initialized and gets pages directly from
+ * the physical memory. To access them they are temporarily mapped.
+ */
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+                                               unsigned long end)
+{
+        unsigned long page_size_mask = 0;
+        unsigned long start_pfn, end_pfn;
+        unsigned long pos;
+        unsigned long ret;
+        struct map_range mr[NR_RANGE_MR];
+        int nr_range, i;
+        int use_pse, use_gbpages;
+        printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
+        if (!after_bootmem)
+                init_gbpages();
+#ifdef CONFIG_DEBUG_PAGEALLOC
+        /*
+         * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+         * This will simplify cpa(), which otherwise needs to support splitting
+         * large pages into small in interrupt context, etc.
+         */
+        use_pse = use_gbpages = 0;
+#else
+        use_pse = cpu_has_pse;
+        use_gbpages = direct_gbpages;
+#endif
+#ifdef CONFIG_X86_32
+#ifdef CONFIG_X86_PAE
+        set_nx();
+        if (nx_enabled)
+                printk(KERN_INFO "NX (Execute Disable) protection: active\n");
+#endif
+        /* Enable PSE if available */
+        if (cpu_has_pse)
+                set_in_cr4(X86_CR4_PSE);
+        /* Enable PGE if available */
+        if (cpu_has_pge) {
+                set_in_cr4(X86_CR4_PGE);
+                __supported_pte_mask |= _PAGE_GLOBAL;
+        }
+#endif
+        if (use_gbpages)
+                page_size_mask |= 1 << PG_LEVEL_1G;
+        if (use_pse)
+                page_size_mask |= 1 << PG_LEVEL_2M;
+        memset(mr, 0, sizeof(mr));
+        nr_range = 0;
+        /* head if not big page alignment ? */
+        start_pfn = start >> PAGE_SHIFT;
+        pos = start_pfn << PAGE_SHIFT;
+#ifdef CONFIG_X86_32
+        /*
+         * Don't use a large page for the first 2/4MB of memory
+         * because there are often fixed size MTRRs in there
+         * and overlapping MTRRs into large pages can cause
+         * slowdowns.
+         */
+        if (pos == 0)
+                end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
+        else
+                end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+                                 << (PMD_SHIFT - PAGE_SHIFT);
+#else /* CONFIG_X86_64 */
+        end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
+                        << (PMD_SHIFT - PAGE_SHIFT);
+#endif
+        if (end_pfn > (end >> PAGE_SHIFT))
+                end_pfn = end >> PAGE_SHIFT;
+        if (start_pfn < end_pfn) {
+                nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+                pos = end_pfn << PAGE_SHIFT;
+        }
+        /* big page (2M) range */
+        start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+                         << (PMD_SHIFT - PAGE_SHIFT);
+#ifdef CONFIG_X86_32
+        end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+#else /* CONFIG_X86_64 */
+        end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
+                         << (PUD_SHIFT - PAGE_SHIFT);
+        if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
+                end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
+#endif
+        if (start_pfn < end_pfn) {
+                nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+                                page_size_mask & (1<<PG_LEVEL_2M));
+                pos = end_pfn << PAGE_SHIFT;
+        }
+#ifdef CONFIG_X86_64
+        /* big page (1G) range */
+        start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
+                         << (PUD_SHIFT - PAGE_SHIFT);
+        end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
+        if (start_pfn < end_pfn) {
+                nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+                                page_size_mask &
+                                 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
+                pos = end_pfn << PAGE_SHIFT;
+        }
+        /* tail is not big page (1G) alignment */
+        start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+                         << (PMD_SHIFT - PAGE_SHIFT);
+        end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+        if (start_pfn < end_pfn) {
+                nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+                                page_size_mask & (1<<PG_LEVEL_2M));
+                pos = end_pfn << PAGE_SHIFT;
+        }
+#endif
+        /* tail is not big page (2M) alignment */
+        start_pfn = pos>>PAGE_SHIFT;
+        end_pfn = end>>PAGE_SHIFT;
+        nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+        /* try to merge same page size and continuous */
+        for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
+                unsigned long old_start;
+                if (mr[i].end != mr[i+1].start ||
+                    mr[i].page_size_mask != mr[i+1].page_size_mask)
+                        continue;
+                /* move it */
+                old_start = mr[i].start;
+                memmove(&mr[i], &mr[i+1],
+                        (nr_range - 1 - i) * sizeof(struct map_range));
+                mr[i--].start = old_start;
+                nr_range--;
+        }
+        for (i = 0; i < nr_range; i++)
+                printk(KERN_DEBUG " %010lx - %010lx page %s\n",
+                                mr[i].start, mr[i].end,
+                        (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
+                         (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
+        /*
+         * Find space for the kernel direct mapping tables.
+         *
+         * Later we should allocate these tables in the local node of the
+         * memory mapped. Unfortunately this is done currently before the
+         * nodes are discovered.
+         */
+        if (!after_bootmem)
+                find_early_table_space(end, use_pse, use_gbpages);
+#ifdef CONFIG_X86_32
+        for (i = 0; i < nr_range; i++)
+                kernel_physical_mapping_init(mr[i].start, mr[i].end,
+                                             mr[i].page_size_mask);
+        ret = end;
+#else /* CONFIG_X86_64 */
+        for (i = 0; i < nr_range; i++)
+                ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
+                                                   mr[i].page_size_mask);
+#endif
+#ifdef CONFIG_X86_32
+        early_ioremap_page_table_range_init();
+        load_cr3(swapper_pg_dir);
+#endif
+#ifdef CONFIG_X86_64
+        if (!after_bootmem)
+                mmu_cr4_features = read_cr4();
+#endif
+        __flush_tlb_all();
+        if (!after_bootmem && e820_table_end > e820_table_start)
+                reserve_early(e820_table_start << PAGE_SHIFT,
+                                 e820_table_end << PAGE_SHIFT, "PGTABLE");
+        if (!after_bootmem)
+                early_memtest(start, end);
+        return ret >> PAGE_SHIFT;
+}
+/*
+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
+ * is valid. The argument is a physical page number.
+ *
+ *
+ * On x86, access has to be given to the first megabyte of ram because that area
+ * contains bios code and data regions used by X and dosemu and similar apps.
+ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
+ * mmio resources as well as potential bios/acpi data regions.
+ */
+int devmem_is_allowed(unsigned long pagenr)
+{
+        if (pagenr <= 256)
+                return 1;
+        if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
+                return 0;
+        if (!page_is_ram(pagenr))
+                return 1;
+        return 0;
+}
+void free_init_pages(char *what, unsigned long begin, unsigned long end)
+{
+        unsigned long addr = begin;
+        if (addr >= end)
+                return;
+        /*
+         * If debugging page accesses then do not free this memory but
+         * mark them not present - any buggy init-section access will
+         * create a kernel page fault:
+         */
+#ifdef CONFIG_DEBUG_PAGEALLOC
+        printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
+                begin, PAGE_ALIGN(end));
+        set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
+#else
+        /*
+         * We just marked the kernel text read only above, now that
+         * we are going to free part of that, we need to make that
+         * writeable first.
+         */
+        set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
+        printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
+        for (; addr < end; addr += PAGE_SIZE) {
+                ClearPageReserved(virt_to_page(addr));
+                init_page_count(virt_to_page(addr));
+                memset((void *)(addr & ~(PAGE_SIZE-1)),
+                        POISON_FREE_INITMEM, PAGE_SIZE);
+                free_page(addr);
+                totalram_pages++;
+        }
+#endif
+}
+void free_initmem(void)
+{
+        free_init_pages("unused kernel memory",
+                        (unsigned long)(&__init_begin),
+                        (unsigned long)(&__init_end));
+}
+#ifdef CONFIG_BLK_DEV_INITRD
+void free_initrd_mem(unsigned long start, unsigned long end)
+{
+        free_init_pages("initrd memory", start, end);
+}
+#endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 3eb2ed188a4c..d7f5060ab21c 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -49,9 +49,7 @@
 #include <asm/paravirt.h>
 #include <asm/setup.h>
 #include <asm/cacheflush.h>
-#include <asm/smp.h>
+#include <asm/init.h>
-unsigned int __VMALLOC_RESERVE = 128 << 20;
 unsigned long max_low_pfn_mapped;
 unsigned long max_pfn_mapped;
@@ -61,19 +59,14 @@ unsigned long highstart_pfn, highend_pfn;
 static noinline int do_test_wp_bit(void);
+bool __read_mostly __vmalloc_start_set = false;
-static unsigned long __initdata table_start;
-static unsigned long __meminitdata table_end;
-static unsigned long __meminitdata table_top;
-static int __initdata after_init_bootmem;
 static __init void *alloc_low_page(void)
 {
-        unsigned long pfn = table_end++;
+        unsigned long pfn = e820_table_end++;
        void *adr;
-        if (pfn >= table_top)
+        if (pfn >= e820_table_top)
                panic("alloc_low_page: ran out of memory");
        adr = __va(pfn * PAGE_SIZE);
@@ -93,7 +86,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
 #ifdef CONFIG_X86_PAE
        if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
-                if (after_init_bootmem)
+                if (after_bootmem)
                        pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
                else
                        pmd_table = (pmd_t *)alloc_low_page();
@@ -120,7 +113,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
        if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
                pte_t *page_table = NULL;
-                if (after_init_bootmem) {
+                if (after_bootmem) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
                        page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
 #endif
@@ -138,6 +131,23 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
        return pte_offset_kernel(pmd, 0);
 }
+pmd_t * __init populate_extra_pmd(unsigned long vaddr)
+{
+        int pgd_idx = pgd_index(vaddr);
+        int pmd_idx = pmd_index(vaddr);
+        return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx;
+}
+pte_t * __init populate_extra_pte(unsigned long vaddr)
+{
+        int pte_idx = pte_index(vaddr);
+        pmd_t *pmd;
+        pmd = populate_extra_pmd(vaddr);
+        return one_page_table_init(pmd) + pte_idx;
+}
 static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
                                           unsigned long vaddr, pte_t *lastpte)
 {
@@ -154,12 +164,12 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
        if (pmd_idx_kmap_begin != pmd_idx_kmap_end
            && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
            && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
-            && ((__pa(pte) >> PAGE_SHIFT) < table_start
+            && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start
-                || (__pa(pte) >> PAGE_SHIFT) >= table_end)) {
+                || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) {
                pte_t *newpte;
                int i;
-                BUG_ON(after_init_bootmem);
+                BUG_ON(after_bootmem);
                newpte = alloc_low_page();
                for (i = 0; i < PTRS_PER_PTE; i++)
                        set_pte(newpte + i, pte[i]);
@@ -228,11 +238,14 @@ static inline int is_kernel_text(unsigned long addr)
 * of max_low_pfn pages, by creating page tables starting from address
 * PAGE_OFFSET:
 */
-static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
+unsigned long __init
-                                                unsigned long start_pfn,
+kernel_physical_mapping_init(unsigned long start,
-                                                unsigned long end_pfn,
+                             unsigned long end,
-                                                int use_pse)
+                             unsigned long page_size_mask)
 {
+        int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
+        unsigned long start_pfn, end_pfn;
+        pgd_t *pgd_base = swapper_pg_dir;
        int pgd_idx, pmd_idx, pte_ofs;
        unsigned long pfn;
        pgd_t *pgd;
@@ -241,6 +254,9 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
        unsigned pages_2m, pages_4k;
        int mapping_iter;
+        start_pfn = start >> PAGE_SHIFT;
+        end_pfn = end >> PAGE_SHIFT;
        /*
         * First iteration will setup identity mapping using large/small pages
         * based on use_pse, with other attributes same as set by
@@ -355,26 +371,6 @@ repeat:
                mapping_iter = 2;
                goto repeat;
        }
-}
-/*
- * devmem_is_allowed() checks to see if /dev/mem access to a certain address
- * is valid. The argument is a physical page number.
- *
- *
- * On x86, access has to be given to the first megabyte of ram because that area
- * contains bios code and data regions used by X and dosemu and similar apps.
- * Access has to be given to non-kernel-ram areas as well, these contain the PCI
- * mmio resources as well as potential bios/acpi data regions.
- */
-int devmem_is_allowed(unsigned long pagenr)
-{
-        if (pagenr <= 256)
-                return 1;
-        if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
-                return 0;
-        if (!page_is_ram(pagenr))
-                return 1;
        return 0;
 }
@@ -470,22 +466,10 @@ void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
        work_with_active_regions(nid, add_highpages_work_fn, &data);
 }
-#ifndef CONFIG_NUMA
-static void __init set_highmem_pages_init(void)
-{
-        add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
-        totalram_pages += totalhigh_pages;
-}
-#endif /* !CONFIG_NUMA */
 #else
 static inline void permanent_kmaps_init(pgd_t *pgd_base)
 {
 }
-static inline void set_highmem_pages_init(void)
-{
-}
 #endif /* CONFIG_HIGHMEM */
 void __init native_pagetable_setup_start(pgd_t *base)
@@ -543,8 +527,9 @@ void __init native_pagetable_setup_done(pgd_t *base)
 * be partially populated, and so it avoids stomping on any existing
 * mappings.
 */
-static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
+void __init early_ioremap_page_table_range_init(void)
 {
+        pgd_t *pgd_base = swapper_pg_dir;
        unsigned long vaddr, end;
        /*
@@ -639,7 +624,7 @@ static int __init noexec_setup(char *str)
 }
 early_param("noexec", noexec_setup);
-static void __init set_nx(void)
+void __init set_nx(void)
 {
        unsigned int v[4], l, h;
@@ -675,75 +660,97 @@ static int __init parse_highmem(char *arg)
 }
 early_param("highmem", parse_highmem);
+#define MSG_HIGHMEM_TOO_BIG \
+        "highmem size (%luMB) is bigger than pages available (%luMB)!\n"
+#define MSG_LOWMEM_TOO_SMALL \
+        "highmem size (%luMB) results in <64MB lowmem, ignoring it!\n"
 /*
- * Determine low and high memory ranges:
+ * All of RAM fits into lowmem - but if user wants highmem
+ * artificially via the highmem=x boot parameter then create
+ * it:
 */
-void __init find_low_pfn_range(void)
+void __init lowmem_pfn_init(void)
 {
-        /* it could update max_pfn */
        /* max_low_pfn is 0, we already have early_res support */
        max_low_pfn = max_pfn;
-        if (max_low_pfn > MAXMEM_PFN) {
-                if (highmem_pages == -1)
+        if (highmem_pages == -1)
-                        highmem_pages = max_pfn - MAXMEM_PFN;
+                highmem_pages = 0;
-                if (highmem_pages + MAXMEM_PFN < max_pfn)
+#ifdef CONFIG_HIGHMEM
-                        max_pfn = MAXMEM_PFN + highmem_pages;
+        if (highmem_pages >= max_pfn) {
-                if (highmem_pages + MAXMEM_PFN > max_pfn) {
+                printk(KERN_ERR MSG_HIGHMEM_TOO_BIG,
-                        printk(KERN_WARNING "only %luMB highmem pages "
+                        pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
-                                "available, ignoring highmem size of %uMB.\n",
+                highmem_pages = 0;
-                                pages_to_mb(max_pfn - MAXMEM_PFN),
+        }
+        if (highmem_pages) {
+                if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE) {
+                        printk(KERN_ERR MSG_LOWMEM_TOO_SMALL,
                                pages_to_mb(highmem_pages));
                        highmem_pages = 0;
                }
-                max_low_pfn = MAXMEM_PFN;
+                max_low_pfn -= highmem_pages;
+        }
+#else
+        if (highmem_pages)
+                printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
+#endif
+}
+#define MSG_HIGHMEM_TOO_SMALL \
+        "only %luMB highmem pages available, ignoring highmem size of %luMB!\n"
+#define MSG_HIGHMEM_TRIMMED \
+        "Warning: only 4GB will be used. Use a HIGHMEM64G enabled kernel!\n"
+/*
+ * We have more RAM than fits into lowmem - we try to put it into
+ * highmem, also taking the highmem=x boot parameter into account:
+ */
+void __init highmem_pfn_init(void)
+{
+        max_low_pfn = MAXMEM_PFN;
+        if (highmem_pages == -1)
+                highmem_pages = max_pfn - MAXMEM_PFN;
+        if (highmem_pages + MAXMEM_PFN < max_pfn)
+                max_pfn = MAXMEM_PFN + highmem_pages;
+        if (highmem_pages + MAXMEM_PFN > max_pfn) {
+                printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL,
+                        pages_to_mb(max_pfn - MAXMEM_PFN),
+                        pages_to_mb(highmem_pages));
+                highmem_pages = 0;
+        }
 #ifndef CONFIG_HIGHMEM
-                /* Maximum memory usable is what is directly addressable */
+        /* Maximum memory usable is what is directly addressable */
-                printk(KERN_WARNING "Warning only %ldMB will be used.\n",
+        printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20);
-                                        MAXMEM>>20);
+        if (max_pfn > MAX_NONPAE_PFN)
-                if (max_pfn > MAX_NONPAE_PFN)
+                printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
-                        printk(KERN_WARNING
+        else
-                                 "Use a HIGHMEM64G enabled kernel.\n");
+                printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
-                else
+        max_pfn = MAXMEM_PFN;
-                        printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
-                max_pfn = MAXMEM_PFN;
 #else /* !CONFIG_HIGHMEM */
 #ifndef CONFIG_HIGHMEM64G
-                if (max_pfn > MAX_NONPAE_PFN) {
+        if (max_pfn > MAX_NONPAE_PFN) {
-                        max_pfn = MAX_NONPAE_PFN;
+                max_pfn = MAX_NONPAE_PFN;
-                        printk(KERN_WARNING "Warning only 4GB will be used."
+                printk(KERN_WARNING MSG_HIGHMEM_TRIMMED);
-                                "Use a HIGHMEM64G enabled kernel.\n");
+        }
-                }
 #endif /* !CONFIG_HIGHMEM64G */
 #endif /* !CONFIG_HIGHMEM */
-        } else {
+}
-                if (highmem_pages == -1)
-                        highmem_pages = 0;
+/*
-#ifdef CONFIG_HIGHMEM
+ * Determine low and high memory ranges:
-                if (highmem_pages >= max_pfn) {
+ */
-                        printk(KERN_ERR "highmem size specified (%uMB) is "
+void __init find_low_pfn_range(void)
-                                "bigger than pages available (%luMB)!.\n",
+{
-                                pages_to_mb(highmem_pages),
+        /* it could update max_pfn */
-                                pages_to_mb(max_pfn));
-                        highmem_pages = 0;
+        if (max_pfn <= MAXMEM_PFN)
-                }
+                lowmem_pfn_init();
-                if (highmem_pages) {
+        else
-                        if (max_low_pfn - highmem_pages <
+                highmem_pfn_init();
-                            64*1024*1024/PAGE_SIZE){
-                                printk(KERN_ERR "highmem size %uMB results in "
-                                "smaller than 64MB lowmem, ignoring it.\n"
-                                        , pages_to_mb(highmem_pages));
-                                highmem_pages = 0;
-                        }
-                        max_low_pfn -= highmem_pages;
-                }
-#else
-                if (highmem_pages)
-                        printk(KERN_ERR "ignoring highmem size on non-highmem"
-                                        " kernel!\n");
-#endif
-        }
 }
 #ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -769,6 +776,8 @@ void __init initmem_init(unsigned long start_pfn,
 #ifdef CONFIG_FLATMEM
        max_mapnr = num_physpages;
 #endif
+        __vmalloc_start_set = true;
        printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
                        pages_to_mb(max_low_pfn));
@@ -790,176 +799,61 @@ static void __init zone_sizes_init(void)
        free_area_init_nodes(max_zone_pfns);
 }
+static unsigned long __init setup_node_bootmem(int nodeid,
+                                 unsigned long start_pfn,
+                                 unsigned long end_pfn,
+                                 unsigned long bootmap)
+{
+        unsigned long bootmap_size;
+        if (start_pfn > max_low_pfn)
+                return bootmap;
+        if (end_pfn > max_low_pfn)
+                end_pfn = max_low_pfn;
+        /* don't touch min_low_pfn */
+        bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
+                                         bootmap >> PAGE_SHIFT,
+                                         start_pfn, end_pfn);
+        printk(KERN_INFO "  node %d low ram: %08lx - %08lx\n",
+                nodeid, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+        printk(KERN_INFO "  node %d bootmap %08lx - %08lx\n",
+                 nodeid, bootmap, bootmap + bootmap_size);
+        free_bootmem_with_active_regions(nodeid, end_pfn);
+        early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+        return bootmap + bootmap_size;
+}
 void __init setup_bootmem_allocator(void)
 {
-        int i;
+        int nodeid;
        unsigned long bootmap_size, bootmap;
        /*
         * Initialize the boot-time allocator (with low memory only):
         */
        bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
-        bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
+        bootmap = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
-                                 max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
                                 PAGE_SIZE);
        if (bootmap == -1L)
                panic("Cannot find bootmem map of size %ld\n", bootmap_size);
        reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
-        /* don't touch min_low_pfn */
-        bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
-                                         min_low_pfn, max_low_pfn);
        printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
                 max_pfn_mapped<<PAGE_SHIFT);
-        printk(KERN_INFO "  low ram: %08lx - %08lx\n",
+        printk(KERN_INFO "  low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
-                 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
-        printk(KERN_INFO "  bootmap %08lx - %08lx\n",
-                 bootmap, bootmap + bootmap_size);
-        for_each_online_node(i)
-                free_bootmem_with_active_regions(i, max_low_pfn);
-        early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
-        after_init_bootmem = 1;
-}
-static void __init find_early_table_space(unsigned long end, int use_pse)
-{
-        unsigned long puds, pmds, ptes, tables, start;
-        puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
-        tables = PAGE_ALIGN(puds * sizeof(pud_t));
-        pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
-        tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
-        if (use_pse) {
-                unsigned long extra;
-                extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
-                extra += PMD_SIZE;
-                ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
-        } else
-                ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
-        tables += PAGE_ALIGN(ptes * sizeof(pte_t));
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+        for_each_online_node(nodeid)
-        /* for fixmap */
+                bootmap = setup_node_bootmem(nodeid, node_start_pfn[nodeid],
-        tables += PAGE_ALIGN(__end_of_fixed_addresses * sizeof(pte_t));
+                                        node_end_pfn[nodeid], bootmap);
-        /*
-         * RED-PEN putting page tables only on node 0 could
-         * cause a hotspot and fill up ZONE_DMA. The page tables
-         * need roughly 0.5KB per GB.
-         */
-        start = 0x7000;
-        table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
-                                        tables, PAGE_SIZE);
-        if (table_start == -1UL)
-                panic("Cannot find space for the kernel page tables");
-        table_start >>= PAGE_SHIFT;
-        table_end = table_start;
-        table_top = table_start + (tables>>PAGE_SHIFT);
-        printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
-                end, table_start << PAGE_SHIFT,
-                (table_start << PAGE_SHIFT) + tables);
-}
-unsigned long __init_refok init_memory_mapping(unsigned long start,
-                                                unsigned long end)
-{
-        pgd_t *pgd_base = swapper_pg_dir;
-        unsigned long start_pfn, end_pfn;
-        unsigned long big_page_start;
-#ifdef CONFIG_DEBUG_PAGEALLOC
-        /*
-         * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
-         * This will simplify cpa(), which otherwise needs to support splitting
-         * large pages into small in interrupt context, etc.
-         */
-        int use_pse = 0;
 #else
-        int use_pse = cpu_has_pse;
+        bootmap = setup_node_bootmem(0, 0, max_low_pfn, bootmap);
-#endif
-        /*
-         * Find space for the kernel direct mapping tables.
-         */
-        if (!after_init_bootmem)
-                find_early_table_space(end, use_pse);
-#ifdef CONFIG_X86_PAE
-        set_nx();
-        if (nx_enabled)
-                printk(KERN_INFO "NX (Execute Disable) protection: active\n");
 #endif
-        /* Enable PSE if available */
+        after_bootmem = 1;
-        if (cpu_has_pse)
-                set_in_cr4(X86_CR4_PSE);
-        /* Enable PGE if available */
-        if (cpu_has_pge) {
-                set_in_cr4(X86_CR4_PGE);
-                __supported_pte_mask |= _PAGE_GLOBAL;
-        }
-        /*
-         * Don't use a large page for the first 2/4MB of memory
-         * because there are often fixed size MTRRs in there
-         * and overlapping MTRRs into large pages can cause
-         * slowdowns.
-         */
-        big_page_start = PMD_SIZE;
-        if (start < big_page_start) {
-                start_pfn = start >> PAGE_SHIFT;
-                end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
-        } else {
-                /* head is not big page alignment ? */
-                start_pfn = start >> PAGE_SHIFT;
-                end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
-                                 << (PMD_SHIFT - PAGE_SHIFT);
-        }
-        if (start_pfn < end_pfn)
-                kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
-        /* big page range */
-        start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
-                         << (PMD_SHIFT - PAGE_SHIFT);
-        if (start_pfn < (big_page_start >> PAGE_SHIFT))
-                start_pfn =  big_page_start >> PAGE_SHIFT;
-        end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
-        if (start_pfn < end_pfn)
-                kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
-                                             use_pse);
-        /* tail is not big page alignment ? */
-        start_pfn = end_pfn;
-        if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
-                end_pfn = end >> PAGE_SHIFT;
-                if (start_pfn < end_pfn)
-                        kernel_physical_mapping_init(pgd_base, start_pfn,
-                                                         end_pfn, 0);
-        }
-        early_ioremap_page_table_range_init(pgd_base);
-        load_cr3(swapper_pg_dir);
-        __flush_tlb_all();
-        if (!after_init_bootmem)
-                reserve_early(table_start << PAGE_SHIFT,
-                                 table_end << PAGE_SHIFT, "PGTABLE");
-        if (!after_init_bootmem)
-                early_memtest(start, end);
-        return end >> PAGE_SHIFT;
 }
 /*
 * paging_init() sets up the page tables - note that the first 8MB are
 * already mapped by head.S.
@@ -1222,52 +1116,6 @@ void mark_rodata_ro(void)
 }
 #endif
-void free_init_pages(char *what, unsigned long begin, unsigned long end)
-{
-#ifdef CONFIG_DEBUG_PAGEALLOC
-        /*
-         * If debugging page accesses then do not free this memory but
-         * mark them not present - any buggy init-section access will
-         * create a kernel page fault:
-         */
-        printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
-                begin, PAGE_ALIGN(end));
-        set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
-#else
-        unsigned long addr;
-        /*
-         * We just marked the kernel text read only above, now that
-         * we are going to free part of that, we need to make that
-         * writeable first.
-         */
-        set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
-        for (addr = begin; addr < end; addr += PAGE_SIZE) {
-                ClearPageReserved(virt_to_page(addr));
-                init_page_count(virt_to_page(addr));
-                memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
-                free_page(addr);
-                totalram_pages++;
-        }
-        printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
-#endif
-}
-void free_initmem(void)
-{
-        free_init_pages("unused kernel memory",
-                        (unsigned long)(&__init_begin),
-                        (unsigned long)(&__init_end));
-}
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-        free_init_pages("initrd memory", start, end);
-}
-#endif
 int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
                                   int flags)
 {
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ea5ad1e3672d..66d6be85df82 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -48,6 +48,7 @@
 #include <asm/kdebug.h>
 #include <asm/numa.h>
 #include <asm/cacheflush.h>
+#include <asm/init.h>
 /*
 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -61,12 +62,6 @@ static unsigned long dma_reserve __initdata;
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-int direct_gbpages
-#ifdef CONFIG_DIRECT_GBPAGES
-                                = 1
-#endif
-;
 static int __init parse_direct_gbpages_off(char *arg)
 {
        direct_gbpages = 0;
@@ -87,8 +82,6 @@ early_param("gbpages", parse_direct_gbpages_on);
 * around without checking the pgd every time.
 */
-int after_bootmem;
 pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
 EXPORT_SYMBOL_GPL(__supported_pte_mask);
@@ -168,34 +161,51 @@ static __ref void *spp_getpage(void)
        return ptr;
 }
-void
+static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
-set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
 {
-        pud_t *pud;
+        if (pgd_none(*pgd)) {
-        pmd_t *pmd;
+                pud_t *pud = (pud_t *)spp_getpage();
-        pte_t *pte;
+                pgd_populate(&init_mm, pgd, pud);
+                if (pud != pud_offset(pgd, 0))
+                        printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
+                               pud, pud_offset(pgd, 0));
+        }
+        return pud_offset(pgd, vaddr);
+}
-        pud = pud_page + pud_index(vaddr);
+static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
+{
        if (pud_none(*pud)) {
-                pmd = (pmd_t *) spp_getpage();
+                pmd_t *pmd = (pmd_t *) spp_getpage();
                pud_populate(&init_mm, pud, pmd);
-                if (pmd != pmd_offset(pud, 0)) {
+                if (pmd != pmd_offset(pud, 0))
                        printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
-                                pmd, pmd_offset(pud, 0));
+                               pmd, pmd_offset(pud, 0));
-                        return;
-                }
        }
-        pmd = pmd_offset(pud, vaddr);
+        return pmd_offset(pud, vaddr);
+}
+static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
+{
        if (pmd_none(*pmd)) {
-                pte = (pte_t *) spp_getpage();
+                pte_t *pte = (pte_t *) spp_getpage();
                pmd_populate_kernel(&init_mm, pmd, pte);
-                if (pte != pte_offset_kernel(pmd, 0)) {
+                if (pte != pte_offset_kernel(pmd, 0))
                        printk(KERN_ERR "PAGETABLE BUG #02!\n");
-                        return;
-                }
        }
+        return pte_offset_kernel(pmd, vaddr);
+}
+void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
+{
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *pte;
+        pud = pud_page + pud_index(vaddr);
+        pmd = fill_pmd(pud, vaddr);
+        pte = fill_pte(pmd, vaddr);
-        pte = pte_offset_kernel(pmd, vaddr);
        set_pte(pte, new_pte);
        /*
@@ -205,8 +215,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
        __flush_tlb_one(vaddr);
 }
-void
+void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
-set_pte_vaddr(unsigned long vaddr, pte_t pteval)
 {
        pgd_t *pgd;
        pud_t *pud_page;
@@ -223,6 +232,24 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval)
        set_pte_vaddr_pud(pud_page, vaddr, pteval);
 }
+pmd_t * __init populate_extra_pmd(unsigned long vaddr)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pgd = pgd_offset_k(vaddr);
+        pud = fill_pud(pgd, vaddr);
+        return fill_pmd(pud, vaddr);
+}
+pte_t * __init populate_extra_pte(unsigned long vaddr)
+{
+        pmd_t *pmd;
+        pmd = populate_extra_pmd(vaddr);
+        return fill_pte(pmd, vaddr);
+}
 /*
 * Create large page table mappings for a range of physical addresses.
 */
@@ -291,13 +318,9 @@ void __init cleanup_highmap(void)
        }
 }
-static unsigned long __initdata table_start;
-static unsigned long __meminitdata table_end;
-static unsigned long __meminitdata table_top;
 static __ref void *alloc_low_page(unsigned long *phys)
 {
-        unsigned long pfn = table_end++;
+        unsigned long pfn = e820_table_end++;
        void *adr;
        if (after_bootmem) {
@@ -307,7 +330,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
                return adr;
        }
-        if (pfn >= table_top)
+        if (pfn >= e820_table_top)
                panic("alloc_low_page: ran out of memory");
        adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
@@ -547,58 +570,10 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
        return phys_pud_init(pud, addr, end, page_size_mask);
 }
-static void __init find_early_table_space(unsigned long end, int use_pse,
+unsigned long __init
-                                          int use_gbpages)
+kernel_physical_mapping_init(unsigned long start,
-{
+                             unsigned long end,
-        unsigned long puds, pmds, ptes, tables, start;
+                             unsigned long page_size_mask)
-        puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
-        tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
-        if (use_gbpages) {
-                unsigned long extra;
-                extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
-                pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
-        } else
-                pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
-        tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
-        if (use_pse) {
-                unsigned long extra;
-                extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
-                ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
-        } else
-                ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
-        tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
-        /*
-         * RED-PEN putting page tables only on node 0 could
-         * cause a hotspot and fill up ZONE_DMA. The page tables
-         * need roughly 0.5KB per GB.
-         */
-        start = 0x8000;
-        table_start = find_e820_area(start, end, tables, PAGE_SIZE);
-        if (table_start == -1UL)
-                panic("Cannot find space for the kernel page tables");
-        table_start >>= PAGE_SHIFT;
-        table_end = table_start;
-        table_top = table_start + (tables >> PAGE_SHIFT);
-        printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
-                end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
-}
-static void __init init_gbpages(void)
-{
-        if (direct_gbpages && cpu_has_gbpages)
-                printk(KERN_INFO "Using GB pages for direct mapping\n");
-        else
-                direct_gbpages = 0;
-}
-static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
-                                                unsigned long end,
-                                                unsigned long page_size_mask)
 {
        unsigned long next, last_map_addr = end;
@@ -635,176 +610,6 @@ static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
        return last_map_addr;
 }
-struct map_range {
-        unsigned long start;
-        unsigned long end;
-        unsigned page_size_mask;
-};
-#define NR_RANGE_MR 5
-static int save_mr(struct map_range *mr, int nr_range,
-                   unsigned long start_pfn, unsigned long end_pfn,
-                   unsigned long page_size_mask)
-{
-        if (start_pfn < end_pfn) {
-                if (nr_range >= NR_RANGE_MR)
-                        panic("run out of range for init_memory_mapping\n");
-                mr[nr_range].start = start_pfn<<PAGE_SHIFT;
-                mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
-                mr[nr_range].page_size_mask = page_size_mask;
-                nr_range++;
-        }
-        return nr_range;
-}
-/*
- * Setup the direct mapping of the physical memory at PAGE_OFFSET.
- * This runs before bootmem is initialized and gets pages directly from
- * the physical memory. To access them they are temporarily mapped.
- */
-unsigned long __init_refok init_memory_mapping(unsigned long start,
-                                               unsigned long end)
-{
-        unsigned long last_map_addr = 0;
-        unsigned long page_size_mask = 0;
-        unsigned long start_pfn, end_pfn;
-        unsigned long pos;
-        struct map_range mr[NR_RANGE_MR];
-        int nr_range, i;
-        int use_pse, use_gbpages;
-        printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
-        /*
-         * Find space for the kernel direct mapping tables.
-         *
-         * Later we should allocate these tables in the local node of the
-         * memory mapped. Unfortunately this is done currently before the
-         * nodes are discovered.
-         */
-        if (!after_bootmem)
-                init_gbpages();
-#ifdef CONFIG_DEBUG_PAGEALLOC
-        /*
-         * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
-         * This will simplify cpa(), which otherwise needs to support splitting
-         * large pages into small in interrupt context, etc.
-         */
-        use_pse = use_gbpages = 0;
-#else
-        use_pse = cpu_has_pse;
-        use_gbpages = direct_gbpages;
-#endif
-        if (use_gbpages)
-                page_size_mask |= 1 << PG_LEVEL_1G;
-        if (use_pse)
-                page_size_mask |= 1 << PG_LEVEL_2M;
-        memset(mr, 0, sizeof(mr));
-        nr_range = 0;
-        /* head if not big page alignment ?*/
-        start_pfn = start >> PAGE_SHIFT;
-        pos = start_pfn << PAGE_SHIFT;
-        end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
-                        << (PMD_SHIFT - PAGE_SHIFT);
-        if (end_pfn > (end >> PAGE_SHIFT))
-                end_pfn = end >> PAGE_SHIFT;
-        if (start_pfn < end_pfn) {
-                nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
-                pos = end_pfn << PAGE_SHIFT;
-        }
-        /* big page (2M) range*/
-        start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-                         << (PMD_SHIFT - PAGE_SHIFT);
-        end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
-                         << (PUD_SHIFT - PAGE_SHIFT);
-        if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
-                end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
-        if (start_pfn < end_pfn) {
-                nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
-                                page_size_mask & (1<<PG_LEVEL_2M));
-                pos = end_pfn << PAGE_SHIFT;
-        }
-        /* big page (1G) range */
-        start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
-                         << (PUD_SHIFT - PAGE_SHIFT);
-        end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
-        if (start_pfn < end_pfn) {
-                nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
-                                page_size_mask &
-                                 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
-                pos = end_pfn << PAGE_SHIFT;
-        }
-        /* tail is not big page (1G) alignment */
-        start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-                         << (PMD_SHIFT - PAGE_SHIFT);
-        end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
-        if (start_pfn < end_pfn) {
-                nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
-                                page_size_mask & (1<<PG_LEVEL_2M));
-                pos = end_pfn << PAGE_SHIFT;
-        }
-        /* tail is not big page (2M) alignment */
-        start_pfn = pos>>PAGE_SHIFT;
-        end_pfn = end>>PAGE_SHIFT;
-        nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
-        /* try to merge same page size and continuous */
-        for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
-                unsigned long old_start;
-                if (mr[i].end != mr[i+1].start ||
-                    mr[i].page_size_mask != mr[i+1].page_size_mask)
-                        continue;
-                /* move it */
-                old_start = mr[i].start;
-                memmove(&mr[i], &mr[i+1],
-                         (nr_range - 1 - i) * sizeof (struct map_range));
-                mr[i--].start = old_start;
-                nr_range--;
-        }
-        for (i = 0; i < nr_range; i++)
-                printk(KERN_DEBUG " %010lx - %010lx page %s\n",
-                                mr[i].start, mr[i].end,
-                        (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
-                         (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
-        if (!after_bootmem)
-                find_early_table_space(end, use_pse, use_gbpages);
-        for (i = 0; i < nr_range; i++)
-                last_map_addr = kernel_physical_mapping_init(
-                                        mr[i].start, mr[i].end,
-                                        mr[i].page_size_mask);
-        if (!after_bootmem)
-                mmu_cr4_features = read_cr4();
-        __flush_tlb_all();
-        if (!after_bootmem && table_end > table_start)
-                reserve_early(table_start << PAGE_SHIFT,
-                                 table_end << PAGE_SHIFT, "PGTABLE");
-        printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
-                         last_map_addr, end);
-        if (!after_bootmem)
-                early_memtest(start, end);
-        return last_map_addr >> PAGE_SHIFT;
-}
 #ifndef CONFIG_NUMA
 void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 {
@@ -876,28 +681,6 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 #endif /* CONFIG_MEMORY_HOTPLUG */
-/*
- * devmem_is_allowed() checks to see if /dev/mem access to a certain address
- * is valid. The argument is a physical page number.
- *
- *
- * On x86, access has to be given to the first megabyte of ram because that area
- * contains bios code and data regions used by X and dosemu and similar apps.
- * Access has to be given to non-kernel-ram areas as well, these contain the PCI
- * mmio resources as well as potential bios/acpi data regions.
- */
-int devmem_is_allowed(unsigned long pagenr)
-{
-        if (pagenr <= 256)
-                return 1;
-        if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
-                return 0;
-        if (!page_is_ram(pagenr))
-                return 1;
-        return 0;
-}
 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
                         kcore_modules, kcore_vsyscall;
@@ -947,43 +730,6 @@ void __init mem_init(void)
                initsize >> 10);
 }
-void free_init_pages(char *what, unsigned long begin, unsigned long end)
-{
-        unsigned long addr = begin;
-        if (addr >= end)
-                return;
-        /*
-         * If debugging page accesses then do not free this memory but
-         * mark them not present - any buggy init-section access will
-         * create a kernel page fault:
-         */
-#ifdef CONFIG_DEBUG_PAGEALLOC
-        printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
-                begin, PAGE_ALIGN(end));
-        set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
-#else
-        printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
-        for (; addr < end; addr += PAGE_SIZE) {
-                ClearPageReserved(virt_to_page(addr));
-                init_page_count(virt_to_page(addr));
-                memset((void *)(addr & ~(PAGE_SIZE-1)),
-                        POISON_FREE_INITMEM, PAGE_SIZE);
-                free_page(addr);
-                totalram_pages++;
-        }
-#endif
-}
-void free_initmem(void)
-{
-        free_init_pages("unused kernel memory",
-                        (unsigned long)(&__init_begin),
-                        (unsigned long)(&__init_end));
-}
 #ifdef CONFIG_DEBUG_RODATA
 const int rodata_test_data = 0xC3;
 EXPORT_SYMBOL_GPL(rodata_test_data);
@@ -1049,13 +795,6 @@ void mark_rodata_ro(void)
 #endif
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-        free_init_pages("initrd memory", start, end);
-}
-#endif
 int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
                                   int flags)
 {
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index f45d5e29a72e..62773abdf088 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -38,8 +38,7 @@ unsigned long __phys_addr(unsigned long x)
        } else {
                VIRTUAL_BUG_ON(x < PAGE_OFFSET);
                x -= PAGE_OFFSET;
-                VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM :
+                VIRTUAL_BUG_ON(!phys_addr_valid(x));
-                                        !phys_addr_valid(x));
        }
        return x;
 }
@@ -56,10 +55,8 @@ bool __virt_addr_valid(unsigned long x)
                if (x < PAGE_OFFSET)
                        return false;
                x -= PAGE_OFFSET;
-                if (system_state == SYSTEM_BOOTING ?
+                if (!phys_addr_valid(x))
-                                x > MAXMEM : !phys_addr_valid(x)) {
                        return false;
-                }
        }
        return pfn_valid(x >> PAGE_SHIFT);
@@ -76,10 +73,9 @@ static inline int phys_addr_valid(unsigned long addr)
 #ifdef CONFIG_DEBUG_VIRTUAL
 unsigned long __phys_addr(unsigned long x)
 {
-        /* VMALLOC_* aren't constants; not available at the boot time */
+        /* VMALLOC_* aren't constants  */
        VIRTUAL_BUG_ON(x < PAGE_OFFSET);
-        VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING &&
+        VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
-                is_vmalloc_addr((void *) x));
        return x - PAGE_OFFSET;
 }
 EXPORT_SYMBOL(__phys_addr);
@@ -89,7 +85,7 @@ bool __virt_addr_valid(unsigned long x)
 {
        if (x < PAGE_OFFSET)
                return false;
-        if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x))
+        if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
                return false;
        return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
 }
@@ -348,7 +344,7 @@ EXPORT_SYMBOL(ioremap_nocache);
 *
 * Must be freed with iounmap.
 */
-void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
+void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
 {
        if (pat_enabled)
                return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 9cab18b0b857..0bcd7883d036 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -9,44 +9,44 @@
 #include <asm/e820.h>
-static void __init memtest(unsigned long start_phys, unsigned long size,
+static u64 patterns[] __initdata = {
-                                 unsigned pattern)
+        0,
+        0xffffffffffffffffULL,
+        0x5555555555555555ULL,
+        0xaaaaaaaaaaaaaaaaULL,
+        0x1111111111111111ULL,
+        0x2222222222222222ULL,
+        0x4444444444444444ULL,
+        0x8888888888888888ULL,
+        0x3333333333333333ULL,
+        0x6666666666666666ULL,
+        0x9999999999999999ULL,
+        0xccccccccccccccccULL,
+        0x7777777777777777ULL,
+        0xbbbbbbbbbbbbbbbbULL,
+        0xddddddddddddddddULL,
+        0xeeeeeeeeeeeeeeeeULL,
+        0x7a6c7258554e494cULL, /* yeah ;-) */
+};
+static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
 {
-        unsigned long i;
+        printk(KERN_INFO "  %016llx bad mem addr %010llx - %010llx reserved\n",
-        unsigned long *start;
+               (unsigned long long) pattern,
-        unsigned long start_bad;
+               (unsigned long long) start_bad,
-        unsigned long last_bad;
+               (unsigned long long) end_bad);
-        unsigned long val;
+        reserve_early(start_bad, end_bad, "BAD RAM");
-        unsigned long start_phys_aligned;
+}
-        unsigned long count;
-        unsigned long incr;
-        switch (pattern) {
-        case 0:
-                val = 0UL;
-                break;
-        case 1:
-                val = -1UL;
-                break;
-        case 2:
-#ifdef CONFIG_X86_64
-                val = 0x5555555555555555UL;
-#else
-                val = 0x55555555UL;
-#endif
-                break;
-        case 3:
-#ifdef CONFIG_X86_64
-                val = 0xaaaaaaaaaaaaaaaaUL;
-#else
-                val = 0xaaaaaaaaUL;
-#endif
-                break;
-        default:
-                return;
-        }
-        incr = sizeof(unsigned long);
+static void __init memtest(u64 pattern, u64 start_phys, u64 size)
+{
+        u64 i, count;
+        u64 *start;
+        u64 start_bad, last_bad;
+        u64 start_phys_aligned;
+        size_t incr;
+        incr = sizeof(pattern);
        start_phys_aligned = ALIGN(start_phys, incr);
        count = (size - (start_phys_aligned - start_phys))/incr;
        start = __va(start_phys_aligned);
@@ -54,25 +54,42 @@ static void __init memtest(unsigned long start_phys, unsigned long size,
        last_bad = 0;
        for (i = 0; i < count; i++)
-                start[i] = val;
+                start[i] = pattern;
        for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
-                if (*start != val) {
+                if (*start == pattern)
-                        if (start_phys_aligned == last_bad + incr) {
+                        continue;
-                                last_bad += incr;
+                if (start_phys_aligned == last_bad + incr) {
-                        } else {
+                        last_bad += incr;
-                                if (start_bad) {
+                        continue;
-                                        printk(KERN_CONT "\n  %016lx bad mem addr %010lx - %010lx reserved",
-                                                val, start_bad, last_bad + incr);
-                                        reserve_early(start_bad, last_bad + incr, "BAD RAM");
-                                }
-                                start_bad = last_bad = start_phys_aligned;
-                        }
                }
+                if (start_bad)
+                        reserve_bad_mem(pattern, start_bad, last_bad + incr);
+                start_bad = last_bad = start_phys_aligned;
        }
-        if (start_bad) {
+        if (start_bad)
-                printk(KERN_CONT "\n  %016lx bad mem addr %010lx - %010lx reserved",
+                reserve_bad_mem(pattern, start_bad, last_bad + incr);
-                        val, start_bad, last_bad + incr);
+}
-                reserve_early(start_bad, last_bad + incr, "BAD RAM");
+static void __init do_one_pass(u64 pattern, u64 start, u64 end)
+{
+        u64 size = 0;
+        while (start < end) {
+                start = find_e820_area_size(start, &size, 1);
+                /* done ? */
+                if (start >= end)
+                        break;
+                if (start + size > end)
+                        size = end - start;
+                printk(KERN_INFO "  %010llx - %010llx pattern %016llx\n",
+                       (unsigned long long) start,
+                       (unsigned long long) start + size,
+                       (unsigned long long) cpu_to_be64(pattern));
+                memtest(pattern, start, size);
+                start += size;
        }
 }
@@ -90,33 +107,22 @@ early_param("memtest", parse_memtest);
 void __init early_memtest(unsigned long start, unsigned long end)
 {
-        u64 t_start, t_size;
+        unsigned int i;
-        unsigned pattern;
+        unsigned int idx = 0;
        if (!memtest_pattern)
                return;
-        printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
+        printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern);
-        for (pattern = 0; pattern < memtest_pattern; pattern++) {
+        for (i = 0; i < memtest_pattern; i++) {
-                t_start = start;
+                idx = i % ARRAY_SIZE(patterns);
-                t_size = 0;
+                do_one_pass(patterns[idx], start, end);
-                while (t_start < end) {
+        }
-                        t_start = find_e820_area_size(t_start, &t_size, 1);
-                        /* done ? */
-                        if (t_start >= end)
-                                break;
-                        if (t_start + t_size > end)
-                                t_size = end - t_start;
-                        printk(KERN_CONT "\n  %010llx - %010llx pattern %d",
-                                (unsigned long long)t_start,
-                                (unsigned long long)t_start + t_size, pattern);
-                        memtest(t_start, t_size, pattern);
-                        t_start += t_size;
+        if (idx > 0) {
-                }
+                printk(KERN_INFO "early_memtest: wipe out "
+                       "test pattern from memory\n");
+                /* additional test with pattern 0 will do this */
+                do_one_pass(0, start, end);
        }
-        printk(KERN_CONT "\n");
 }
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 56fe7124fbec..165829600566 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -4,7 +4,7 @@
 * Based on code by Ingo Molnar and Andi Kleen, copyrighted
 * as follows:
 *
- * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * Copyright 2003-2009 Red Hat Inc.
 * All Rights Reserved.
 * Copyright 2005 Andi Kleen, SUSE Labs.
 * Copyright 2007 Jiri Kosina, SUSE Labs.
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index d1f7439d173c..3daefa04ace5 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -194,7 +194,7 @@ void *alloc_remap(int nid, unsigned long size)
        size = ALIGN(size, L1_CACHE_BYTES);
        if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid])
-                return 0;
+                return NULL;
        node_remap_alloc_vaddr[nid] += size;
        memset(allocation, 0, size);
@@ -416,39 +416,14 @@ void __init initmem_init(unsigned long start_pfn,
        for_each_online_node(nid)
                propagate_e820_map_node(nid);
-        for_each_online_node(nid)
+        for_each_online_node(nid) {
                memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
+                NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
+        }
-        NODE_DATA(0)->bdata = &bootmem_node_data[0];
        setup_bootmem_allocator();
 }
-void __init set_highmem_pages_init(void)
-{
-#ifdef CONFIG_HIGHMEM
-        struct zone *zone;
-        int nid;
-        for_each_zone(zone) {
-                unsigned long zone_start_pfn, zone_end_pfn;
-                if (!is_highmem(zone))
-                        continue;
-                zone_start_pfn = zone->zone_start_pfn;
-                zone_end_pfn = zone_start_pfn + zone->spanned_pages;
-                nid = zone_to_nid(zone);
-                printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
-                                zone->name, nid, zone_start_pfn, zone_end_pfn);
-                add_highpages_with_active_regions(nid, zone_start_pfn,
-                                 zone_end_pfn);
-        }
-        totalram_pages += totalhigh_pages;
-#endif
-}
 #ifdef CONFIG_MEMORY_HOTPLUG
 static int paddr_to_nid(u64 addr)
 {
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index f3516da035d1..64c9cf043cdd 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -20,6 +20,12 @@
 #include <asm/acpi.h>
 #include <asm/k8.h>
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+# define DBG(x...) printk(KERN_DEBUG x)
+#else
+# define DBG(x...)
+#endif
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
@@ -33,6 +39,21 @@ int numa_off __initdata;
 static unsigned long __initdata nodemap_addr;
 static unsigned long __initdata nodemap_size;
+DEFINE_PER_CPU(int, node_number) = 0;
+EXPORT_PER_CPU_SYMBOL(node_number);
+/*
+ * Map cpu index to node index
+ */
+DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
+/*
+ * Which logical CPUs are on which nodes
+ */
+cpumask_t *node_to_cpumask_map;
+EXPORT_SYMBOL(node_to_cpumask_map);
 /*
 * Given a shift value, try to populate memnodemap[]
 * Returns :
@@ -640,3 +661,199 @@ void __init init_cpu_to_node(void)
 #endif
+/*
+ * Allocate node_to_cpumask_map based on number of available nodes
+ * Requires node_possible_map to be valid.
+ *
+ * Note: node_to_cpumask() is not valid until after this is done.
+ * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
+ */
+void __init setup_node_to_cpumask_map(void)
+{
+        unsigned int node, num = 0;
+        cpumask_t *map;
+        /* setup nr_node_ids if not done yet */
+        if (nr_node_ids == MAX_NUMNODES) {
+                for_each_node_mask(node, node_possible_map)
+                        num = node;
+                nr_node_ids = num + 1;
+        }
+        /* allocate the map */
+        map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
+        DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids);
+        pr_debug("Node to cpumask map at %p for %d nodes\n",
+                 map, nr_node_ids);
+        /* node_to_cpumask() will now work */
+        node_to_cpumask_map = map;
+}
+void __cpuinit numa_set_node(int cpu, int node)
+{
+        int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
+        /* early setting, no percpu area yet */
+        if (cpu_to_node_map) {
+                cpu_to_node_map[cpu] = node;
+                return;
+        }
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+        if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
+                printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
+                dump_stack();
+                return;
+        }
+#endif
+        per_cpu(x86_cpu_to_node_map, cpu) = node;
+        if (node != NUMA_NO_NODE)
+                per_cpu(node_number, cpu) = node;
+}
+void __cpuinit numa_clear_node(int cpu)
+{
+        numa_set_node(cpu, NUMA_NO_NODE);
+}
+#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+void __cpuinit numa_add_cpu(int cpu)
+{
+        cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+void __cpuinit numa_remove_cpu(int cpu)
+{
+        cpu_clear(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+#else /* CONFIG_DEBUG_PER_CPU_MAPS */
+/*
+ * --------- debug versions of the numa functions ---------
+ */
+static void __cpuinit numa_set_cpumask(int cpu, int enable)
+{
+        int node = early_cpu_to_node(cpu);
+        cpumask_t *mask;
+        char buf[64];
+        if (node_to_cpumask_map == NULL) {
+                printk(KERN_ERR "node_to_cpumask_map NULL\n");
+                dump_stack();
+                return;
+        }
+        mask = &node_to_cpumask_map[node];
+        if (enable)
+                cpu_set(cpu, *mask);
+        else
+                cpu_clear(cpu, *mask);
+        cpulist_scnprintf(buf, sizeof(buf), mask);
+        printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
+                enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
+}
+void __cpuinit numa_add_cpu(int cpu)
+{
+        numa_set_cpumask(cpu, 1);
+}
+void __cpuinit numa_remove_cpu(int cpu)
+{
+        numa_set_cpumask(cpu, 0);
+}
+int cpu_to_node(int cpu)
+{
+        if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
+                printk(KERN_WARNING
+                        "cpu_to_node(%d): usage too early!\n", cpu);
+                dump_stack();
+                return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+        }
+        return per_cpu(x86_cpu_to_node_map, cpu);
+}
+EXPORT_SYMBOL(cpu_to_node);
+/*
+ * Same function as cpu_to_node() but used if called before the
+ * per_cpu areas are setup.
+ */
+int early_cpu_to_node(int cpu)
+{
+        if (early_per_cpu_ptr(x86_cpu_to_node_map))
+                return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+        if (!cpu_possible(cpu)) {
+                printk(KERN_WARNING
+                        "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
+                dump_stack();
+                return NUMA_NO_NODE;
+        }
+        return per_cpu(x86_cpu_to_node_map, cpu);
+}
+/* empty cpumask */
+static const cpumask_t cpu_mask_none;
+/*
+ * Returns a pointer to the bitmask of CPUs on Node 'node'.
+ */
+const cpumask_t *cpumask_of_node(int node)
+{
+        if (node_to_cpumask_map == NULL) {
+                printk(KERN_WARNING
+                        "cpumask_of_node(%d): no node_to_cpumask_map!\n",
+                        node);
+                dump_stack();
+                return (const cpumask_t *)&cpu_online_map;
+        }
+        if (node >= nr_node_ids) {
+                printk(KERN_WARNING
+                        "cpumask_of_node(%d): node > nr_node_ids(%d)\n",
+                        node, nr_node_ids);
+                dump_stack();
+                return &cpu_mask_none;
+        }
+        return &node_to_cpumask_map[node];
+}
+EXPORT_SYMBOL(cpumask_of_node);
+/*
+ * Returns a bitmask of CPUs on Node 'node'.
+ *
+ * Side note: this function creates the returned cpumask on the stack
+ * so with a high NR_CPUS count, excessive stack space is used.  The
+ * node_to_cpumask_ptr function should be used whenever possible.
+ */
+cpumask_t node_to_cpumask(int node)
+{
+        if (node_to_cpumask_map == NULL) {
+                printk(KERN_WARNING
+                        "node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
+                dump_stack();
+                return cpu_online_map;
+        }
+        if (node >= nr_node_ids) {
+                printk(KERN_WARNING
+                        "node_to_cpumask(%d): node > nr_node_ids(%d)\n",
+                        node, nr_node_ids);
+                dump_stack();
+                return cpu_mask_none;
+        }
+        return node_to_cpumask_map[node];
+}
+EXPORT_SYMBOL(node_to_cpumask);
+/*
+ * --------- end of debug versions of the numa functions ---------
+ */
+#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 7be47d1a97e4..8253bc97587e 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -482,6 +482,13 @@ static int split_large_page(pte_t *kpte, unsigned long address)
        pbase = (pte_t *)page_address(base);
        paravirt_alloc_pte(&init_mm, page_to_pfn(base));
        ref_prot = pte_pgprot(pte_clrhuge(*kpte));
+        /*
+         * If we ever want to utilize the PAT bit, we need to
+         * update this function to make sure it's converted from
+         * bit 12 to bit 7 when we cross from the 2MB level to
+         * the 4K level:
+         */
+        WARN_ON_ONCE(pgprot_val(ref_prot) & _PAGE_PAT_LARGE);
 #ifdef CONFIG_X86_64
        if (level == PG_LEVEL_1G) {
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index e0ab173b6974..2ed37158012d 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -31,7 +31,7 @@
 #ifdef CONFIG_X86_PAT
 int __read_mostly pat_enabled = 1;
-void __cpuinit pat_disable(char *reason)
+void __cpuinit pat_disable(const char *reason)
 {
        pat_enabled = 0;
        printk(KERN_INFO "%s\n", reason);
@@ -43,6 +43,11 @@ static int __init nopat(char *str)
        return 0;
 }
 early_param("nopat", nopat);
+#else
+static inline void pat_disable(const char *reason)
+{
+        (void)reason;
+}
 #endif
@@ -79,16 +84,20 @@ void pat_init(void)
        if (!pat_enabled)
                return;
-        /* Paranoia check. */
+        if (!cpu_has_pat) {
-        if (!cpu_has_pat && boot_pat_state) {
+                if (!boot_pat_state) {
-                /*
+                        pat_disable("PAT not supported by CPU.");
-                 * If this happens we are on a secondary CPU, but
+                        return;
-                 * switched to PAT on the boot CPU. We have no way to
+                } else {
-                 * undo PAT.
+                        /*
-                 */
+                         * If this happens we are on a secondary CPU, but
-                printk(KERN_ERR "PAT enabled, "
+                         * switched to PAT on the boot CPU. We have no way to
-                       "but not supported by secondary CPU\n");
+                         * undo PAT.
-                BUG();
+                         */
+                        printk(KERN_ERR "PAT enabled, "
+                               "but not supported by secondary CPU\n");
+                        BUG();
+                }
        }
        /* Set PWT to Write-Combining. All other bits stay the same */
@@ -626,6 +635,33 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
 }
 /*
+ * Change the memory type for the physial address range in kernel identity
+ * mapping space if that range is a part of identity map.
+ */
+int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags)
+{
+        unsigned long id_sz;
+        if (!pat_enabled || base >= __pa(high_memory))
+                return 0;
+        id_sz = (__pa(high_memory) < base + size) ?
+                                __pa(high_memory) - base :
+                                size;
+        if (ioremap_change_attr((unsigned long)__va(base), id_sz, flags) < 0) {
+                printk(KERN_INFO
+                        "%s:%d ioremap_change_attr failed %s "
+                        "for %Lx-%Lx\n",
+                        current->comm, current->pid,
+                        cattr_name(flags),
+                        base, (unsigned long long)(base + size));
+                return -EINVAL;
+        }
+        return 0;
+}
+/*
 * Internal interface to reserve a range of physical memory with prot.
 * Reserved non RAM regions only and after successful reserve_memtype,
 * this func also keeps identity mapping (if any) in sync with this new prot.
@@ -634,7 +670,7 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
                                int strict_prot)
 {
        int is_ram = 0;
-        int id_sz, ret;
+        int ret;
        unsigned long flags;
        unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK);
@@ -671,23 +707,8 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
                                     flags);
        }
-        /* Need to keep identity mapping in sync */
+        if (kernel_map_sync_memtype(paddr, size, flags) < 0) {
-        if (paddr >= __pa(high_memory))
-                return 0;
-        id_sz = (__pa(high_memory) < paddr + size) ?
-                                __pa(high_memory) - paddr :
-                                size;
-        if (ioremap_change_attr((unsigned long)__va(paddr), id_sz, flags) < 0) {
                free_memtype(paddr, paddr + size);
-                printk(KERN_ERR
-                        "%s:%d reserve_pfn_range ioremap_change_attr failed %s "
-                        "for %Lx-%Lx\n",
-                        current->comm, current->pid,
-                        cattr_name(flags),
-                        (unsigned long long)paddr,
-                        (unsigned long long)(paddr + size));
                return -EINVAL;
        }
        return 0;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 86f2ffc43c3d..5b7c7c8464fe 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -313,6 +313,24 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
        return young;
 }
+/**
+ * reserve_top_address - reserves a hole in the top of kernel address space
+ * @reserve - size of hole to reserve
+ *
+ * Can be used to relocate the fixmap area and poke a hole in the top
+ * of kernel address space to make room for a hypervisor.
+ */
+void __init reserve_top_address(unsigned long reserve)
+{
+#ifdef CONFIG_X86_32
+        BUG_ON(fixmaps_set > 0);
+        printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
+               (int)-reserve);
+        __FIXADDR_TOP = -reserve - PAGE_SIZE;
+        __VMALLOC_RESERVE += reserve;
+#endif
+}
 int fixmaps_set;
 void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 0951db9ee519..f2e477c91c1b 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -20,6 +20,8 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
+unsigned int __VMALLOC_RESERVE = 128 << 20;
 /*
 * Associate a virtual page frame with a given physical page frame 
 * and protection flags for that frame.
@@ -97,22 +99,6 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
 unsigned long __FIXADDR_TOP = 0xfffff000;
 EXPORT_SYMBOL(__FIXADDR_TOP);
-/**
- * reserve_top_address - reserves a hole in the top of kernel address space
- * @reserve - size of hole to reserve
- *
- * Can be used to relocate the fixmap area and poke a hole in the top
- * of kernel address space to make room for a hypervisor.
- */
-void __init reserve_top_address(unsigned long reserve)
-{
-        BUG_ON(fixmaps_set > 0);
-        printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
-               (int)-reserve);
-        __FIXADDR_TOP = -reserve - PAGE_SIZE;
-        __VMALLOC_RESERVE += reserve;
-}
 /*
 * vmalloc=size forces the vmalloc area to be exactly 'size'
 * bytes. This can be used to increase (or decrease) the
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 09737c8af074..574c8bc95ef0 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -20,7 +20,8 @@
 #include <asm/proto.h>
 #include <asm/numa.h>
 #include <asm/e820.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
+#include <asm/uv/uv.h>
 int acpi_numa __initdata;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
new file mode 100644
index 000000000000..a654d59e4483
--- /dev/null
+++ b/arch/x86/mm/tlb.c
@@ -0,0 +1,295 @@
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/apic.h>
+#include <asm/uv/uv.h>
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
+                        = { &init_mm, 0, };
+/*
+ *      Smarter SMP flushing macros.
+ *              c/o Linus Torvalds.
+ *
+ *      These mean you can really definitely utterly forget about
+ *      writing to user space from interrupts. (Its not allowed anyway).
+ *
+ *      Optimizations Manfred Spraul <manfred@colorfullife.com>
+ *
+ *      More scalable flush, from Andi Kleen
+ *
+ *      To avoid global state use 8 different call vectors.
+ *      Each CPU uses a specific vector to trigger flushes on other
+ *      CPUs. Depending on the received vector the target CPUs look into
+ *      the right array slot for the flush data.
+ *
+ *      With more than 8 CPUs they are hashed to the 8 available
+ *      vectors. The limited global vector space forces us to this right now.
+ *      In future when interrupts are split into per CPU domains this could be
+ *      fixed, at the cost of triggering multiple IPIs in some cases.
+ */
+union smp_flush_state {
+        struct {
+                struct mm_struct *flush_mm;
+                unsigned long flush_va;
+                spinlock_t tlbstate_lock;
+                DECLARE_BITMAP(flush_cpumask, NR_CPUS);
+        };
+        char pad[CONFIG_X86_INTERNODE_CACHE_BYTES];
+} ____cacheline_internodealigned_in_smp;
+/* State is put into the per CPU data section, but padded
+   to a full cache line because other CPUs can access it and we don't
+   want false sharing in the per cpu data segment. */
+static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
+/*
+ * We cannot call mmdrop() because we are in interrupt context,
+ * instead update mm->cpu_vm_mask.
+ */
+void leave_mm(int cpu)
+{
+        if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
+                BUG();
+        cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask);
+        load_cr3(swapper_pg_dir);
+}
+EXPORT_SYMBOL_GPL(leave_mm);
+/*
+ *
+ * The flush IPI assumes that a thread switch happens in this order:
+ * [cpu0: the cpu that switches]
+ * 1) switch_mm() either 1a) or 1b)
+ * 1a) thread switch to a different mm
+ * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
+ *      Stop ipi delivery for the old mm. This is not synchronized with
+ *      the other cpus, but smp_invalidate_interrupt ignore flush ipis
+ *      for the wrong mm, and in the worst case we perform a superfluous
+ *      tlb flush.
+ * 1a2) set cpu mmu_state to TLBSTATE_OK
+ *      Now the smp_invalidate_interrupt won't call leave_mm if cpu0
+ *      was in lazy tlb mode.
+ * 1a3) update cpu active_mm
+ *      Now cpu0 accepts tlb flushes for the new mm.
+ * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
+ *      Now the other cpus will send tlb flush ipis.
+ * 1a4) change cr3.
+ * 1b) thread switch without mm change
+ *      cpu active_mm is correct, cpu0 already handles
+ *      flush ipis.
+ * 1b1) set cpu mmu_state to TLBSTATE_OK
+ * 1b2) test_and_set the cpu bit in cpu_vm_mask.
+ *      Atomically set the bit [other cpus will start sending flush ipis],
+ *      and test the bit.
+ * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
+ * 2) switch %%esp, ie current
+ *
+ * The interrupt must handle 2 special cases:
+ * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
+ * - the cpu performs speculative tlb reads, i.e. even if the cpu only
+ *   runs in kernel space, the cpu could load tlb entries for user space
+ *   pages.
+ *
+ * The good news is that cpu mmu_state is local to each cpu, no
+ * write/read ordering problems.
+ */
+/*
+ * TLB flush IPI:
+ *
+ * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
+ * 2) Leave the mm if we are in the lazy tlb mode.
+ *
+ * Interrupts are disabled.
+ */
+/*
+ * FIXME: use of asmlinkage is not consistent.  On x86_64 it's noop
+ * but still used for documentation purpose but the usage is slightly
+ * inconsistent.  On x86_32, asmlinkage is regparm(0) but interrupt
+ * entry calls in with the first parameter in %eax.  Maybe define
+ * intrlinkage?
+ */
+#ifdef CONFIG_X86_64
+asmlinkage
+#endif
+void smp_invalidate_interrupt(struct pt_regs *regs)
+{
+        unsigned int cpu;
+        unsigned int sender;
+        union smp_flush_state *f;
+        cpu = smp_processor_id();
+        /*
+         * orig_rax contains the negated interrupt vector.
+         * Use that to determine where the sender put the data.
+         */
+        sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
+        f = &flush_state[sender];
+        if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask)))
+                goto out;
+                /*
+                 * This was a BUG() but until someone can quote me the
+                 * line from the intel manual that guarantees an IPI to
+                 * multiple CPUs is retried _only_ on the erroring CPUs
+                 * its staying as a return
+                 *
+                 * BUG();
+                 */
+        if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
+                if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
+                        if (f->flush_va == TLB_FLUSH_ALL)
+                                local_flush_tlb();
+                        else
+                                __flush_tlb_one(f->flush_va);
+                } else
+                        leave_mm(cpu);
+        }
+out:
+        ack_APIC_irq();
+        smp_mb__before_clear_bit();
+        cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask));
+        smp_mb__after_clear_bit();
+        inc_irq_stat(irq_tlb_count);
+}
+static void flush_tlb_others_ipi(const struct cpumask *cpumask,
+                                 struct mm_struct *mm, unsigned long va)
+{
+        unsigned int sender;
+        union smp_flush_state *f;
+        /* Caller has disabled preemption */
+        sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
+        f = &flush_state[sender];
+        /*
+         * Could avoid this lock when
+         * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
+         * probably not worth checking this for a cache-hot lock.
+         */
+        spin_lock(&f->tlbstate_lock);
+        f->flush_mm = mm;
+        f->flush_va = va;
+        cpumask_andnot(to_cpumask(f->flush_cpumask),
+                       cpumask, cpumask_of(smp_processor_id()));
+        /*
+         * Make the above memory operations globally visible before
+         * sending the IPI.
+         */
+        smp_mb();
+        /*
+         * We have to send the IPI only to
+         * CPUs affected.
+         */
+        apic->send_IPI_mask(to_cpumask(f->flush_cpumask),
+                      INVALIDATE_TLB_VECTOR_START + sender);
+        while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
+                cpu_relax();
+        f->flush_mm = NULL;
+        f->flush_va = 0;
+        spin_unlock(&f->tlbstate_lock);
+}
+void native_flush_tlb_others(const struct cpumask *cpumask,
+                             struct mm_struct *mm, unsigned long va)
+{
+        if (is_uv_system()) {
+                unsigned int cpu;
+                cpu = get_cpu();
+                cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
+                if (cpumask)
+                        flush_tlb_others_ipi(cpumask, mm, va);
+                put_cpu();
+                return;
+        }
+        flush_tlb_others_ipi(cpumask, mm, va);
+}
+static int __cpuinit init_smp_flush(void)
+{
+        int i;
+        for (i = 0; i < ARRAY_SIZE(flush_state); i++)
+                spin_lock_init(&flush_state[i].tlbstate_lock);
+        return 0;
+}
+core_initcall(init_smp_flush);
+void flush_tlb_current_task(void)
+{
+        struct mm_struct *mm = current->mm;
+        preempt_disable();
+        local_flush_tlb();
+        if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
+                flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
+        preempt_enable();
+}
+void flush_tlb_mm(struct mm_struct *mm)
+{
+        preempt_disable();
+        if (current->active_mm == mm) {
+                if (current->mm)
+                        local_flush_tlb();
+                else
+                        leave_mm(smp_processor_id());
+        }
+        if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
+                flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
+        preempt_enable();
+}
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        preempt_disable();
+        if (current->active_mm == mm) {
+                if (current->mm)
+                        __flush_tlb_one(va);
+                else
+                        leave_mm(smp_processor_id());
+        }
+        if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
+                flush_tlb_others(&mm->cpu_vm_mask, mm, va);
+        preempt_enable();
+}
+static void do_flush_tlb_all(void *info)
+{
+        unsigned long cpu = smp_processor_id();
+        __flush_tlb_all();
+        if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
+                leave_mm(cpu);
+}
+void flush_tlb_all(void)
+{
+        on_each_cpu(do_flush_tlb_all, NULL, 1);
+}
author	Ingo Molnar <mingo@elte.hu>	2009-03-06 10:44:14 -0500
committer	Ingo Molnar <mingo@elte.hu>	2009-03-06 10:45:01 -0500
commit	f0ef03985130287c6c84ebe69416cf790e6cc00e (patch)
tree	3ecb04cc4d82e5fc3ae5f1747e6da172ae8cbcb7 /arch/x86/mm
parent	16097439703bcd38e9fe5608c12add6dacb825ea (diff)
parent	31bbed527e7039203920c51c9fb48c27aed0820c (diff)