Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp

Conflicts: litmus/sched_cedf.c
author: Glenn Elliott <gelliott@cs.unc.edu> 2012-03-04 19:47:13 -0500
committer: Glenn Elliott <gelliott@cs.unc.edu> 2012-03-04 19:47:13 -0500
commit: c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
tree: ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /arch/x86/mm
parent: ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent: 6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
29 files changed, 2562 insertions, 2621 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index a4c768397baa..3d11327c9ab4 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -23,7 +23,10 @@ mmiotrace-y			:= kmmio.o pf_in.o mmio-mod.o
 obj-$(CONFIG_MMIOTRACE_TEST)    += testmmiotrace.o
 obj-$(CONFIG_NUMA)              += numa.o numa_$(BITS).o
-obj-$(CONFIG_K8_NUMA)           += k8topology_64.o
+obj-$(CONFIG_AMD_NUMA)          += amdtopology.o
-obj-$(CONFIG_ACPI_NUMA)         += srat_$(BITS).o
+obj-$(CONFIG_ACPI_NUMA)         += srat.o
+obj-$(CONFIG_NUMA_EMU)          += numa_emulation.o
+obj-$(CONFIG_HAVE_MEMBLOCK)             += memblock.o
 obj-$(CONFIG_MEMTEST)           += memtest.o
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/amdtopology.c
index 970ed579d4e4..5247d01329ca 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/amdtopology.c
@@ -1,8 +1,8 @@
 /*
- * AMD K8 NUMA support.
+ * AMD NUMA support.
 * Discover the memory map and associated nodes.
 *
- * This version reads it directly from the K8 northbridge.
+ * This version reads it directly from the AMD northbridge.
 *
 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
 */
@@ -11,6 +11,9 @@
 #include <linux/string.h>
 #include <linux/module.h>
 #include <linux/nodemask.h>
+#include <linux/memblock.h>
+#include <linux/bootmem.h>
 #include <asm/io.h>
 #include <linux/pci_ids.h>
 #include <linux/acpi.h>
@@ -22,10 +25,9 @@
 #include <asm/numa.h>
 #include <asm/mpspec.h>
 #include <asm/apic.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
-static struct bootnode __initdata nodes[8];
+static unsigned char __initdata nodeids[8];
-static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
 static __init int find_northbridge(void)
 {
@@ -48,14 +50,14 @@ static __init int find_northbridge(void)
                return num;
        }
-        return -1;
+        return -ENOENT;
 }
 static __init void early_get_boot_cpu_id(void)
 {
        /*
-         * need to get boot_cpu_id so can use that to create apicid_to_node
+         * need to get the APIC ID of the BSP so can use that to
-         * in k8_scan_nodes()
+         * create apicid_to_node in amd_scan_nodes()
         */
 #ifdef CONFIG_X86_MPPARSE
        /*
@@ -64,33 +66,20 @@ static __init void early_get_boot_cpu_id(void)
        if (smp_found_config)
                early_get_smp_config();
 #endif
-        early_init_lapic_mapping();
-}
-int __init k8_get_nodes(struct bootnode *physnodes)
-{
-        int i;
-        int ret = 0;
-        for_each_node_mask(i, nodes_parsed) {
-                physnodes[ret].start = nodes[i].start;
-                physnodes[ret].end = nodes[i].end;
-                ret++;
-        }
-        return ret;
 }
-int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
+int __init amd_numa_init(void)
 {
-        unsigned long start = PFN_PHYS(start_pfn);
+        u64 start = PFN_PHYS(0);
-        unsigned long end = PFN_PHYS(end_pfn);
+        u64 end = PFN_PHYS(max_pfn);
        unsigned numnodes;
-        unsigned long prevbase;
+        u64 prevbase;
-        int i, nb, found = 0;
+        int i, j, nb;
        u32 nodeid, reg;
+        unsigned int bits, cores, apicid_base;
        if (!early_pci_allowed())
-                return -1;
+                return -EINVAL;
        nb = find_northbridge();
        if (nb < 0)
@@ -101,40 +90,40 @@ int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
        reg = read_pci_config(0, nb, 0, 0x60);
        numnodes = ((reg >> 4) & 0xF) + 1;
        if (numnodes <= 1)
-                return -1;
+                return -ENOENT;
        pr_info("Number of physical nodes %d\n", numnodes);
        prevbase = 0;
        for (i = 0; i < 8; i++) {
-                unsigned long base, limit;
+                u64 base, limit;
                base = read_pci_config(0, nb, 1, 0x40 + i*8);
                limit = read_pci_config(0, nb, 1, 0x44 + i*8);
-                nodeid = limit & 7;
+                nodeids[i] = nodeid = limit & 7;
                if ((base & 3) == 0) {
                        if (i < numnodes)
                                pr_info("Skipping disabled node %d\n", i);
                        continue;
                }
                if (nodeid >= numnodes) {
-                        pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid,
+                        pr_info("Ignoring excess node %d (%Lx:%Lx)\n", nodeid,
                                base, limit);
                        continue;
                }
                if (!limit) {
-                        pr_info("Skipping node entry %d (base %lx)\n",
+                        pr_info("Skipping node entry %d (base %Lx)\n",
                                i, base);
                        continue;
                }
                if ((base >> 8) & 3 || (limit >> 8) & 3) {
-                        pr_err("Node %d using interleaving mode %lx/%lx\n",
+                        pr_err("Node %d using interleaving mode %Lx/%Lx\n",
                               nodeid, (base >> 8) & 3, (limit >> 8) & 3);
-                        return -1;
+                        return -EINVAL;
                }
-                if (node_isset(nodeid, nodes_parsed)) {
+                if (node_isset(nodeid, numa_nodes_parsed)) {
                        pr_info("Node %d already present, skipping\n",
                                nodeid);
                        continue;
@@ -162,74 +151,47 @@ int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
                        continue;
                }
                if (limit < base) {
-                        pr_err("Node %d bogus settings %lx-%lx.\n",
+                        pr_err("Node %d bogus settings %Lx-%Lx.\n",
                               nodeid, base, limit);
                        continue;
                }
                /* Could sort here, but pun for now. Should not happen anyroads. */
                if (prevbase > base) {
-                        pr_err("Node map not sorted %lx,%lx\n",
+                        pr_err("Node map not sorted %Lx,%Lx\n",
                               prevbase, base);
-                        return -1;
+                        return -EINVAL;
                }
-                pr_info("Node %d MemBase %016lx Limit %016lx\n",
+                pr_info("Node %d MemBase %016Lx Limit %016Lx\n",
                        nodeid, base, limit);
-                found++;
-                nodes[nodeid].start = base;
-                nodes[nodeid].end = limit;
                prevbase = base;
+                numa_add_memblk(nodeid, base, limit);
-                node_set(nodeid, nodes_parsed);
+                node_set(nodeid, numa_nodes_parsed);
        }
-        if (!found)
+        if (!nodes_weight(numa_nodes_parsed))
-                return -1;
+                return -ENOENT;
-        return 0;
-}
-int __init k8_scan_nodes(void)
+        /*
-{
+         * We seem to have valid NUMA configuration.  Map apicids to nodes
-        unsigned int bits;
+         * using the coreid bits from early_identify_cpu.
-        unsigned int cores;
+         */
-        unsigned int apicid_base;
-        int i;
-        BUG_ON(nodes_empty(nodes_parsed));
-        node_possible_map = nodes_parsed;
-        memnode_shift = compute_hash_shift(nodes, 8, NULL);
-        if (memnode_shift < 0) {
-                pr_err("No NUMA node hash function found. Contact maintainer\n");
-                return -1;
-        }
-        pr_info("Using node hash shift of %d\n", memnode_shift);
-        /* use the coreid bits from early_identify_cpu */
        bits = boot_cpu_data.x86_coreid_bits;
-        cores = (1<<bits);
+        cores = 1 << bits;
        apicid_base = 0;
-        /* need to get boot_cpu_id early for system with apicid lifting */
+        /* get the APIC ID of the BSP early for systems with apicid lifting */
        early_get_boot_cpu_id();
        if (boot_cpu_physical_apicid > 0) {
                pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
                apicid_base = boot_cpu_physical_apicid;
        }
-        for_each_node_mask(i, node_possible_map) {
+        for_each_node_mask(i, numa_nodes_parsed)
-                int j;
-                e820_register_active_regions(i,
-                                nodes[i].start >> PAGE_SHIFT,
-                                nodes[i].end >> PAGE_SHIFT);
                for (j = apicid_base; j < cores + apicid_base; j++)
-                        apicid_to_node[(i << bits) + j] = i;
+                        set_apicid_to_node((i << bits) + j, i);
-                setup_node_bootmem(i, nodes[i].start, nodes[i].end);
-        }
-        numa_init_array();
        return 0;
 }
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 4c4508e8a204..2dbf6bf4c7e5 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -11,6 +11,8 @@
 #include <linux/kprobes.h>              /* __kprobes, ...               */
 #include <linux/mmiotrace.h>            /* kmmio_handler, ...           */
 #include <linux/perf_event.h>           /* perf_sw_event                */
+#include <linux/hugetlb.h>              /* hstate_index_to_shift        */
+#include <linux/prefetch.h>             /* prefetchw                    */
 #include <asm/traps.h>                  /* dotraplinkage, ...           */
 #include <asm/pgalloc.h>                /* pgd_*(), ...                 */
@@ -160,15 +162,20 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
 static void
 force_sig_info_fault(int si_signo, int si_code, unsigned long address,
-                     struct task_struct *tsk)
+                     struct task_struct *tsk, int fault)
 {
+        unsigned lsb = 0;
        siginfo_t info;
        info.si_signo   = si_signo;
        info.si_errno   = 0;
        info.si_code    = si_code;
        info.si_addr    = (void __user *)address;
-        info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0;
+        if (fault & VM_FAULT_HWPOISON_LARGE)
+                lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); 
+        if (fault & VM_FAULT_HWPOISON)
+                lsb = PAGE_SHIFT;
+        info.si_addr_lsb = lsb;
        force_sig_info(si_signo, &info, tsk);
 }
@@ -223,16 +230,24 @@ void vmalloc_sync_all(void)
        for (address = VMALLOC_START & PMD_MASK;
             address >= TASK_SIZE && address < FIXADDR_TOP;
             address += PMD_SIZE) {
-                unsigned long flags;
                struct page *page;
-                spin_lock_irqsave(&pgd_lock, flags);
+                spin_lock(&pgd_lock);
                list_for_each_entry(page, &pgd_list, lru) {
-                        if (!vmalloc_sync_one(page_address(page), address))
+                        spinlock_t *pgt_lock;
+                        pmd_t *ret;
+                        /* the pgt_lock only for Xen */
+                        pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+                        spin_lock(pgt_lock);
+                        ret = vmalloc_sync_one(page_address(page), address);
+                        spin_unlock(pgt_lock);
+                        if (!ret)
                                break;
                }
-                spin_unlock_irqrestore(&pgd_lock, flags);
+                spin_unlock(&pgd_lock);
        }
 }
@@ -251,6 +266,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
        if (!(address >= VMALLOC_START && address < VMALLOC_END))
                return -1;
+        WARN_ON_ONCE(in_nmi());
        /*
         * Synchronize this task's top level page-table
         * with the 'reference' page table.
@@ -326,29 +343,7 @@ out:
 void vmalloc_sync_all(void)
 {
-        unsigned long address;
+        sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
-        for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
-             address += PGDIR_SIZE) {
-                const pgd_t *pgd_ref = pgd_offset_k(address);
-                unsigned long flags;
-                struct page *page;
-                if (pgd_none(*pgd_ref))
-                        continue;
-                spin_lock_irqsave(&pgd_lock, flags);
-                list_for_each_entry(page, &pgd_list, lru) {
-                        pgd_t *pgd;
-                        pgd = (pgd_t *)page_address(page) + pgd_index(address);
-                        if (pgd_none(*pgd))
-                                set_pgd(pgd, *pgd_ref);
-                        else
-                                BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
-                }
-                spin_unlock_irqrestore(&pgd_lock, flags);
-        }
 }
 /*
@@ -369,6 +364,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
        if (!(address >= VMALLOC_START && address < VMALLOC_END))
                return -1;
+        WARN_ON_ONCE(in_nmi());
        /*
         * Copy kernel mappings over when needed. This can also
         * happen within a race in page table update. In the later
@@ -731,7 +728,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
                tsk->thread.error_code  = error_code | (address >= TASK_SIZE);
                tsk->thread.trap_no     = 14;
-                force_sig_info_fault(SIGSEGV, si_code, address, tsk);
+                force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
                return;
        }
@@ -816,28 +813,51 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
        tsk->thread.trap_no     = 14;
 #ifdef CONFIG_MEMORY_FAILURE
-        if (fault & VM_FAULT_HWPOISON) {
+        if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
                printk(KERN_ERR
        "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
                        tsk->comm, tsk->pid, address);
                code = BUS_MCEERR_AR;
        }
 #endif
-        force_sig_info_fault(SIGBUS, code, address, tsk);
+        force_sig_info_fault(SIGBUS, code, address, tsk, fault);
 }
-static noinline void
+static noinline int
 mm_fault_error(struct pt_regs *regs, unsigned long error_code,
               unsigned long address, unsigned int fault)
 {
+        /*
+         * Pagefault was interrupted by SIGKILL. We have no reason to
+         * continue pagefault.
+         */
+        if (fatal_signal_pending(current)) {
+                if (!(fault & VM_FAULT_RETRY))
+                        up_read(&current->mm->mmap_sem);
+                if (!(error_code & PF_USER))
+                        no_context(regs, error_code, address);
+                return 1;
+        }
+        if (!(fault & VM_FAULT_ERROR))
+                return 0;
        if (fault & VM_FAULT_OOM) {
+                /* Kernel mode? Handle exceptions or die: */
+                if (!(error_code & PF_USER)) {
+                        up_read(&current->mm->mmap_sem);
+                        no_context(regs, error_code, address);
+                        return 1;
+                }
                out_of_memory(regs, error_code, address);
        } else {
-                if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON))
+                if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
+                             VM_FAULT_HWPOISON_LARGE))
                        do_sigbus(regs, error_code, address, fault);
                else
                        BUG();
        }
+        return 1;
 }
 static int spurious_fault_check(unsigned long error_code, pte_t *pte)
@@ -894,8 +914,14 @@ spurious_fault(unsigned long error_code, unsigned long address)
        if (pmd_large(*pmd))
                return spurious_fault_check(error_code, (pte_t *) pmd);
+        /*
+         * Note: don't use pte_present() here, since it returns true
+         * if the _PAGE_PROTNONE bit is set.  However, this aliases the
+         * _PAGE_GLOBAL bit, which for kernel pages give false positives
+         * when CONFIG_DEBUG_PAGEALLOC is used.
+         */
        pte = pte_offset_kernel(pmd, address);
-        if (!pte_present(*pte))
+        if (!(pte_flags(*pte) & _PAGE_PRESENT))
                return 0;
        ret = spurious_fault_check(error_code, pte);
@@ -915,9 +941,9 @@ spurious_fault(unsigned long error_code, unsigned long address)
 int show_unhandled_signals = 1;
 static inline int
-access_error(unsigned long error_code, int write, struct vm_area_struct *vma)
+access_error(unsigned long error_code, struct vm_area_struct *vma)
 {
-        if (write) {
+        if (error_code & PF_WRITE) {
                /* write, present and write, not present: */
                if (unlikely(!(vma->vm_flags & VM_WRITE)))
                        return 1;
@@ -952,8 +978,10 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
        struct task_struct *tsk;
        unsigned long address;
        struct mm_struct *mm;
-        int write;
        int fault;
+        int write = error_code & PF_WRITE;
+        unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
+                                        (write ? FAULT_FLAG_WRITE : 0);
        tsk = current;
        mm = tsk->mm;
@@ -1064,6 +1092,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
                        bad_area_nosemaphore(regs, error_code, address);
                        return;
                }
+retry:
                down_read(&mm->mmap_sem);
        } else {
                /*
@@ -1107,9 +1136,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
         * we can handle it..
         */
 good_area:
-        write = error_code & PF_WRITE;
+        if (unlikely(access_error(error_code, vma))) {
-        if (unlikely(access_error(error_code, write, vma))) {
                bad_area_access_error(regs, error_code, address);
                return;
        }
@@ -1119,21 +1146,34 @@ good_area:
         * make sure we exit gracefully rather than endlessly redo
         * the fault:
         */
-        fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
+        fault = handle_mm_fault(mm, vma, address, flags);
-        if (unlikely(fault & VM_FAULT_ERROR)) {
+        if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
-                mm_fault_error(regs, error_code, address, fault);
+                if (mm_fault_error(regs, error_code, address, fault))
-                return;
+                        return;
        }
-        if (fault & VM_FAULT_MAJOR) {
+        /*
-                tsk->maj_flt++;
+         * Major/minor page fault accounting is only done on the
-                perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+         * initial attempt. If we go through a retry, it is extremely
-                                     regs, address);
+         * likely that the page will be found in page cache at that point.
-        } else {
+         */
-                tsk->min_flt++;
+        if (flags & FAULT_FLAG_ALLOW_RETRY) {
-                perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+                if (fault & VM_FAULT_MAJOR) {
-                                     regs, address);
+                        tsk->maj_flt++;
+                        perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+                                      regs, address);
+                } else {
+                        tsk->min_flt++;
+                        perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+                                      regs, address);
+                }
+                if (fault & VM_FAULT_RETRY) {
+                        /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
+                         * of starvation. */
+                        flags &= ~FAULT_FLAG_ALLOW_RETRY;
+                        goto retry;
+                }
        }
        check_v8086_mode(regs, address, tsk);
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 738e6593799d..dbe34b931374 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -8,6 +8,7 @@
 #include <linux/mm.h>
 #include <linux/vmstat.h>
 #include <linux/highmem.h>
+#include <linux/swap.h>
 #include <asm/pgtable.h>
@@ -89,6 +90,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
                VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
                page = pte_page(pte);
                get_page(page);
+                SetPageReferenced(page);
                pages[*nr] = page;
                (*nr)++;
@@ -103,6 +105,17 @@ static inline void get_head_page_multiple(struct page *page, int nr)
        VM_BUG_ON(page != compound_head(page));
        VM_BUG_ON(page_count(page) == 0);
        atomic_add(nr, &page->_count);
+        SetPageReferenced(page);
+}
+static inline void get_huge_page_tail(struct page *page)
+{
+        /*
+         * __split_huge_page_refcount() cannot run
+         * from under us.
+         */
+        VM_BUG_ON(atomic_read(&page->_count) < 0);
+        atomic_inc(&page->_count);
 }
 static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
@@ -128,6 +141,8 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
        do {
                VM_BUG_ON(compound_head(page) != head);
                pages[*nr] = page;
+                if (PageTail(page))
+                        get_huge_page_tail(page);
                (*nr)++;
                page++;
                refs++;
@@ -148,7 +163,18 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                pmd_t pmd = *pmdp;
                next = pmd_addr_end(addr, end);
-                if (pmd_none(pmd))
+                /*
+                 * The pmd_trans_splitting() check below explains why
+                 * pmdp_splitting_flush has to flush the tlb, to stop
+                 * this gup-fast code from running while we set the
+                 * splitting bit in the pmd. Returning zero will take
+                 * the slow path that will call wait_split_huge_page()
+                 * if the pmd is still in splitting state. gup-fast
+                 * can't because it has irq disabled and
+                 * wait_split_huge_page() would never return as the
+                 * tlb flush IPI wouldn't run.
+                 */
+                if (pmd_none(pmd) || pmd_trans_splitting(pmd))
                        return 0;
                if (unlikely(pmd_large(pmd))) {
                        if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 5e8fa12ef861..b49962662101 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -9,6 +9,7 @@ void *kmap(struct page *page)
                return page_address(page);
        return kmap_high(page);
 }
+EXPORT_SYMBOL(kmap);
 void kunmap(struct page *page)
 {
@@ -18,6 +19,7 @@ void kunmap(struct page *page)
                return;
        kunmap_high(page);
 }
+EXPORT_SYMBOL(kunmap);
 /*
 * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
@@ -27,10 +29,10 @@ void kunmap(struct page *page)
 * However when holding an atomic kmap it is not legal to sleep, so atomic
 * kmaps are appropriate for short, tight code paths only.
 */
-void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
+void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 {
-        enum fixed_addresses idx;
        unsigned long vaddr;
+        int idx, type;
        /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
        pagefault_disable();
@@ -38,8 +40,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
        if (!PageHighMem(page))
                return page_address(page);
-        debug_kmap_atomic(type);
+        type = kmap_atomic_idx_push();
        idx = type + KM_TYPE_NR*smp_processor_id();
        vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
        BUG_ON(!pte_none(*(kmap_pte-idx)));
@@ -47,44 +48,57 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
        return (void *)vaddr;
 }
+EXPORT_SYMBOL(kmap_atomic_prot);
+void *__kmap_atomic(struct page *page)
+{
+        return kmap_atomic_prot(page, kmap_prot);
+}
+EXPORT_SYMBOL(__kmap_atomic);
-void *kmap_atomic(struct page *page, enum km_type type)
+/*
+ * This is the same as kmap_atomic() but can map memory that doesn't
+ * have a struct page associated with it.
+ */
+void *kmap_atomic_pfn(unsigned long pfn)
 {
-        return kmap_atomic_prot(page, type, kmap_prot);
+        return kmap_atomic_prot_pfn(pfn, kmap_prot);
 }
+EXPORT_SYMBOL_GPL(kmap_atomic_pfn);
-void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type)
+void __kunmap_atomic(void *kvaddr)
 {
        unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
-        enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
+        if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
-        /*
+            vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
-         * Force other mappings to Oops if they'll try to access this pte
+                int idx, type;
-         * without first remap it.  Keeping stale mappings around is a bad idea
-         * also, in case the page changes cacheability attributes or becomes
+                type = kmap_atomic_idx();
-         * a protected page in a hypervisor.
+                idx = type + KM_TYPE_NR * smp_processor_id();
-         */
-        if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
+#ifdef CONFIG_DEBUG_HIGHMEM
+                WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
+#endif
+                /*
+                 * Force other mappings to Oops if they'll try to access this
+                 * pte without first remap it.  Keeping stale mappings around
+                 * is a bad idea also, in case the page changes cacheability
+                 * attributes or becomes a protected page in a hypervisor.
+                 */
                kpte_clear_flush(kmap_pte-idx, vaddr);
-        else {
+                kmap_atomic_idx_pop();
+        }
 #ifdef CONFIG_DEBUG_HIGHMEM
+        else {
                BUG_ON(vaddr < PAGE_OFFSET);
                BUG_ON(vaddr >= (unsigned long)high_memory);
-#endif
        }
+#endif
        pagefault_enable();
 }
+EXPORT_SYMBOL(__kunmap_atomic);
-/*
- * This is the same as kmap_atomic() but can map memory that doesn't
- * have a struct page associated with it.
- */
-void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
-{
-        return kmap_atomic_prot_pfn(pfn, type, kmap_prot);
-}
-EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */
 struct page *kmap_atomic_to_page(void *ptr)
 {
@@ -98,12 +112,6 @@ struct page *kmap_atomic_to_page(void *ptr)
        pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
        return pte_page(*pte);
 }
-EXPORT_SYMBOL(kmap);
-EXPORT_SYMBOL(kunmap);
-EXPORT_SYMBOL(kmap_atomic);
-EXPORT_SYMBOL(kunmap_atomic_notypecheck);
-EXPORT_SYMBOL(kmap_atomic_prot);
 EXPORT_SYMBOL(kmap_atomic_to_page);
 void __init set_highmem_pages_init(void)
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 069ce7c37c01..f581a18c0d4d 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -72,7 +72,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
        if (!vma_shareable(vma, addr))
                return;
-        spin_lock(&mapping->i_mmap_lock);
+        mutex_lock(&mapping->i_mmap_mutex);
        vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
                if (svma == vma)
                        continue;
@@ -97,7 +97,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
                put_page(virt_to_page(spte));
        spin_unlock(&mm->page_table_lock);
 out:
-        spin_unlock(&mapping->i_mmap_lock);
+        mutex_unlock(&mapping->i_mmap_mutex);
 }
 /*
@@ -326,7 +326,7 @@ try_again:
        if (mm->free_area_cache < len)
                goto fail;
-        /* either no address requested or cant fit in requested address hole */
+        /* either no address requested or can't fit in requested address hole */
        addr = (mm->free_area_cache - len) & huge_page_mask(h);
        do {
                /*
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index b278535b14aa..30326443ab81 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -2,6 +2,7 @@
 #include <linux/initrd.h>
 #include <linux/ioport.h>
 #include <linux/swap.h>
+#include <linux/memblock.h>
 #include <asm/cacheflush.h>
 #include <asm/e820.h>
@@ -15,11 +16,9 @@
 #include <asm/tlb.h>
 #include <asm/proto.h>
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+unsigned long __initdata pgt_buf_start;
+unsigned long __meminitdata pgt_buf_end;
-unsigned long __initdata e820_table_start;
+unsigned long __meminitdata pgt_buf_top;
-unsigned long __meminitdata e820_table_end;
-unsigned long __meminitdata e820_table_top;
 int after_bootmem;
@@ -32,7 +31,8 @@ int direct_gbpages
 static void __init find_early_table_space(unsigned long end, int use_pse,
                                          int use_gbpages)
 {
-        unsigned long puds, pmds, ptes, tables, start;
+        unsigned long puds, pmds, ptes, tables, start = 0, good_end = end;
+        phys_addr_t base;
        puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
        tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
@@ -63,29 +63,25 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
 #ifdef CONFIG_X86_32
        /* for fixmap */
        tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
-#endif
-        /*
+        good_end = max_pfn_mapped << PAGE_SHIFT;
-         * RED-PEN putting page tables only on node 0 could
-         * cause a hotspot and fill up ZONE_DMA. The page tables
-         * need roughly 0.5KB per GB.
-         */
-#ifdef CONFIG_X86_32
-        start = 0x7000;
-#else
-        start = 0x8000;
 #endif
-        e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
-                                        tables, PAGE_SIZE);
+        base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
-        if (e820_table_start == -1UL)
+        if (base == MEMBLOCK_ERROR)
                panic("Cannot find space for the kernel page tables");
-        e820_table_start >>= PAGE_SHIFT;
+        pgt_buf_start = base >> PAGE_SHIFT;
-        e820_table_end = e820_table_start;
+        pgt_buf_end = pgt_buf_start;
-        e820_table_top = e820_table_start + (tables >> PAGE_SHIFT);
+        pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
        printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
-                end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT);
+                end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT);
+}
+void __init native_pagetable_reserve(u64 start, u64 end)
+{
+        memblock_x86_reserve_range(start, end, "PGTABLE");
 }
 struct map_range {
@@ -277,30 +273,26 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
        load_cr3(swapper_pg_dir);
 #endif
-#ifdef CONFIG_X86_64
-        if (!after_bootmem && !start) {
-                pud_t *pud;
-                pmd_t *pmd;
-                mmu_cr4_features = read_cr4();
-                /*
-                 * _brk_end cannot change anymore, but it and _end may be
-                 * located on different 2M pages. cleanup_highmap(), however,
-                 * can only consider _end when it runs, so destroy any
-                 * mappings beyond _brk_end here.
-                 */
-                pud = pud_offset(pgd_offset_k(_brk_end), _brk_end);
-                pmd = pmd_offset(pud, _brk_end - 1);
-                while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1))
-                        pmd_clear(pmd);
-        }
-#endif
        __flush_tlb_all();
-        if (!after_bootmem && e820_table_end > e820_table_start)
+        /*
-                reserve_early(e820_table_start << PAGE_SHIFT,
+         * Reserve the kernel pagetable pages we used (pgt_buf_start -
-                                 e820_table_end << PAGE_SHIFT, "PGTABLE");
+         * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top)
+         * so that they can be reused for other purposes.
+         *
+         * On native it just means calling memblock_x86_reserve_range, on Xen it
+         * also means marking RW the pagetable pages that we allocated before
+         * but that haven't been used.
+         *
+         * In fact on xen we mark RO the whole range pgt_buf_start -
+         * pgt_buf_top, because we have to make sure that when
+         * init_memory_mapping reaches the pagetable pages area, it maps
+         * RO all the pagetable pages, including the ones that are beyond
+         * pgt_buf_end at that time.
+         */
+        if (!after_bootmem && pgt_buf_end > pgt_buf_start)
+                x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
+                                PFN_PHYS(pgt_buf_end));
        if (!after_bootmem)
                early_memtest(start, end);
@@ -362,8 +354,9 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
        /*
         * We just marked the kernel text read only above, now that
         * we are going to free part of that, we need to make that
-         * writeable first.
+         * writeable and non-executable first.
         */
+        set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
        set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
        printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index bca79091b9d6..29f7c6d98179 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -25,6 +25,7 @@
 #include <linux/pfn.h>
 #include <linux/poison.h>
 #include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/proc_fs.h>
 #include <linux/memory_hotplug.h>
 #include <linux/initrd.h>
@@ -44,6 +45,7 @@
 #include <asm/bugs.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
+#include <asm/olpc_ofw.h>
 #include <asm/pgalloc.h>
 #include <asm/sections.h>
 #include <asm/paravirt.h>
@@ -60,14 +62,14 @@ bool __read_mostly __vmalloc_start_set = false;
 static __init void *alloc_low_page(void)
 {
-        unsigned long pfn = e820_table_end++;
+        unsigned long pfn = pgt_buf_end++;
        void *adr;
-        if (pfn >= e820_table_top)
+        if (pfn >= pgt_buf_top)
                panic("alloc_low_page: ran out of memory");
        adr = __va(pfn * PAGE_SIZE);
-        memset(adr, 0, PAGE_SIZE);
+        clear_page(adr);
        return adr;
 }
@@ -161,8 +163,8 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
        if (pmd_idx_kmap_begin != pmd_idx_kmap_end
            && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
            && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
-            && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start
+            && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start
-                || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) {
+                || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) {
                pte_t *newpte;
                int i;
@@ -225,7 +227,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
 static inline int is_kernel_text(unsigned long addr)
 {
-        if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
+        if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)
                return 1;
        return 0;
 }
@@ -422,49 +424,28 @@ static void __init add_one_highpage_init(struct page *page)
        totalhigh_pages++;
 }
-struct add_highpages_data {
+void __init add_highpages_with_active_regions(int nid,
-        unsigned long start_pfn;
+                         unsigned long start_pfn, unsigned long end_pfn)
-        unsigned long end_pfn;
-};
-static int __init add_highpages_work_fn(unsigned long start_pfn,
-                                         unsigned long end_pfn, void *datax)
 {
-        int node_pfn;
+        struct range *range;
-        struct page *page;
+        int nr_range;
-        unsigned long final_start_pfn, final_end_pfn;
+        int i;
-        struct add_highpages_data *data;
-        data = (struct add_highpages_data *)datax;
+        nr_range = __get_free_all_memory_range(&range, nid, start_pfn, end_pfn);
-        final_start_pfn = max(start_pfn, data->start_pfn);
+        for (i = 0; i < nr_range; i++) {
-        final_end_pfn = min(end_pfn, data->end_pfn);
+                struct page *page;
-        if (final_start_pfn >= final_end_pfn)
+                int node_pfn;
-                return 0;
-        for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
+                for (node_pfn = range[i].start; node_pfn < range[i].end;
-             node_pfn++) {
+                     node_pfn++) {
-                if (!pfn_valid(node_pfn))
+                        if (!pfn_valid(node_pfn))
-                        continue;
+                                continue;
-                page = pfn_to_page(node_pfn);
+                        page = pfn_to_page(node_pfn);
-                add_one_highpage_init(page);
+                        add_one_highpage_init(page);
+                }
        }
-        return 0;
 }
-void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
-                                              unsigned long end_pfn)
-{
-        struct add_highpages_data data;
-        data.start_pfn = start_pfn;
-        data.end_pfn = end_pfn;
-        work_with_active_regions(nid, add_highpages_work_fn, &data);
-}
 #else
 static inline void permanent_kmaps_init(pgd_t *pgd_base)
 {
@@ -548,48 +529,6 @@ static void __init pagetable_init(void)
        permanent_kmaps_init(pgd_base);
 }
-#ifdef CONFIG_ACPI_SLEEP
-/*
- * ACPI suspend needs this for resume, because things like the intel-agp
- * driver might have split up a kernel 4MB mapping.
- */
-char swsusp_pg_dir[PAGE_SIZE]
-        __attribute__ ((aligned(PAGE_SIZE)));
-static inline void save_pg_dir(void)
-{
-        memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
-}
-#else /* !CONFIG_ACPI_SLEEP */
-static inline void save_pg_dir(void)
-{
-}
-#endif /* !CONFIG_ACPI_SLEEP */
-void zap_low_mappings(bool early)
-{
-        int i;
-        /*
-         * Zap initial low-memory mappings.
-         *
-         * Note that "pgd_clear()" doesn't do it for
-         * us, because pgd_clear() is a no-op on i386.
-         */
-        for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) {
-#ifdef CONFIG_X86_PAE
-                set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
-#else
-                set_pgd(swapper_pg_dir+i, __pgd(0));
-#endif
-        }
-        if (early)
-                __flush_tlb();
-        else
-                flush_tlb_all();
-}
 pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
 EXPORT_SYMBOL_GPL(__supported_pte_mask);
@@ -705,21 +644,20 @@ void __init find_low_pfn_range(void)
 }
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
+void __init initmem_init(void)
-                                int acpi, int k8)
 {
 #ifdef CONFIG_HIGHMEM
        highstart_pfn = highend_pfn = max_pfn;
        if (max_pfn > max_low_pfn)
                highstart_pfn = max_low_pfn;
-        e820_register_active_regions(0, 0, highend_pfn);
+        memblock_x86_register_active_regions(0, 0, highend_pfn);
        sparse_memory_present_with_active_regions(0);
        printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
                pages_to_mb(highend_pfn - highstart_pfn));
        num_physpages = highend_pfn;
        high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
 #else
-        e820_register_active_regions(0, 0, max_low_pfn);
+        memblock_x86_register_active_regions(0, 0, max_low_pfn);
        sparse_memory_present_with_active_regions(0);
        num_physpages = max_low_pfn;
        high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
@@ -740,8 +678,10 @@ static void __init zone_sizes_init(void)
 {
        unsigned long max_zone_pfns[MAX_NR_ZONES];
        memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+#ifdef CONFIG_ZONE_DMA
        max_zone_pfns[ZONE_DMA] =
                virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+#endif
        max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
 #ifdef CONFIG_HIGHMEM
        max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
@@ -750,68 +690,12 @@ static void __init zone_sizes_init(void)
        free_area_init_nodes(max_zone_pfns);
 }
-#ifndef CONFIG_NO_BOOTMEM
-static unsigned long __init setup_node_bootmem(int nodeid,
-                                 unsigned long start_pfn,
-                                 unsigned long end_pfn,
-                                 unsigned long bootmap)
-{
-        unsigned long bootmap_size;
-        /* don't touch min_low_pfn */
-        bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
-                                         bootmap >> PAGE_SHIFT,
-                                         start_pfn, end_pfn);
-        printk(KERN_INFO "  node %d low ram: %08lx - %08lx\n",
-                nodeid, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
-        printk(KERN_INFO "  node %d bootmap %08lx - %08lx\n",
-                 nodeid, bootmap, bootmap + bootmap_size);
-        free_bootmem_with_active_regions(nodeid, end_pfn);
-        return bootmap + bootmap_size;
-}
-#endif
 void __init setup_bootmem_allocator(void)
 {
-#ifndef CONFIG_NO_BOOTMEM
-        int nodeid;
-        unsigned long bootmap_size, bootmap;
-        /*
-         * Initialize the boot-time allocator (with low memory only):
-         */
-        bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
-        bootmap = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
-                                 PAGE_SIZE);
-        if (bootmap == -1L)
-                panic("Cannot find bootmem map of size %ld\n", bootmap_size);
-        reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
-#endif
        printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
                 max_pfn_mapped<<PAGE_SHIFT);
        printk(KERN_INFO "  low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
-#ifndef CONFIG_NO_BOOTMEM
-        for_each_online_node(nodeid) {
-                 unsigned long start_pfn, end_pfn;
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-                start_pfn = node_start_pfn[nodeid];
-                end_pfn = node_end_pfn[nodeid];
-                if (start_pfn > max_low_pfn)
-                        continue;
-                if (end_pfn > max_low_pfn)
-                        end_pfn = max_low_pfn;
-#else
-                start_pfn = 0;
-                end_pfn = max_low_pfn;
-#endif
-                bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
-                                                 bootmap);
-        }
-#endif
        after_bootmem = 1;
 }
@@ -833,6 +717,8 @@ void __init paging_init(void)
        /*
         * NOTE: at this point the bootmem allocator is fully available.
         */
+        olpc_dt_build_devicetree();
+        sparse_memory_present_with_active_regions(MAX_NUMNODES);
        sparse_init();
        zone_sizes_init();
 }
@@ -958,9 +844,6 @@ void __init mem_init(void)
        if (boot_cpu_data.wp_works_ok < 0)
                test_wp_bit();
-        save_pg_dir();
-        zap_low_mappings(true);
 }
 #ifdef CONFIG_MEMORY_HOTPLUG
@@ -1033,6 +916,23 @@ void set_kernel_text_ro(void)
        set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 }
+static void mark_nxdata_nx(void)
+{
+        /*
+         * When this called, init has already been executed and released,
+         * so everything past _etext should be NX.
+         */
+        unsigned long start = PFN_ALIGN(_etext);
+        /*
+         * This comes from is_kernel_text upper limit. Also HPAGE where used:
+         */
+        unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start;
+        if (__supported_pte_mask & _PAGE_NX)
+                printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10);
+        set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT);
+}
 void mark_rodata_ro(void)
 {
        unsigned long start = PFN_ALIGN(_text);
@@ -1067,11 +967,7 @@ void mark_rodata_ro(void)
        printk(KERN_INFO "Testing CPA: write protecting again\n");
        set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 #endif
+        mark_nxdata_nx();
 }
 #endif
-int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
-                                   int flags)
-{
-        return reserve_bootmem(phys, len, flags);
-}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 9a6674689a20..bbaaa005bf0e 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -21,12 +21,14 @@
 #include <linux/initrd.h>
 #include <linux/pagemap.h>
 #include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/proc_fs.h>
 #include <linux/pci.h>
 #include <linux/pfn.h>
 #include <linux/poison.h>
 #include <linux/dma-mapping.h>
 #include <linux/module.h>
+#include <linux/memory.h>
 #include <linux/memory_hotplug.h>
 #include <linux/nmi.h>
 #include <linux/gfp.h>
@@ -50,9 +52,8 @@
 #include <asm/numa.h>
 #include <asm/cacheflush.h>
 #include <asm/init.h>
-#include <linux/bootmem.h>
+#include <asm/uv/uv.h>
+#include <asm/setup.h>
-static unsigned long dma_reserve __initdata;
 static int __init parse_direct_gbpages_off(char *arg)
 {
@@ -98,6 +99,43 @@ static int __init nonx32_setup(char *str)
 __setup("noexec32=", nonx32_setup);
 /*
+ * When memory was added/removed make sure all the processes MM have
+ * suitable PGD entries in the local PGD level page.
+ */
+void sync_global_pgds(unsigned long start, unsigned long end)
+{
+        unsigned long address;
+        for (address = start; address <= end; address += PGDIR_SIZE) {
+                const pgd_t *pgd_ref = pgd_offset_k(address);
+                struct page *page;
+                if (pgd_none(*pgd_ref))
+                        continue;
+                spin_lock(&pgd_lock);
+                list_for_each_entry(page, &pgd_list, lru) {
+                        pgd_t *pgd;
+                        spinlock_t *pgt_lock;
+                        pgd = (pgd_t *)page_address(page) + pgd_index(address);
+                        /* the pgt_lock only for Xen */
+                        pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+                        spin_lock(pgt_lock);
+                        if (pgd_none(*pgd))
+                                set_pgd(pgd, *pgd_ref);
+                        else
+                                BUG_ON(pgd_page_vaddr(*pgd)
+                                       != pgd_page_vaddr(*pgd_ref));
+                        spin_unlock(pgt_lock);
+                }
+                spin_unlock(&pgd_lock);
+        }
+}
+/*
 * NOTE: This function is marked __ref because it calls __init function
 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
 */
@@ -258,18 +296,18 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
 * to the compile time generated pmds. This results in invalid pmds up
 * to the point where we hit the physaddr 0 mapping.
 *
- * We limit the mappings to the region from _text to _end.  _end is
+ * We limit the mappings to the region from _text to _brk_end.  _brk_end
- * rounded up to the 2MB boundary. This catches the invalid pmds as
+ * is rounded up to the 2MB boundary. This catches the invalid pmds as
 * well, as they are located before _text:
 */
 void __init cleanup_highmap(void)
 {
        unsigned long vaddr = __START_KERNEL_map;
-        unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;
+        unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
+        unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
        pmd_t *pmd = level2_kernel_pgt;
-        pmd_t *last_pmd = pmd + PTRS_PER_PMD;
-        for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
+        for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
                if (pmd_none(*pmd))
                        continue;
                if (vaddr < (unsigned long) _text || vaddr > end)
@@ -279,7 +317,7 @@ void __init cleanup_highmap(void)
 static __ref void *alloc_low_page(unsigned long *phys)
 {
-        unsigned long pfn = e820_table_end++;
+        unsigned long pfn = pgt_buf_end++;
        void *adr;
        if (after_bootmem) {
@@ -289,21 +327,37 @@ static __ref void *alloc_low_page(unsigned long *phys)
                return adr;
        }
-        if (pfn >= e820_table_top)
+        if (pfn >= pgt_buf_top)
                panic("alloc_low_page: ran out of memory");
        adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
-        memset(adr, 0, PAGE_SIZE);
+        clear_page(adr);
        *phys  = pfn * PAGE_SIZE;
        return adr;
 }
+static __ref void *map_low_page(void *virt)
+{
+        void *adr;
+        unsigned long phys, left;
+        if (after_bootmem)
+                return virt;
+        phys = __pa(virt);
+        left = phys & (PAGE_SIZE - 1);
+        adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE);
+        adr = (void *)(((unsigned long)adr) | left);
+        return adr;
+}
 static __ref void unmap_low_page(void *adr)
 {
        if (after_bootmem)
                return;
-        early_iounmap(adr, PAGE_SIZE);
+        early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
 }
 static unsigned long __meminit
@@ -351,15 +405,6 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
 }
 static unsigned long __meminit
-phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
-                pgprot_t prot)
-{
-        pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
-        return phys_pte_init(pte, address, end, prot);
-}
-static unsigned long __meminit
 phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
              unsigned long page_size_mask, pgprot_t prot)
 {
@@ -385,8 +430,10 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
                if (pmd_val(*pmd)) {
                        if (!pmd_large(*pmd)) {
                                spin_lock(&init_mm.page_table_lock);
-                                last_map_addr = phys_pte_update(pmd, address,
+                                pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
+                                last_map_addr = phys_pte_init(pte, address,
                                                                end, prot);
+                                unmap_low_page(pte);
                                spin_unlock(&init_mm.page_table_lock);
                                continue;
                        }
@@ -433,18 +480,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
 }
 static unsigned long __meminit
-phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
-                unsigned long page_size_mask, pgprot_t prot)
-{
-        pmd_t *pmd = pmd_offset(pud, 0);
-        unsigned long last_map_addr;
-        last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
-        __flush_tlb_all();
-        return last_map_addr;
-}
-static unsigned long __meminit
 phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
                         unsigned long page_size_mask)
 {
@@ -469,8 +504,11 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
                if (pud_val(*pud)) {
                        if (!pud_large(*pud)) {
-                                last_map_addr = phys_pmd_update(pud, addr, end,
+                                pmd = map_low_page(pmd_offset(pud, 0));
+                                last_map_addr = phys_pmd_init(pmd, addr, end,
                                                         page_size_mask, prot);
+                                unmap_low_page(pmd);
+                                __flush_tlb_all();
                                continue;
                        }
                        /*
@@ -518,27 +556,18 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
        return last_map_addr;
 }
-static unsigned long __meminit
-phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
-                 unsigned long page_size_mask)
-{
-        pud_t *pud;
-        pud = (pud_t *)pgd_page_vaddr(*pgd);
-        return phys_pud_init(pud, addr, end, page_size_mask);
-}
 unsigned long __meminit
 kernel_physical_mapping_init(unsigned long start,
                             unsigned long end,
                             unsigned long page_size_mask)
 {
+        bool pgd_changed = false;
        unsigned long next, last_map_addr = end;
+        unsigned long addr;
        start = (unsigned long)__va(start);
        end = (unsigned long)__va(end);
+        addr = start;
        for (; start < end; start = next) {
                pgd_t *pgd = pgd_offset_k(start);
@@ -550,8 +579,10 @@ kernel_physical_mapping_init(unsigned long start,
                        next = end;
                if (pgd_val(*pgd)) {
-                        last_map_addr = phys_pud_update(pgd, __pa(start),
+                        pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
+                        last_map_addr = phys_pud_init(pud, __pa(start),
                                                 __pa(end), page_size_mask);
+                        unmap_low_page(pud);
                        continue;
                }
@@ -563,33 +594,21 @@ kernel_physical_mapping_init(unsigned long start,
                spin_lock(&init_mm.page_table_lock);
                pgd_populate(&init_mm, pgd, __va(pud_phys));
                spin_unlock(&init_mm.page_table_lock);
+                pgd_changed = true;
        }
+        if (pgd_changed)
+                sync_global_pgds(addr, end);
        __flush_tlb_all();
        return last_map_addr;
 }
 #ifndef CONFIG_NUMA
-void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
+void __init initmem_init(void)
-                                int acpi, int k8)
+{
-{
+        memblock_x86_register_active_regions(0, 0, max_pfn);
-#ifndef CONFIG_NO_BOOTMEM
-        unsigned long bootmap_size, bootmap;
-        bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
-        bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
-                                 PAGE_SIZE);
-        if (bootmap == -1L)
-                panic("Cannot find bootmem map of size %ld\n", bootmap_size);
-        reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
-        /* don't touch min_low_pfn */
-        bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
-                                         0, end_pfn);
-        e820_register_active_regions(0, start_pfn, end_pfn);
-        free_bootmem_with_active_regions(0, end_pfn);
-#else
-        e820_register_active_regions(0, start_pfn, end_pfn);
-#endif
 }
 #endif
@@ -598,7 +617,9 @@ void __init paging_init(void)
        unsigned long max_zone_pfns[MAX_NR_ZONES];
        memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+#ifdef CONFIG_ZONE_DMA
        max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
+#endif
        max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
        max_zone_pfns[ZONE_NORMAL] = max_pfn;
@@ -661,14 +682,6 @@ int arch_add_memory(int nid, u64 start, u64 size)
 }
 EXPORT_SYMBOL_GPL(arch_add_memory);
-#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
-int memory_add_physaddr_to_nid(u64 start)
-{
-        return 0;
-}
-EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
-#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
 static struct kcore_list kcore_vsyscall;
@@ -799,52 +812,6 @@ void mark_rodata_ro(void)
 #endif
-int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
-                                   int flags)
-{
-#ifdef CONFIG_NUMA
-        int nid, next_nid;
-        int ret;
-#endif
-        unsigned long pfn = phys >> PAGE_SHIFT;
-        if (pfn >= max_pfn) {
-                /*
-                 * This can happen with kdump kernels when accessing
-                 * firmware tables:
-                 */
-                if (pfn < max_pfn_mapped)
-                        return -EFAULT;
-                printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
-                                phys, len);
-                return -EFAULT;
-        }
-        /* Should check here against the e820 map to avoid double free */
-#ifdef CONFIG_NUMA
-        nid = phys_to_nid(phys);
-        next_nid = phys_to_nid(phys + len - 1);
-        if (nid == next_nid)
-                ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
-        else
-                ret = reserve_bootmem(phys, len, flags);
-        if (ret != 0)
-                return ret;
-#else
-        reserve_bootmem(phys, len, flags);
-#endif
-        if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
-                dma_reserve += len / PAGE_SIZE;
-                set_dma_reserve(dma_reserve);
-        }
-        return 0;
-}
 int kern_addr_valid(unsigned long addr)
 {
        unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
@@ -890,18 +857,18 @@ static struct vm_area_struct gate_vma = {
        .vm_flags       = VM_READ | VM_EXEC
 };
-struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
 {
 #ifdef CONFIG_IA32_EMULATION
-        if (test_tsk_thread_flag(tsk, TIF_IA32))
+        if (!mm || mm->context.ia32_compat)
                return NULL;
 #endif
        return &gate_vma;
 }
-int in_gate_area(struct task_struct *task, unsigned long addr)
+int in_gate_area(struct mm_struct *mm, unsigned long addr)
 {
-        struct vm_area_struct *vma = get_gate_vma(task);
+        struct vm_area_struct *vma = get_gate_vma(mm);
        if (!vma)
                return 0;
@@ -910,11 +877,11 @@ int in_gate_area(struct task_struct *task, unsigned long addr)
 }
 /*
- * Use this when you have no reliable task/vma, typically from interrupt
+ * Use this when you have no reliable mm, typically from interrupt
- * context. It is less reliable than using the task's vma and may give
+ * context. It is less reliable than using a task's mm and may give
- * false positives:
+ * false positives.
 */
-int in_gate_area_no_task(unsigned long addr)
+int in_gate_area_no_mm(unsigned long addr)
 {
        return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
 }
@@ -928,6 +895,17 @@ const char *arch_vma_name(struct vm_area_struct *vma)
        return NULL;
 }
+#ifdef CONFIG_X86_UV
+unsigned long memory_block_size_bytes(void)
+{
+        if (is_uv_system()) {
+                printk(KERN_INFO "UV: memory block size 2GB\n");
+                return 2UL * 1024 * 1024 * 1024;
+        }
+        return MIN_MEMORY_BLOCK_SIZE;
+}
+#endif
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 /*
 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
@@ -1003,6 +981,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
                }
        }
+        sync_global_pgds((unsigned long)start_page, end);
        return 0;
 }
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 72fc70cf6184..7b179b499fa3 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -48,21 +48,20 @@ int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot)
 }
 EXPORT_SYMBOL_GPL(iomap_create_wc);
-void
+void iomap_free(resource_size_t base, unsigned long size)
-iomap_free(resource_size_t base, unsigned long size)
 {
        io_free_memtype(base, base + size);
 }
 EXPORT_SYMBOL_GPL(iomap_free);
-void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
+void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
 {
-        enum fixed_addresses idx;
        unsigned long vaddr;
+        int idx, type;
        pagefault_disable();
-        debug_kmap_atomic(type);
+        type = kmap_atomic_idx_push();
        idx = type + KM_TYPE_NR * smp_processor_id();
        vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
        set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
@@ -72,10 +71,10 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
 }
 /*
- * Map 'pfn' using fixed map 'type' and protections 'prot'
+ * Map 'pfn' using protections 'prot'
 */
 void __iomem *
-iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
+iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
 {
        /*
         * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
@@ -86,24 +85,34 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
        if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
                prot = PAGE_KERNEL_UC_MINUS;
-        return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, type, prot);
+        return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot);
 }
 EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);
 void
-iounmap_atomic(void __iomem *kvaddr, enum km_type type)
+iounmap_atomic(void __iomem *kvaddr)
 {
        unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
-        enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
-        /*
+        if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
-         * Force other mappings to Oops if they'll try to access this pte
+            vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
-         * without first remap it.  Keeping stale mappings around is a bad idea
+                int idx, type;
-         * also, in case the page changes cacheability attributes or becomes
-         * a protected page in a hypervisor.
+                type = kmap_atomic_idx();
-         */
+                idx = type + KM_TYPE_NR * smp_processor_id();
-        if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
+#ifdef CONFIG_DEBUG_HIGHMEM
+                WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
+#endif
+                /*
+                 * Force other mappings to Oops if they'll try to access this
+                 * pte without first remap it.  Keeping stale mappings around
+                 * is a bad idea also, in case the page changes cacheability
+                 * attributes or becomes a protected page in a hypervisor.
+                 */
                kpte_clear_flush(kmap_pte-idx, vaddr);
+                kmap_atomic_idx_pop();
+        }
        pagefault_enable();
 }
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 3ba6e0608c55..be1ef574ce9a 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -91,13 +91,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
                return (__force void __iomem *)phys_to_virt(phys_addr);
        /*
-         * Check if the request spans more than any BAR in the iomem resource
-         * tree.
-         */
-        WARN_ONCE(iomem_map_sanity_check(phys_addr, size),
-                  KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
-        /*
         * Don't allow anybody to remap normal RAM that we're using..
         */
        last_pfn = last_addr >> PAGE_SHIFT;
@@ -170,6 +163,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
        ret_addr = (void __iomem *) (vaddr + offset);
        mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
+        /*
+         * Check if the request spans more than any BAR in the iomem resource
+         * tree.
+         */
+        WARN_ONCE(iomem_map_sanity_check(unaligned_phys_addr, unaligned_size),
+                  KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
        return ret_addr;
 err_free_area:
        free_vm_area(area);
@@ -362,6 +362,11 @@ static inline pte_t * __init early_ioremap_pte(unsigned long addr)
        return &bm_pte[pte_index(addr)];
 }
+bool __init is_early_ioremap_ptep(pte_t *ptep)
+{
+        return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)];
+}
 static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
 void __init early_ioremap_init(void)
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index af3b6c8a436f..704a37cedddb 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -185,7 +185,7 @@ void kmemcheck_error_save(enum kmemcheck_shadow state,
        e->trace.entries = e->trace_entries;
        e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
        e->trace.skip = 0;
-        save_stack_trace_bp(&e->trace, regs->bp);
+        save_stack_trace_regs(&e->trace, regs);
        /* Round address down to nearest 16 bytes */
        shadow_copy = kmemcheck_shadow_lookup(address
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
index b3b531a4f8e5..d87dd6d042d6 100644
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -631,6 +631,8 @@ bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
        if (!pte)
                return false;
+        WARN_ON_ONCE(in_nmi());
        if (error_code & 2)
                kmemcheck_access(regs, address, KMEMCHECK_WRITE);
        else
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
index 63c19e27aa6f..324aa3f07237 100644
--- a/arch/x86/mm/kmemcheck/opcode.c
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -9,7 +9,7 @@ static bool opcode_is_prefix(uint8_t b)
                b == 0xf0 || b == 0xf2 || b == 0xf3
                /* Group 2 */
                || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
-                || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e
+                || b == 0x64 || b == 0x65
                /* Group 3 */
                || b == 0x66
                /* Group 4 */
diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c
new file mode 100644
index 000000000000..992da5ec5a64
--- /dev/null
+++ b/arch/x86/mm/memblock.c
@@ -0,0 +1,348 @@
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/memblock.h>
+#include <linux/bootmem.h>
+#include <linux/mm.h>
+#include <linux/range.h>
+/* Check for already reserved areas */
+bool __init memblock_x86_check_reserved_size(u64 *addrp, u64 *sizep, u64 align)
+{
+        struct memblock_region *r;
+        u64 addr = *addrp, last;
+        u64 size = *sizep;
+        bool changed = false;
+again:
+        last = addr + size;
+        for_each_memblock(reserved, r) {
+                if (last > r->base && addr < r->base) {
+                        size = r->base - addr;
+                        changed = true;
+                        goto again;
+                }
+                if (last > (r->base + r->size) && addr < (r->base + r->size)) {
+                        addr = round_up(r->base + r->size, align);
+                        size = last - addr;
+                        changed = true;
+                        goto again;
+                }
+                if (last <= (r->base + r->size) && addr >= r->base) {
+                        *sizep = 0;
+                        return false;
+                }
+        }
+        if (changed) {
+                *addrp = addr;
+                *sizep = size;
+        }
+        return changed;
+}
+/*
+ * Find next free range after start, and size is returned in *sizep
+ */
+u64 __init memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align)
+{
+        struct memblock_region *r;
+        for_each_memblock(memory, r) {
+                u64 ei_start = r->base;
+                u64 ei_last = ei_start + r->size;
+                u64 addr;
+                addr = round_up(ei_start, align);
+                if (addr < start)
+                        addr = round_up(start, align);
+                if (addr >= ei_last)
+                        continue;
+                *sizep = ei_last - addr;
+                while (memblock_x86_check_reserved_size(&addr, sizep, align))
+                        ;
+                if (*sizep)
+                        return addr;
+        }
+        return MEMBLOCK_ERROR;
+}
+static __init struct range *find_range_array(int count)
+{
+        u64 end, size, mem;
+        struct range *range;
+        size = sizeof(struct range) * count;
+        end = memblock.current_limit;
+        mem = memblock_find_in_range(0, end, size, sizeof(struct range));
+        if (mem == MEMBLOCK_ERROR)
+                panic("can not find more space for range array");
+        /*
+         * This range is tempoaray, so don't reserve it, it will not be
+         * overlapped because We will not alloccate new buffer before
+         * We discard this one
+         */
+        range = __va(mem);
+        memset(range, 0, size);
+        return range;
+}
+static void __init memblock_x86_subtract_reserved(struct range *range, int az)
+{
+        u64 final_start, final_end;
+        struct memblock_region *r;
+        /* Take out region array itself at first*/
+        memblock_free_reserved_regions();
+        memblock_dbg("Subtract (%ld early reservations)\n", memblock.reserved.cnt);
+        for_each_memblock(reserved, r) {
+                memblock_dbg("  [%010llx-%010llx]\n", (u64)r->base, (u64)r->base + r->size - 1);
+                final_start = PFN_DOWN(r->base);
+                final_end = PFN_UP(r->base + r->size);
+                if (final_start >= final_end)
+                        continue;
+                subtract_range(range, az, final_start, final_end);
+        }
+        /* Put region array back ? */
+        memblock_reserve_reserved_regions();
+}
+struct count_data {
+        int nr;
+};
+static int __init count_work_fn(unsigned long start_pfn,
+                                unsigned long end_pfn, void *datax)
+{
+        struct count_data *data = datax;
+        data->nr++;
+        return 0;
+}
+static int __init count_early_node_map(int nodeid)
+{
+        struct count_data data;
+        data.nr = 0;
+        work_with_active_regions(nodeid, count_work_fn, &data);
+        return data.nr;
+}
+int __init __get_free_all_memory_range(struct range **rangep, int nodeid,
+                         unsigned long start_pfn, unsigned long end_pfn)
+{
+        int count;
+        struct range *range;
+        int nr_range;
+        count = (memblock.reserved.cnt + count_early_node_map(nodeid)) * 2;
+        range = find_range_array(count);
+        nr_range = 0;
+        /*
+         * Use early_node_map[] and memblock.reserved.region to get range array
+         * at first
+         */
+        nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
+        subtract_range(range, count, 0, start_pfn);
+        subtract_range(range, count, end_pfn, -1ULL);
+        memblock_x86_subtract_reserved(range, count);
+        nr_range = clean_sort_range(range, count);
+        *rangep = range;
+        return nr_range;
+}
+int __init get_free_all_memory_range(struct range **rangep, int nodeid)
+{
+        unsigned long end_pfn = -1UL;
+#ifdef CONFIG_X86_32
+        end_pfn = max_low_pfn;
+#endif
+        return __get_free_all_memory_range(rangep, nodeid, 0, end_pfn);
+}
+static u64 __init __memblock_x86_memory_in_range(u64 addr, u64 limit, bool get_free)
+{
+        int i, count;
+        struct range *range;
+        int nr_range;
+        u64 final_start, final_end;
+        u64 free_size;
+        struct memblock_region *r;
+        count = (memblock.reserved.cnt + memblock.memory.cnt) * 2;
+        range = find_range_array(count);
+        nr_range = 0;
+        addr = PFN_UP(addr);
+        limit = PFN_DOWN(limit);
+        for_each_memblock(memory, r) {
+                final_start = PFN_UP(r->base);
+                final_end = PFN_DOWN(r->base + r->size);
+                if (final_start >= final_end)
+                        continue;
+                if (final_start >= limit || final_end <= addr)
+                        continue;
+                nr_range = add_range(range, count, nr_range, final_start, final_end);
+        }
+        subtract_range(range, count, 0, addr);
+        subtract_range(range, count, limit, -1ULL);
+        /* Subtract memblock.reserved.region in range ? */
+        if (!get_free)
+                goto sort_and_count_them;
+        for_each_memblock(reserved, r) {
+                final_start = PFN_DOWN(r->base);
+                final_end = PFN_UP(r->base + r->size);
+                if (final_start >= final_end)
+                        continue;
+                if (final_start >= limit || final_end <= addr)
+                        continue;
+                subtract_range(range, count, final_start, final_end);
+        }
+sort_and_count_them:
+        nr_range = clean_sort_range(range, count);
+        free_size = 0;
+        for (i = 0; i < nr_range; i++)
+                free_size += range[i].end - range[i].start;
+        return free_size << PAGE_SHIFT;
+}
+u64 __init memblock_x86_free_memory_in_range(u64 addr, u64 limit)
+{
+        return __memblock_x86_memory_in_range(addr, limit, true);
+}
+u64 __init memblock_x86_memory_in_range(u64 addr, u64 limit)
+{
+        return __memblock_x86_memory_in_range(addr, limit, false);
+}
+void __init memblock_x86_reserve_range(u64 start, u64 end, char *name)
+{
+        if (start == end)
+                return;
+        if (WARN_ONCE(start > end, "memblock_x86_reserve_range: wrong range [%#llx, %#llx)\n", start, end))
+                return;
+        memblock_dbg("    memblock_x86_reserve_range: [%#010llx-%#010llx] %16s\n", start, end - 1, name);
+        memblock_reserve(start, end - start);
+}
+void __init memblock_x86_free_range(u64 start, u64 end)
+{
+        if (start == end)
+                return;
+        if (WARN_ONCE(start > end, "memblock_x86_free_range: wrong range [%#llx, %#llx)\n", start, end))
+                return;
+        memblock_dbg("       memblock_x86_free_range: [%#010llx-%#010llx]\n", start, end - 1);
+        memblock_free(start, end - start);
+}
+/*
+ * Need to call this function after memblock_x86_register_active_regions,
+ * so early_node_map[] is filled already.
+ */
+u64 __init memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align)
+{
+        u64 addr;
+        addr = find_memory_core_early(nid, size, align, start, end);
+        if (addr != MEMBLOCK_ERROR)
+                return addr;
+        /* Fallback, should already have start end within node range */
+        return memblock_find_in_range(start, end, size, align);
+}
+/*
+ * Finds an active region in the address range from start_pfn to last_pfn and
+ * returns its range in ei_startpfn and ei_endpfn for the memblock entry.
+ */
+static int __init memblock_x86_find_active_region(const struct memblock_region *ei,
+                                  unsigned long start_pfn,
+                                  unsigned long last_pfn,
+                                  unsigned long *ei_startpfn,
+                                  unsigned long *ei_endpfn)
+{
+        u64 align = PAGE_SIZE;
+        *ei_startpfn = round_up(ei->base, align) >> PAGE_SHIFT;
+        *ei_endpfn = round_down(ei->base + ei->size, align) >> PAGE_SHIFT;
+        /* Skip map entries smaller than a page */
+        if (*ei_startpfn >= *ei_endpfn)
+                return 0;
+        /* Skip if map is outside the node */
+        if (*ei_endpfn <= start_pfn || *ei_startpfn >= last_pfn)
+                return 0;
+        /* Check for overlaps */
+        if (*ei_startpfn < start_pfn)
+                *ei_startpfn = start_pfn;
+        if (*ei_endpfn > last_pfn)
+                *ei_endpfn = last_pfn;
+        return 1;
+}
+/* Walk the memblock.memory map and register active regions within a node */
+void __init memblock_x86_register_active_regions(int nid, unsigned long start_pfn,
+                                         unsigned long last_pfn)
+{
+        unsigned long ei_startpfn;
+        unsigned long ei_endpfn;
+        struct memblock_region *r;
+        for_each_memblock(memory, r)
+                if (memblock_x86_find_active_region(r, start_pfn, last_pfn,
+                                           &ei_startpfn, &ei_endpfn))
+                        add_active_range(nid, ei_startpfn, ei_endpfn);
+}
+/*
+ * Find the hole size (in bytes) in the memory range.
+ * @start: starting address of the memory range to scan
+ * @end: ending address of the memory range to scan
+ */
+u64 __init memblock_x86_hole_size(u64 start, u64 end)
+{
+        unsigned long start_pfn = start >> PAGE_SHIFT;
+        unsigned long last_pfn = end >> PAGE_SHIFT;
+        unsigned long ei_startpfn, ei_endpfn, ram = 0;
+        struct memblock_region *r;
+        for_each_memblock(memory, r)
+                if (memblock_x86_find_active_region(r, start_pfn, last_pfn,
+                                           &ei_startpfn, &ei_endpfn))
+                        ram += ei_endpfn - ei_startpfn;
+        return end - start - ((u64)ram << PAGE_SHIFT);
+}
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 18d244f70205..92faf3a1c53e 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -6,8 +6,7 @@
 #include <linux/smp.h>
 #include <linux/init.h>
 #include <linux/pfn.h>
+#include <linux/memblock.h>
-#include <asm/e820.h>
 static u64 patterns[] __initdata = {
        0,
@@ -35,7 +34,7 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
               (unsigned long long) pattern,
               (unsigned long long) start_bad,
               (unsigned long long) end_bad);
-        reserve_early(start_bad, end_bad, "BAD RAM");
+        memblock_x86_reserve_range(start_bad, end_bad, "BAD RAM");
 }
 static void __init memtest(u64 pattern, u64 start_phys, u64 size)
@@ -74,7 +73,7 @@ static void __init do_one_pass(u64 pattern, u64 start, u64 end)
        u64 size = 0;
        while (start < end) {
-                start = find_e820_area_size(start, &size, 1);
+                start = memblock_x86_find_in_range_size(start, &size, 1);
                /* done ? */
                if (start >= end)
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 787c52ca49c3..f5510d889a22 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -1,15 +1,112 @@
 /* Common code for 32 and 64-bit NUMA */
-#include <linux/topology.h>
+#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/init.h>
 #include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/mmzone.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/nodemask.h>
+#include <linux/sched.h>
+#include <linux/topology.h>
+#include <asm/e820.h>
+#include <asm/proto.h>
+#include <asm/dma.h>
+#include <asm/acpi.h>
+#include <asm/amd_nb.h>
+#include "numa_internal.h"
+int __initdata numa_off;
+nodemask_t numa_nodes_parsed __initdata;
+struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
+EXPORT_SYMBOL(node_data);
+static struct numa_meminfo numa_meminfo
+#ifndef CONFIG_MEMORY_HOTPLUG
+__initdata
+#endif
+;
+static int numa_distance_cnt;
+static u8 *numa_distance;
+static __init int numa_setup(char *opt)
+{
+        if (!opt)
+                return -EINVAL;
+        if (!strncmp(opt, "off", 3))
+                numa_off = 1;
+#ifdef CONFIG_NUMA_EMU
+        if (!strncmp(opt, "fake=", 5))
+                numa_emu_cmdline(opt + 5);
+#endif
+#ifdef CONFIG_ACPI_NUMA
+        if (!strncmp(opt, "noacpi", 6))
+                acpi_numa = -1;
+#endif
+        return 0;
+}
+early_param("numa", numa_setup);
 /*
- * Which logical CPUs are on which nodes
+ * apicid, cpu, node mappings
 */
+s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
+        [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+};
+int __cpuinit numa_cpu_node(int cpu)
+{
+        int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
+        if (apicid != BAD_APICID)
+                return __apicid_to_node[apicid];
+        return NUMA_NO_NODE;
+}
 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
 EXPORT_SYMBOL(node_to_cpumask_map);
 /*
+ * Map cpu index to node index
+ */
+DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
+void __cpuinit numa_set_node(int cpu, int node)
+{
+        int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
+        /* early setting, no percpu area yet */
+        if (cpu_to_node_map) {
+                cpu_to_node_map[cpu] = node;
+                return;
+        }
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+        if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
+                printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
+                dump_stack();
+                return;
+        }
+#endif
+        per_cpu(x86_cpu_to_node_map, cpu) = node;
+        if (node != NUMA_NO_NODE)
+                set_cpu_numa_node(cpu, node);
+}
+void __cpuinit numa_clear_node(int cpu)
+{
+        numa_set_node(cpu, NUMA_NO_NODE);
+}
+/*
 * Allocate node_to_cpumask_map based on number of available nodes
 * Requires node_possible_map to be valid.
 *
@@ -35,7 +132,659 @@ void __init setup_node_to_cpumask_map(void)
        pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
 }
-#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
+                                     struct numa_meminfo *mi)
+{
+        /* ignore zero length blks */
+        if (start == end)
+                return 0;
+        /* whine about and ignore invalid blks */
+        if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
+                pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
+                           nid, start, end);
+                return 0;
+        }
+        if (mi->nr_blks >= NR_NODE_MEMBLKS) {
+                pr_err("NUMA: too many memblk ranges\n");
+                return -EINVAL;
+        }
+        mi->blk[mi->nr_blks].start = start;
+        mi->blk[mi->nr_blks].end = end;
+        mi->blk[mi->nr_blks].nid = nid;
+        mi->nr_blks++;
+        return 0;
+}
+/**
+ * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
+ * @idx: Index of memblk to remove
+ * @mi: numa_meminfo to remove memblk from
+ *
+ * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
+ * decrementing @mi->nr_blks.
+ */
+void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
+{
+        mi->nr_blks--;
+        memmove(&mi->blk[idx], &mi->blk[idx + 1],
+                (mi->nr_blks - idx) * sizeof(mi->blk[0]));
+}
+/**
+ * numa_add_memblk - Add one numa_memblk to numa_meminfo
+ * @nid: NUMA node ID of the new memblk
+ * @start: Start address of the new memblk
+ * @end: End address of the new memblk
+ *
+ * Add a new memblk to the default numa_meminfo.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init numa_add_memblk(int nid, u64 start, u64 end)
+{
+        return numa_add_memblk_to(nid, start, end, &numa_meminfo);
+}
+/* Initialize NODE_DATA for a node on the local memory */
+static void __init setup_node_data(int nid, u64 start, u64 end)
+{
+        const u64 nd_low = PFN_PHYS(MAX_DMA_PFN);
+        const u64 nd_high = PFN_PHYS(max_pfn_mapped);
+        const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
+        bool remapped = false;
+        u64 nd_pa;
+        void *nd;
+        int tnid;
+        /*
+         * Don't confuse VM with a node that doesn't have the
+         * minimum amount of memory:
+         */
+        if (end && (end - start) < NODE_MIN_SIZE)
+                return;
+        /* initialize remap allocator before aligning to ZONE_ALIGN */
+        init_alloc_remap(nid, start, end);
+        start = roundup(start, ZONE_ALIGN);
+        printk(KERN_INFO "Initmem setup node %d %016Lx-%016Lx\n",
+               nid, start, end);
+        /*
+         * Allocate node data.  Try remap allocator first, node-local
+         * memory and then any node.  Never allocate in DMA zone.
+         */
+        nd = alloc_remap(nid, nd_size);
+        if (nd) {
+                nd_pa = __pa(nd);
+                remapped = true;
+        } else {
+                nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high,
+                                                nd_size, SMP_CACHE_BYTES);
+                if (nd_pa == MEMBLOCK_ERROR)
+                        nd_pa = memblock_find_in_range(nd_low, nd_high,
+                                                nd_size, SMP_CACHE_BYTES);
+                if (nd_pa == MEMBLOCK_ERROR) {
+                        pr_err("Cannot find %zu bytes in node %d\n",
+                               nd_size, nid);
+                        return;
+                }
+                memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA");
+                nd = __va(nd_pa);
+        }
+        /* report and initialize */
+        printk(KERN_INFO "  NODE_DATA [%016Lx - %016Lx]%s\n",
+               nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : "");
+        tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
+        if (!remapped && tnid != nid)
+                printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nid, tnid);
+        node_data[nid] = nd;
+        memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
+        NODE_DATA(nid)->node_id = nid;
+        NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT;
+        NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT;
+        node_set_online(nid);
+}
+/**
+ * numa_cleanup_meminfo - Cleanup a numa_meminfo
+ * @mi: numa_meminfo to clean up
+ *
+ * Sanitize @mi by merging and removing unncessary memblks.  Also check for
+ * conflicts and clear unused memblks.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
+{
+        const u64 low = 0;
+        const u64 high = PFN_PHYS(max_pfn);
+        int i, j, k;
+        /* first, trim all entries */
+        for (i = 0; i < mi->nr_blks; i++) {
+                struct numa_memblk *bi = &mi->blk[i];
+                /* make sure all blocks are inside the limits */
+                bi->start = max(bi->start, low);
+                bi->end = min(bi->end, high);
+                /* and there's no empty block */
+                if (bi->start >= bi->end)
+                        numa_remove_memblk_from(i--, mi);
+        }
+        /* merge neighboring / overlapping entries */
+        for (i = 0; i < mi->nr_blks; i++) {
+                struct numa_memblk *bi = &mi->blk[i];
+                for (j = i + 1; j < mi->nr_blks; j++) {
+                        struct numa_memblk *bj = &mi->blk[j];
+                        u64 start, end;
+                        /*
+                         * See whether there are overlapping blocks.  Whine
+                         * about but allow overlaps of the same nid.  They
+                         * will be merged below.
+                         */
+                        if (bi->end > bj->start && bi->start < bj->end) {
+                                if (bi->nid != bj->nid) {
+                                        pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
+                                               bi->nid, bi->start, bi->end,
+                                               bj->nid, bj->start, bj->end);
+                                        return -EINVAL;
+                                }
+                                pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
+                                           bi->nid, bi->start, bi->end,
+                                           bj->start, bj->end);
+                        }
+                        /*
+                         * Join together blocks on the same node, holes
+                         * between which don't overlap with memory on other
+                         * nodes.
+                         */
+                        if (bi->nid != bj->nid)
+                                continue;
+                        start = min(bi->start, bj->start);
+                        end = max(bi->end, bj->end);
+                        for (k = 0; k < mi->nr_blks; k++) {
+                                struct numa_memblk *bk = &mi->blk[k];
+                                if (bi->nid == bk->nid)
+                                        continue;
+                                if (start < bk->end && end > bk->start)
+                                        break;
+                        }
+                        if (k < mi->nr_blks)
+                                continue;
+                        printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%Lx,%Lx)\n",
+                               bi->nid, bi->start, bi->end, bj->start, bj->end,
+                               start, end);
+                        bi->start = start;
+                        bi->end = end;
+                        numa_remove_memblk_from(j--, mi);
+                }
+        }
+        /* clear unused ones */
+        for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
+                mi->blk[i].start = mi->blk[i].end = 0;
+                mi->blk[i].nid = NUMA_NO_NODE;
+        }
+        return 0;
+}
+/*
+ * Set nodes, which have memory in @mi, in *@nodemask.
+ */
+static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
+                                              const struct numa_meminfo *mi)
+{
+        int i;
+        for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
+                if (mi->blk[i].start != mi->blk[i].end &&
+                    mi->blk[i].nid != NUMA_NO_NODE)
+                        node_set(mi->blk[i].nid, *nodemask);
+}
+/**
+ * numa_reset_distance - Reset NUMA distance table
+ *
+ * The current table is freed.  The next numa_set_distance() call will
+ * create a new one.
+ */
+void __init numa_reset_distance(void)
+{
+        size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
+        /* numa_distance could be 1LU marking allocation failure, test cnt */
+        if (numa_distance_cnt)
+                memblock_x86_free_range(__pa(numa_distance),
+                                        __pa(numa_distance) + size);
+        numa_distance_cnt = 0;
+        numa_distance = NULL;   /* enable table creation */
+}
+static int __init numa_alloc_distance(void)
+{
+        nodemask_t nodes_parsed;
+        size_t size;
+        int i, j, cnt = 0;
+        u64 phys;
+        /* size the new table and allocate it */
+        nodes_parsed = numa_nodes_parsed;
+        numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
+        for_each_node_mask(i, nodes_parsed)
+                cnt = i;
+        cnt++;
+        size = cnt * cnt * sizeof(numa_distance[0]);
+        phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
+                                      size, PAGE_SIZE);
+        if (phys == MEMBLOCK_ERROR) {
+                pr_warning("NUMA: Warning: can't allocate distance table!\n");
+                /* don't retry until explicitly reset */
+                numa_distance = (void *)1LU;
+                return -ENOMEM;
+        }
+        memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
+        numa_distance = __va(phys);
+        numa_distance_cnt = cnt;
+        /* fill with the default distances */
+        for (i = 0; i < cnt; i++)
+                for (j = 0; j < cnt; j++)
+                        numa_distance[i * cnt + j] = i == j ?
+                                LOCAL_DISTANCE : REMOTE_DISTANCE;
+        printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
+        return 0;
+}
+/**
+ * numa_set_distance - Set NUMA distance from one NUMA to another
+ * @from: the 'from' node to set distance
+ * @to: the 'to'  node to set distance
+ * @distance: NUMA distance
+ *
+ * Set the distance from node @from to @to to @distance.  If distance table
+ * doesn't exist, one which is large enough to accommodate all the currently
+ * known nodes will be created.
+ *
+ * If such table cannot be allocated, a warning is printed and further
+ * calls are ignored until the distance table is reset with
+ * numa_reset_distance().
+ *
+ * If @from or @to is higher than the highest known node at the time of
+ * table creation or @distance doesn't make sense, the call is ignored.
+ * This is to allow simplification of specific NUMA config implementations.
+ */
+void __init numa_set_distance(int from, int to, int distance)
+{
+        if (!numa_distance && numa_alloc_distance() < 0)
+                return;
+        if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
+                printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
+                            from, to, distance);
+                return;
+        }
+        if ((u8)distance != distance ||
+            (from == to && distance != LOCAL_DISTANCE)) {
+                pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
+                             from, to, distance);
+                return;
+        }
+        numa_distance[from * numa_distance_cnt + to] = distance;
+}
+int __node_distance(int from, int to)
+{
+        if (from >= numa_distance_cnt || to >= numa_distance_cnt)
+                return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
+        return numa_distance[from * numa_distance_cnt + to];
+}
+EXPORT_SYMBOL(__node_distance);
+/*
+ * Sanity check to catch more bad NUMA configurations (they are amazingly
+ * common).  Make sure the nodes cover all memory.
+ */
+static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
+{
+        u64 numaram, e820ram;
+        int i;
+        numaram = 0;
+        for (i = 0; i < mi->nr_blks; i++) {
+                u64 s = mi->blk[i].start >> PAGE_SHIFT;
+                u64 e = mi->blk[i].end >> PAGE_SHIFT;
+                numaram += e - s;
+                numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
+                if ((s64)numaram < 0)
+                        numaram = 0;
+        }
+        e820ram = max_pfn - (memblock_x86_hole_size(0,
+                                        PFN_PHYS(max_pfn)) >> PAGE_SHIFT);
+        /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
+        if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
+                printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n",
+                       (numaram << PAGE_SHIFT) >> 20,
+                       (e820ram << PAGE_SHIFT) >> 20);
+                return false;
+        }
+        return true;
+}
+static int __init numa_register_memblks(struct numa_meminfo *mi)
+{
+        int i, nid;
+        /* Account for nodes with cpus and no memory */
+        node_possible_map = numa_nodes_parsed;
+        numa_nodemask_from_meminfo(&node_possible_map, mi);
+        if (WARN_ON(nodes_empty(node_possible_map)))
+                return -EINVAL;
+        for (i = 0; i < mi->nr_blks; i++)
+                memblock_x86_register_active_regions(mi->blk[i].nid,
+                                        mi->blk[i].start >> PAGE_SHIFT,
+                                        mi->blk[i].end >> PAGE_SHIFT);
+        /* for out of order entries */
+        sort_node_map();
+        if (!numa_meminfo_cover_memory(mi))
+                return -EINVAL;
+        /* Finally register nodes. */
+        for_each_node_mask(nid, node_possible_map) {
+                u64 start = PFN_PHYS(max_pfn);
+                u64 end = 0;
+                for (i = 0; i < mi->nr_blks; i++) {
+                        if (nid != mi->blk[i].nid)
+                                continue;
+                        start = min(mi->blk[i].start, start);
+                        end = max(mi->blk[i].end, end);
+                }
+                if (start < end)
+                        setup_node_data(nid, start, end);
+        }
+        return 0;
+}
+/*
+ * There are unfortunately some poorly designed mainboards around that
+ * only connect memory to a single CPU. This breaks the 1:1 cpu->node
+ * mapping. To avoid this fill in the mapping for all possible CPUs,
+ * as the number of CPUs is not known yet. We round robin the existing
+ * nodes.
+ */
+static void __init numa_init_array(void)
+{
+        int rr, i;
+        rr = first_node(node_online_map);
+        for (i = 0; i < nr_cpu_ids; i++) {
+                if (early_cpu_to_node(i) != NUMA_NO_NODE)
+                        continue;
+                numa_set_node(i, rr);
+                rr = next_node(rr, node_online_map);
+                if (rr == MAX_NUMNODES)
+                        rr = first_node(node_online_map);
+        }
+}
+static int __init numa_init(int (*init_func)(void))
+{
+        int i;
+        int ret;
+        for (i = 0; i < MAX_LOCAL_APIC; i++)
+                set_apicid_to_node(i, NUMA_NO_NODE);
+        nodes_clear(numa_nodes_parsed);
+        nodes_clear(node_possible_map);
+        nodes_clear(node_online_map);
+        memset(&numa_meminfo, 0, sizeof(numa_meminfo));
+        remove_all_active_ranges();
+        numa_reset_distance();
+        ret = init_func();
+        if (ret < 0)
+                return ret;
+        ret = numa_cleanup_meminfo(&numa_meminfo);
+        if (ret < 0)
+                return ret;
+        numa_emulation(&numa_meminfo, numa_distance_cnt);
+        ret = numa_register_memblks(&numa_meminfo);
+        if (ret < 0)
+                return ret;
+        for (i = 0; i < nr_cpu_ids; i++) {
+                int nid = early_cpu_to_node(i);
+                if (nid == NUMA_NO_NODE)
+                        continue;
+                if (!node_online(nid))
+                        numa_clear_node(i);
+        }
+        numa_init_array();
+        return 0;
+}
+/**
+ * dummy_numa_init - Fallback dummy NUMA init
+ *
+ * Used if there's no underlying NUMA architecture, NUMA initialization
+ * fails, or NUMA is disabled on the command line.
+ *
+ * Must online at least one node and add memory blocks that cover all
+ * allowed memory.  This function must not fail.
+ */
+static int __init dummy_numa_init(void)
+{
+        printk(KERN_INFO "%s\n",
+               numa_off ? "NUMA turned off" : "No NUMA configuration found");
+        printk(KERN_INFO "Faking a node at %016Lx-%016Lx\n",
+               0LLU, PFN_PHYS(max_pfn));
+        node_set(0, numa_nodes_parsed);
+        numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
+        return 0;
+}
+/**
+ * x86_numa_init - Initialize NUMA
+ *
+ * Try each configured NUMA initialization method until one succeeds.  The
+ * last fallback is dummy single node config encomapssing whole memory and
+ * never fails.
+ */
+void __init x86_numa_init(void)
+{
+        if (!numa_off) {
+#ifdef CONFIG_X86_NUMAQ
+                if (!numa_init(numaq_numa_init))
+                        return;
+#endif
+#ifdef CONFIG_ACPI_NUMA
+                if (!numa_init(x86_acpi_numa_init))
+                        return;
+#endif
+#ifdef CONFIG_AMD_NUMA
+                if (!numa_init(amd_numa_init))
+                        return;
+#endif
+        }
+        numa_init(dummy_numa_init);
+}
+static __init int find_near_online_node(int node)
+{
+        int n, val;
+        int min_val = INT_MAX;
+        int best_node = -1;
+        for_each_online_node(n) {
+                val = node_distance(node, n);
+                if (val < min_val) {
+                        min_val = val;
+                        best_node = n;
+                }
+        }
+        return best_node;
+}
+/*
+ * Setup early cpu_to_node.
+ *
+ * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
+ * and apicid_to_node[] tables have valid entries for a CPU.
+ * This means we skip cpu_to_node[] initialisation for NUMA
+ * emulation and faking node case (when running a kernel compiled
+ * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
+ * is already initialized in a round robin manner at numa_init_array,
+ * prior to this call, and this initialization is good enough
+ * for the fake NUMA cases.
+ *
+ * Called before the per_cpu areas are setup.
+ */
+void __init init_cpu_to_node(void)
+{
+        int cpu;
+        u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
+        BUG_ON(cpu_to_apicid == NULL);
+        for_each_possible_cpu(cpu) {
+                int node = numa_cpu_node(cpu);
+                if (node == NUMA_NO_NODE)
+                        continue;
+                if (!node_online(node))
+                        node = find_near_online_node(node);
+                numa_set_node(cpu, node);
+        }
+}
+#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+# ifndef CONFIG_NUMA_EMU
+void __cpuinit numa_add_cpu(int cpu)
+{
+        cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+void __cpuinit numa_remove_cpu(int cpu)
+{
+        cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+# endif /* !CONFIG_NUMA_EMU */
+#else   /* !CONFIG_DEBUG_PER_CPU_MAPS */
+int __cpu_to_node(int cpu)
+{
+        if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
+                printk(KERN_WARNING
+                        "cpu_to_node(%d): usage too early!\n", cpu);
+                dump_stack();
+                return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+        }
+        return per_cpu(x86_cpu_to_node_map, cpu);
+}
+EXPORT_SYMBOL(__cpu_to_node);
+/*
+ * Same function as cpu_to_node() but used if called before the
+ * per_cpu areas are setup.
+ */
+int early_cpu_to_node(int cpu)
+{
+        if (early_per_cpu_ptr(x86_cpu_to_node_map))
+                return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+        if (!cpu_possible(cpu)) {
+                printk(KERN_WARNING
+                        "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
+                dump_stack();
+                return NUMA_NO_NODE;
+        }
+        return per_cpu(x86_cpu_to_node_map, cpu);
+}
+void debug_cpumask_set_cpu(int cpu, int node, bool enable)
+{
+        struct cpumask *mask;
+        char buf[64];
+        if (node == NUMA_NO_NODE) {
+                /* early_cpu_to_node() already emits a warning and trace */
+                return;
+        }
+        mask = node_to_cpumask_map[node];
+        if (!mask) {
+                pr_err("node_to_cpumask_map[%i] NULL\n", node);
+                dump_stack();
+                return;
+        }
+        if (enable)
+                cpumask_set_cpu(cpu, mask);
+        else
+                cpumask_clear_cpu(cpu, mask);
+        cpulist_scnprintf(buf, sizeof(buf), mask);
+        printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
+                enable ? "numa_add_cpu" : "numa_remove_cpu",
+                cpu, node, buf);
+        return;
+}
+# ifndef CONFIG_NUMA_EMU
+static void __cpuinit numa_set_cpumask(int cpu, bool enable)
+{
+        debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
+}
+void __cpuinit numa_add_cpu(int cpu)
+{
+        numa_set_cpumask(cpu, true);
+}
+void __cpuinit numa_remove_cpu(int cpu)
+{
+        numa_set_cpumask(cpu, false);
+}
+# endif /* !CONFIG_NUMA_EMU */
 /*
 * Returns a pointer to the bitmask of CPUs on Node 'node'.
 */
@@ -58,4 +807,20 @@ const struct cpumask *cpumask_of_node(int node)
        return node_to_cpumask_map[node];
 }
 EXPORT_SYMBOL(cpumask_of_node);
+#endif  /* !CONFIG_DEBUG_PER_CPU_MAPS */
+#ifdef CONFIG_MEMORY_HOTPLUG
+int memory_add_physaddr_to_nid(u64 start)
+{
+        struct numa_meminfo *mi = &numa_meminfo;
+        int nid = mi->blk[0].nid;
+        int i;
+        for (i = 0; i < mi->nr_blks; i++)
+                if (mi->blk[i].start <= start && mi->blk[i].end > start)
+                        nid = mi->blk[i].nid;
+        return nid;
+}
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 #endif
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 809baaaf48b1..849a975d3fa0 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -22,38 +22,11 @@
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
-#include <linux/mm.h>
 #include <linux/bootmem.h>
-#include <linux/mmzone.h>
+#include <linux/memblock.h>
-#include <linux/highmem.h>
-#include <linux/initrd.h>
-#include <linux/nodemask.h>
 #include <linux/module.h>
-#include <linux/kexec.h>
-#include <linux/pfn.h>
-#include <linux/swap.h>
-#include <linux/acpi.h>
-#include <asm/e820.h>
-#include <asm/setup.h>
-#include <asm/mmzone.h>
-#include <asm/bios_ebda.h>
-#include <asm/proto.h>
-struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
-EXPORT_SYMBOL(node_data);
-/*
- * numa interface - we expect the numa architecture specific code to have
- *                  populated the following initialisation.
- *
- * 1) node_online_map  - the map of all nodes configured (online) in the system
- * 2) node_start_pfn   - the starting page frame number for a node
- * 3) node_end_pfn     - the ending page fram number for a node
- */
-unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly;
-unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
+#include "numa_internal.h"
 #ifdef CONFIG_DISCONTIGMEM
 /*
@@ -98,102 +71,46 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
 }
 #endif
-extern unsigned long find_max_low_pfn(void);
 extern unsigned long highend_pfn, highstart_pfn;
 #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
-unsigned long node_remap_size[MAX_NUMNODES];
 static void *node_remap_start_vaddr[MAX_NUMNODES];
 void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
-static unsigned long kva_start_pfn;
-static unsigned long kva_pages;
-/*
- * FLAT - support for basic PC memory model with discontig enabled, essentially
- *        a single node with all available processors in it with a flat
- *        memory map.
- */
-int __init get_memcfg_numa_flat(void)
-{
-        printk(KERN_DEBUG "NUMA - single node, flat memory mode\n");
-        node_start_pfn[0] = 0;
-        node_end_pfn[0] = max_pfn;
-        e820_register_active_regions(0, 0, max_pfn);
-        memory_present(0, 0, max_pfn);
-        node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
-        /* Indicate there is one node available. */
-        nodes_clear(node_online_map);
-        node_set_online(0);
-        return 1;
-}
-/*
- * Find the highest page frame number we have available for the node
- */
-static void __init propagate_e820_map_node(int nid)
-{
-        if (node_end_pfn[nid] > max_pfn)
-                node_end_pfn[nid] = max_pfn;
-        /*
-         * if a user has given mem=XXXX, then we need to make sure 
-         * that the node _starts_ before that, too, not just ends
-         */
-        if (node_start_pfn[nid] > max_pfn)
-                node_start_pfn[nid] = max_pfn;
-        BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]);
-}
-/* 
- * Allocate memory for the pg_data_t for this node via a crude pre-bootmem
- * method.  For node zero take this from the bottom of memory, for
- * subsequent nodes place them at node_remap_start_vaddr which contains
- * node local data in physically node local memory.  See setup_memory()
- * for details.
- */
-static void __init allocate_pgdat(int nid)
-{
-        char buf[16];
-        if (node_has_online_mem(nid) && node_remap_start_vaddr[nid])
-                NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
-        else {
-                unsigned long pgdat_phys;
-                pgdat_phys = find_e820_area(min_low_pfn<<PAGE_SHIFT,
-                                 max_pfn_mapped<<PAGE_SHIFT,
-                                 sizeof(pg_data_t),
-                                 PAGE_SIZE);
-                NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
-                memset(buf, 0, sizeof(buf));
-                sprintf(buf, "NODE_DATA %d",  nid);
-                reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf);
-        }
-        printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
-                nid, (unsigned long)NODE_DATA(nid));
-}
 /*
- * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel
+ * Remap memory allocator
- * virtual address space (KVA) is reserved and portions of nodes are mapped
- * using it. This is to allow node-local memory to be allocated for
- * structures that would normally require ZONE_NORMAL. The memory is
- * allocated with alloc_remap() and callers should be prepared to allocate
- * from the bootmem allocator instead.
 */
 static unsigned long node_remap_start_pfn[MAX_NUMNODES];
 static void *node_remap_end_vaddr[MAX_NUMNODES];
 static void *node_remap_alloc_vaddr[MAX_NUMNODES];
-static unsigned long node_remap_offset[MAX_NUMNODES];
+/**
+ * alloc_remap - Allocate remapped memory
+ * @nid: NUMA node to allocate memory from
+ * @size: The size of allocation
+ *
+ * Allocate @size bytes from the remap area of NUMA node @nid.  The
+ * size of the remap area is predetermined by init_alloc_remap() and
+ * only the callers considered there should call this function.  For
+ * more info, please read the comment on top of init_alloc_remap().
+ *
+ * The caller must be ready to handle allocation failure from this
+ * function and fall back to regular memory allocator in such cases.
+ *
+ * CONTEXT:
+ * Single CPU early boot context.
+ *
+ * RETURNS:
+ * Pointer to the allocated memory on success, %NULL on failure.
+ */
 void *alloc_remap(int nid, unsigned long size)
 {
        void *allocation = node_remap_alloc_vaddr[nid];
        size = ALIGN(size, L1_CACHE_BYTES);
-        if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid])
+        if (!allocation || (allocation + size) > node_remap_end_vaddr[nid])
                return NULL;
        node_remap_alloc_vaddr[nid] += size;
@@ -202,26 +119,6 @@ void *alloc_remap(int nid, unsigned long size)
        return allocation;
 }
-static void __init remap_numa_kva(void)
-{
-        void *vaddr;
-        unsigned long pfn;
-        int node;
-        for_each_online_node(node) {
-                printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
-                for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
-                        vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
-                        printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
-                                (unsigned long)vaddr,
-                                node_remap_start_pfn[node] + pfn);
-                        set_pmd_pfn((ulong) vaddr, 
-                                node_remap_start_pfn[node] + pfn, 
-                                PAGE_KERNEL_LARGE);
-                }
-        }
-}
 #ifdef CONFIG_HIBERNATION
 /**
 * resume_map_numa_kva - add KVA mapping to the temporary page tables created
@@ -233,15 +130,16 @@ void resume_map_numa_kva(pgd_t *pgd_base)
        int node;
        for_each_online_node(node) {
-                unsigned long start_va, start_pfn, size, pfn;
+                unsigned long start_va, start_pfn, nr_pages, pfn;
                start_va = (unsigned long)node_remap_start_vaddr[node];
                start_pfn = node_remap_start_pfn[node];
-                size = node_remap_size[node];
+                nr_pages = (node_remap_end_vaddr[node] -
+                            node_remap_start_vaddr[node]) >> PAGE_SHIFT;
                printk(KERN_DEBUG "%s: node %d\n", __func__, node);
-                for (pfn = 0; pfn < size; pfn += PTRS_PER_PTE) {
+                for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) {
                        unsigned long vaddr = start_va + (pfn << PAGE_SHIFT);
                        pgd_t *pgd = pgd_base + pgd_index(vaddr);
                        pud_t *pud = pud_offset(pgd, vaddr);
@@ -257,134 +155,89 @@ void resume_map_numa_kva(pgd_t *pgd_base)
 }
 #endif
-static __init unsigned long calculate_numa_remap_pages(void)
+/**
-{
+ * init_alloc_remap - Initialize remap allocator for a NUMA node
-        int nid;
+ * @nid: NUMA node to initizlie remap allocator for
-        unsigned long size, reserve_pages = 0;
+ *
+ * NUMA nodes may end up without any lowmem.  As allocating pgdat and
-        for_each_online_node(nid) {
+ * memmap on a different node with lowmem is inefficient, a special
-                u64 node_kva_target;
+ * remap allocator is implemented which can be used by alloc_remap().
-                u64 node_kva_final;
+ *
+ * For each node, the amount of memory which will be necessary for
-                /*
+ * pgdat and memmap is calculated and two memory areas of the size are
-                 * The acpi/srat node info can show hot-add memroy zones
+ * allocated - one in the node and the other in lowmem; then, the area
-                 * where memory could be added but not currently present.
+ * in the node is remapped to the lowmem area.
-                 */
+ *
-                printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
+ * As pgdat and memmap must be allocated in lowmem anyway, this
-                        nid, node_start_pfn[nid], node_end_pfn[nid]);
+ * doesn't waste lowmem address space; however, the actual lowmem
-                if (node_start_pfn[nid] > max_pfn)
+ * which gets remapped over is wasted.  The amount shouldn't be
-                        continue;
+ * problematic on machines this feature will be used.
-                if (!node_end_pfn[nid])
+ *
-                        continue;
+ * Initialization failure isn't fatal.  alloc_remap() is used
-                if (node_end_pfn[nid] > max_pfn)
+ * opportunistically and the callers will fall back to other memory
-                        node_end_pfn[nid] = max_pfn;
+ * allocation mechanisms on failure.
+ */
-                /* ensure the remap includes space for the pgdat. */
+void __init init_alloc_remap(int nid, u64 start, u64 end)
-                size = node_remap_size[nid] + sizeof(pg_data_t);
-                /* convert size to large (pmd size) pages, rounding up */
-                size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
-                /* now the roundup is correct, convert to PAGE_SIZE pages */
-                size = size * PTRS_PER_PTE;
-                node_kva_target = round_down(node_end_pfn[nid] - size,
-                                                 PTRS_PER_PTE);
-                node_kva_target <<= PAGE_SHIFT;
-                do {
-                        node_kva_final = find_e820_area(node_kva_target,
-                                        ((u64)node_end_pfn[nid])<<PAGE_SHIFT,
-                                                ((u64)size)<<PAGE_SHIFT,
-                                                LARGE_PAGE_BYTES);
-                        node_kva_target -= LARGE_PAGE_BYTES;
-                } while (node_kva_final == -1ULL &&
-                         (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
-                if (node_kva_final == -1ULL)
-                        panic("Can not get kva ram\n");
-                node_remap_size[nid] = size;
-                node_remap_offset[nid] = reserve_pages;
-                reserve_pages += size;
-                printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of"
-                                  " node %d at %llx\n",
-                                size, nid, node_kva_final>>PAGE_SHIFT);
-                /*
-                 *  prevent kva address below max_low_pfn want it on system
-                 *  with less memory later.
-                 *  layout will be: KVA address , KVA RAM
-                 *
-                 *  we are supposed to only record the one less then max_low_pfn
-                 *  but we could have some hole in high memory, and it will only
-                 *  check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
-                 *  to use it as free.
-                 *  So reserve_early here, hope we don't run out of that array
-                 */
-                reserve_early(node_kva_final,
-                              node_kva_final+(((u64)size)<<PAGE_SHIFT),
-                              "KVA RAM");
-                node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
-                remove_active_range(nid, node_remap_start_pfn[nid],
-                                         node_remap_start_pfn[nid] + size);
-        }
-        printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
-                        reserve_pages);
-        return reserve_pages;
-}
-static void init_remap_allocator(int nid)
-{
-        node_remap_start_vaddr[nid] = pfn_to_kaddr(
-                        kva_start_pfn + node_remap_offset[nid]);
-        node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
-                (node_remap_size[nid] * PAGE_SIZE);
-        node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
-                ALIGN(sizeof(pg_data_t), PAGE_SIZE);
-        printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid,
-                (ulong) node_remap_start_vaddr[nid],
-                (ulong) node_remap_end_vaddr[nid]);
-}
-void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
-                                int acpi, int k8)
 {
-        int nid;
+        unsigned long start_pfn = start >> PAGE_SHIFT;
-        long kva_target_pfn;
+        unsigned long end_pfn = end >> PAGE_SHIFT;
+        unsigned long size, pfn;
+        u64 node_pa, remap_pa;
+        void *remap_va;
        /*
-         * When mapping a NUMA machine we allocate the node_mem_map arrays
+         * The acpi/srat node info can show hot-add memroy zones where
-         * from node local memory.  They are then mapped directly into KVA
+         * memory could be added but not currently present.
-         * between zone normal and vmalloc space.  Calculate the size of
-         * this space and use it to adjust the boundary between ZONE_NORMAL
-         * and ZONE_HIGHMEM.
         */
+        printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
+               nid, start_pfn, end_pfn);
+        /* calculate the necessary space aligned to large page size */
+        size = node_memmap_size_bytes(nid, start_pfn, end_pfn);
+        size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);
+        size = ALIGN(size, LARGE_PAGE_BYTES);
+        /* allocate node memory and the lowmem remap area */
+        node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES);
+        if (node_pa == MEMBLOCK_ERROR) {
+                pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n",
+                           size, nid);
+                return;
+        }
+        memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM");
+        remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
+                                          max_low_pfn << PAGE_SHIFT,
+                                          size, LARGE_PAGE_BYTES);
+        if (remap_pa == MEMBLOCK_ERROR) {
+                pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n",
+                           size, nid);
+                memblock_x86_free_range(node_pa, node_pa + size);
+                return;
+        }
+        memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG");
+        remap_va = phys_to_virt(remap_pa);
+        /* perform actual remap */
+        for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE)
+                set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT),
+                            (node_pa >> PAGE_SHIFT) + pfn,
+                            PAGE_KERNEL_LARGE);
+        /* initialize remap allocator parameters */
+        node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT;
+        node_remap_start_vaddr[nid] = remap_va;
+        node_remap_end_vaddr[nid] = remap_va + size;
+        node_remap_alloc_vaddr[nid] = remap_va;
+        printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n",
+               nid, node_pa, node_pa + size, remap_va, remap_va + size);
+}
-        get_memcfg_numa();
+void __init initmem_init(void)
+{
-        kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
+        x86_numa_init();
-        kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
-        do {
-                kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT,
-                                        max_low_pfn<<PAGE_SHIFT,
-                                        kva_pages<<PAGE_SHIFT,
-                                        PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
-                kva_target_pfn -= PTRS_PER_PTE;
-        } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn);
-        if (kva_start_pfn == -1UL)
-                panic("Can not get kva space\n");
-        printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
-                kva_start_pfn, max_low_pfn);
-        printk(KERN_INFO "max_pfn = %lx\n", max_pfn);
-        /* avoid clash with initrd */
-        reserve_early(kva_start_pfn<<PAGE_SHIFT,
-                      (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
-                     "KVA PG");
 #ifdef CONFIG_HIGHMEM
        highstart_pfn = highend_pfn = max_pfn;
        if (max_pfn > max_low_pfn)
@@ -404,54 +257,9 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
        printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
                        (ulong) pfn_to_kaddr(max_low_pfn));
-        for_each_online_node(nid) {
-                init_remap_allocator(nid);
-                allocate_pgdat(nid);
-        }
-        remap_numa_kva();
        printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
                        (ulong) pfn_to_kaddr(highstart_pfn));
-        for_each_online_node(nid)
-                propagate_e820_map_node(nid);
-        for_each_online_node(nid) {
-                memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
-                NODE_DATA(nid)->node_id = nid;
-#ifndef CONFIG_NO_BOOTMEM
-                NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
-#endif
-        }
        setup_bootmem_allocator();
 }
-#ifdef CONFIG_MEMORY_HOTPLUG
-static int paddr_to_nid(u64 addr)
-{
-        int nid;
-        unsigned long pfn = PFN_DOWN(addr);
-        for_each_node(nid)
-                if (node_start_pfn[nid] <= pfn &&
-                    pfn < node_end_pfn[nid])
-                        return nid;
-        return -1;
-}
-/*
- * This function is used to ask node id BEFORE memmap and mem_section's
- * initialization (pfn_to_nid() can't be used yet).
- * If _PXM is not defined on ACPI's DSDT, node id must be found by this.
- */
-int memory_add_physaddr_to_nid(u64 addr)
-{
-        int nid = paddr_to_nid(addr);
-        return (nid >= 0) ? nid : 0;
-}
-EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
-#endif
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index a7bcc23ef96c..dd27f401f0a0 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -2,697 +2,13 @@
 * Generic VM initialization for x86-64 NUMA setups.
 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
 */
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/init.h>
 #include <linux/bootmem.h>
-#include <linux/mmzone.h>
-#include <linux/ctype.h>
-#include <linux/module.h>
-#include <linux/nodemask.h>
-#include <linux/sched.h>
-#include <asm/e820.h>
+#include "numa_internal.h"
-#include <asm/proto.h>
-#include <asm/dma.h>
-#include <asm/numa.h>
-#include <asm/acpi.h>
-#include <asm/k8.h>
-struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
+void __init initmem_init(void)
-EXPORT_SYMBOL(node_data);
-struct memnode memnode;
-s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
-        [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
-};
-int numa_off __initdata;
-static unsigned long __initdata nodemap_addr;
-static unsigned long __initdata nodemap_size;
-/*
- * Map cpu index to node index
- */
-DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
-EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
-/*
- * Given a shift value, try to populate memnodemap[]
- * Returns :
- * 1 if OK
- * 0 if memnodmap[] too small (of shift too small)
- * -1 if node overlap or lost ram (shift too big)
- */
-static int __init populate_memnodemap(const struct bootnode *nodes,
-                                      int numnodes, int shift, int *nodeids)
-{
-        unsigned long addr, end;
-        int i, res = -1;
-        memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
-        for (i = 0; i < numnodes; i++) {
-                addr = nodes[i].start;
-                end = nodes[i].end;
-                if (addr >= end)
-                        continue;
-                if ((end >> shift) >= memnodemapsize)
-                        return 0;
-                do {
-                        if (memnodemap[addr >> shift] != NUMA_NO_NODE)
-                                return -1;
-                        if (!nodeids)
-                                memnodemap[addr >> shift] = i;
-                        else
-                                memnodemap[addr >> shift] = nodeids[i];
-                        addr += (1UL << shift);
-                } while (addr < end);
-                res = 1;
-        }
-        return res;
-}
-static int __init allocate_cachealigned_memnodemap(void)
-{
-        unsigned long addr;
-        memnodemap = memnode.embedded_map;
-        if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
-                return 0;
-        addr = 0x8000;
-        nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
-        nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT,
-                                      nodemap_size, L1_CACHE_BYTES);
-        if (nodemap_addr == -1UL) {
-                printk(KERN_ERR
-                       "NUMA: Unable to allocate Memory to Node hash map\n");
-                nodemap_addr = nodemap_size = 0;
-                return -1;
-        }
-        memnodemap = phys_to_virt(nodemap_addr);
-        reserve_early(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP");
-        printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
-               nodemap_addr, nodemap_addr + nodemap_size);
-        return 0;
-}
-/*
- * The LSB of all start and end addresses in the node map is the value of the
- * maximum possible shift.
- */
-static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
-                                         int numnodes)
-{
-        int i, nodes_used = 0;
-        unsigned long start, end;
-        unsigned long bitfield = 0, memtop = 0;
-        for (i = 0; i < numnodes; i++) {
-                start = nodes[i].start;
-                end = nodes[i].end;
-                if (start >= end)
-                        continue;
-                bitfield |= start;
-                nodes_used++;
-                if (end > memtop)
-                        memtop = end;
-        }
-        if (nodes_used <= 1)
-                i = 63;
-        else
-                i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
-        memnodemapsize = (memtop >> i)+1;
-        return i;
-}
-int __init compute_hash_shift(struct bootnode *nodes, int numnodes,
-                              int *nodeids)
-{
-        int shift;
-        shift = extract_lsb_from_nodes(nodes, numnodes);
-        if (allocate_cachealigned_memnodemap())
-                return -1;
-        printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
-                shift);
-        if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) {
-                printk(KERN_INFO "Your memory is not aligned you need to "
-                       "rebuild your kernel with a bigger NODEMAPSIZE "
-                       "shift=%d\n", shift);
-                return -1;
-        }
-        return shift;
-}
-int __meminit  __early_pfn_to_nid(unsigned long pfn)
-{
-        return phys_to_nid(pfn << PAGE_SHIFT);
-}
-static void * __init early_node_mem(int nodeid, unsigned long start,
-                                    unsigned long end, unsigned long size,
-                                    unsigned long align)
-{
-        unsigned long mem;
-        /*
-         * put it on high as possible
-         * something will go with NODE_DATA
-         */
-        if (start < (MAX_DMA_PFN<<PAGE_SHIFT))
-                start = MAX_DMA_PFN<<PAGE_SHIFT;
-        if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
-            end > (MAX_DMA32_PFN<<PAGE_SHIFT))
-                start = MAX_DMA32_PFN<<PAGE_SHIFT;
-        mem = find_e820_area(start, end, size, align);
-        if (mem != -1L)
-                return __va(mem);
-        /* extend the search scope */
-        end = max_pfn_mapped << PAGE_SHIFT;
-        if (end > (MAX_DMA32_PFN<<PAGE_SHIFT))
-                start = MAX_DMA32_PFN<<PAGE_SHIFT;
-        else
-                start = MAX_DMA_PFN<<PAGE_SHIFT;
-        mem = find_e820_area(start, end, size, align);
-        if (mem != -1L)
-                return __va(mem);
-        printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
-                       size, nodeid);
-        return NULL;
-}
-/* Initialize bootmem allocator for a node */
-void __init
-setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
-{
-        unsigned long start_pfn, last_pfn, nodedata_phys;
-        const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
-        int nid;
-#ifndef CONFIG_NO_BOOTMEM
-        unsigned long bootmap_start, bootmap_pages, bootmap_size;
-        void *bootmap;
-#endif
-        if (!end)
-                return;
-        /*
-         * Don't confuse VM with a node that doesn't have the
-         * minimum amount of memory:
-         */
-        if (end && (end - start) < NODE_MIN_SIZE)
-                return;
-        start = roundup(start, ZONE_ALIGN);
-        printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid,
-               start, end);
-        start_pfn = start >> PAGE_SHIFT;
-        last_pfn = end >> PAGE_SHIFT;
-        node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
-                                           SMP_CACHE_BYTES);
-        if (node_data[nodeid] == NULL)
-                return;
-        nodedata_phys = __pa(node_data[nodeid]);
-        reserve_early(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
-        printk(KERN_INFO "  NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
-                nodedata_phys + pgdat_size - 1);
-        nid = phys_to_nid(nodedata_phys);
-        if (nid != nodeid)
-                printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nodeid, nid);
-        memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
-        NODE_DATA(nodeid)->node_id = nodeid;
-        NODE_DATA(nodeid)->node_start_pfn = start_pfn;
-        NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
-#ifndef CONFIG_NO_BOOTMEM
-        NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
-        /*
-         * Find a place for the bootmem map
-         * nodedata_phys could be on other nodes by alloc_bootmem,
-         * so need to sure bootmap_start not to be small, otherwise
-         * early_node_mem will get that with find_e820_area instead
-         * of alloc_bootmem, that could clash with reserved range
-         */
-        bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
-        bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
-        /*
-         * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
-         * to use that to align to PAGE_SIZE
-         */
-        bootmap = early_node_mem(nodeid, bootmap_start, end,
-                                 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
-        if (bootmap == NULL)  {
-                free_early(nodedata_phys, nodedata_phys + pgdat_size);
-                node_data[nodeid] = NULL;
-                return;
-        }
-        bootmap_start = __pa(bootmap);
-        reserve_early(bootmap_start, bootmap_start+(bootmap_pages<<PAGE_SHIFT),
-                        "BOOTMAP");
-        bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
-                                         bootmap_start >> PAGE_SHIFT,
-                                         start_pfn, last_pfn);
-        printk(KERN_INFO "  bootmap [%016lx -  %016lx] pages %lx\n",
-                 bootmap_start, bootmap_start + bootmap_size - 1,
-                 bootmap_pages);
-        nid = phys_to_nid(bootmap_start);
-        if (nid != nodeid)
-                printk(KERN_INFO "    bootmap(%d) on node %d\n", nodeid, nid);
-        free_bootmem_with_active_regions(nodeid, end);
-#endif
-        node_set_online(nodeid);
-}
-/*
- * There are unfortunately some poorly designed mainboards around that
- * only connect memory to a single CPU. This breaks the 1:1 cpu->node
- * mapping. To avoid this fill in the mapping for all possible CPUs,
- * as the number of CPUs is not known yet. We round robin the existing
- * nodes.
- */
-void __init numa_init_array(void)
-{
-        int rr, i;
-        rr = first_node(node_online_map);
-        for (i = 0; i < nr_cpu_ids; i++) {
-                if (early_cpu_to_node(i) != NUMA_NO_NODE)
-                        continue;
-                numa_set_node(i, rr);
-                rr = next_node(rr, node_online_map);
-                if (rr == MAX_NUMNODES)
-                        rr = first_node(node_online_map);
-        }
-}
-#ifdef CONFIG_NUMA_EMU
-/* Numa emulation */
-static struct bootnode nodes[MAX_NUMNODES] __initdata;
-static struct bootnode physnodes[MAX_NUMNODES] __initdata;
-static char *cmdline __initdata;
-static int __init setup_physnodes(unsigned long start, unsigned long end,
-                                        int acpi, int k8)
-{
-        int nr_nodes = 0;
-        int ret = 0;
-        int i;
-#ifdef CONFIG_ACPI_NUMA
-        if (acpi)
-                nr_nodes = acpi_get_nodes(physnodes);
-#endif
-#ifdef CONFIG_K8_NUMA
-        if (k8)
-                nr_nodes = k8_get_nodes(physnodes);
-#endif
-        /*
-         * Basic sanity checking on the physical node map: there may be errors
-         * if the SRAT or K8 incorrectly reported the topology or the mem=
-         * kernel parameter is used.
-         */
-        for (i = 0; i < nr_nodes; i++) {
-                if (physnodes[i].start == physnodes[i].end)
-                        continue;
-                if (physnodes[i].start > end) {
-                        physnodes[i].end = physnodes[i].start;
-                        continue;
-                }
-                if (physnodes[i].end < start) {
-                        physnodes[i].start = physnodes[i].end;
-                        continue;
-                }
-                if (physnodes[i].start < start)
-                        physnodes[i].start = start;
-                if (physnodes[i].end > end)
-                        physnodes[i].end = end;
-        }
-        /*
-         * Remove all nodes that have no memory or were truncated because of the
-         * limited address range.
-         */
-        for (i = 0; i < nr_nodes; i++) {
-                if (physnodes[i].start == physnodes[i].end)
-                        continue;
-                physnodes[ret].start = physnodes[i].start;
-                physnodes[ret].end = physnodes[i].end;
-                ret++;
-        }
-        /*
-         * If no physical topology was detected, a single node is faked to cover
-         * the entire address space.
-         */
-        if (!ret) {
-                physnodes[ret].start = start;
-                physnodes[ret].end = end;
-                ret = 1;
-        }
-        return ret;
-}
-/*
- * Setups up nid to range from addr to addr + size.  If the end
- * boundary is greater than max_addr, then max_addr is used instead.
- * The return value is 0 if there is additional memory left for
- * allocation past addr and -1 otherwise.  addr is adjusted to be at
- * the end of the node.
- */
-static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
-{
-        int ret = 0;
-        nodes[nid].start = *addr;
-        *addr += size;
-        if (*addr >= max_addr) {
-                *addr = max_addr;
-                ret = -1;
-        }
-        nodes[nid].end = *addr;
-        node_set(nid, node_possible_map);
-        printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
-               nodes[nid].start, nodes[nid].end,
-               (nodes[nid].end - nodes[nid].start) >> 20);
-        return ret;
-}
-/*
- * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
- * to max_addr.  The return value is the number of nodes allocated.
- */
-static int __init split_nodes_interleave(u64 addr, u64 max_addr,
-                                                int nr_phys_nodes, int nr_nodes)
 {
-        nodemask_t physnode_mask = NODE_MASK_NONE;
+        x86_numa_init();
-        u64 size;
-        int big;
-        int ret = 0;
-        int i;
-        if (nr_nodes <= 0)
-                return -1;
-        if (nr_nodes > MAX_NUMNODES) {
-                pr_info("numa=fake=%d too large, reducing to %d\n",
-                        nr_nodes, MAX_NUMNODES);
-                nr_nodes = MAX_NUMNODES;
-        }
-        size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes;
-        /*
-         * Calculate the number of big nodes that can be allocated as a result
-         * of consolidating the remainder.
-         */
-        big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
-                FAKE_NODE_MIN_SIZE;
-        size &= FAKE_NODE_MIN_HASH_MASK;
-        if (!size) {
-                pr_err("Not enough memory for each node.  "
-                        "NUMA emulation disabled.\n");
-                return -1;
-        }
-        for (i = 0; i < nr_phys_nodes; i++)
-                if (physnodes[i].start != physnodes[i].end)
-                        node_set(i, physnode_mask);
-        /*
-         * Continue to fill physical nodes with fake nodes until there is no
-         * memory left on any of them.
-         */
-        while (nodes_weight(physnode_mask)) {
-                for_each_node_mask(i, physnode_mask) {
-                        u64 end = physnodes[i].start + size;
-                        u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
-                        if (ret < big)
-                                end += FAKE_NODE_MIN_SIZE;
-                        /*
-                         * Continue to add memory to this fake node if its
-                         * non-reserved memory is less than the per-node size.
-                         */
-                        while (end - physnodes[i].start -
-                                e820_hole_size(physnodes[i].start, end) < size) {
-                                end += FAKE_NODE_MIN_SIZE;
-                                if (end > physnodes[i].end) {
-                                        end = physnodes[i].end;
-                                        break;
-                                }
-                        }
-                        /*
-                         * If there won't be at least FAKE_NODE_MIN_SIZE of
-                         * non-reserved memory in ZONE_DMA32 for the next node,
-                         * this one must extend to the boundary.
-                         */
-                        if (end < dma32_end && dma32_end - end -
-                            e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
-                                end = dma32_end;
-                        /*
-                         * If there won't be enough non-reserved memory for the
-                         * next node, this one must extend to the end of the
-                         * physical node.
-                         */
-                        if (physnodes[i].end - end -
-                            e820_hole_size(end, physnodes[i].end) < size)
-                                end = physnodes[i].end;
-                        /*
-                         * Avoid allocating more nodes than requested, which can
-                         * happen as a result of rounding down each node's size
-                         * to FAKE_NODE_MIN_SIZE.
-                         */
-                        if (nodes_weight(physnode_mask) + ret >= nr_nodes)
-                                end = physnodes[i].end;
-                        if (setup_node_range(ret++, &physnodes[i].start,
-                                                end - physnodes[i].start,
-                                                physnodes[i].end) < 0)
-                                node_clear(i, physnode_mask);
-                }
-        }
-        return ret;
-}
-/*
- * Returns the end address of a node so that there is at least `size' amount of
- * non-reserved memory or `max_addr' is reached.
- */
-static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
-{
-        u64 end = start + size;
-        while (end - start - e820_hole_size(start, end) < size) {
-                end += FAKE_NODE_MIN_SIZE;
-                if (end > max_addr) {
-                        end = max_addr;
-                        break;
-                }
-        }
-        return end;
-}
-/*
- * Sets up fake nodes of `size' interleaved over physical nodes ranging from
- * `addr' to `max_addr'.  The return value is the number of nodes allocated.
- */
-static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
-{
-        nodemask_t physnode_mask = NODE_MASK_NONE;
-        u64 min_size;
-        int ret = 0;
-        int i;
-        if (!size)
-                return -1;
-        /*
-         * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
-         * increased accordingly if the requested size is too small.  This
-         * creates a uniform distribution of node sizes across the entire
-         * machine (but not necessarily over physical nodes).
-         */
-        min_size = (max_addr - addr - e820_hole_size(addr, max_addr)) /
-                                                MAX_NUMNODES;
-        min_size = max(min_size, FAKE_NODE_MIN_SIZE);
-        if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
-                min_size = (min_size + FAKE_NODE_MIN_SIZE) &
-                                                FAKE_NODE_MIN_HASH_MASK;
-        if (size < min_size) {
-                pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
-                        size >> 20, min_size >> 20);
-                size = min_size;
-        }
-        size &= FAKE_NODE_MIN_HASH_MASK;
-        for (i = 0; i < MAX_NUMNODES; i++)
-                if (physnodes[i].start != physnodes[i].end)
-                        node_set(i, physnode_mask);
-        /*
-         * Fill physical nodes with fake nodes of size until there is no memory
-         * left on any of them.
-         */
-        while (nodes_weight(physnode_mask)) {
-                for_each_node_mask(i, physnode_mask) {
-                        u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
-                        u64 end;
-                        end = find_end_of_node(physnodes[i].start,
-                                                physnodes[i].end, size);
-                        /*
-                         * If there won't be at least FAKE_NODE_MIN_SIZE of
-                         * non-reserved memory in ZONE_DMA32 for the next node,
-                         * this one must extend to the boundary.
-                         */
-                        if (end < dma32_end && dma32_end - end -
-                            e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
-                                end = dma32_end;
-                        /*
-                         * If there won't be enough non-reserved memory for the
-                         * next node, this one must extend to the end of the
-                         * physical node.
-                         */
-                        if (physnodes[i].end - end -
-                            e820_hole_size(end, physnodes[i].end) < size)
-                                end = physnodes[i].end;
-                        /*
-                         * Setup the fake node that will be allocated as bootmem
-                         * later.  If setup_node_range() returns non-zero, there
-                         * is no more memory available on this physical node.
-                         */
-                        if (setup_node_range(ret++, &physnodes[i].start,
-                                                end - physnodes[i].start,
-                                                physnodes[i].end) < 0)
-                                node_clear(i, physnode_mask);
-                }
-        }
-        return ret;
-}
-/*
- * Sets up the system RAM area from start_pfn to last_pfn according to the
- * numa=fake command-line option.
- */
-static int __init numa_emulation(unsigned long start_pfn,
-                        unsigned long last_pfn, int acpi, int k8)
-{
-        u64 addr = start_pfn << PAGE_SHIFT;
-        u64 max_addr = last_pfn << PAGE_SHIFT;
-        int num_phys_nodes;
-        int num_nodes;
-        int i;
-        num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
-        /*
-         * If the numa=fake command-line contains a 'M' or 'G', it represents
-         * the fixed node size.  Otherwise, if it is just a single number N,
-         * split the system RAM into N fake nodes.
-         */
-        if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) {
-                u64 size;
-                size = memparse(cmdline, &cmdline);
-                num_nodes = split_nodes_size_interleave(addr, max_addr, size);
-        } else {
-                unsigned long n;
-                n = simple_strtoul(cmdline, NULL, 0);
-                num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n);
-        }
-        if (num_nodes < 0)
-                return num_nodes;
-        memnode_shift = compute_hash_shift(nodes, num_nodes, NULL);
-        if (memnode_shift < 0) {
-                memnode_shift = 0;
-                printk(KERN_ERR "No NUMA hash function found.  NUMA emulation "
-                       "disabled.\n");
-                return -1;
-        }
-        /*
-         * We need to vacate all active ranges that may have been registered for
-         * the e820 memory map.
-         */
-        remove_all_active_ranges();
-        for_each_node_mask(i, node_possible_map) {
-                e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
-                                                nodes[i].end >> PAGE_SHIFT);
-                setup_node_bootmem(i, nodes[i].start, nodes[i].end);
-        }
-        acpi_fake_nodes(nodes, num_nodes);
-        numa_init_array();
-        return 0;
-}
-#endif /* CONFIG_NUMA_EMU */
-void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
-                                int acpi, int k8)
-{
-        int i;
-        nodes_clear(node_possible_map);
-        nodes_clear(node_online_map);
-#ifdef CONFIG_NUMA_EMU
-        if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8))
-                return;
-        nodes_clear(node_possible_map);
-        nodes_clear(node_online_map);
-#endif
-#ifdef CONFIG_ACPI_NUMA
-        if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
-                                                  last_pfn << PAGE_SHIFT))
-                return;
-        nodes_clear(node_possible_map);
-        nodes_clear(node_online_map);
-#endif
-#ifdef CONFIG_K8_NUMA
-        if (!numa_off && k8 && !k8_scan_nodes())
-                return;
-        nodes_clear(node_possible_map);
-        nodes_clear(node_online_map);
-#endif
-        printk(KERN_INFO "%s\n",
-               numa_off ? "NUMA turned off" : "No NUMA configuration found");
-        printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
-               start_pfn << PAGE_SHIFT,
-               last_pfn << PAGE_SHIFT);
-        /* setup dummy node covering all memory */
-        memnode_shift = 63;
-        memnodemap = memnode.embedded_map;
-        memnodemap[0] = 0;
-        node_set_online(0);
-        node_set(0, node_possible_map);
-        for (i = 0; i < nr_cpu_ids; i++)
-                numa_set_node(i, 0);
-        e820_register_active_regions(0, start_pfn, last_pfn);
-        setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
 }
 unsigned long __init numa_free_all_bootmem(void)
@@ -703,199 +19,7 @@ unsigned long __init numa_free_all_bootmem(void)
        for_each_online_node(i)
                pages += free_all_bootmem_node(NODE_DATA(i));
-#ifdef CONFIG_NO_BOOTMEM
        pages += free_all_memory_core_early(MAX_NUMNODES);
-#endif
        return pages;
 }
-static __init int numa_setup(char *opt)
-{
-        if (!opt)
-                return -EINVAL;
-        if (!strncmp(opt, "off", 3))
-                numa_off = 1;
-#ifdef CONFIG_NUMA_EMU
-        if (!strncmp(opt, "fake=", 5))
-                cmdline = opt + 5;
-#endif
-#ifdef CONFIG_ACPI_NUMA
-        if (!strncmp(opt, "noacpi", 6))
-                acpi_numa = -1;
-#endif
-        return 0;
-}
-early_param("numa", numa_setup);
-#ifdef CONFIG_NUMA
-static __init int find_near_online_node(int node)
-{
-        int n, val;
-        int min_val = INT_MAX;
-        int best_node = -1;
-        for_each_online_node(n) {
-                val = node_distance(node, n);
-                if (val < min_val) {
-                        min_val = val;
-                        best_node = n;
-                }
-        }
-        return best_node;
-}
-/*
- * Setup early cpu_to_node.
- *
- * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
- * and apicid_to_node[] tables have valid entries for a CPU.
- * This means we skip cpu_to_node[] initialisation for NUMA
- * emulation and faking node case (when running a kernel compiled
- * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
- * is already initialized in a round robin manner at numa_init_array,
- * prior to this call, and this initialization is good enough
- * for the fake NUMA cases.
- *
- * Called before the per_cpu areas are setup.
- */
-void __init init_cpu_to_node(void)
-{
-        int cpu;
-        u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
-        BUG_ON(cpu_to_apicid == NULL);
-        for_each_possible_cpu(cpu) {
-                int node;
-                u16 apicid = cpu_to_apicid[cpu];
-                if (apicid == BAD_APICID)
-                        continue;
-                node = apicid_to_node[apicid];
-                if (node == NUMA_NO_NODE)
-                        continue;
-                if (!node_online(node))
-                        node = find_near_online_node(node);
-                numa_set_node(cpu, node);
-        }
-}
-#endif
-void __cpuinit numa_set_node(int cpu, int node)
-{
-        int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
-        /* early setting, no percpu area yet */
-        if (cpu_to_node_map) {
-                cpu_to_node_map[cpu] = node;
-                return;
-        }
-#ifdef CONFIG_DEBUG_PER_CPU_MAPS
-        if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
-                printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
-                dump_stack();
-                return;
-        }
-#endif
-        per_cpu(x86_cpu_to_node_map, cpu) = node;
-        if (node != NUMA_NO_NODE)
-                set_cpu_numa_node(cpu, node);
-}
-void __cpuinit numa_clear_node(int cpu)
-{
-        numa_set_node(cpu, NUMA_NO_NODE);
-}
-#ifndef CONFIG_DEBUG_PER_CPU_MAPS
-void __cpuinit numa_add_cpu(int cpu)
-{
-        cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
-}
-void __cpuinit numa_remove_cpu(int cpu)
-{
-        cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
-}
-#else /* CONFIG_DEBUG_PER_CPU_MAPS */
-/*
- * --------- debug versions of the numa functions ---------
- */
-static void __cpuinit numa_set_cpumask(int cpu, int enable)
-{
-        int node = early_cpu_to_node(cpu);
-        struct cpumask *mask;
-        char buf[64];
-        mask = node_to_cpumask_map[node];
-        if (mask == NULL) {
-                printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node);
-                dump_stack();
-                return;
-        }
-        if (enable)
-                cpumask_set_cpu(cpu, mask);
-        else
-                cpumask_clear_cpu(cpu, mask);
-        cpulist_scnprintf(buf, sizeof(buf), mask);
-        printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
-                enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
-}
-void __cpuinit numa_add_cpu(int cpu)
-{
-        numa_set_cpumask(cpu, 1);
-}
-void __cpuinit numa_remove_cpu(int cpu)
-{
-        numa_set_cpumask(cpu, 0);
-}
-int __cpu_to_node(int cpu)
-{
-        if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
-                printk(KERN_WARNING
-                        "cpu_to_node(%d): usage too early!\n", cpu);
-                dump_stack();
-                return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
-        }
-        return per_cpu(x86_cpu_to_node_map, cpu);
-}
-EXPORT_SYMBOL(__cpu_to_node);
-/*
- * Same function as cpu_to_node() but used if called before the
- * per_cpu areas are setup.
- */
-int early_cpu_to_node(int cpu)
-{
-        if (early_per_cpu_ptr(x86_cpu_to_node_map))
-                return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
-        if (!cpu_possible(cpu)) {
-                printk(KERN_WARNING
-                        "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
-                dump_stack();
-                return NUMA_NO_NODE;
-        }
-        return per_cpu(x86_cpu_to_node_map, cpu);
-}
-/*
- * --------- end of debug versions of the numa functions ---------
- */
-#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
new file mode 100644
index 000000000000..d0ed086b6247
--- /dev/null
+++ b/arch/x86/mm/numa_emulation.c
@@ -0,0 +1,492 @@
+/*
+ * NUMA emulation
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/topology.h>
+#include <linux/memblock.h>
+#include <linux/bootmem.h>
+#include <asm/dma.h>
+#include "numa_internal.h"
+static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata;
+static char *emu_cmdline __initdata;
+void __init numa_emu_cmdline(char *str)
+{
+        emu_cmdline = str;
+}
+static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
+{
+        int i;
+        for (i = 0; i < mi->nr_blks; i++)
+                if (mi->blk[i].nid == nid)
+                        return i;
+        return -ENOENT;
+}
+/*
+ * Sets up nid to range from @start to @end.  The return value is -errno if
+ * something went wrong, 0 otherwise.
+ */
+static int __init emu_setup_memblk(struct numa_meminfo *ei,
+                                   struct numa_meminfo *pi,
+                                   int nid, int phys_blk, u64 size)
+{
+        struct numa_memblk *eb = &ei->blk[ei->nr_blks];
+        struct numa_memblk *pb = &pi->blk[phys_blk];
+        if (ei->nr_blks >= NR_NODE_MEMBLKS) {
+                pr_err("NUMA: Too many emulated memblks, failing emulation\n");
+                return -EINVAL;
+        }
+        ei->nr_blks++;
+        eb->start = pb->start;
+        eb->end = pb->start + size;
+        eb->nid = nid;
+        if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
+                emu_nid_to_phys[nid] = pb->nid;
+        pb->start += size;
+        if (pb->start >= pb->end) {
+                WARN_ON_ONCE(pb->start > pb->end);
+                numa_remove_memblk_from(phys_blk, pi);
+        }
+        printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
+               eb->start, eb->end, (eb->end - eb->start) >> 20);
+        return 0;
+}
+/*
+ * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
+ * to max_addr.  The return value is the number of nodes allocated.
+ */
+static int __init split_nodes_interleave(struct numa_meminfo *ei,
+                                         struct numa_meminfo *pi,
+                                         u64 addr, u64 max_addr, int nr_nodes)
+{
+        nodemask_t physnode_mask = NODE_MASK_NONE;
+        u64 size;
+        int big;
+        int nid = 0;
+        int i, ret;
+        if (nr_nodes <= 0)
+                return -1;
+        if (nr_nodes > MAX_NUMNODES) {
+                pr_info("numa=fake=%d too large, reducing to %d\n",
+                        nr_nodes, MAX_NUMNODES);
+                nr_nodes = MAX_NUMNODES;
+        }
+        /*
+         * Calculate target node size.  x86_32 freaks on __udivdi3() so do
+         * the division in ulong number of pages and convert back.
+         */
+        size = max_addr - addr - memblock_x86_hole_size(addr, max_addr);
+        size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
+        /*
+         * Calculate the number of big nodes that can be allocated as a result
+         * of consolidating the remainder.
+         */
+        big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
+                FAKE_NODE_MIN_SIZE;
+        size &= FAKE_NODE_MIN_HASH_MASK;
+        if (!size) {
+                pr_err("Not enough memory for each node.  "
+                        "NUMA emulation disabled.\n");
+                return -1;
+        }
+        for (i = 0; i < pi->nr_blks; i++)
+                node_set(pi->blk[i].nid, physnode_mask);
+        /*
+         * Continue to fill physical nodes with fake nodes until there is no
+         * memory left on any of them.
+         */
+        while (nodes_weight(physnode_mask)) {
+                for_each_node_mask(i, physnode_mask) {
+                        u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
+                        u64 start, limit, end;
+                        int phys_blk;
+                        phys_blk = emu_find_memblk_by_nid(i, pi);
+                        if (phys_blk < 0) {
+                                node_clear(i, physnode_mask);
+                                continue;
+                        }
+                        start = pi->blk[phys_blk].start;
+                        limit = pi->blk[phys_blk].end;
+                        end = start + size;
+                        if (nid < big)
+                                end += FAKE_NODE_MIN_SIZE;
+                        /*
+                         * Continue to add memory to this fake node if its
+                         * non-reserved memory is less than the per-node size.
+                         */
+                        while (end - start -
+                               memblock_x86_hole_size(start, end) < size) {
+                                end += FAKE_NODE_MIN_SIZE;
+                                if (end > limit) {
+                                        end = limit;
+                                        break;
+                                }
+                        }
+                        /*
+                         * If there won't be at least FAKE_NODE_MIN_SIZE of
+                         * non-reserved memory in ZONE_DMA32 for the next node,
+                         * this one must extend to the boundary.
+                         */
+                        if (end < dma32_end && dma32_end - end -
+                            memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+                                end = dma32_end;
+                        /*
+                         * If there won't be enough non-reserved memory for the
+                         * next node, this one must extend to the end of the
+                         * physical node.
+                         */
+                        if (limit - end -
+                            memblock_x86_hole_size(end, limit) < size)
+                                end = limit;
+                        ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
+                                               phys_blk,
+                                               min(end, limit) - start);
+                        if (ret < 0)
+                                return ret;
+                }
+        }
+        return 0;
+}
+/*
+ * Returns the end address of a node so that there is at least `size' amount of
+ * non-reserved memory or `max_addr' is reached.
+ */
+static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
+{
+        u64 end = start + size;
+        while (end - start - memblock_x86_hole_size(start, end) < size) {
+                end += FAKE_NODE_MIN_SIZE;
+                if (end > max_addr) {
+                        end = max_addr;
+                        break;
+                }
+        }
+        return end;
+}
+/*
+ * Sets up fake nodes of `size' interleaved over physical nodes ranging from
+ * `addr' to `max_addr'.  The return value is the number of nodes allocated.
+ */
+static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
+                                              struct numa_meminfo *pi,
+                                              u64 addr, u64 max_addr, u64 size)
+{
+        nodemask_t physnode_mask = NODE_MASK_NONE;
+        u64 min_size;
+        int nid = 0;
+        int i, ret;
+        if (!size)
+                return -1;
+        /*
+         * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
+         * increased accordingly if the requested size is too small.  This
+         * creates a uniform distribution of node sizes across the entire
+         * machine (but not necessarily over physical nodes).
+         */
+        min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
+                                                MAX_NUMNODES;
+        min_size = max(min_size, FAKE_NODE_MIN_SIZE);
+        if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
+                min_size = (min_size + FAKE_NODE_MIN_SIZE) &
+                                                FAKE_NODE_MIN_HASH_MASK;
+        if (size < min_size) {
+                pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
+                        size >> 20, min_size >> 20);
+                size = min_size;
+        }
+        size &= FAKE_NODE_MIN_HASH_MASK;
+        for (i = 0; i < pi->nr_blks; i++)
+                node_set(pi->blk[i].nid, physnode_mask);
+        /*
+         * Fill physical nodes with fake nodes of size until there is no memory
+         * left on any of them.
+         */
+        while (nodes_weight(physnode_mask)) {
+                for_each_node_mask(i, physnode_mask) {
+                        u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
+                        u64 start, limit, end;
+                        int phys_blk;
+                        phys_blk = emu_find_memblk_by_nid(i, pi);
+                        if (phys_blk < 0) {
+                                node_clear(i, physnode_mask);
+                                continue;
+                        }
+                        start = pi->blk[phys_blk].start;
+                        limit = pi->blk[phys_blk].end;
+                        end = find_end_of_node(start, limit, size);
+                        /*
+                         * If there won't be at least FAKE_NODE_MIN_SIZE of
+                         * non-reserved memory in ZONE_DMA32 for the next node,
+                         * this one must extend to the boundary.
+                         */
+                        if (end < dma32_end && dma32_end - end -
+                            memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+                                end = dma32_end;
+                        /*
+                         * If there won't be enough non-reserved memory for the
+                         * next node, this one must extend to the end of the
+                         * physical node.
+                         */
+                        if (limit - end -
+                            memblock_x86_hole_size(end, limit) < size)
+                                end = limit;
+                        ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
+                                               phys_blk,
+                                               min(end, limit) - start);
+                        if (ret < 0)
+                                return ret;
+                }
+        }
+        return 0;
+}
+/**
+ * numa_emulation - Emulate NUMA nodes
+ * @numa_meminfo: NUMA configuration to massage
+ * @numa_dist_cnt: The size of the physical NUMA distance table
+ *
+ * Emulate NUMA nodes according to the numa=fake kernel parameter.
+ * @numa_meminfo contains the physical memory configuration and is modified
+ * to reflect the emulated configuration on success.  @numa_dist_cnt is
+ * used to determine the size of the physical distance table.
+ *
+ * On success, the following modifications are made.
+ *
+ * - @numa_meminfo is updated to reflect the emulated nodes.
+ *
+ * - __apicid_to_node[] is updated such that APIC IDs are mapped to the
+ *   emulated nodes.
+ *
+ * - NUMA distance table is rebuilt to represent distances between emulated
+ *   nodes.  The distances are determined considering how emulated nodes
+ *   are mapped to physical nodes and match the actual distances.
+ *
+ * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical
+ *   nodes.  This is used by numa_add_cpu() and numa_remove_cpu().
+ *
+ * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with
+ * identity mapping and no other modification is made.
+ */
+void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
+{
+        static struct numa_meminfo ei __initdata;
+        static struct numa_meminfo pi __initdata;
+        const u64 max_addr = PFN_PHYS(max_pfn);
+        u8 *phys_dist = NULL;
+        size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
+        int max_emu_nid, dfl_phys_nid;
+        int i, j, ret;
+        if (!emu_cmdline)
+                goto no_emu;
+        memset(&ei, 0, sizeof(ei));
+        pi = *numa_meminfo;
+        for (i = 0; i < MAX_NUMNODES; i++)
+                emu_nid_to_phys[i] = NUMA_NO_NODE;
+        /*
+         * If the numa=fake command-line contains a 'M' or 'G', it represents
+         * the fixed node size.  Otherwise, if it is just a single number N,
+         * split the system RAM into N fake nodes.
+         */
+        if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
+                u64 size;
+                size = memparse(emu_cmdline, &emu_cmdline);
+                ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
+        } else {
+                unsigned long n;
+                n = simple_strtoul(emu_cmdline, NULL, 0);
+                ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
+        }
+        if (ret < 0)
+                goto no_emu;
+        if (numa_cleanup_meminfo(&ei) < 0) {
+                pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
+                goto no_emu;
+        }
+        /* copy the physical distance table */
+        if (numa_dist_cnt) {
+                u64 phys;
+                phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
+                                              phys_size, PAGE_SIZE);
+                if (phys == MEMBLOCK_ERROR) {
+                        pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
+                        goto no_emu;
+                }
+                memblock_x86_reserve_range(phys, phys + phys_size, "TMP NUMA DIST");
+                phys_dist = __va(phys);
+                for (i = 0; i < numa_dist_cnt; i++)
+                        for (j = 0; j < numa_dist_cnt; j++)
+                                phys_dist[i * numa_dist_cnt + j] =
+                                        node_distance(i, j);
+        }
+        /*
+         * Determine the max emulated nid and the default phys nid to use
+         * for unmapped nodes.
+         */
+        max_emu_nid = 0;
+        dfl_phys_nid = NUMA_NO_NODE;
+        for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
+                if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
+                        max_emu_nid = i;
+                        if (dfl_phys_nid == NUMA_NO_NODE)
+                                dfl_phys_nid = emu_nid_to_phys[i];
+                }
+        }
+        if (dfl_phys_nid == NUMA_NO_NODE) {
+                pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n");
+                goto no_emu;
+        }
+        /* commit */
+        *numa_meminfo = ei;
+        /*
+         * Transform __apicid_to_node table to use emulated nids by
+         * reverse-mapping phys_nid.  The maps should always exist but fall
+         * back to zero just in case.
+         */
+        for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
+                if (__apicid_to_node[i] == NUMA_NO_NODE)
+                        continue;
+                for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
+                        if (__apicid_to_node[i] == emu_nid_to_phys[j])
+                                break;
+                __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
+        }
+        /* make sure all emulated nodes are mapped to a physical node */
+        for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
+                if (emu_nid_to_phys[i] == NUMA_NO_NODE)
+                        emu_nid_to_phys[i] = dfl_phys_nid;
+        /* transform distance table */
+        numa_reset_distance();
+        for (i = 0; i < max_emu_nid + 1; i++) {
+                for (j = 0; j < max_emu_nid + 1; j++) {
+                        int physi = emu_nid_to_phys[i];
+                        int physj = emu_nid_to_phys[j];
+                        int dist;
+                        if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
+                                dist = physi == physj ?
+                                        LOCAL_DISTANCE : REMOTE_DISTANCE;
+                        else
+                                dist = phys_dist[physi * numa_dist_cnt + physj];
+                        numa_set_distance(i, j, dist);
+                }
+        }
+        /* free the copied physical distance table */
+        if (phys_dist)
+                memblock_x86_free_range(__pa(phys_dist), __pa(phys_dist) + phys_size);
+        return;
+no_emu:
+        /* No emulation.  Build identity emu_nid_to_phys[] for numa_add_cpu() */
+        for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
+                emu_nid_to_phys[i] = i;
+}
+#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+void __cpuinit numa_add_cpu(int cpu)
+{
+        int physnid, nid;
+        nid = early_cpu_to_node(cpu);
+        BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
+        physnid = emu_nid_to_phys[nid];
+        /*
+         * Map the cpu to each emulated node that is allocated on the physical
+         * node of the cpu's apic id.
+         */
+        for_each_online_node(nid)
+                if (emu_nid_to_phys[nid] == physnid)
+                        cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
+}
+void __cpuinit numa_remove_cpu(int cpu)
+{
+        int i;
+        for_each_online_node(i)
+                cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
+}
+#else   /* !CONFIG_DEBUG_PER_CPU_MAPS */
+static void __cpuinit numa_set_cpumask(int cpu, bool enable)
+{
+        int nid, physnid;
+        nid = early_cpu_to_node(cpu);
+        if (nid == NUMA_NO_NODE) {
+                /* early_cpu_to_node() already emits a warning and trace */
+                return;
+        }
+        physnid = emu_nid_to_phys[nid];
+        for_each_online_node(nid) {
+                if (emu_nid_to_phys[nid] != physnid)
+                        continue;
+                debug_cpumask_set_cpu(cpu, nid, enable);
+        }
+}
+void __cpuinit numa_add_cpu(int cpu)
+{
+        numa_set_cpumask(cpu, true);
+}
+void __cpuinit numa_remove_cpu(int cpu)
+{
+        numa_set_cpumask(cpu, false);
+}
+#endif  /* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
new file mode 100644
index 000000000000..7178c3afe05e
--- /dev/null
+++ b/arch/x86/mm/numa_internal.h
@@ -0,0 +1,39 @@
+#ifndef __X86_MM_NUMA_INTERNAL_H
+#define __X86_MM_NUMA_INTERNAL_H
+#include <linux/types.h>
+#include <asm/numa.h>
+struct numa_memblk {
+        u64                     start;
+        u64                     end;
+        int                     nid;
+};
+struct numa_meminfo {
+        int                     nr_blks;
+        struct numa_memblk      blk[NR_NODE_MEMBLKS];
+};
+void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
+int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
+void __init numa_reset_distance(void);
+void __init x86_numa_init(void);
+#ifdef CONFIG_X86_64
+static inline void init_alloc_remap(int nid, u64 start, u64 end)        { }
+#else
+void __init init_alloc_remap(int nid, u64 start, u64 end);
+#endif
+#ifdef CONFIG_NUMA_EMU
+void __init numa_emulation(struct numa_meminfo *numa_meminfo,
+                           int numa_dist_cnt);
+#else
+static inline void numa_emulation(struct numa_meminfo *numa_meminfo,
+                                  int numa_dist_cnt)
+{ }
+#endif
+#endif  /* __X86_MM_NUMA_INTERNAL_H */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 532e7933d606..f9e526742fa1 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -13,6 +13,7 @@
 #include <linux/pfn.h>
 #include <linux/percpu.h>
 #include <linux/gfp.h>
+#include <linux/pci.h>
 #include <asm/e820.h>
 #include <asm/processor.h>
@@ -56,12 +57,10 @@ static unsigned long direct_pages_count[PG_LEVEL_NUM];
 void update_page_count(int level, unsigned long pages)
 {
-        unsigned long flags;
        /* Protect against CPA */
-        spin_lock_irqsave(&pgd_lock, flags);
+        spin_lock(&pgd_lock);
        direct_pages_count[level] += pages;
-        spin_unlock_irqrestore(&pgd_lock, flags);
+        spin_unlock(&pgd_lock);
 }
 static void split_page_count(int level)
@@ -260,8 +259,10 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
         * The BIOS area between 640k and 1Mb needs to be executable for
         * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
         */
-        if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
+#ifdef CONFIG_PCI_BIOS
+        if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
                pgprot_val(forbidden) |= _PAGE_NX;
+#endif
        /*
         * The kernel text needs to be executable for obvious reasons
@@ -309,7 +310,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
                 * these shared mappings are made of small page mappings.
                 * Thus this don't enforce !RW mapping for small page kernel
                 * text mapping logic will help Linux Xen parvirt guest boot
-                 * aswell.
+                 * as well.
                 */
                if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
                        pgprot_val(forbidden) |= _PAGE_RW;
@@ -391,16 +392,16 @@ static int
 try_preserve_large_page(pte_t *kpte, unsigned long address,
                        struct cpa_data *cpa)
 {
-        unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
+        unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn;
        pte_t new_pte, old_pte, *tmp;
-        pgprot_t old_prot, new_prot;
+        pgprot_t old_prot, new_prot, req_prot;
        int i, do_split = 1;
        unsigned int level;
        if (cpa->force_split)
                return 1;
-        spin_lock_irqsave(&pgd_lock, flags);
+        spin_lock(&pgd_lock);
        /*
         * Check for races, another CPU might have split this page
         * up already:
@@ -438,10 +439,10 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
         * We are safe now. Check whether the new pgprot is the same:
         */
        old_pte = *kpte;
-        old_prot = new_prot = pte_pgprot(old_pte);
+        old_prot = new_prot = req_prot = pte_pgprot(old_pte);
-        pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
+        pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
-        pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
+        pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
        /*
         * old_pte points to the large page base address. So we need
@@ -450,17 +451,17 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
        pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
        cpa->pfn = pfn;
-        new_prot = static_protections(new_prot, address, pfn);
+        new_prot = static_protections(req_prot, address, pfn);
        /*
         * We need to check the full range, whether
         * static_protection() requires a different pgprot for one of
         * the pages in the range we try to preserve:
         */
-        addr = address + PAGE_SIZE;
+        addr = address & pmask;
-        pfn++;
+        pfn = pte_pfn(old_pte);
-        for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) {
+        for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
-                pgprot_t chk_prot = static_protections(new_prot, addr, pfn);
+                pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
                if (pgprot_val(chk_prot) != pgprot_val(new_prot))
                        goto out_unlock;
@@ -483,7 +484,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
         * that we limited the number of possible pages already to
         * the number of pages in the large page.
         */
-        if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
+        if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
                /*
                 * The address is aligned and the number of pages
                 * covers the full page.
@@ -495,14 +496,14 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
        }
 out_unlock:
-        spin_unlock_irqrestore(&pgd_lock, flags);
+        spin_unlock(&pgd_lock);
        return do_split;
 }
 static int split_large_page(pte_t *kpte, unsigned long address)
 {
-        unsigned long flags, pfn, pfninc = 1;
+        unsigned long pfn, pfninc = 1;
        unsigned int i, level;
        pte_t *pbase, *tmp;
        pgprot_t ref_prot;
@@ -516,7 +517,7 @@ static int split_large_page(pte_t *kpte, unsigned long address)
        if (!base)
                return -ENOMEM;
-        spin_lock_irqsave(&pgd_lock, flags);
+        spin_lock(&pgd_lock);
        /*
         * Check for races, another CPU might have split this page
         * up for us already:
@@ -588,7 +589,7 @@ out_unlock:
         */
        if (base)
                __free_page(base);
-        spin_unlock_irqrestore(&pgd_lock, flags);
+        spin_unlock(&pgd_lock);
        return 0;
 }
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
index 38e6d174c497..9f0614daea85 100644
--- a/arch/x86/mm/pf_in.c
+++ b/arch/x86/mm/pf_in.c
@@ -414,22 +414,17 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
        unsigned char *p;
        struct prefix_bits prf;
        int i;
-        unsigned long rv;
        p = (unsigned char *)ins_addr;
        p += skip_prefix(p, &prf);
        p += get_opcode(p, &opcode);
        for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
-                if (reg_rop[i] == opcode) {
+                if (reg_rop[i] == opcode)
-                        rv = REG_READ;
                        goto do_work;
-                }
        for (i = 0; i < ARRAY_SIZE(reg_wop); i++)
-                if (reg_wop[i] == opcode) {
+                if (reg_wop[i] == opcode)
-                        rv = REG_WRITE;
                        goto do_work;
-                }
        printk(KERN_ERR "mmiotrace: Not a register instruction, opcode "
                                                        "0x%02x\n", opcode);
@@ -474,16 +469,13 @@ unsigned long get_ins_imm_val(unsigned long ins_addr)
        unsigned char *p;
        struct prefix_bits prf;
        int i;
-        unsigned long rv;
        p = (unsigned char *)ins_addr;
        p += skip_prefix(p, &prf);
        p += get_opcode(p, &opcode);
        for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
-                if (imm_wop[i] == opcode) {
+                if (imm_wop[i] == opcode)
-                        rv = IMM_WRITE;
                        goto do_work;
-                }
        printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode "
                                                        "0x%02x\n", opcode);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 5c4ee422590e..8573b83a63d0 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -87,7 +87,19 @@ static inline void pgd_list_del(pgd_t *pgd)
 #define UNSHARED_PTRS_PER_PGD                           \
        (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
-static void pgd_ctor(pgd_t *pgd)
+static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
+{
+        BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
+        virt_to_page(pgd)->index = (pgoff_t)mm;
+}
+struct mm_struct *pgd_page_get_mm(struct page *page)
+{
+        return (struct mm_struct *)page->index;
+}
+static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
 {
        /* If the pgd points to a shared pagetable level (either the
           ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -98,27 +110,23 @@ static void pgd_ctor(pgd_t *pgd)
                clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
                                swapper_pg_dir + KERNEL_PGD_BOUNDARY,
                                KERNEL_PGD_PTRS);
-                paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
-                                         __pa(swapper_pg_dir) >> PAGE_SHIFT,
-                                         KERNEL_PGD_BOUNDARY,
-                                         KERNEL_PGD_PTRS);
        }
        /* list required to sync kernel mapping updates */
-        if (!SHARED_KERNEL_PMD)
+        if (!SHARED_KERNEL_PMD) {
+                pgd_set_mm(pgd, mm);
                pgd_list_add(pgd);
+        }
 }
 static void pgd_dtor(pgd_t *pgd)
 {
-        unsigned long flags; /* can be called from interrupt context */
        if (SHARED_KERNEL_PMD)
                return;
-        spin_lock_irqsave(&pgd_lock, flags);
+        spin_lock(&pgd_lock);
        pgd_list_del(pgd);
-        spin_unlock_irqrestore(&pgd_lock, flags);
+        spin_unlock(&pgd_lock);
 }
 /*
@@ -160,8 +168,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
         * section 8.1: in PAE mode we explicitly have to flush the
         * TLB via cr3 if the top-level pgd is changed...
         */
-        if (mm == current->active_mm)
+        flush_tlb_mm(mm);
-                write_cr3(read_cr3());
 }
 #else  /* !CONFIG_X86_PAE */
@@ -250,7 +257,6 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 {
        pgd_t *pgd;
        pmd_t *pmds[PREALLOCATED_PMDS];
-        unsigned long flags;
        pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
@@ -270,12 +276,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
         * respect to anything walking the pgd_list, so that they
         * never see a partially populated pgd.
         */
-        spin_lock_irqsave(&pgd_lock, flags);
+        spin_lock(&pgd_lock);
-        pgd_ctor(pgd);
+        pgd_ctor(mm, pgd);
        pgd_prepopulate_pmd(mm, pgd, pmds);
-        spin_unlock_irqrestore(&pgd_lock, flags);
+        spin_unlock(&pgd_lock);
        return pgd;
@@ -310,6 +316,25 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
        return changed;
 }
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_set_access_flags(struct vm_area_struct *vma,
+                          unsigned long address, pmd_t *pmdp,
+                          pmd_t entry, int dirty)
+{
+        int changed = !pmd_same(*pmdp, entry);
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        if (changed && dirty) {
+                *pmdp = entry;
+                pmd_update_defer(vma->vm_mm, address, pmdp);
+                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+        }
+        return changed;
+}
+#endif
 int ptep_test_and_clear_young(struct vm_area_struct *vma,
                              unsigned long addr, pte_t *ptep)
 {
@@ -325,6 +350,23 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
        return ret;
 }
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+                              unsigned long addr, pmd_t *pmdp)
+{
+        int ret = 0;
+        if (pmd_young(*pmdp))
+                ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+                                         (unsigned long *)pmdp);
+        if (ret)
+                pmd_update(vma->vm_mm, addr, pmdp);
+        return ret;
+}
+#endif
 int ptep_clear_flush_young(struct vm_area_struct *vma,
                           unsigned long address, pte_t *ptep)
 {
@@ -337,6 +379,36 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
        return young;
 }
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+                           unsigned long address, pmd_t *pmdp)
+{
+        int young;
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        young = pmdp_test_and_clear_young(vma, address, pmdp);
+        if (young)
+                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+        return young;
+}
+void pmdp_splitting_flush(struct vm_area_struct *vma,
+                          unsigned long address, pmd_t *pmdp)
+{
+        int set;
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
+                                (unsigned long *)pmdp);
+        if (set) {
+                pmd_update(vma->vm_mm, address, pmdp);
+                /* need tlb flush only to serialize against gup-fast */
+                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+        }
+}
+#endif
 /**
 * reserve_top_address - reserves a hole in the top of kernel address space
 * @reserve - size of hole to reserve
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index a3250aa34086..410531d3c292 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -41,7 +41,7 @@ void __init x86_report_nx(void)
 {
        if (!cpu_has_nx) {
                printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
-                       "missing in CPU or disabled in BIOS!\n");
+                       "missing in CPU!\n");
        } else {
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
                if (disable_nx) {
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
new file mode 100644
index 000000000000..81dbfdeb080d
--- /dev/null
+++ b/arch/x86/mm/srat.c
@@ -0,0 +1,184 @@
+/*
+ * ACPI 3.0 based NUMA setup
+ * Copyright 2004 Andi Kleen, SuSE Labs.
+ *
+ * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
+ *
+ * Called from acpi_numa_init while reading the SRAT and SLIT tables.
+ * Assumes all memory regions belonging to a single proximity domain
+ * are in one chunk. Holes between them will be included in the node.
+ */
+#include <linux/kernel.h>
+#include <linux/acpi.h>
+#include <linux/mmzone.h>
+#include <linux/bitmap.h>
+#include <linux/module.h>
+#include <linux/topology.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/mm.h>
+#include <asm/proto.h>
+#include <asm/numa.h>
+#include <asm/e820.h>
+#include <asm/apic.h>
+#include <asm/uv/uv.h>
+int acpi_numa __initdata;
+static __init int setup_node(int pxm)
+{
+        return acpi_map_pxm_to_node(pxm);
+}
+static __init void bad_srat(void)
+{
+        printk(KERN_ERR "SRAT: SRAT not used.\n");
+        acpi_numa = -1;
+}
+static __init inline int srat_disabled(void)
+{
+        return acpi_numa < 0;
+}
+/* Callback for SLIT parsing */
+void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
+{
+        int i, j;
+        for (i = 0; i < slit->locality_count; i++)
+                for (j = 0; j < slit->locality_count; j++)
+                        numa_set_distance(pxm_to_node(i), pxm_to_node(j),
+                                slit->entry[slit->locality_count * i + j]);
+}
+/* Callback for Proximity Domain -> x2APIC mapping */
+void __init
+acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
+{
+        int pxm, node;
+        int apic_id;
+        if (srat_disabled())
+                return;
+        if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
+                bad_srat();
+                return;
+        }
+        if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
+                return;
+        pxm = pa->proximity_domain;
+        node = setup_node(pxm);
+        if (node < 0) {
+                printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
+                bad_srat();
+                return;
+        }
+        apic_id = pa->apic_id;
+        if (apic_id >= MAX_LOCAL_APIC) {
+                printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+                return;
+        }
+        set_apicid_to_node(apic_id, node);
+        node_set(node, numa_nodes_parsed);
+        acpi_numa = 1;
+        printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
+               pxm, apic_id, node);
+}
+/* Callback for Proximity Domain -> LAPIC mapping */
+void __init
+acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
+{
+        int pxm, node;
+        int apic_id;
+        if (srat_disabled())
+                return;
+        if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
+                bad_srat();
+                return;
+        }
+        if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
+                return;
+        pxm = pa->proximity_domain_lo;
+        node = setup_node(pxm);
+        if (node < 0) {
+                printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
+                bad_srat();
+                return;
+        }
+        if (get_uv_system_type() >= UV_X2APIC)
+                apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
+        else
+                apic_id = pa->apic_id;
+        if (apic_id >= MAX_LOCAL_APIC) {
+                printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+                return;
+        }
+        set_apicid_to_node(apic_id, node);
+        node_set(node, numa_nodes_parsed);
+        acpi_numa = 1;
+        printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
+               pxm, apic_id, node);
+}
+#ifdef CONFIG_MEMORY_HOTPLUG
+static inline int save_add_info(void) {return 1;}
+#else
+static inline int save_add_info(void) {return 0;}
+#endif
+/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
+void __init
+acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
+{
+        u64 start, end;
+        int node, pxm;
+        if (srat_disabled())
+                return;
+        if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
+                bad_srat();
+                return;
+        }
+        if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
+                return;
+        if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
+                return;
+        start = ma->base_address;
+        end = start + ma->length;
+        pxm = ma->proximity_domain;
+        node = setup_node(pxm);
+        if (node < 0) {
+                printk(KERN_ERR "SRAT: Too many proximity domains.\n");
+                bad_srat();
+                return;
+        }
+        if (numa_add_memblk(node, start, end) < 0) {
+                bad_srat();
+                return;
+        }
+        printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
+               start, end);
+}
+void __init acpi_numa_arch_fixup(void) {}
+int __init x86_acpi_numa_init(void)
+{
+        int ret;
+        ret = acpi_numa_init();
+        if (ret < 0)
+                return ret;
+        return srat_disabled() ? -EINVAL : 0;
+}
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
deleted file mode 100644
index 9324f13492d5..000000000000
--- a/arch/x86/mm/srat_32.c
+++ /dev/null
@@ -1,285 +0,0 @@
-/*
- * Some of the code in this file has been gleaned from the 64 bit 
- * discontigmem support code base.
- *
- * Copyright (C) 2002, IBM Corp.
- *
- * All rights reserved.          
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to Pat Gaughen <gone@us.ibm.com>
- */
-#include <linux/mm.h>
-#include <linux/bootmem.h>
-#include <linux/mmzone.h>
-#include <linux/acpi.h>
-#include <linux/nodemask.h>
-#include <asm/srat.h>
-#include <asm/topology.h>
-#include <asm/smp.h>
-#include <asm/e820.h>
-/*
- * proximity macros and definitions
- */
-#define NODE_ARRAY_INDEX(x)     ((x) / 8)       /* 8 bits/char */
-#define NODE_ARRAY_OFFSET(x)    ((x) % 8)       /* 8 bits/char */
-#define BMAP_SET(bmap, bit)     ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
-#define BMAP_TEST(bmap, bit)    ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
-/* bitmap length; _PXM is at most 255 */
-#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) 
-static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN];        /* bitmap of proximity domains */
-#define MAX_CHUNKS_PER_NODE     3
-#define MAXCHUNKS               (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
-struct node_memory_chunk_s {
-        unsigned long   start_pfn;
-        unsigned long   end_pfn;
-        u8      pxm;            // proximity domain of node
-        u8      nid;            // which cnode contains this chunk?
-        u8      bank;           // which mem bank on this node
-};
-static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
-static int __initdata num_memory_chunks; /* total number of memory chunks */
-static u8 __initdata apicid_to_pxm[MAX_APICID];
-int numa_off __initdata;
-int acpi_numa __initdata;
-static __init void bad_srat(void)
-{
-        printk(KERN_ERR "SRAT: SRAT not used.\n");
-        acpi_numa = -1;
-        num_memory_chunks = 0;
-}
-static __init inline int srat_disabled(void)
-{
-        return numa_off || acpi_numa < 0;
-}
-/* Identify CPU proximity domains */
-void __init
-acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
-{
-        if (srat_disabled())
-                return;
-        if (cpu_affinity->header.length !=
-             sizeof(struct acpi_srat_cpu_affinity)) {
-                bad_srat();
-                return;
-        }
-        if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
-                return;         /* empty entry */
-        /* mark this node as "seen" in node bitmap */
-        BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
-        apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
-        printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
-                cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
-}
-/*
- * Identify memory proximity domains and hot-remove capabilities.
- * Fill node memory chunk list structure.
- */
-void __init
-acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
-{
-        unsigned long long paddr, size;
-        unsigned long start_pfn, end_pfn;
-        u8 pxm;
-        struct node_memory_chunk_s *p, *q, *pend;
-        if (srat_disabled())
-                return;
-        if (memory_affinity->header.length !=
-             sizeof(struct acpi_srat_mem_affinity)) {
-                bad_srat();
-                return;
-        }
-        if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
-                return;         /* empty entry */
-        pxm = memory_affinity->proximity_domain & 0xff;
-        /* mark this node as "seen" in node bitmap */
-        BMAP_SET(pxm_bitmap, pxm);
-        /* calculate info for memory chunk structure */
-        paddr = memory_affinity->base_address;
-        size = memory_affinity->length;
-        start_pfn = paddr >> PAGE_SHIFT;
-        end_pfn = (paddr + size) >> PAGE_SHIFT;
-        if (num_memory_chunks >= MAXCHUNKS) {
-                printk(KERN_WARNING "Too many mem chunks in SRAT."
-                        " Ignoring %lld MBytes at %llx\n",
-                        size/(1024*1024), paddr);
-                return;
-        }
-        /* Insertion sort based on base address */
-        pend = &node_memory_chunk[num_memory_chunks];
-        for (p = &node_memory_chunk[0]; p < pend; p++) {
-                if (start_pfn < p->start_pfn)
-                        break;
-        }
-        if (p < pend) {
-                for (q = pend; q >= p; q--)
-                        *(q + 1) = *q;
-        }
-        p->start_pfn = start_pfn;
-        p->end_pfn = end_pfn;
-        p->pxm = pxm;
-        num_memory_chunks++;
-        printk(KERN_DEBUG "Memory range %08lx to %08lx"
-                          " in proximity domain %02x %s\n",
-                start_pfn, end_pfn,
-                pxm,
-                ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
-                 "enabled and removable" : "enabled" ) );
-}
-/* Callback for SLIT parsing */
-void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
-{
-}
-void acpi_numa_arch_fixup(void)
-{
-}
-/*
- * The SRAT table always lists ascending addresses, so can always
- * assume that the first "start" address that you see is the real
- * start of the node, and that the current "end" address is after
- * the previous one.
- */
-static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
-{
-        /*
-         * Only add present memory as told by the e820.
-         * There is no guarantee from the SRAT that the memory it
-         * enumerates is present at boot time because it represents
-         * *possible* memory hotplug areas the same as normal RAM.
-         */
-        if (memory_chunk->start_pfn >= max_pfn) {
-                printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
-                        memory_chunk->start_pfn, memory_chunk->end_pfn);
-                return -1;
-        }
-        if (memory_chunk->nid != nid)
-                return -1;
-        if (!node_has_online_mem(nid))
-                node_start_pfn[nid] = memory_chunk->start_pfn;
-        if (node_start_pfn[nid] > memory_chunk->start_pfn)
-                node_start_pfn[nid] = memory_chunk->start_pfn;
-        if (node_end_pfn[nid] < memory_chunk->end_pfn)
-                node_end_pfn[nid] = memory_chunk->end_pfn;
-        return 0;
-}
-int __init get_memcfg_from_srat(void)
-{
-        int i, j, nid;
-        if (srat_disabled())
-                goto out_fail;
-        if (num_memory_chunks == 0) {
-                printk(KERN_DEBUG
-                         "could not find any ACPI SRAT memory areas.\n");
-                goto out_fail;
-        }
-        /* Calculate total number of nodes in system from PXM bitmap and create
-         * a set of sequential node IDs starting at zero.  (ACPI doesn't seem
-         * to specify the range of _PXM values.)
-         */
-        /*
-         * MCD - we no longer HAVE to number nodes sequentially.  PXM domain
-         * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically
-         * 32, so we will continue numbering them in this manner until MAX_NUMNODES
-         * approaches MAX_PXM_DOMAINS for i386.
-         */
-        nodes_clear(node_online_map);
-        for (i = 0; i < MAX_PXM_DOMAINS; i++) {
-                if (BMAP_TEST(pxm_bitmap, i)) {
-                        int nid = acpi_map_pxm_to_node(i);
-                        node_set_online(nid);
-                }
-        }
-        BUG_ON(num_online_nodes() == 0);
-        /* set cnode id in memory chunk structure */
-        for (i = 0; i < num_memory_chunks; i++)
-                node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
-        printk(KERN_DEBUG "pxm bitmap: ");
-        for (i = 0; i < sizeof(pxm_bitmap); i++) {
-                printk(KERN_CONT "%02x ", pxm_bitmap[i]);
-        }
-        printk(KERN_CONT "\n");
-        printk(KERN_DEBUG "Number of logical nodes in system = %d\n",
-                         num_online_nodes());
-        printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
-                         num_memory_chunks);
-        for (i = 0; i < MAX_APICID; i++)
-                apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
-        for (j = 0; j < num_memory_chunks; j++){
-                struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
-                printk(KERN_DEBUG
-                        "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
-                       j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
-                if (node_read_chunk(chunk->nid, chunk))
-                        continue;
-                e820_register_active_regions(chunk->nid, chunk->start_pfn,
-                                             min(chunk->end_pfn, max_pfn));
-        }
-        /* for out of order entries in SRAT */
-        sort_node_map();
-        for_each_online_node(nid) {
-                unsigned long start = node_start_pfn[nid];
-                unsigned long end = min(node_end_pfn[nid], max_pfn);
-                memory_present(nid, start, end);
-                node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
-        }
-        return 1;
-out_fail:
-        printk(KERN_DEBUG "failed to get NUMA memory information from SRAT"
-                        " table\n");
-        return 0;
-}
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
deleted file mode 100644
index 9c0d0d399c30..000000000000
--- a/arch/x86/mm/srat_64.c
+++ /dev/null
@@ -1,564 +0,0 @@
-/*
- * ACPI 3.0 based NUMA setup
- * Copyright 2004 Andi Kleen, SuSE Labs.
- *
- * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
- *
- * Called from acpi_numa_init while reading the SRAT and SLIT tables.
- * Assumes all memory regions belonging to a single proximity domain
- * are in one chunk. Holes between them will be included in the node.
- */
-#include <linux/kernel.h>
-#include <linux/acpi.h>
-#include <linux/mmzone.h>
-#include <linux/bitmap.h>
-#include <linux/module.h>
-#include <linux/topology.h>
-#include <linux/bootmem.h>
-#include <linux/mm.h>
-#include <asm/proto.h>
-#include <asm/numa.h>
-#include <asm/e820.h>
-#include <asm/apic.h>
-#include <asm/uv/uv.h>
-int acpi_numa __initdata;
-static struct acpi_table_slit *acpi_slit;
-static nodemask_t nodes_parsed __initdata;
-static nodemask_t cpu_nodes_parsed __initdata;
-static struct bootnode nodes[MAX_NUMNODES] __initdata;
-static struct bootnode nodes_add[MAX_NUMNODES];
-static int num_node_memblks __initdata;
-static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
-static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
-static __init int setup_node(int pxm)
-{
-        return acpi_map_pxm_to_node(pxm);
-}
-static __init int conflicting_memblks(unsigned long start, unsigned long end)
-{
-        int i;
-        for (i = 0; i < num_node_memblks; i++) {
-                struct bootnode *nd = &node_memblk_range[i];
-                if (nd->start == nd->end)
-                        continue;
-                if (nd->end > start && nd->start < end)
-                        return memblk_nodeid[i];
-                if (nd->end == end && nd->start == start)
-                        return memblk_nodeid[i];
-        }
-        return -1;
-}
-static __init void cutoff_node(int i, unsigned long start, unsigned long end)
-{
-        struct bootnode *nd = &nodes[i];
-        if (nd->start < start) {
-                nd->start = start;
-                if (nd->end < nd->start)
-                        nd->start = nd->end;
-        }
-        if (nd->end > end) {
-                nd->end = end;
-                if (nd->start > nd->end)
-                        nd->start = nd->end;
-        }
-}
-static __init void bad_srat(void)
-{
-        int i;
-        printk(KERN_ERR "SRAT: SRAT not used.\n");
-        acpi_numa = -1;
-        for (i = 0; i < MAX_LOCAL_APIC; i++)
-                apicid_to_node[i] = NUMA_NO_NODE;
-        for (i = 0; i < MAX_NUMNODES; i++) {
-                nodes[i].start = nodes[i].end = 0;
-                nodes_add[i].start = nodes_add[i].end = 0;
-        }
-        remove_all_active_ranges();
-}
-static __init inline int srat_disabled(void)
-{
-        return numa_off || acpi_numa < 0;
-}
-/* Callback for SLIT parsing */
-void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
-{
-        unsigned length;
-        unsigned long phys;
-        length = slit->header.length;
-        phys = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, length,
-                 PAGE_SIZE);
-        if (phys == -1L)
-                panic(" Can not save slit!\n");
-        acpi_slit = __va(phys);
-        memcpy(acpi_slit, slit, length);
-        reserve_early(phys, phys + length, "ACPI SLIT");
-}
-/* Callback for Proximity Domain -> x2APIC mapping */
-void __init
-acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
-{
-        int pxm, node;
-        int apic_id;
-        if (srat_disabled())
-                return;
-        if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
-                bad_srat();
-                return;
-        }
-        if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
-                return;
-        pxm = pa->proximity_domain;
-        node = setup_node(pxm);
-        if (node < 0) {
-                printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
-                bad_srat();
-                return;
-        }
-        apic_id = pa->apic_id;
-        apicid_to_node[apic_id] = node;
-        node_set(node, cpu_nodes_parsed);
-        acpi_numa = 1;
-        printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
-               pxm, apic_id, node);
-}
-/* Callback for Proximity Domain -> LAPIC mapping */
-void __init
-acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
-{
-        int pxm, node;
-        int apic_id;
-        if (srat_disabled())
-                return;
-        if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
-                bad_srat();
-                return;
-        }
-        if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
-                return;
-        pxm = pa->proximity_domain_lo;
-        node = setup_node(pxm);
-        if (node < 0) {
-                printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
-                bad_srat();
-                return;
-        }
-        if (get_uv_system_type() >= UV_X2APIC)
-                apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
-        else
-                apic_id = pa->apic_id;
-        apicid_to_node[apic_id] = node;
-        node_set(node, cpu_nodes_parsed);
-        acpi_numa = 1;
-        printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
-               pxm, apic_id, node);
-}
-#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
-static inline int save_add_info(void) {return 1;}
-#else
-static inline int save_add_info(void) {return 0;}
-#endif
-/*
- * Update nodes_add[]
- * This code supports one contiguous hot add area per node
- */
-static void __init
-update_nodes_add(int node, unsigned long start, unsigned long end)
-{
-        unsigned long s_pfn = start >> PAGE_SHIFT;
-        unsigned long e_pfn = end >> PAGE_SHIFT;
-        int changed = 0;
-        struct bootnode *nd = &nodes_add[node];
-        /* I had some trouble with strange memory hotadd regions breaking
-           the boot. Be very strict here and reject anything unexpected.
-           If you want working memory hotadd write correct SRATs.
-           The node size check is a basic sanity check to guard against
-           mistakes */
-        if ((signed long)(end - start) < NODE_MIN_SIZE) {
-                printk(KERN_ERR "SRAT: Hotplug area too small\n");
-                return;
-        }
-        /* This check might be a bit too strict, but I'm keeping it for now. */
-        if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
-                printk(KERN_ERR
-                        "SRAT: Hotplug area %lu -> %lu has existing memory\n",
-                        s_pfn, e_pfn);
-                return;
-        }
-        /* Looks good */
-        if (nd->start == nd->end) {
-                nd->start = start;
-                nd->end = end;
-                changed = 1;
-        } else {
-                if (nd->start == end) {
-                        nd->start = start;
-                        changed = 1;
-                }
-                if (nd->end == start) {
-                        nd->end = end;
-                        changed = 1;
-                }
-                if (!changed)
-                        printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
-        }
-        if (changed) {
-                node_set(node, cpu_nodes_parsed);
-                printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
-                                 nd->start, nd->end);
-        }
-}
-/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
-void __init
-acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
-{
-        struct bootnode *nd, oldnode;
-        unsigned long start, end;
-        int node, pxm;
-        int i;
-        if (srat_disabled())
-                return;
-        if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
-                bad_srat();
-                return;
-        }
-        if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
-                return;
-        if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
-                return;
-        start = ma->base_address;
-        end = start + ma->length;
-        pxm = ma->proximity_domain;
-        node = setup_node(pxm);
-        if (node < 0) {
-                printk(KERN_ERR "SRAT: Too many proximity domains.\n");
-                bad_srat();
-                return;
-        }
-        i = conflicting_memblks(start, end);
-        if (i == node) {
-                printk(KERN_WARNING
-                "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
-                        pxm, start, end, nodes[i].start, nodes[i].end);
-        } else if (i >= 0) {
-                printk(KERN_ERR
-                       "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
-                       pxm, start, end, node_to_pxm(i),
-                        nodes[i].start, nodes[i].end);
-                bad_srat();
-                return;
-        }
-        nd = &nodes[node];
-        oldnode = *nd;
-        if (!node_test_and_set(node, nodes_parsed)) {
-                nd->start = start;
-                nd->end = end;
-        } else {
-                if (start < nd->start)
-                        nd->start = start;
-                if (nd->end < end)
-                        nd->end = end;
-        }
-        printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
-               start, end);
-        if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
-                update_nodes_add(node, start, end);
-                /* restore nodes[node] */
-                *nd = oldnode;
-                if ((nd->start | nd->end) == 0)
-                        node_clear(node, nodes_parsed);
-        }
-        node_memblk_range[num_node_memblks].start = start;
-        node_memblk_range[num_node_memblks].end = end;
-        memblk_nodeid[num_node_memblks] = node;
-        num_node_memblks++;
-}
-/* Sanity check to catch more bad SRATs (they are amazingly common).
-   Make sure the PXMs cover all memory. */
-static int __init nodes_cover_memory(const struct bootnode *nodes)
-{
-        int i;
-        unsigned long pxmram, e820ram;
-        pxmram = 0;
-        for_each_node_mask(i, nodes_parsed) {
-                unsigned long s = nodes[i].start >> PAGE_SHIFT;
-                unsigned long e = nodes[i].end >> PAGE_SHIFT;
-                pxmram += e - s;
-                pxmram -= __absent_pages_in_range(i, s, e);
-                if ((long)pxmram < 0)
-                        pxmram = 0;
-        }
-        e820ram = max_pfn - (e820_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
-        /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
-        if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) {
-                printk(KERN_ERR
-        "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
-                        (pxmram << PAGE_SHIFT) >> 20,
-                        (e820ram << PAGE_SHIFT) >> 20);
-                return 0;
-        }
-        return 1;
-}
-void __init acpi_numa_arch_fixup(void) {}
-int __init acpi_get_nodes(struct bootnode *physnodes)
-{
-        int i;
-        int ret = 0;
-        for_each_node_mask(i, nodes_parsed) {
-                physnodes[ret].start = nodes[i].start;
-                physnodes[ret].end = nodes[i].end;
-                ret++;
-        }
-        return ret;
-}
-/* Use the information discovered above to actually set up the nodes. */
-int __init acpi_scan_nodes(unsigned long start, unsigned long end)
-{
-        int i;
-        if (acpi_numa <= 0)
-                return -1;
-        /* First clean up the node list */
-        for (i = 0; i < MAX_NUMNODES; i++)
-                cutoff_node(i, start, end);
-        /*
-         * Join together blocks on the same node, holes between
-         * which don't overlap with memory on other nodes.
-         */
-        for (i = 0; i < num_node_memblks; ++i) {
-                int j, k;
-                for (j = i + 1; j < num_node_memblks; ++j) {
-                        unsigned long start, end;
-                        if (memblk_nodeid[i] != memblk_nodeid[j])
-                                continue;
-                        start = min(node_memblk_range[i].end,
-                                    node_memblk_range[j].end);
-                        end = max(node_memblk_range[i].start,
-                                  node_memblk_range[j].start);
-                        for (k = 0; k < num_node_memblks; ++k) {
-                                if (memblk_nodeid[i] == memblk_nodeid[k])
-                                        continue;
-                                if (start < node_memblk_range[k].end &&
-                                    end > node_memblk_range[k].start)
-                                        break;
-                        }
-                        if (k < num_node_memblks)
-                                continue;
-                        start = min(node_memblk_range[i].start,
-                                    node_memblk_range[j].start);
-                        end = max(node_memblk_range[i].end,
-                                  node_memblk_range[j].end);
-                        printk(KERN_INFO "SRAT: Node %d "
-                               "[%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
-                               memblk_nodeid[i],
-                               node_memblk_range[i].start,
-                               node_memblk_range[i].end,
-                               node_memblk_range[j].start,
-                               node_memblk_range[j].end,
-                               start, end);
-                        node_memblk_range[i].start = start;
-                        node_memblk_range[i].end = end;
-                        k = --num_node_memblks - j;
-                        memmove(memblk_nodeid + j, memblk_nodeid + j+1,
-                                k * sizeof(*memblk_nodeid));
-                        memmove(node_memblk_range + j, node_memblk_range + j+1,
-                                k * sizeof(*node_memblk_range));
-                        --j;
-                }
-        }
-        memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
-                                           memblk_nodeid);
-        if (memnode_shift < 0) {
-                printk(KERN_ERR
-                     "SRAT: No NUMA node hash function found. Contact maintainer\n");
-                bad_srat();
-                return -1;
-        }
-        for (i = 0; i < num_node_memblks; i++)
-                e820_register_active_regions(memblk_nodeid[i],
-                                node_memblk_range[i].start >> PAGE_SHIFT,
-                                node_memblk_range[i].end >> PAGE_SHIFT);
-        /* for out of order entries in SRAT */
-        sort_node_map();
-        if (!nodes_cover_memory(nodes)) {
-                bad_srat();
-                return -1;
-        }
-        /* Account for nodes with cpus and no memory */
-        nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed);
-        /* Finally register nodes */
-        for_each_node_mask(i, node_possible_map)
-                setup_node_bootmem(i, nodes[i].start, nodes[i].end);
-        /* Try again in case setup_node_bootmem missed one due
-           to missing bootmem */
-        for_each_node_mask(i, node_possible_map)
-                if (!node_online(i))
-                        setup_node_bootmem(i, nodes[i].start, nodes[i].end);
-        for (i = 0; i < nr_cpu_ids; i++) {
-                int node = early_cpu_to_node(i);
-                if (node == NUMA_NO_NODE)
-                        continue;
-                if (!node_online(node))
-                        numa_clear_node(i);
-        }
-        numa_init_array();
-        return 0;
-}
-#ifdef CONFIG_NUMA_EMU
-static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
-        [0 ... MAX_NUMNODES-1] = PXM_INVAL
-};
-static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
-        [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
-};
-static int __init find_node_by_addr(unsigned long addr)
-{
-        int ret = NUMA_NO_NODE;
-        int i;
-        for_each_node_mask(i, nodes_parsed) {
-                /*
-                 * Find the real node that this emulated node appears on.  For
-                 * the sake of simplicity, we only use a real node's starting
-                 * address to determine which emulated node it appears on.
-                 */
-                if (addr >= nodes[i].start && addr < nodes[i].end) {
-                        ret = i;
-                        break;
-                }
-        }
-        return ret;
-}
-/*
- * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
- * mappings that respect the real ACPI topology but reflect our emulated
- * environment.  For each emulated node, we find which real node it appears on
- * and create PXM to NID mappings for those fake nodes which mirror that
- * locality.  SLIT will now represent the correct distances between emulated
- * nodes as a result of the real topology.
- */
-void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
-{
-        int i, j;
-        printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
-                         "topology.\n");
-        for (i = 0; i < num_nodes; i++) {
-                int nid, pxm;
-                nid = find_node_by_addr(fake_nodes[i].start);
-                if (nid == NUMA_NO_NODE)
-                        continue;
-                pxm = node_to_pxm(nid);
-                if (pxm == PXM_INVAL)
-                        continue;
-                fake_node_to_pxm_map[i] = pxm;
-                /*
-                 * For each apicid_to_node mapping that exists for this real
-                 * node, it must now point to the fake node ID.
-                 */
-                for (j = 0; j < MAX_LOCAL_APIC; j++)
-                        if (apicid_to_node[j] == nid &&
-                            fake_apicid_to_node[j] == NUMA_NO_NODE)
-                                fake_apicid_to_node[j] = i;
-        }
-        for (i = 0; i < num_nodes; i++)
-                __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
-        memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
-        nodes_clear(nodes_parsed);
-        for (i = 0; i < num_nodes; i++)
-                if (fake_nodes[i].start != fake_nodes[i].end)
-                        node_set(i, nodes_parsed);
-}
-static int null_slit_node_compare(int a, int b)
-{
-        return node_to_pxm(a) == node_to_pxm(b);
-}
-#else
-static int null_slit_node_compare(int a, int b)
-{
-        return a == b;
-}
-#endif /* CONFIG_NUMA_EMU */
-int __node_distance(int a, int b)
-{
-        int index;
-        if (!acpi_slit)
-                return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
-                                                      REMOTE_DISTANCE;
-        index = acpi_slit->locality_count * node_to_pxm(a);
-        return acpi_slit->entry[index + node_to_pxm(b)];
-}
-EXPORT_SYMBOL(__node_distance);
-#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
-int memory_add_physaddr_to_nid(u64 start)
-{
-        int i, ret = 0;
-        for_each_node(i)
-                if (nodes_add[i].start <= start && nodes_add[i].end > start)
-                        ret = i;
-        return ret;
-}
-EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
-#endif
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index c03f14ab6667..d6c0418c3e47 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -5,6 +5,7 @@
 #include <linux/smp.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
+#include <linux/cpu.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
@@ -52,6 +53,8 @@ union smp_flush_state {
   want false sharing in the per cpu data segment. */
 static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
+static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);
 /*
 * We cannot call mmdrop() because we are in interrupt context,
 * instead update mm->cpu_vm_mask.
@@ -173,15 +176,11 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
        union smp_flush_state *f;
        /* Caller has disabled preemption */
-        sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
+        sender = this_cpu_read(tlb_vector_offset);
        f = &flush_state[sender];
-        /*
+        if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
-         * Could avoid this lock when
+                raw_spin_lock(&f->tlbstate_lock);
-         * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
-         * probably not worth checking this for a cache-hot lock.
-         */
-        raw_spin_lock(&f->tlbstate_lock);
        f->flush_mm = mm;
        f->flush_va = va;
@@ -199,7 +198,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
        f->flush_mm = NULL;
        f->flush_va = 0;
-        raw_spin_unlock(&f->tlbstate_lock);
+        if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
+                raw_spin_unlock(&f->tlbstate_lock);
 }
 void native_flush_tlb_others(const struct cpumask *cpumask,
@@ -208,16 +208,57 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
        if (is_uv_system()) {
                unsigned int cpu;
-                cpu = get_cpu();
+                cpu = smp_processor_id();
                cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
                if (cpumask)
                        flush_tlb_others_ipi(cpumask, mm, va);
-                put_cpu();
                return;
        }
        flush_tlb_others_ipi(cpumask, mm, va);
 }
+static void __cpuinit calculate_tlb_offset(void)
+{
+        int cpu, node, nr_node_vecs, idx = 0;
+        /*
+         * we are changing tlb_vector_offset for each CPU in runtime, but this
+         * will not cause inconsistency, as the write is atomic under X86. we
+         * might see more lock contentions in a short time, but after all CPU's
+         * tlb_vector_offset are changed, everything should go normal
+         *
+         * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might
+         * waste some vectors.
+         **/
+        if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS)
+                nr_node_vecs = 1;
+        else
+                nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
+        for_each_online_node(node) {
+                int node_offset = (idx % NUM_INVALIDATE_TLB_VECTORS) *
+                        nr_node_vecs;
+                int cpu_offset = 0;
+                for_each_cpu(cpu, cpumask_of_node(node)) {
+                        per_cpu(tlb_vector_offset, cpu) = node_offset +
+                                cpu_offset;
+                        cpu_offset++;
+                        cpu_offset = cpu_offset % nr_node_vecs;
+                }
+                idx++;
+        }
+}
+static int __cpuinit tlb_cpuhp_notify(struct notifier_block *n,
+                unsigned long action, void *hcpu)
+{
+        switch (action & 0xf) {
+        case CPU_ONLINE:
+        case CPU_DEAD:
+                calculate_tlb_offset();
+        }
+        return NOTIFY_OK;
+}
 static int __cpuinit init_smp_flush(void)
 {
        int i;
@@ -225,6 +266,8 @@ static int __cpuinit init_smp_flush(void)
        for (i = 0; i < ARRAY_SIZE(flush_state); i++)
                raw_spin_lock_init(&flush_state[i].tlbstate_lock);
+        calculate_tlb_offset();
+        hotcpu_notifier(tlb_cpuhp_notify, 0);
        return 0;
 }
 core_initcall(init_smp_flush);
author	Glenn Elliott <gelliott@cs.unc.edu>	2012-03-04 19:47:13 -0500
committer	Glenn Elliott <gelliott@cs.unc.edu>	2012-03-04 19:47:13 -0500
commit	c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
tree	ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /arch/x86/mm
parent	ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent	6a00f206debf8a5c8899055726ad127dbeeed098 (diff)