Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (93 commits) x86, tlb, UV: Do small micro-optimization for native_flush_tlb_others() x86-64, NUMA: Don't call numa_set_distanc() for all possible node combinations during emulation x86-64, NUMA: Don't assume phys node 0 is always online in numa_emulation() x86-64, NUMA: Clean up initmem_init() x86-64, NUMA: Fix numa_emulation code with node0 without RAM x86-64, NUMA: Revert NUMA affine page table allocation x86: Work around old gas bug x86-64, NUMA: Better explain numa_distance handling x86-64, NUMA: Fix distance table handling mm: Move early_node_map[] reverse scan helpers under HAVE_MEMBLOCK x86-64, NUMA: Fix size of numa_distance array x86: Rename e820_table_* to pgt_buf_* bootmem: Move __alloc_memory_core_early() to nobootmem.c bootmem: Move contig_page_data definition to bootmem.c/nobootmem.c bootmem: Separate out CONFIG_NO_BOOTMEM code into nobootmem.c x86-64, NUMA: Seperate out numa_alloc_distance() from numa_set_distance() x86-64, NUMA: Add proper function comments to global functions x86-64, NUMA: Move NUMA emulation into numa_emulation.c x86-64, NUMA: Prepare numa_emulation() for moving NUMA emulation into a separate file x86-64, NUMA: Do not scan two times for setup_node_bootmem() ... Fix up conflicts in arch/x86/kernel/smpboot.c
author: Linus Torvalds <torvalds@linux-foundation.org> 2011-03-15 22:49:10 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2011-03-15 22:49:10 -0400
commit: 181f977d134a9f8e3f8839f42af655b045fc059e (patch)
tree: 5d9bb67c62ef1476c18ed350106a84c02f0dd8e4 /arch/x86/mm
parent: d5d42399bd7b66bd6b55363b311810504110c967 (diff)
parent: 25542c646afbf14c43fa7d2b443055cadb73b07a (diff)
13 files changed, 1207 insertions, 1193 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 09df2f9a3d69..3e608edf9958 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_MMIOTRACE_TEST)	+= testmmiotrace.o
 obj-$(CONFIG_NUMA)              += numa.o numa_$(BITS).o
 obj-$(CONFIG_AMD_NUMA)          += amdtopology_64.o
 obj-$(CONFIG_ACPI_NUMA)         += srat_$(BITS).o
+obj-$(CONFIG_NUMA_EMU)          += numa_emulation.o
 obj-$(CONFIG_HAVE_MEMBLOCK)             += memblock.o
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index f21962c435ed..0919c26820d4 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -26,9 +26,7 @@
 #include <asm/apic.h>
 #include <asm/amd_nb.h>
-static struct bootnode __initdata nodes[8];
 static unsigned char __initdata nodeids[8];
-static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
 static __init int find_northbridge(void)
 {
@@ -51,7 +49,7 @@ static __init int find_northbridge(void)
                return num;
        }
-        return -1;
+        return -ENOENT;
 }
 static __init void early_get_boot_cpu_id(void)
@@ -69,17 +67,18 @@ static __init void early_get_boot_cpu_id(void)
 #endif
 }
-int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
+int __init amd_numa_init(void)
 {
-        unsigned long start = PFN_PHYS(start_pfn);
+        unsigned long start = PFN_PHYS(0);
-        unsigned long end = PFN_PHYS(end_pfn);
+        unsigned long end = PFN_PHYS(max_pfn);
        unsigned numnodes;
        unsigned long prevbase;
-        int i, nb, found = 0;
+        int i, j, nb;
        u32 nodeid, reg;
+        unsigned int bits, cores, apicid_base;
        if (!early_pci_allowed())
-                return -1;
+                return -EINVAL;
        nb = find_northbridge();
        if (nb < 0)
@@ -90,7 +89,7 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
        reg = read_pci_config(0, nb, 0, 0x60);
        numnodes = ((reg >> 4) & 0xF) + 1;
        if (numnodes <= 1)
-                return -1;
+                return -ENOENT;
        pr_info("Number of physical nodes %d\n", numnodes);
@@ -121,9 +120,9 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
                if ((base >> 8) & 3 || (limit >> 8) & 3) {
                        pr_err("Node %d using interleaving mode %lx/%lx\n",
                               nodeid, (base >> 8) & 3, (limit >> 8) & 3);
-                        return -1;
+                        return -EINVAL;
                }
-                if (node_isset(nodeid, nodes_parsed)) {
+                if (node_isset(nodeid, numa_nodes_parsed)) {
                        pr_info("Node %d already present, skipping\n",
                                nodeid);
                        continue;
@@ -160,117 +159,28 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
                if (prevbase > base) {
                        pr_err("Node map not sorted %lx,%lx\n",
                               prevbase, base);
-                        return -1;
+                        return -EINVAL;
                }
                pr_info("Node %d MemBase %016lx Limit %016lx\n",
                        nodeid, base, limit);
-                found++;
-                nodes[nodeid].start = base;
-                nodes[nodeid].end = limit;
                prevbase = base;
+                numa_add_memblk(nodeid, base, limit);
-                node_set(nodeid, nodes_parsed);
+                node_set(nodeid, numa_nodes_parsed);
-        }
-        if (!found)
-                return -1;
-        return 0;
-}
-#ifdef CONFIG_NUMA_EMU
-static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
-        [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
-};
-void __init amd_get_nodes(struct bootnode *physnodes)
-{
-        int i;
-        for_each_node_mask(i, nodes_parsed) {
-                physnodes[i].start = nodes[i].start;
-                physnodes[i].end = nodes[i].end;
        }
-}
-static int __init find_node_by_addr(unsigned long addr)
-{
-        int ret = NUMA_NO_NODE;
-        int i;
-        for (i = 0; i < 8; i++)
-                if (addr >= nodes[i].start && addr < nodes[i].end) {
-                        ret = i;
-                        break;
-                }
-        return ret;
-}
-/*
+        if (!nodes_weight(numa_nodes_parsed))
- * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be
+                return -ENOENT;
- * setup to represent the physical topology but reflect the emulated
- * environment.  For each emulated node, the real node which it appears on is
- * found and a fake pxm to nid mapping is created which mirrors the actual
- * locality.  node_distance() then represents the correct distances between
- * emulated nodes by using the fake acpi mappings to pxms.
- */
-void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes)
-{
-        unsigned int bits;
-        unsigned int cores;
-        unsigned int apicid_base = 0;
-        int i;
+        /*
+         * We seem to have valid NUMA configuration.  Map apicids to nodes
+         * using the coreid bits from early_identify_cpu.
+         */
        bits = boot_cpu_data.x86_coreid_bits;
        cores = 1 << bits;
-        early_get_boot_cpu_id();
-        if (boot_cpu_physical_apicid > 0)
-                apicid_base = boot_cpu_physical_apicid;
-        for (i = 0; i < nr_nodes; i++) {
-                int index;
-                int nid;
-                int j;
-                nid = find_node_by_addr(nodes[i].start);
-                if (nid == NUMA_NO_NODE)
-                        continue;
-                index = nodeids[nid] << bits;
-                if (fake_apicid_to_node[index + apicid_base] == NUMA_NO_NODE)
-                        for (j = apicid_base; j < cores + apicid_base; j++)
-                                fake_apicid_to_node[index + j] = i;
-#ifdef CONFIG_ACPI_NUMA
-                __acpi_map_pxm_to_node(nid, i);
-#endif
-        }
-        memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
-}
-#endif /* CONFIG_NUMA_EMU */
-int __init amd_scan_nodes(void)
-{
-        unsigned int bits;
-        unsigned int cores;
-        unsigned int apicid_base;
-        int i;
-        BUG_ON(nodes_empty(nodes_parsed));
-        node_possible_map = nodes_parsed;
-        memnode_shift = compute_hash_shift(nodes, 8, NULL);
-        if (memnode_shift < 0) {
-                pr_err("No NUMA node hash function found. Contact maintainer\n");
-                return -1;
-        }
-        pr_info("Using node hash shift of %d\n", memnode_shift);
-        /* use the coreid bits from early_identify_cpu */
-        bits = boot_cpu_data.x86_coreid_bits;
-        cores = (1<<bits);
        apicid_base = 0;
        /* get the APIC ID of the BSP early for systems with apicid lifting */
        early_get_boot_cpu_id();
        if (boot_cpu_physical_apicid > 0) {
@@ -278,17 +188,9 @@ int __init amd_scan_nodes(void)
                apicid_base = boot_cpu_physical_apicid;
        }
-        for_each_node_mask(i, node_possible_map) {
+        for_each_node_mask(i, numa_nodes_parsed)
-                int j;
-                memblock_x86_register_active_regions(i,
-                                nodes[i].start >> PAGE_SHIFT,
-                                nodes[i].end >> PAGE_SHIFT);
                for (j = apicid_base; j < cores + apicid_base; j++)
-                        apicid_to_node[(i << bits) + j] = i;
+                        set_apicid_to_node((i << bits) + j, i);
-                setup_node_bootmem(i, nodes[i].start, nodes[i].end);
-        }
-        numa_init_array();
        return 0;
 }
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 947f42abe820..286d289b039b 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -18,9 +18,9 @@
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-unsigned long __initdata e820_table_start;
+unsigned long __initdata pgt_buf_start;
-unsigned long __meminitdata e820_table_end;
+unsigned long __meminitdata pgt_buf_end;
-unsigned long __meminitdata e820_table_top;
+unsigned long __meminitdata pgt_buf_top;
 int after_bootmem;
@@ -33,7 +33,7 @@ int direct_gbpages
 static void __init find_early_table_space(unsigned long end, int use_pse,
                                          int use_gbpages)
 {
-        unsigned long puds, pmds, ptes, tables, start;
+        unsigned long puds, pmds, ptes, tables, start = 0, good_end = end;
        phys_addr_t base;
        puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
@@ -65,29 +65,20 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
 #ifdef CONFIG_X86_32
        /* for fixmap */
        tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
-#endif
-        /*
+        good_end = max_pfn_mapped << PAGE_SHIFT;
-         * RED-PEN putting page tables only on node 0 could
-         * cause a hotspot and fill up ZONE_DMA. The page tables
-         * need roughly 0.5KB per GB.
-         */
-#ifdef CONFIG_X86_32
-        start = 0x7000;
-#else
-        start = 0x8000;
 #endif
-        base = memblock_find_in_range(start, max_pfn_mapped<<PAGE_SHIFT,
-                                        tables, PAGE_SIZE);
+        base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
        if (base == MEMBLOCK_ERROR)
                panic("Cannot find space for the kernel page tables");
-        e820_table_start = base >> PAGE_SHIFT;
+        pgt_buf_start = base >> PAGE_SHIFT;
-        e820_table_end = e820_table_start;
+        pgt_buf_end = pgt_buf_start;
-        e820_table_top = e820_table_start + (tables >> PAGE_SHIFT);
+        pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
        printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
-                end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT);
+                end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT);
 }
 struct map_range {
@@ -279,30 +270,11 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
        load_cr3(swapper_pg_dir);
 #endif
-#ifdef CONFIG_X86_64
-        if (!after_bootmem && !start) {
-                pud_t *pud;
-                pmd_t *pmd;
-                mmu_cr4_features = read_cr4();
-                /*
-                 * _brk_end cannot change anymore, but it and _end may be
-                 * located on different 2M pages. cleanup_highmap(), however,
-                 * can only consider _end when it runs, so destroy any
-                 * mappings beyond _brk_end here.
-                 */
-                pud = pud_offset(pgd_offset_k(_brk_end), _brk_end);
-                pmd = pmd_offset(pud, _brk_end - 1);
-                while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1))
-                        pmd_clear(pmd);
-        }
-#endif
        __flush_tlb_all();
-        if (!after_bootmem && e820_table_end > e820_table_start)
+        if (!after_bootmem && pgt_buf_end > pgt_buf_start)
-                memblock_x86_reserve_range(e820_table_start << PAGE_SHIFT,
+                memblock_x86_reserve_range(pgt_buf_start << PAGE_SHIFT,
-                                 e820_table_end << PAGE_SHIFT, "PGTABLE");
+                                 pgt_buf_end << PAGE_SHIFT, "PGTABLE");
        if (!after_bootmem)
                early_memtest(start, end);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c821074b7f0b..73ad7ebd6e9c 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -62,10 +62,10 @@ bool __read_mostly __vmalloc_start_set = false;
 static __init void *alloc_low_page(void)
 {
-        unsigned long pfn = e820_table_end++;
+        unsigned long pfn = pgt_buf_end++;
        void *adr;
-        if (pfn >= e820_table_top)
+        if (pfn >= pgt_buf_top)
                panic("alloc_low_page: ran out of memory");
        adr = __va(pfn * PAGE_SIZE);
@@ -163,8 +163,8 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
        if (pmd_idx_kmap_begin != pmd_idx_kmap_end
            && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
            && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
-            && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start
+            && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start
-                || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) {
+                || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) {
                pte_t *newpte;
                int i;
@@ -644,8 +644,7 @@ void __init find_low_pfn_range(void)
 }
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
+void __init initmem_init(void)
-                                int acpi, int k8)
 {
 #ifdef CONFIG_HIGHMEM
        highstart_pfn = highend_pfn = max_pfn;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index c14a5422e152..a08a62cb136e 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -314,7 +314,7 @@ void __init cleanup_highmap(void)
 static __ref void *alloc_low_page(unsigned long *phys)
 {
-        unsigned long pfn = e820_table_end++;
+        unsigned long pfn = pgt_buf_end++;
        void *adr;
        if (after_bootmem) {
@@ -324,7 +324,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
                return adr;
        }
-        if (pfn >= e820_table_top)
+        if (pfn >= pgt_buf_top)
                panic("alloc_low_page: ran out of memory");
        adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
@@ -333,12 +333,28 @@ static __ref void *alloc_low_page(unsigned long *phys)
        return adr;
 }
+static __ref void *map_low_page(void *virt)
+{
+        void *adr;
+        unsigned long phys, left;
+        if (after_bootmem)
+                return virt;
+        phys = __pa(virt);
+        left = phys & (PAGE_SIZE - 1);
+        adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE);
+        adr = (void *)(((unsigned long)adr) | left);
+        return adr;
+}
 static __ref void unmap_low_page(void *adr)
 {
        if (after_bootmem)
                return;
-        early_iounmap(adr, PAGE_SIZE);
+        early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
 }
 static unsigned long __meminit
@@ -386,15 +402,6 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
 }
 static unsigned long __meminit
-phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
-                pgprot_t prot)
-{
-        pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
-        return phys_pte_init(pte, address, end, prot);
-}
-static unsigned long __meminit
 phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
              unsigned long page_size_mask, pgprot_t prot)
 {
@@ -420,8 +427,10 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
                if (pmd_val(*pmd)) {
                        if (!pmd_large(*pmd)) {
                                spin_lock(&init_mm.page_table_lock);
-                                last_map_addr = phys_pte_update(pmd, address,
+                                pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
+                                last_map_addr = phys_pte_init(pte, address,
                                                                end, prot);
+                                unmap_low_page(pte);
                                spin_unlock(&init_mm.page_table_lock);
                                continue;
                        }
@@ -468,18 +477,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
 }
 static unsigned long __meminit
-phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
-                unsigned long page_size_mask, pgprot_t prot)
-{
-        pmd_t *pmd = pmd_offset(pud, 0);
-        unsigned long last_map_addr;
-        last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
-        __flush_tlb_all();
-        return last_map_addr;
-}
-static unsigned long __meminit
 phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
                         unsigned long page_size_mask)
 {
@@ -504,8 +501,11 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
                if (pud_val(*pud)) {
                        if (!pud_large(*pud)) {
-                                last_map_addr = phys_pmd_update(pud, addr, end,
+                                pmd = map_low_page(pmd_offset(pud, 0));
+                                last_map_addr = phys_pmd_init(pmd, addr, end,
                                                         page_size_mask, prot);
+                                unmap_low_page(pmd);
+                                __flush_tlb_all();
                                continue;
                        }
                        /*
@@ -553,17 +553,6 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
        return last_map_addr;
 }
-static unsigned long __meminit
-phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
-                 unsigned long page_size_mask)
-{
-        pud_t *pud;
-        pud = (pud_t *)pgd_page_vaddr(*pgd);
-        return phys_pud_init(pud, addr, end, page_size_mask);
-}
 unsigned long __meminit
 kernel_physical_mapping_init(unsigned long start,
                             unsigned long end,
@@ -587,8 +576,10 @@ kernel_physical_mapping_init(unsigned long start,
                        next = end;
                if (pgd_val(*pgd)) {
-                        last_map_addr = phys_pud_update(pgd, __pa(start),
+                        pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
+                        last_map_addr = phys_pud_init(pud, __pa(start),
                                                 __pa(end), page_size_mask);
+                        unmap_low_page(pud);
                        continue;
                }
@@ -612,10 +603,9 @@ kernel_physical_mapping_init(unsigned long start,
 }
 #ifndef CONFIG_NUMA
-void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
+void __init initmem_init(void)
-                                int acpi, int k8)
 {
-        memblock_x86_register_active_regions(0, start_pfn, end_pfn);
+        memblock_x86_register_active_regions(0, 0, max_pfn);
 }
 #endif
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index ebf6d7887a38..9559d360fde7 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -26,12 +26,50 @@ static __init int numa_setup(char *opt)
 early_param("numa", numa_setup);
 /*
- * Which logical CPUs are on which nodes
+ * apicid, cpu, node mappings
 */
+s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
+        [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+};
 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
 EXPORT_SYMBOL(node_to_cpumask_map);
 /*
+ * Map cpu index to node index
+ */
+DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
+void __cpuinit numa_set_node(int cpu, int node)
+{
+        int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
+        /* early setting, no percpu area yet */
+        if (cpu_to_node_map) {
+                cpu_to_node_map[cpu] = node;
+                return;
+        }
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+        if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
+                printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
+                dump_stack();
+                return;
+        }
+#endif
+        per_cpu(x86_cpu_to_node_map, cpu) = node;
+        if (node != NUMA_NO_NODE)
+                set_cpu_numa_node(cpu, node);
+}
+void __cpuinit numa_clear_node(int cpu)
+{
+        numa_set_node(cpu, NUMA_NO_NODE);
+}
+/*
 * Allocate node_to_cpumask_map based on number of available nodes
 * Requires node_possible_map to be valid.
 *
@@ -57,7 +95,174 @@ void __init setup_node_to_cpumask_map(void)
        pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
 }
-#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+/*
+ * There are unfortunately some poorly designed mainboards around that
+ * only connect memory to a single CPU. This breaks the 1:1 cpu->node
+ * mapping. To avoid this fill in the mapping for all possible CPUs,
+ * as the number of CPUs is not known yet. We round robin the existing
+ * nodes.
+ */
+void __init numa_init_array(void)
+{
+        int rr, i;
+        rr = first_node(node_online_map);
+        for (i = 0; i < nr_cpu_ids; i++) {
+                if (early_cpu_to_node(i) != NUMA_NO_NODE)
+                        continue;
+                numa_set_node(i, rr);
+                rr = next_node(rr, node_online_map);
+                if (rr == MAX_NUMNODES)
+                        rr = first_node(node_online_map);
+        }
+}
+static __init int find_near_online_node(int node)
+{
+        int n, val;
+        int min_val = INT_MAX;
+        int best_node = -1;
+        for_each_online_node(n) {
+                val = node_distance(node, n);
+                if (val < min_val) {
+                        min_val = val;
+                        best_node = n;
+                }
+        }
+        return best_node;
+}
+/*
+ * Setup early cpu_to_node.
+ *
+ * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
+ * and apicid_to_node[] tables have valid entries for a CPU.
+ * This means we skip cpu_to_node[] initialisation for NUMA
+ * emulation and faking node case (when running a kernel compiled
+ * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
+ * is already initialized in a round robin manner at numa_init_array,
+ * prior to this call, and this initialization is good enough
+ * for the fake NUMA cases.
+ *
+ * Called before the per_cpu areas are setup.
+ */
+void __init init_cpu_to_node(void)
+{
+        int cpu;
+        u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
+        BUG_ON(cpu_to_apicid == NULL);
+        for_each_possible_cpu(cpu) {
+                int node = numa_cpu_node(cpu);
+                if (node == NUMA_NO_NODE)
+                        continue;
+                if (!node_online(node))
+                        node = find_near_online_node(node);
+                numa_set_node(cpu, node);
+        }
+}
+#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+# ifndef CONFIG_NUMA_EMU
+void __cpuinit numa_add_cpu(int cpu)
+{
+        cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+void __cpuinit numa_remove_cpu(int cpu)
+{
+        cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+# endif /* !CONFIG_NUMA_EMU */
+#else   /* !CONFIG_DEBUG_PER_CPU_MAPS */
+int __cpu_to_node(int cpu)
+{
+        if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
+                printk(KERN_WARNING
+                        "cpu_to_node(%d): usage too early!\n", cpu);
+                dump_stack();
+                return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+        }
+        return per_cpu(x86_cpu_to_node_map, cpu);
+}
+EXPORT_SYMBOL(__cpu_to_node);
+/*
+ * Same function as cpu_to_node() but used if called before the
+ * per_cpu areas are setup.
+ */
+int early_cpu_to_node(int cpu)
+{
+        if (early_per_cpu_ptr(x86_cpu_to_node_map))
+                return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+        if (!cpu_possible(cpu)) {
+                printk(KERN_WARNING
+                        "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
+                dump_stack();
+                return NUMA_NO_NODE;
+        }
+        return per_cpu(x86_cpu_to_node_map, cpu);
+}
+struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable)
+{
+        int node = early_cpu_to_node(cpu);
+        struct cpumask *mask;
+        char buf[64];
+        if (node == NUMA_NO_NODE) {
+                /* early_cpu_to_node() already emits a warning and trace */
+                return NULL;
+        }
+        mask = node_to_cpumask_map[node];
+        if (!mask) {
+                pr_err("node_to_cpumask_map[%i] NULL\n", node);
+                dump_stack();
+                return NULL;
+        }
+        cpulist_scnprintf(buf, sizeof(buf), mask);
+        printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
+                enable ? "numa_add_cpu" : "numa_remove_cpu",
+                cpu, node, buf);
+        return mask;
+}
+# ifndef CONFIG_NUMA_EMU
+static void __cpuinit numa_set_cpumask(int cpu, int enable)
+{
+        struct cpumask *mask;
+        mask = debug_cpumask_set_cpu(cpu, enable);
+        if (!mask)
+                return;
+        if (enable)
+                cpumask_set_cpu(cpu, mask);
+        else
+                cpumask_clear_cpu(cpu, mask);
+}
+void __cpuinit numa_add_cpu(int cpu)
+{
+        numa_set_cpumask(cpu, 1);
+}
+void __cpuinit numa_remove_cpu(int cpu)
+{
+        numa_set_cpumask(cpu, 0);
+}
+# endif /* !CONFIG_NUMA_EMU */
 /*
 * Returns a pointer to the bitmask of CPUs on Node 'node'.
 */
@@ -80,4 +285,5 @@ const struct cpumask *cpumask_of_node(int node)
        return node_to_cpumask_map[node];
 }
 EXPORT_SYMBOL(cpumask_of_node);
-#endif
+#endif  /* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 84a3e4c9f277..bde3906420df 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -110,6 +110,12 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
 static unsigned long kva_start_pfn;
 static unsigned long kva_pages;
+int __cpuinit numa_cpu_node(int cpu)
+{
+        return apic->x86_32_numa_cpu_node(cpu);
+}
 /*
 * FLAT - support for basic PC memory model with discontig enabled, essentially
 *        a single node with all available processors in it with a flat
@@ -346,8 +352,7 @@ static void init_remap_allocator(int nid)
                (ulong) node_remap_end_vaddr[nid]);
 }
-void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
+void __init initmem_init(void)
-                                int acpi, int k8)
 {
        int nid;
        long kva_target_pfn;
@@ -361,6 +366,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
         */
        get_memcfg_numa();
+        numa_init_array();
        kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 1337c51b07d7..9ec0f209a6a4 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -13,31 +13,30 @@
 #include <linux/module.h>
 #include <linux/nodemask.h>
 #include <linux/sched.h>
+#include <linux/acpi.h>
 #include <asm/e820.h>
 #include <asm/proto.h>
 #include <asm/dma.h>
-#include <asm/numa.h>
 #include <asm/acpi.h>
 #include <asm/amd_nb.h>
+#include "numa_internal.h"
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
-struct memnode memnode;
+nodemask_t numa_nodes_parsed __initdata;
-s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
+struct memnode memnode;
-        [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
-};
 static unsigned long __initdata nodemap_addr;
 static unsigned long __initdata nodemap_size;
-/*
+static struct numa_meminfo numa_meminfo __initdata;
- * Map cpu index to node index
- */
+static int numa_distance_cnt;
-DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
+static u8 *numa_distance;
-EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
 /*
 * Given a shift value, try to populate memnodemap[]
@@ -46,16 +45,15 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
 * 0 if memnodmap[] too small (of shift too small)
 * -1 if node overlap or lost ram (shift too big)
 */
-static int __init populate_memnodemap(const struct bootnode *nodes,
+static int __init populate_memnodemap(const struct numa_meminfo *mi, int shift)
-                                      int numnodes, int shift, int *nodeids)
 {
        unsigned long addr, end;
        int i, res = -1;
        memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
-        for (i = 0; i < numnodes; i++) {
+        for (i = 0; i < mi->nr_blks; i++) {
-                addr = nodes[i].start;
+                addr = mi->blk[i].start;
-                end = nodes[i].end;
+                end = mi->blk[i].end;
                if (addr >= end)
                        continue;
                if ((end >> shift) >= memnodemapsize)
@@ -63,12 +61,7 @@ static int __init populate_memnodemap(const struct bootnode *nodes,
                do {
                        if (memnodemap[addr >> shift] != NUMA_NO_NODE)
                                return -1;
+                        memnodemap[addr >> shift] = mi->blk[i].nid;
-                        if (!nodeids)
-                                memnodemap[addr >> shift] = i;
-                        else
-                                memnodemap[addr >> shift] = nodeids[i];
                        addr += (1UL << shift);
                } while (addr < end);
                res = 1;
@@ -86,7 +79,7 @@ static int __init allocate_cachealigned_memnodemap(void)
        addr = 0x8000;
        nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
-        nodemap_addr = memblock_find_in_range(addr, max_pfn<<PAGE_SHIFT,
+        nodemap_addr = memblock_find_in_range(addr, get_max_mapped(),
                                      nodemap_size, L1_CACHE_BYTES);
        if (nodemap_addr == MEMBLOCK_ERROR) {
                printk(KERN_ERR
@@ -106,16 +99,15 @@ static int __init allocate_cachealigned_memnodemap(void)
 * The LSB of all start and end addresses in the node map is the value of the
 * maximum possible shift.
 */
-static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
+static int __init extract_lsb_from_nodes(const struct numa_meminfo *mi)
-                                         int numnodes)
 {
        int i, nodes_used = 0;
        unsigned long start, end;
        unsigned long bitfield = 0, memtop = 0;
-        for (i = 0; i < numnodes; i++) {
+        for (i = 0; i < mi->nr_blks; i++) {
-                start = nodes[i].start;
+                start = mi->blk[i].start;
-                end = nodes[i].end;
+                end = mi->blk[i].end;
                if (start >= end)
                        continue;
                bitfield |= start;
@@ -131,18 +123,17 @@ static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
        return i;
 }
-int __init compute_hash_shift(struct bootnode *nodes, int numnodes,
+static int __init compute_hash_shift(const struct numa_meminfo *mi)
-                              int *nodeids)
 {
        int shift;
-        shift = extract_lsb_from_nodes(nodes, numnodes);
+        shift = extract_lsb_from_nodes(mi);
        if (allocate_cachealigned_memnodemap())
                return -1;
        printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
                shift);
-        if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) {
+        if (populate_memnodemap(mi, shift) != 1) {
                printk(KERN_INFO "Your memory is not aligned you need to "
                       "rebuild your kernel with a bigger NODEMAPSIZE "
                       "shift=%d\n", shift);
@@ -188,6 +179,63 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
        return NULL;
 }
+static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
+                                     struct numa_meminfo *mi)
+{
+        /* ignore zero length blks */
+        if (start == end)
+                return 0;
+        /* whine about and ignore invalid blks */
+        if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
+                pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
+                           nid, start, end);
+                return 0;
+        }
+        if (mi->nr_blks >= NR_NODE_MEMBLKS) {
+                pr_err("NUMA: too many memblk ranges\n");
+                return -EINVAL;
+        }
+        mi->blk[mi->nr_blks].start = start;
+        mi->blk[mi->nr_blks].end = end;
+        mi->blk[mi->nr_blks].nid = nid;
+        mi->nr_blks++;
+        return 0;
+}
+/**
+ * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
+ * @idx: Index of memblk to remove
+ * @mi: numa_meminfo to remove memblk from
+ *
+ * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
+ * decrementing @mi->nr_blks.
+ */
+void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
+{
+        mi->nr_blks--;
+        memmove(&mi->blk[idx], &mi->blk[idx + 1],
+                (mi->nr_blks - idx) * sizeof(mi->blk[0]));
+}
+/**
+ * numa_add_memblk - Add one numa_memblk to numa_meminfo
+ * @nid: NUMA node ID of the new memblk
+ * @start: Start address of the new memblk
+ * @end: End address of the new memblk
+ *
+ * Add a new memblk to the default numa_meminfo.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init numa_add_memblk(int nid, u64 start, u64 end)
+{
+        return numa_add_memblk_to(nid, start, end, &numa_meminfo);
+}
 /* Initialize bootmem allocator for a node */
 void __init
 setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
@@ -234,692 +282,386 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
        node_set_online(nodeid);
 }
-/*
+/**
- * There are unfortunately some poorly designed mainboards around that
+ * numa_cleanup_meminfo - Cleanup a numa_meminfo
- * only connect memory to a single CPU. This breaks the 1:1 cpu->node
+ * @mi: numa_meminfo to clean up
- * mapping. To avoid this fill in the mapping for all possible CPUs,
+ *
- * as the number of CPUs is not known yet. We round robin the existing
+ * Sanitize @mi by merging and removing unncessary memblks.  Also check for
- * nodes.
+ * conflicts and clear unused memblks.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
 */
-void __init numa_init_array(void)
+int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
 {
-        int rr, i;
+        const u64 low = 0;
+        const u64 high = (u64)max_pfn << PAGE_SHIFT;
+        int i, j, k;
-        rr = first_node(node_online_map);
+        for (i = 0; i < mi->nr_blks; i++) {
-        for (i = 0; i < nr_cpu_ids; i++) {
+                struct numa_memblk *bi = &mi->blk[i];
-                if (early_cpu_to_node(i) != NUMA_NO_NODE)
-                        continue;
-                numa_set_node(i, rr);
-                rr = next_node(rr, node_online_map);
-                if (rr == MAX_NUMNODES)
-                        rr = first_node(node_online_map);
-        }
-}
-#ifdef CONFIG_NUMA_EMU
-/* Numa emulation */
-static struct bootnode nodes[MAX_NUMNODES] __initdata;
-static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata;
-static char *cmdline __initdata;
-void __init numa_emu_cmdline(char *str)
+                /* make sure all blocks are inside the limits */
-{
+                bi->start = max(bi->start, low);
-        cmdline = str;
+                bi->end = min(bi->end, high);
-}
-static int __init setup_physnodes(unsigned long start, unsigned long end,
+                /* and there's no empty block */
-                                        int acpi, int amd)
+                if (bi->start == bi->end) {
-{
+                        numa_remove_memblk_from(i--, mi);
-        int ret = 0;
-        int i;
-        memset(physnodes, 0, sizeof(physnodes));
-#ifdef CONFIG_ACPI_NUMA
-        if (acpi)
-                acpi_get_nodes(physnodes, start, end);
-#endif
-#ifdef CONFIG_AMD_NUMA
-        if (amd)
-                amd_get_nodes(physnodes);
-#endif
-        /*
-         * Basic sanity checking on the physical node map: there may be errors
-         * if the SRAT or AMD code incorrectly reported the topology or the mem=
-         * kernel parameter is used.
-         */
-        for (i = 0; i < MAX_NUMNODES; i++) {
-                if (physnodes[i].start == physnodes[i].end)
-                        continue;
-                if (physnodes[i].start > end) {
-                        physnodes[i].end = physnodes[i].start;
-                        continue;
-                }
-                if (physnodes[i].end < start) {
-                        physnodes[i].start = physnodes[i].end;
                        continue;
                }
-                if (physnodes[i].start < start)
-                        physnodes[i].start = start;
-                if (physnodes[i].end > end)
-                        physnodes[i].end = end;
-                ret++;
-        }
-        /*
+                for (j = i + 1; j < mi->nr_blks; j++) {
-         * If no physical topology was detected, a single node is faked to cover
+                        struct numa_memblk *bj = &mi->blk[j];
-         * the entire address space.
+                        unsigned long start, end;
-         */
-        if (!ret) {
-                physnodes[ret].start = start;
-                physnodes[ret].end = end;
-                ret = 1;
-        }
-        return ret;
-}
-static void __init fake_physnodes(int acpi, int amd, int nr_nodes)
-{
-        int i;
-        BUG_ON(acpi && amd);
-#ifdef CONFIG_ACPI_NUMA
-        if (acpi)
-                acpi_fake_nodes(nodes, nr_nodes);
-#endif
-#ifdef CONFIG_AMD_NUMA
-        if (amd)
-                amd_fake_nodes(nodes, nr_nodes);
-#endif
-        if (!acpi && !amd)
-                for (i = 0; i < nr_cpu_ids; i++)
-                        numa_set_node(i, 0);
-}
-/*
- * Setups up nid to range from addr to addr + size.  If the end
- * boundary is greater than max_addr, then max_addr is used instead.
- * The return value is 0 if there is additional memory left for
- * allocation past addr and -1 otherwise.  addr is adjusted to be at
- * the end of the node.
- */
-static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
-{
-        int ret = 0;
-        nodes[nid].start = *addr;
-        *addr += size;
-        if (*addr >= max_addr) {
-                *addr = max_addr;
-                ret = -1;
-        }
-        nodes[nid].end = *addr;
-        node_set(nid, node_possible_map);
-        printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
-               nodes[nid].start, nodes[nid].end,
-               (nodes[nid].end - nodes[nid].start) >> 20);
-        return ret;
-}
-/*
- * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
- * to max_addr.  The return value is the number of nodes allocated.
- */
-static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes)
-{
-        nodemask_t physnode_mask = NODE_MASK_NONE;
-        u64 size;
-        int big;
-        int ret = 0;
-        int i;
-        if (nr_nodes <= 0)
-                return -1;
-        if (nr_nodes > MAX_NUMNODES) {
-                pr_info("numa=fake=%d too large, reducing to %d\n",
-                        nr_nodes, MAX_NUMNODES);
-                nr_nodes = MAX_NUMNODES;
-        }
-        size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes;
-        /*
-         * Calculate the number of big nodes that can be allocated as a result
-         * of consolidating the remainder.
-         */
-        big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
-                FAKE_NODE_MIN_SIZE;
-        size &= FAKE_NODE_MIN_HASH_MASK;
-        if (!size) {
-                pr_err("Not enough memory for each node.  "
-                        "NUMA emulation disabled.\n");
-                return -1;
-        }
-        for (i = 0; i < MAX_NUMNODES; i++)
-                if (physnodes[i].start != physnodes[i].end)
-                        node_set(i, physnode_mask);
-        /*
-         * Continue to fill physical nodes with fake nodes until there is no
-         * memory left on any of them.
-         */
-        while (nodes_weight(physnode_mask)) {
-                for_each_node_mask(i, physnode_mask) {
-                        u64 end = physnodes[i].start + size;
-                        u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
-                        if (ret < big)
-                                end += FAKE_NODE_MIN_SIZE;
                        /*
-                         * Continue to add memory to this fake node if its
+                         * See whether there are overlapping blocks.  Whine
-                         * non-reserved memory is less than the per-node size.
+                         * about but allow overlaps of the same nid.  They
+                         * will be merged below.
                         */
-                        while (end - physnodes[i].start -
+                        if (bi->end > bj->start && bi->start < bj->end) {
-                                memblock_x86_hole_size(physnodes[i].start, end) < size) {
+                                if (bi->nid != bj->nid) {
-                                end += FAKE_NODE_MIN_SIZE;
+                                        pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
-                                if (end > physnodes[i].end) {
+                                               bi->nid, bi->start, bi->end,
-                                        end = physnodes[i].end;
+                                               bj->nid, bj->start, bj->end);
-                                        break;
+                                        return -EINVAL;
                                }
+                                pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
+                                           bi->nid, bi->start, bi->end,
+                                           bj->start, bj->end);
                        }
                        /*
-                         * If there won't be at least FAKE_NODE_MIN_SIZE of
+                         * Join together blocks on the same node, holes
-                         * non-reserved memory in ZONE_DMA32 for the next node,
+                         * between which don't overlap with memory on other
-                         * this one must extend to the boundary.
+                         * nodes.
-                         */
-                        if (end < dma32_end && dma32_end - end -
-                            memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
-                                end = dma32_end;
-                        /*
-                         * If there won't be enough non-reserved memory for the
-                         * next node, this one must extend to the end of the
-                         * physical node.
                         */
-                        if (physnodes[i].end - end -
+                        if (bi->nid != bj->nid)
-                            memblock_x86_hole_size(end, physnodes[i].end) < size)
+                                continue;
-                                end = physnodes[i].end;
+                        start = max(min(bi->start, bj->start), low);
+                        end = min(max(bi->end, bj->end), high);
-                        /*
+                        for (k = 0; k < mi->nr_blks; k++) {
-                         * Avoid allocating more nodes than requested, which can
+                                struct numa_memblk *bk = &mi->blk[k];
-                         * happen as a result of rounding down each node's size
-                         * to FAKE_NODE_MIN_SIZE.
+                                if (bi->nid == bk->nid)
-                         */
+                                        continue;
-                        if (nodes_weight(physnode_mask) + ret >= nr_nodes)
+                                if (start < bk->end && end > bk->start)
-                                end = physnodes[i].end;
+                                        break;
+                        }
-                        if (setup_node_range(ret++, &physnodes[i].start,
+                        if (k < mi->nr_blks)
-                                                end - physnodes[i].start,
+                                continue;
-                                                physnodes[i].end) < 0)
+                        printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
-                                node_clear(i, physnode_mask);
+                               bi->nid, bi->start, bi->end, bj->start, bj->end,
+                               start, end);
+                        bi->start = start;
+                        bi->end = end;
+                        numa_remove_memblk_from(j--, mi);
                }
        }
-        return ret;
-}
-/*
- * Returns the end address of a node so that there is at least `size' amount of
- * non-reserved memory or `max_addr' is reached.
- */
-static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
-{
-        u64 end = start + size;
-        while (end - start - memblock_x86_hole_size(start, end) < size) {
+        for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
-                end += FAKE_NODE_MIN_SIZE;
+                mi->blk[i].start = mi->blk[i].end = 0;
-                if (end > max_addr) {
+                mi->blk[i].nid = NUMA_NO_NODE;
-                        end = max_addr;
-                        break;
-                }
        }
-        return end;
+        return 0;
 }
 /*
- * Sets up fake nodes of `size' interleaved over physical nodes ranging from
+ * Set nodes, which have memory in @mi, in *@nodemask.
- * `addr' to `max_addr'.  The return value is the number of nodes allocated.
 */
-static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
+static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
+                                              const struct numa_meminfo *mi)
 {
-        nodemask_t physnode_mask = NODE_MASK_NONE;
-        u64 min_size;
-        int ret = 0;
        int i;
-        if (!size)
+        for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
-                return -1;
+                if (mi->blk[i].start != mi->blk[i].end &&
-        /*
+                    mi->blk[i].nid != NUMA_NO_NODE)
-         * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
+                        node_set(mi->blk[i].nid, *nodemask);
-         * increased accordingly if the requested size is too small.  This
+}
-         * creates a uniform distribution of node sizes across the entire
-         * machine (but not necessarily over physical nodes).
-         */
-        min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
-                                                MAX_NUMNODES;
-        min_size = max(min_size, FAKE_NODE_MIN_SIZE);
-        if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
-                min_size = (min_size + FAKE_NODE_MIN_SIZE) &
-                                                FAKE_NODE_MIN_HASH_MASK;
-        if (size < min_size) {
-                pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
-                        size >> 20, min_size >> 20);
-                size = min_size;
-        }
-        size &= FAKE_NODE_MIN_HASH_MASK;
-        for (i = 0; i < MAX_NUMNODES; i++)
-                if (physnodes[i].start != physnodes[i].end)
-                        node_set(i, physnode_mask);
-        /*
-         * Fill physical nodes with fake nodes of size until there is no memory
-         * left on any of them.
-         */
-        while (nodes_weight(physnode_mask)) {
-                for_each_node_mask(i, physnode_mask) {
-                        u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
-                        u64 end;
-                        end = find_end_of_node(physnodes[i].start,
-                                                physnodes[i].end, size);
-                        /*
-                         * If there won't be at least FAKE_NODE_MIN_SIZE of
-                         * non-reserved memory in ZONE_DMA32 for the next node,
-                         * this one must extend to the boundary.
-                         */
-                        if (end < dma32_end && dma32_end - end -
-                            memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
-                                end = dma32_end;
-                        /*
+/**
-                         * If there won't be enough non-reserved memory for the
+ * numa_reset_distance - Reset NUMA distance table
-                         * next node, this one must extend to the end of the
+ *
-                         * physical node.
+ * The current table is freed.  The next numa_set_distance() call will
-                         */
+ * create a new one.
-                        if (physnodes[i].end - end -
+ */
-                            memblock_x86_hole_size(end, physnodes[i].end) < size)
+void __init numa_reset_distance(void)
-                                end = physnodes[i].end;
+{
+        size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
-                        /*
+        /* numa_distance could be 1LU marking allocation failure, test cnt */
-                         * Setup the fake node that will be allocated as bootmem
+        if (numa_distance_cnt)
-                         * later.  If setup_node_range() returns non-zero, there
+                memblock_x86_free_range(__pa(numa_distance),
-                         * is no more memory available on this physical node.
+                                        __pa(numa_distance) + size);
-                         */
+        numa_distance_cnt = 0;
-                        if (setup_node_range(ret++, &physnodes[i].start,
+        numa_distance = NULL;   /* enable table creation */
-                                                end - physnodes[i].start,
-                                                physnodes[i].end) < 0)
-                                node_clear(i, physnode_mask);
-                }
-        }
-        return ret;
 }
-/*
+static int __init numa_alloc_distance(void)
- * Sets up the system RAM area from start_pfn to last_pfn according to the
- * numa=fake command-line option.
- */
-static int __init numa_emulation(unsigned long start_pfn,
-                        unsigned long last_pfn, int acpi, int amd)
 {
-        u64 addr = start_pfn << PAGE_SHIFT;
+        nodemask_t nodes_parsed;
-        u64 max_addr = last_pfn << PAGE_SHIFT;
+        size_t size;
-        int num_nodes;
+        int i, j, cnt = 0;
-        int i;
+        u64 phys;
-        /*
+        /* size the new table and allocate it */
-         * If the numa=fake command-line contains a 'M' or 'G', it represents
+        nodes_parsed = numa_nodes_parsed;
-         * the fixed node size.  Otherwise, if it is just a single number N,
+        numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
-         * split the system RAM into N fake nodes.
-         */
-        if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) {
-                u64 size;
-                size = memparse(cmdline, &cmdline);
+        for_each_node_mask(i, nodes_parsed)
-                num_nodes = split_nodes_size_interleave(addr, max_addr, size);
+                cnt = i;
-        } else {
+        cnt++;
-                unsigned long n;
+        size = cnt * cnt * sizeof(numa_distance[0]);
-                n = simple_strtoul(cmdline, NULL, 0);
+        phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT,
-                num_nodes = split_nodes_interleave(addr, max_addr, n);
+                                      size, PAGE_SIZE);
+        if (phys == MEMBLOCK_ERROR) {
+                pr_warning("NUMA: Warning: can't allocate distance table!\n");
+                /* don't retry until explicitly reset */
+                numa_distance = (void *)1LU;
+                return -ENOMEM;
        }
+        memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
-        if (num_nodes < 0)
+        numa_distance = __va(phys);
-                return num_nodes;
+        numa_distance_cnt = cnt;
-        memnode_shift = compute_hash_shift(nodes, num_nodes, NULL);
-        if (memnode_shift < 0) {
+        /* fill with the default distances */
-                memnode_shift = 0;
+        for (i = 0; i < cnt; i++)
-                printk(KERN_ERR "No NUMA hash function found.  NUMA emulation "
+                for (j = 0; j < cnt; j++)
-                       "disabled.\n");
+                        numa_distance[i * cnt + j] = i == j ?
-                return -1;
+                                LOCAL_DISTANCE : REMOTE_DISTANCE;
-        }
+        printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
-        /*
-         * We need to vacate all active ranges that may have been registered for
-         * the e820 memory map.
-         */
-        remove_all_active_ranges();
-        for_each_node_mask(i, node_possible_map) {
-                memblock_x86_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
-                                                nodes[i].end >> PAGE_SHIFT);
-                setup_node_bootmem(i, nodes[i].start, nodes[i].end);
-        }
-        setup_physnodes(addr, max_addr, acpi, amd);
-        fake_physnodes(acpi, amd, num_nodes);
-        numa_init_array();
        return 0;
 }
-#endif /* CONFIG_NUMA_EMU */
-void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
+/**
-                                int acpi, int amd)
+ * numa_set_distance - Set NUMA distance from one NUMA to another
+ * @from: the 'from' node to set distance
+ * @to: the 'to'  node to set distance
+ * @distance: NUMA distance
+ *
+ * Set the distance from node @from to @to to @distance.  If distance table
+ * doesn't exist, one which is large enough to accomodate all the currently
+ * known nodes will be created.
+ *
+ * If such table cannot be allocated, a warning is printed and further
+ * calls are ignored until the distance table is reset with
+ * numa_reset_distance().
+ *
+ * If @from or @to is higher than the highest known node at the time of
+ * table creation or @distance doesn't make sense, the call is ignored.
+ * This is to allow simplification of specific NUMA config implementations.
+ */
+void __init numa_set_distance(int from, int to, int distance)
 {
-        int i;
+        if (!numa_distance && numa_alloc_distance() < 0)
-        nodes_clear(node_possible_map);
-        nodes_clear(node_online_map);
-#ifdef CONFIG_NUMA_EMU
-        setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
-                        acpi, amd);
-        if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd))
                return;
-        setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
-                        acpi, amd);
-        nodes_clear(node_possible_map);
-        nodes_clear(node_online_map);
-#endif
-#ifdef CONFIG_ACPI_NUMA
+        if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
-        if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
+                printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
-                                                  last_pfn << PAGE_SHIFT))
+                            from, to, distance);
                return;
-        nodes_clear(node_possible_map);
+        }
-        nodes_clear(node_online_map);
-#endif
-#ifdef CONFIG_AMD_NUMA
+        if ((u8)distance != distance ||
-        if (!numa_off && amd && !amd_scan_nodes())
+            (from == to && distance != LOCAL_DISTANCE)) {
+                pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
+                             from, to, distance);
                return;
-        nodes_clear(node_possible_map);
+        }
-        nodes_clear(node_online_map);
-#endif
-        printk(KERN_INFO "%s\n",
-               numa_off ? "NUMA turned off" : "No NUMA configuration found");
-        printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
+        numa_distance[from * numa_distance_cnt + to] = distance;
-               start_pfn << PAGE_SHIFT,
-               last_pfn << PAGE_SHIFT);
-        /* setup dummy node covering all memory */
-        memnode_shift = 63;
-        memnodemap = memnode.embedded_map;
-        memnodemap[0] = 0;
-        node_set_online(0);
-        node_set(0, node_possible_map);
-        for (i = 0; i < nr_cpu_ids; i++)
-                numa_set_node(i, 0);
-        memblock_x86_register_active_regions(0, start_pfn, last_pfn);
-        setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
 }
-unsigned long __init numa_free_all_bootmem(void)
+int __node_distance(int from, int to)
 {
-        unsigned long pages = 0;
+        if (from >= numa_distance_cnt || to >= numa_distance_cnt)
-        int i;
+                return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
+        return numa_distance[from * numa_distance_cnt + to];
+}
+EXPORT_SYMBOL(__node_distance);
-        for_each_online_node(i)
+/*
-                pages += free_all_bootmem_node(NODE_DATA(i));
+ * Sanity check to catch more bad NUMA configurations (they are amazingly
+ * common).  Make sure the nodes cover all memory.
+ */
+static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
+{
+        unsigned long numaram, e820ram;
+        int i;
-        pages += free_all_memory_core_early(MAX_NUMNODES);
+        numaram = 0;
+        for (i = 0; i < mi->nr_blks; i++) {
+                unsigned long s = mi->blk[i].start >> PAGE_SHIFT;
+                unsigned long e = mi->blk[i].end >> PAGE_SHIFT;
+                numaram += e - s;
+                numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
+                if ((long)numaram < 0)
+                        numaram = 0;
+        }
-        return pages;
+        e820ram = max_pfn - (memblock_x86_hole_size(0,
+                                        max_pfn << PAGE_SHIFT) >> PAGE_SHIFT);
+        /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
+        if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
+                printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n",
+                       (numaram << PAGE_SHIFT) >> 20,
+                       (e820ram << PAGE_SHIFT) >> 20);
+                return false;
+        }
+        return true;
 }
-#ifdef CONFIG_NUMA
+static int __init numa_register_memblks(struct numa_meminfo *mi)
-static __init int find_near_online_node(int node)
 {
-        int n, val;
+        int i, nid;
-        int min_val = INT_MAX;
-        int best_node = -1;
-        for_each_online_node(n) {
+        /* Account for nodes with cpus and no memory */
-                val = node_distance(node, n);
+        node_possible_map = numa_nodes_parsed;
+        numa_nodemask_from_meminfo(&node_possible_map, mi);
+        if (WARN_ON(nodes_empty(node_possible_map)))
+                return -EINVAL;
-                if (val < min_val) {
+        memnode_shift = compute_hash_shift(mi);
-                        min_val = val;
+        if (memnode_shift < 0) {
-                        best_node = n;
+                printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n");
+                return -EINVAL;
+        }
+        for (i = 0; i < mi->nr_blks; i++)
+                memblock_x86_register_active_regions(mi->blk[i].nid,
+                                        mi->blk[i].start >> PAGE_SHIFT,
+                                        mi->blk[i].end >> PAGE_SHIFT);
+        /* for out of order entries */
+        sort_node_map();
+        if (!numa_meminfo_cover_memory(mi))
+                return -EINVAL;
+        /* Finally register nodes. */
+        for_each_node_mask(nid, node_possible_map) {
+                u64 start = (u64)max_pfn << PAGE_SHIFT;
+                u64 end = 0;
+                for (i = 0; i < mi->nr_blks; i++) {
+                        if (nid != mi->blk[i].nid)
+                                continue;
+                        start = min(mi->blk[i].start, start);
+                        end = max(mi->blk[i].end, end);
                }
+                if (start < end)
+                        setup_node_bootmem(nid, start, end);
        }
-        return best_node;
+        return 0;
 }
-/*
+/**
- * Setup early cpu_to_node.
+ * dummy_numma_init - Fallback dummy NUMA init
 *
- * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
+ * Used if there's no underlying NUMA architecture, NUMA initialization
- * and apicid_to_node[] tables have valid entries for a CPU.
+ * fails, or NUMA is disabled on the command line.
- * This means we skip cpu_to_node[] initialisation for NUMA
- * emulation and faking node case (when running a kernel compiled
- * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
- * is already initialized in a round robin manner at numa_init_array,
- * prior to this call, and this initialization is good enough
- * for the fake NUMA cases.
 *
- * Called before the per_cpu areas are setup.
+ * Must online at least one node and add memory blocks that cover all
+ * allowed memory.  This function must not fail.
 */
-void __init init_cpu_to_node(void)
+static int __init dummy_numa_init(void)
 {
-        int cpu;
+        printk(KERN_INFO "%s\n",
-        u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
+               numa_off ? "NUMA turned off" : "No NUMA configuration found");
+        printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
-        BUG_ON(cpu_to_apicid == NULL);
+               0LU, max_pfn << PAGE_SHIFT);
-        for_each_possible_cpu(cpu) {
+        node_set(0, numa_nodes_parsed);
-                int node;
+        numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT);
-                u16 apicid = cpu_to_apicid[cpu];
-                if (apicid == BAD_APICID)
+        return 0;
-                        continue;
-                node = apicid_to_node[apicid];
-                if (node == NUMA_NO_NODE)
-                        continue;
-                if (!node_online(node))
-                        node = find_near_online_node(node);
-                numa_set_node(cpu, node);
-        }
 }
-#endif
+static int __init numa_init(int (*init_func)(void))
-void __cpuinit numa_set_node(int cpu, int node)
 {
-        int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
+        int i;
+        int ret;
-        /* early setting, no percpu area yet */
-        if (cpu_to_node_map) {
-                cpu_to_node_map[cpu] = node;
-                return;
-        }
-#ifdef CONFIG_DEBUG_PER_CPU_MAPS
-        if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
-                printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
-                dump_stack();
-                return;
-        }
-#endif
-        per_cpu(x86_cpu_to_node_map, cpu) = node;
-        if (node != NUMA_NO_NODE)
+        for (i = 0; i < MAX_LOCAL_APIC; i++)
-                set_cpu_numa_node(cpu, node);
+                set_apicid_to_node(i, NUMA_NO_NODE);
-}
-void __cpuinit numa_clear_node(int cpu)
+        nodes_clear(numa_nodes_parsed);
-{
+        nodes_clear(node_possible_map);
-        numa_set_node(cpu, NUMA_NO_NODE);
+        nodes_clear(node_online_map);
-}
+        memset(&numa_meminfo, 0, sizeof(numa_meminfo));
+        remove_all_active_ranges();
-#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+        numa_reset_distance();
-#ifndef CONFIG_NUMA_EMU
+        ret = init_func();
-void __cpuinit numa_add_cpu(int cpu)
+        if (ret < 0)
-{
+                return ret;
-        cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+        ret = numa_cleanup_meminfo(&numa_meminfo);
-}
+        if (ret < 0)
+                return ret;
-void __cpuinit numa_remove_cpu(int cpu)
+        numa_emulation(&numa_meminfo, numa_distance_cnt);
-{
-        cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
-}
-#else
-void __cpuinit numa_add_cpu(int cpu)
-{
-        unsigned long addr;
-        u16 apicid;
-        int physnid;
-        int nid = NUMA_NO_NODE;
-        nid = early_cpu_to_node(cpu);
+        ret = numa_register_memblks(&numa_meminfo);
-        BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
+        if (ret < 0)
+                return ret;
-        /*
+        for (i = 0; i < nr_cpu_ids; i++) {
-         * Use the starting address of the emulated node to find which physical
+                int nid = early_cpu_to_node(i);
-         * node it is allocated on.
-         */
-        addr = node_start_pfn(nid) << PAGE_SHIFT;
-        for (physnid = 0; physnid < MAX_NUMNODES; physnid++)
-                if (addr >= physnodes[physnid].start &&
-                    addr < physnodes[physnid].end)
-                        break;
-        /*
+                if (nid == NUMA_NO_NODE)
-         * Map the cpu to each emulated node that is allocated on the physical
+                        continue;
-         * node of the cpu's apic id.
+                if (!node_online(nid))
-         */
+                        numa_clear_node(i);
-        for_each_online_node(nid) {
-                addr = node_start_pfn(nid) << PAGE_SHIFT;
-                if (addr >= physnodes[physnid].start &&
-                    addr < physnodes[physnid].end)
-                        cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
        }
+        numa_init_array();
+        return 0;
 }
-void __cpuinit numa_remove_cpu(int cpu)
+void __init initmem_init(void)
 {
-        int i;
+        int ret;
-        for_each_online_node(i)
+        if (!numa_off) {
-                cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
+#ifdef CONFIG_ACPI_NUMA
-}
+                ret = numa_init(x86_acpi_numa_init);
-#endif /* !CONFIG_NUMA_EMU */
+                if (!ret)
+                        return;
-#else /* CONFIG_DEBUG_PER_CPU_MAPS */
+#endif
-static struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable)
+#ifdef CONFIG_AMD_NUMA
-{
+                ret = numa_init(amd_numa_init);
-        int node = early_cpu_to_node(cpu);
+                if (!ret)
-        struct cpumask *mask;
+                        return;
-        char buf[64];
+#endif
-        mask = node_to_cpumask_map[node];
-        if (!mask) {
-                pr_err("node_to_cpumask_map[%i] NULL\n", node);
-                dump_stack();
-                return NULL;
        }
-        cpulist_scnprintf(buf, sizeof(buf), mask);
+        numa_init(dummy_numa_init);
-        printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
-                enable ? "numa_add_cpu" : "numa_remove_cpu",
-                cpu, node, buf);
-        return mask;
 }
-/*
+unsigned long __init numa_free_all_bootmem(void)
- * --------- debug versions of the numa functions ---------
- */
-#ifndef CONFIG_NUMA_EMU
-static void __cpuinit numa_set_cpumask(int cpu, int enable)
-{
-        struct cpumask *mask;
-        mask = debug_cpumask_set_cpu(cpu, enable);
-        if (!mask)
-                return;
-        if (enable)
-                cpumask_set_cpu(cpu, mask);
-        else
-                cpumask_clear_cpu(cpu, mask);
-}
-#else
-static void __cpuinit numa_set_cpumask(int cpu, int enable)
 {
-        int node = early_cpu_to_node(cpu);
+        unsigned long pages = 0;
-        struct cpumask *mask;
        int i;
-        for_each_online_node(i) {
+        for_each_online_node(i)
-                unsigned long addr;
+                pages += free_all_bootmem_node(NODE_DATA(i));
-                addr = node_start_pfn(i) << PAGE_SHIFT;
-                if (addr < physnodes[node].start ||
-                                        addr >= physnodes[node].end)
-                        continue;
-                mask = debug_cpumask_set_cpu(cpu, enable);
-                if (!mask)
-                        return;
-                if (enable)
-                        cpumask_set_cpu(cpu, mask);
-                else
-                        cpumask_clear_cpu(cpu, mask);
-        }
-}
-#endif /* CONFIG_NUMA_EMU */
-void __cpuinit numa_add_cpu(int cpu)
+        pages += free_all_memory_core_early(MAX_NUMNODES);
-{
-        numa_set_cpumask(cpu, 1);
-}
-void __cpuinit numa_remove_cpu(int cpu)
+        return pages;
-{
-        numa_set_cpumask(cpu, 0);
 }
-int __cpu_to_node(int cpu)
+int __cpuinit numa_cpu_node(int cpu)
 {
-        if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
+        int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
-                printk(KERN_WARNING
-                        "cpu_to_node(%d): usage too early!\n", cpu);
-                dump_stack();
-                return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
-        }
-        return per_cpu(x86_cpu_to_node_map, cpu);
-}
-EXPORT_SYMBOL(__cpu_to_node);
-/*
+        if (apicid != BAD_APICID)
- * Same function as cpu_to_node() but used if called before the
+                return __apicid_to_node[apicid];
- * per_cpu areas are setup.
+        return NUMA_NO_NODE;
- */
-int early_cpu_to_node(int cpu)
-{
-        if (early_per_cpu_ptr(x86_cpu_to_node_map))
-                return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
-        if (!cpu_possible(cpu)) {
-                printk(KERN_WARNING
-                        "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
-                dump_stack();
-                return NUMA_NO_NODE;
-        }
-        return per_cpu(x86_cpu_to_node_map, cpu);
 }
-/*
- * --------- end of debug versions of the numa functions ---------
- */
-#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
new file mode 100644
index 000000000000..ad091e4cff17
--- /dev/null
+++ b/arch/x86/mm/numa_emulation.c
@@ -0,0 +1,494 @@
+/*
+ * NUMA emulation
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/topology.h>
+#include <linux/memblock.h>
+#include <asm/dma.h>
+#include "numa_internal.h"
+static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata;
+static char *emu_cmdline __initdata;
+void __init numa_emu_cmdline(char *str)
+{
+        emu_cmdline = str;
+}
+static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
+{
+        int i;
+        for (i = 0; i < mi->nr_blks; i++)
+                if (mi->blk[i].nid == nid)
+                        return i;
+        return -ENOENT;
+}
+/*
+ * Sets up nid to range from @start to @end.  The return value is -errno if
+ * something went wrong, 0 otherwise.
+ */
+static int __init emu_setup_memblk(struct numa_meminfo *ei,
+                                   struct numa_meminfo *pi,
+                                   int nid, int phys_blk, u64 size)
+{
+        struct numa_memblk *eb = &ei->blk[ei->nr_blks];
+        struct numa_memblk *pb = &pi->blk[phys_blk];
+        if (ei->nr_blks >= NR_NODE_MEMBLKS) {
+                pr_err("NUMA: Too many emulated memblks, failing emulation\n");
+                return -EINVAL;
+        }
+        ei->nr_blks++;
+        eb->start = pb->start;
+        eb->end = pb->start + size;
+        eb->nid = nid;
+        if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
+                emu_nid_to_phys[nid] = pb->nid;
+        pb->start += size;
+        if (pb->start >= pb->end) {
+                WARN_ON_ONCE(pb->start > pb->end);
+                numa_remove_memblk_from(phys_blk, pi);
+        }
+        printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
+               eb->start, eb->end, (eb->end - eb->start) >> 20);
+        return 0;
+}
+/*
+ * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
+ * to max_addr.  The return value is the number of nodes allocated.
+ */
+static int __init split_nodes_interleave(struct numa_meminfo *ei,
+                                         struct numa_meminfo *pi,
+                                         u64 addr, u64 max_addr, int nr_nodes)
+{
+        nodemask_t physnode_mask = NODE_MASK_NONE;
+        u64 size;
+        int big;
+        int nid = 0;
+        int i, ret;
+        if (nr_nodes <= 0)
+                return -1;
+        if (nr_nodes > MAX_NUMNODES) {
+                pr_info("numa=fake=%d too large, reducing to %d\n",
+                        nr_nodes, MAX_NUMNODES);
+                nr_nodes = MAX_NUMNODES;
+        }
+        size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes;
+        /*
+         * Calculate the number of big nodes that can be allocated as a result
+         * of consolidating the remainder.
+         */
+        big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
+                FAKE_NODE_MIN_SIZE;
+        size &= FAKE_NODE_MIN_HASH_MASK;
+        if (!size) {
+                pr_err("Not enough memory for each node.  "
+                        "NUMA emulation disabled.\n");
+                return -1;
+        }
+        for (i = 0; i < pi->nr_blks; i++)
+                node_set(pi->blk[i].nid, physnode_mask);
+        /*
+         * Continue to fill physical nodes with fake nodes until there is no
+         * memory left on any of them.
+         */
+        while (nodes_weight(physnode_mask)) {
+                for_each_node_mask(i, physnode_mask) {
+                        u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
+                        u64 start, limit, end;
+                        int phys_blk;
+                        phys_blk = emu_find_memblk_by_nid(i, pi);
+                        if (phys_blk < 0) {
+                                node_clear(i, physnode_mask);
+                                continue;
+                        }
+                        start = pi->blk[phys_blk].start;
+                        limit = pi->blk[phys_blk].end;
+                        end = start + size;
+                        if (nid < big)
+                                end += FAKE_NODE_MIN_SIZE;
+                        /*
+                         * Continue to add memory to this fake node if its
+                         * non-reserved memory is less than the per-node size.
+                         */
+                        while (end - start -
+                               memblock_x86_hole_size(start, end) < size) {
+                                end += FAKE_NODE_MIN_SIZE;
+                                if (end > limit) {
+                                        end = limit;
+                                        break;
+                                }
+                        }
+                        /*
+                         * If there won't be at least FAKE_NODE_MIN_SIZE of
+                         * non-reserved memory in ZONE_DMA32 for the next node,
+                         * this one must extend to the boundary.
+                         */
+                        if (end < dma32_end && dma32_end - end -
+                            memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+                                end = dma32_end;
+                        /*
+                         * If there won't be enough non-reserved memory for the
+                         * next node, this one must extend to the end of the
+                         * physical node.
+                         */
+                        if (limit - end -
+                            memblock_x86_hole_size(end, limit) < size)
+                                end = limit;
+                        ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
+                                               phys_blk,
+                                               min(end, limit) - start);
+                        if (ret < 0)
+                                return ret;
+                }
+        }
+        return 0;
+}
+/*
+ * Returns the end address of a node so that there is at least `size' amount of
+ * non-reserved memory or `max_addr' is reached.
+ */
+static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
+{
+        u64 end = start + size;
+        while (end - start - memblock_x86_hole_size(start, end) < size) {
+                end += FAKE_NODE_MIN_SIZE;
+                if (end > max_addr) {
+                        end = max_addr;
+                        break;
+                }
+        }
+        return end;
+}
+/*
+ * Sets up fake nodes of `size' interleaved over physical nodes ranging from
+ * `addr' to `max_addr'.  The return value is the number of nodes allocated.
+ */
+static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
+                                              struct numa_meminfo *pi,
+                                              u64 addr, u64 max_addr, u64 size)
+{
+        nodemask_t physnode_mask = NODE_MASK_NONE;
+        u64 min_size;
+        int nid = 0;
+        int i, ret;
+        if (!size)
+                return -1;
+        /*
+         * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
+         * increased accordingly if the requested size is too small.  This
+         * creates a uniform distribution of node sizes across the entire
+         * machine (but not necessarily over physical nodes).
+         */
+        min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
+                                                MAX_NUMNODES;
+        min_size = max(min_size, FAKE_NODE_MIN_SIZE);
+        if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
+                min_size = (min_size + FAKE_NODE_MIN_SIZE) &
+                                                FAKE_NODE_MIN_HASH_MASK;
+        if (size < min_size) {
+                pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
+                        size >> 20, min_size >> 20);
+                size = min_size;
+        }
+        size &= FAKE_NODE_MIN_HASH_MASK;
+        for (i = 0; i < pi->nr_blks; i++)
+                node_set(pi->blk[i].nid, physnode_mask);
+        /*
+         * Fill physical nodes with fake nodes of size until there is no memory
+         * left on any of them.
+         */
+        while (nodes_weight(physnode_mask)) {
+                for_each_node_mask(i, physnode_mask) {
+                        u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
+                        u64 start, limit, end;
+                        int phys_blk;
+                        phys_blk = emu_find_memblk_by_nid(i, pi);
+                        if (phys_blk < 0) {
+                                node_clear(i, physnode_mask);
+                                continue;
+                        }
+                        start = pi->blk[phys_blk].start;
+                        limit = pi->blk[phys_blk].end;
+                        end = find_end_of_node(start, limit, size);
+                        /*
+                         * If there won't be at least FAKE_NODE_MIN_SIZE of
+                         * non-reserved memory in ZONE_DMA32 for the next node,
+                         * this one must extend to the boundary.
+                         */
+                        if (end < dma32_end && dma32_end - end -
+                            memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+                                end = dma32_end;
+                        /*
+                         * If there won't be enough non-reserved memory for the
+                         * next node, this one must extend to the end of the
+                         * physical node.
+                         */
+                        if (limit - end -
+                            memblock_x86_hole_size(end, limit) < size)
+                                end = limit;
+                        ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
+                                               phys_blk,
+                                               min(end, limit) - start);
+                        if (ret < 0)
+                                return ret;
+                }
+        }
+        return 0;
+}
+/**
+ * numa_emulation - Emulate NUMA nodes
+ * @numa_meminfo: NUMA configuration to massage
+ * @numa_dist_cnt: The size of the physical NUMA distance table
+ *
+ * Emulate NUMA nodes according to the numa=fake kernel parameter.
+ * @numa_meminfo contains the physical memory configuration and is modified
+ * to reflect the emulated configuration on success.  @numa_dist_cnt is
+ * used to determine the size of the physical distance table.
+ *
+ * On success, the following modifications are made.
+ *
+ * - @numa_meminfo is updated to reflect the emulated nodes.
+ *
+ * - __apicid_to_node[] is updated such that APIC IDs are mapped to the
+ *   emulated nodes.
+ *
+ * - NUMA distance table is rebuilt to represent distances between emulated
+ *   nodes.  The distances are determined considering how emulated nodes
+ *   are mapped to physical nodes and match the actual distances.
+ *
+ * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical
+ *   nodes.  This is used by numa_add_cpu() and numa_remove_cpu().
+ *
+ * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with
+ * identity mapping and no other modification is made.
+ */
+void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
+{
+        static struct numa_meminfo ei __initdata;
+        static struct numa_meminfo pi __initdata;
+        const u64 max_addr = max_pfn << PAGE_SHIFT;
+        u8 *phys_dist = NULL;
+        size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
+        int max_emu_nid, dfl_phys_nid;
+        int i, j, ret;
+        if (!emu_cmdline)
+                goto no_emu;
+        memset(&ei, 0, sizeof(ei));
+        pi = *numa_meminfo;
+        for (i = 0; i < MAX_NUMNODES; i++)
+                emu_nid_to_phys[i] = NUMA_NO_NODE;
+        /*
+         * If the numa=fake command-line contains a 'M' or 'G', it represents
+         * the fixed node size.  Otherwise, if it is just a single number N,
+         * split the system RAM into N fake nodes.
+         */
+        if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
+                u64 size;
+                size = memparse(emu_cmdline, &emu_cmdline);
+                ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
+        } else {
+                unsigned long n;
+                n = simple_strtoul(emu_cmdline, NULL, 0);
+                ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
+        }
+        if (ret < 0)
+                goto no_emu;
+        if (numa_cleanup_meminfo(&ei) < 0) {
+                pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
+                goto no_emu;
+        }
+        /* copy the physical distance table */
+        if (numa_dist_cnt) {
+                u64 phys;
+                phys = memblock_find_in_range(0,
+                                              (u64)max_pfn_mapped << PAGE_SHIFT,
+                                              phys_size, PAGE_SIZE);
+                if (phys == MEMBLOCK_ERROR) {
+                        pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
+                        goto no_emu;
+                }
+                memblock_x86_reserve_range(phys, phys + phys_size, "TMP NUMA DIST");
+                phys_dist = __va(phys);
+                for (i = 0; i < numa_dist_cnt; i++)
+                        for (j = 0; j < numa_dist_cnt; j++)
+                                phys_dist[i * numa_dist_cnt + j] =
+                                        node_distance(i, j);
+        }
+        /*
+         * Determine the max emulated nid and the default phys nid to use
+         * for unmapped nodes.
+         */
+        max_emu_nid = 0;
+        dfl_phys_nid = NUMA_NO_NODE;
+        for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
+                if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
+                        max_emu_nid = i;
+                        if (dfl_phys_nid == NUMA_NO_NODE)
+                                dfl_phys_nid = emu_nid_to_phys[i];
+                }
+        }
+        if (dfl_phys_nid == NUMA_NO_NODE) {
+                pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n");
+                goto no_emu;
+        }
+        /* commit */
+        *numa_meminfo = ei;
+        /*
+         * Transform __apicid_to_node table to use emulated nids by
+         * reverse-mapping phys_nid.  The maps should always exist but fall
+         * back to zero just in case.
+         */
+        for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
+                if (__apicid_to_node[i] == NUMA_NO_NODE)
+                        continue;
+                for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
+                        if (__apicid_to_node[i] == emu_nid_to_phys[j])
+                                break;
+                __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
+        }
+        /* make sure all emulated nodes are mapped to a physical node */
+        for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
+                if (emu_nid_to_phys[i] == NUMA_NO_NODE)
+                        emu_nid_to_phys[i] = dfl_phys_nid;
+        /* transform distance table */
+        numa_reset_distance();
+        for (i = 0; i < max_emu_nid + 1; i++) {
+                for (j = 0; j < max_emu_nid + 1; j++) {
+                        int physi = emu_nid_to_phys[i];
+                        int physj = emu_nid_to_phys[j];
+                        int dist;
+                        if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
+                                dist = physi == physj ?
+                                        LOCAL_DISTANCE : REMOTE_DISTANCE;
+                        else
+                                dist = phys_dist[physi * numa_dist_cnt + physj];
+                        numa_set_distance(i, j, dist);
+                }
+        }
+        /* free the copied physical distance table */
+        if (phys_dist)
+                memblock_x86_free_range(__pa(phys_dist), __pa(phys_dist) + phys_size);
+        return;
+no_emu:
+        /* No emulation.  Build identity emu_nid_to_phys[] for numa_add_cpu() */
+        for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
+                emu_nid_to_phys[i] = i;
+}
+#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+void __cpuinit numa_add_cpu(int cpu)
+{
+        int physnid, nid;
+        nid = early_cpu_to_node(cpu);
+        BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
+        physnid = emu_nid_to_phys[nid];
+        /*
+         * Map the cpu to each emulated node that is allocated on the physical
+         * node of the cpu's apic id.
+         */
+        for_each_online_node(nid)
+                if (emu_nid_to_phys[nid] == physnid)
+                        cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
+}
+void __cpuinit numa_remove_cpu(int cpu)
+{
+        int i;
+        for_each_online_node(i)
+                cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
+}
+#else   /* !CONFIG_DEBUG_PER_CPU_MAPS */
+static void __cpuinit numa_set_cpumask(int cpu, int enable)
+{
+        struct cpumask *mask;
+        int nid, physnid, i;
+        nid = early_cpu_to_node(cpu);
+        if (nid == NUMA_NO_NODE) {
+                /* early_cpu_to_node() already emits a warning and trace */
+                return;
+        }
+        physnid = emu_nid_to_phys[nid];
+        for_each_online_node(i) {
+                if (emu_nid_to_phys[nid] != physnid)
+                        continue;
+                mask = debug_cpumask_set_cpu(cpu, enable);
+                if (!mask)
+                        return;
+                if (enable)
+                        cpumask_set_cpu(cpu, mask);
+                else
+                        cpumask_clear_cpu(cpu, mask);
+        }
+}
+void __cpuinit numa_add_cpu(int cpu)
+{
+        numa_set_cpumask(cpu, 1);
+}
+void __cpuinit numa_remove_cpu(int cpu)
+{
+        numa_set_cpumask(cpu, 0);
+}
+#endif  /* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
new file mode 100644
index 000000000000..ef2d97377d7c
--- /dev/null
+++ b/arch/x86/mm/numa_internal.h
@@ -0,0 +1,31 @@
+#ifndef __X86_MM_NUMA_INTERNAL_H
+#define __X86_MM_NUMA_INTERNAL_H
+#include <linux/types.h>
+#include <asm/numa.h>
+struct numa_memblk {
+        u64                     start;
+        u64                     end;
+        int                     nid;
+};
+struct numa_meminfo {
+        int                     nr_blks;
+        struct numa_memblk      blk[NR_NODE_MEMBLKS];
+};
+void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
+int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
+void __init numa_reset_distance(void);
+#ifdef CONFIG_NUMA_EMU
+void __init numa_emulation(struct numa_meminfo *numa_meminfo,
+                           int numa_dist_cnt);
+#else
+static inline void numa_emulation(struct numa_meminfo *numa_meminfo,
+                                  int numa_dist_cnt)
+{ }
+#endif
+#endif  /* __X86_MM_NUMA_INTERNAL_H */
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index ae96e7b8051d..48651c6f657d 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -57,7 +57,7 @@ struct node_memory_chunk_s {
 static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
 static int __initdata num_memory_chunks; /* total number of memory chunks */
-static u8 __initdata apicid_to_pxm[MAX_APICID];
+static u8 __initdata apicid_to_pxm[MAX_LOCAL_APIC];
 int acpi_numa __initdata;
@@ -254,8 +254,8 @@ int __init get_memcfg_from_srat(void)
        printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
                         num_memory_chunks);
-        for (i = 0; i < MAX_APICID; i++)
+        for (i = 0; i < MAX_LOCAL_APIC; i++)
-                apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
+                set_apicid_to_node(i, pxm_to_node(apicid_to_pxm[i]));
        for (j = 0; j < num_memory_chunks; j++){
                struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 603d285d1daa..8e9d3394f6d4 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -26,88 +26,34 @@
 int acpi_numa __initdata;
-static struct acpi_table_slit *acpi_slit;
-static nodemask_t nodes_parsed __initdata;
-static nodemask_t cpu_nodes_parsed __initdata;
-static struct bootnode nodes[MAX_NUMNODES] __initdata;
 static struct bootnode nodes_add[MAX_NUMNODES];
-static int num_node_memblks __initdata;
-static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
-static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
 static __init int setup_node(int pxm)
 {
        return acpi_map_pxm_to_node(pxm);
 }
-static __init int conflicting_memblks(unsigned long start, unsigned long end)
-{
-        int i;
-        for (i = 0; i < num_node_memblks; i++) {
-                struct bootnode *nd = &node_memblk_range[i];
-                if (nd->start == nd->end)
-                        continue;
-                if (nd->end > start && nd->start < end)
-                        return memblk_nodeid[i];
-                if (nd->end == end && nd->start == start)
-                        return memblk_nodeid[i];
-        }
-        return -1;
-}
-static __init void cutoff_node(int i, unsigned long start, unsigned long end)
-{
-        struct bootnode *nd = &nodes[i];
-        if (nd->start < start) {
-                nd->start = start;
-                if (nd->end < nd->start)
-                        nd->start = nd->end;
-        }
-        if (nd->end > end) {
-                nd->end = end;
-                if (nd->start > nd->end)
-                        nd->start = nd->end;
-        }
-}
 static __init void bad_srat(void)
 {
-        int i;
        printk(KERN_ERR "SRAT: SRAT not used.\n");
        acpi_numa = -1;
-        for (i = 0; i < MAX_LOCAL_APIC; i++)
+        memset(nodes_add, 0, sizeof(nodes_add));
-                apicid_to_node[i] = NUMA_NO_NODE;
-        for (i = 0; i < MAX_NUMNODES; i++) {
-                nodes[i].start = nodes[i].end = 0;
-                nodes_add[i].start = nodes_add[i].end = 0;
-        }
-        remove_all_active_ranges();
 }
 static __init inline int srat_disabled(void)
 {
-        return numa_off || acpi_numa < 0;
+        return acpi_numa < 0;
 }
 /* Callback for SLIT parsing */
 void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
 {
-        unsigned length;
+        int i, j;
-        unsigned long phys;
-        length = slit->header.length;
-        phys = memblock_find_in_range(0, max_pfn_mapped<<PAGE_SHIFT, length,
-                 PAGE_SIZE);
-        if (phys == MEMBLOCK_ERROR)
-                panic(" Can not save slit!\n");
-        acpi_slit = __va(phys);
+        for (i = 0; i < slit->locality_count; i++)
-        memcpy(acpi_slit, slit, length);
+                for (j = 0; j < slit->locality_count; j++)
-        memblock_x86_reserve_range(phys, phys + length, "ACPI SLIT");
+                        numa_set_distance(pxm_to_node(i), pxm_to_node(j),
+                                slit->entry[slit->locality_count * i + j]);
 }
 /* Callback for Proximity Domain -> x2APIC mapping */
@@ -138,8 +84,8 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
                printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
                return;
        }
-        apicid_to_node[apic_id] = node;
+        set_apicid_to_node(apic_id, node);
-        node_set(node, cpu_nodes_parsed);
+        node_set(node, numa_nodes_parsed);
        acpi_numa = 1;
        printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
               pxm, apic_id, node);
@@ -178,8 +124,8 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
                return;
        }
-        apicid_to_node[apic_id] = node;
+        set_apicid_to_node(apic_id, node);
-        node_set(node, cpu_nodes_parsed);
+        node_set(node, numa_nodes_parsed);
        acpi_numa = 1;
        printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
               pxm, apic_id, node);
@@ -241,7 +187,7 @@ update_nodes_add(int node, unsigned long start, unsigned long end)
        }
        if (changed) {
-                node_set(node, cpu_nodes_parsed);
+                node_set(node, numa_nodes_parsed);
                printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
                                 nd->start, nd->end);
        }
@@ -251,10 +197,8 @@ update_nodes_add(int node, unsigned long start, unsigned long end)
 void __init
 acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 {
-        struct bootnode *nd, oldnode;
        unsigned long start, end;
        int node, pxm;
-        int i;
        if (srat_disabled())
                return;
@@ -276,300 +220,31 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
                bad_srat();
                return;
        }
-        i = conflicting_memblks(start, end);
-        if (i == node) {
+        if (numa_add_memblk(node, start, end) < 0) {
-                printk(KERN_WARNING
-                "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
-                        pxm, start, end, nodes[i].start, nodes[i].end);
-        } else if (i >= 0) {
-                printk(KERN_ERR
-                       "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
-                       pxm, start, end, node_to_pxm(i),
-                        nodes[i].start, nodes[i].end);
                bad_srat();
                return;
        }
-        nd = &nodes[node];
-        oldnode = *nd;
-        if (!node_test_and_set(node, nodes_parsed)) {
-                nd->start = start;
-                nd->end = end;
-        } else {
-                if (start < nd->start)
-                        nd->start = start;
-                if (nd->end < end)
-                        nd->end = end;
-        }
        printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
               start, end);
-        if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
+        if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)
                update_nodes_add(node, start, end);
-                /* restore nodes[node] */
-                *nd = oldnode;
-                if ((nd->start | nd->end) == 0)
-                        node_clear(node, nodes_parsed);
-        }
-        node_memblk_range[num_node_memblks].start = start;
-        node_memblk_range[num_node_memblks].end = end;
-        memblk_nodeid[num_node_memblks] = node;
-        num_node_memblks++;
-}
-/* Sanity check to catch more bad SRATs (they are amazingly common).
-   Make sure the PXMs cover all memory. */
-static int __init nodes_cover_memory(const struct bootnode *nodes)
-{
-        int i;
-        unsigned long pxmram, e820ram;
-        pxmram = 0;
-        for_each_node_mask(i, nodes_parsed) {
-                unsigned long s = nodes[i].start >> PAGE_SHIFT;
-                unsigned long e = nodes[i].end >> PAGE_SHIFT;
-                pxmram += e - s;
-                pxmram -= __absent_pages_in_range(i, s, e);
-                if ((long)pxmram < 0)
-                        pxmram = 0;
-        }
-        e820ram = max_pfn - (memblock_x86_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
-        /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
-        if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) {
-                printk(KERN_ERR
-        "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
-                        (pxmram << PAGE_SHIFT) >> 20,
-                        (e820ram << PAGE_SHIFT) >> 20);
-                return 0;
-        }
-        return 1;
 }
 void __init acpi_numa_arch_fixup(void) {}
-#ifdef CONFIG_NUMA_EMU
+int __init x86_acpi_numa_init(void)
-void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
-                                unsigned long end)
-{
-        int i;
-        for_each_node_mask(i, nodes_parsed) {
-                cutoff_node(i, start, end);
-                physnodes[i].start = nodes[i].start;
-                physnodes[i].end = nodes[i].end;
-        }
-}
-#endif /* CONFIG_NUMA_EMU */
-/* Use the information discovered above to actually set up the nodes. */
-int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 {
-        int i;
+        int ret;
-        if (acpi_numa <= 0)
-                return -1;
-        /* First clean up the node list */
-        for (i = 0; i < MAX_NUMNODES; i++)
-                cutoff_node(i, start, end);
-        /*
-         * Join together blocks on the same node, holes between
-         * which don't overlap with memory on other nodes.
-         */
-        for (i = 0; i < num_node_memblks; ++i) {
-                int j, k;
-                for (j = i + 1; j < num_node_memblks; ++j) {
-                        unsigned long start, end;
-                        if (memblk_nodeid[i] != memblk_nodeid[j])
-                                continue;
-                        start = min(node_memblk_range[i].end,
-                                    node_memblk_range[j].end);
-                        end = max(node_memblk_range[i].start,
-                                  node_memblk_range[j].start);
-                        for (k = 0; k < num_node_memblks; ++k) {
-                                if (memblk_nodeid[i] == memblk_nodeid[k])
-                                        continue;
-                                if (start < node_memblk_range[k].end &&
-                                    end > node_memblk_range[k].start)
-                                        break;
-                        }
-                        if (k < num_node_memblks)
-                                continue;
-                        start = min(node_memblk_range[i].start,
-                                    node_memblk_range[j].start);
-                        end = max(node_memblk_range[i].end,
-                                  node_memblk_range[j].end);
-                        printk(KERN_INFO "SRAT: Node %d "
-                               "[%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
-                               memblk_nodeid[i],
-                               node_memblk_range[i].start,
-                               node_memblk_range[i].end,
-                               node_memblk_range[j].start,
-                               node_memblk_range[j].end,
-                               start, end);
-                        node_memblk_range[i].start = start;
-                        node_memblk_range[i].end = end;
-                        k = --num_node_memblks - j;
-                        memmove(memblk_nodeid + j, memblk_nodeid + j+1,
-                                k * sizeof(*memblk_nodeid));
-                        memmove(node_memblk_range + j, node_memblk_range + j+1,
-                                k * sizeof(*node_memblk_range));
-                        --j;
-                }
-        }
-        memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
-                                           memblk_nodeid);
-        if (memnode_shift < 0) {
-                printk(KERN_ERR
-                     "SRAT: No NUMA node hash function found. Contact maintainer\n");
-                bad_srat();
-                return -1;
-        }
-        for (i = 0; i < num_node_memblks; i++)
-                memblock_x86_register_active_regions(memblk_nodeid[i],
-                                node_memblk_range[i].start >> PAGE_SHIFT,
-                                node_memblk_range[i].end >> PAGE_SHIFT);
-        /* for out of order entries in SRAT */
-        sort_node_map();
-        if (!nodes_cover_memory(nodes)) {
-                bad_srat();
-                return -1;
-        }
-        /* Account for nodes with cpus and no memory */
+        ret = acpi_numa_init();
-        nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed);
+        if (ret < 0)
+                return ret;
-        /* Finally register nodes */
+        return srat_disabled() ? -EINVAL : 0;
-        for_each_node_mask(i, node_possible_map)
-                setup_node_bootmem(i, nodes[i].start, nodes[i].end);
-        /* Try again in case setup_node_bootmem missed one due
-           to missing bootmem */
-        for_each_node_mask(i, node_possible_map)
-                if (!node_online(i))
-                        setup_node_bootmem(i, nodes[i].start, nodes[i].end);
-        for (i = 0; i < nr_cpu_ids; i++) {
-                int node = early_cpu_to_node(i);
-                if (node == NUMA_NO_NODE)
-                        continue;
-                if (!node_online(node))
-                        numa_clear_node(i);
-        }
-        numa_init_array();
-        return 0;
-}
-#ifdef CONFIG_NUMA_EMU
-static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
-        [0 ... MAX_NUMNODES-1] = PXM_INVAL
-};
-static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
-        [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
-};
-static int __init find_node_by_addr(unsigned long addr)
-{
-        int ret = NUMA_NO_NODE;
-        int i;
-        for_each_node_mask(i, nodes_parsed) {
-                /*
-                 * Find the real node that this emulated node appears on.  For
-                 * the sake of simplicity, we only use a real node's starting
-                 * address to determine which emulated node it appears on.
-                 */
-                if (addr >= nodes[i].start && addr < nodes[i].end) {
-                        ret = i;
-                        break;
-                }
-        }
-        return ret;
 }
-/*
- * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
- * mappings that respect the real ACPI topology but reflect our emulated
- * environment.  For each emulated node, we find which real node it appears on
- * and create PXM to NID mappings for those fake nodes which mirror that
- * locality.  SLIT will now represent the correct distances between emulated
- * nodes as a result of the real topology.
- */
-void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
-{
-        int i, j;
-        for (i = 0; i < num_nodes; i++) {
-                int nid, pxm;
-                nid = find_node_by_addr(fake_nodes[i].start);
-                if (nid == NUMA_NO_NODE)
-                        continue;
-                pxm = node_to_pxm(nid);
-                if (pxm == PXM_INVAL)
-                        continue;
-                fake_node_to_pxm_map[i] = pxm;
-                /*
-                 * For each apicid_to_node mapping that exists for this real
-                 * node, it must now point to the fake node ID.
-                 */
-                for (j = 0; j < MAX_LOCAL_APIC; j++)
-                        if (apicid_to_node[j] == nid &&
-                            fake_apicid_to_node[j] == NUMA_NO_NODE)
-                                fake_apicid_to_node[j] = i;
-        }
-        /*
-         * If there are apicid-to-node mappings for physical nodes that do not
-         * have a corresponding emulated node, it should default to a guaranteed
-         * value.
-         */
-        for (i = 0; i < MAX_LOCAL_APIC; i++)
-                if (apicid_to_node[i] != NUMA_NO_NODE &&
-                    fake_apicid_to_node[i] == NUMA_NO_NODE)
-                        fake_apicid_to_node[i] = 0;
-        for (i = 0; i < num_nodes; i++)
-                __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
-        memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
-        nodes_clear(nodes_parsed);
-        for (i = 0; i < num_nodes; i++)
-                if (fake_nodes[i].start != fake_nodes[i].end)
-                        node_set(i, nodes_parsed);
-}
-static int null_slit_node_compare(int a, int b)
-{
-        return node_to_pxm(a) == node_to_pxm(b);
-}
-#else
-static int null_slit_node_compare(int a, int b)
-{
-        return a == b;
-}
-#endif /* CONFIG_NUMA_EMU */
-int __node_distance(int a, int b)
-{
-        int index;
-        if (!acpi_slit)
-                return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
-                                                      REMOTE_DISTANCE;
-        index = acpi_slit->locality_count * node_to_pxm(a);
-        return acpi_slit->entry[index + node_to_pxm(b)];
-}
-EXPORT_SYMBOL(__node_distance);
 #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
 int memory_add_physaddr_to_nid(u64 start)
 {
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6acc724d5d8f..d6c0418c3e47 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -179,12 +179,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
        sender = this_cpu_read(tlb_vector_offset);
        f = &flush_state[sender];
-        /*
+        if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
-         * Could avoid this lock when
+                raw_spin_lock(&f->tlbstate_lock);
-         * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
-         * probably not worth checking this for a cache-hot lock.
-         */
-        raw_spin_lock(&f->tlbstate_lock);
        f->flush_mm = mm;
        f->flush_va = va;
@@ -202,7 +198,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
        f->flush_mm = NULL;
        f->flush_va = 0;
-        raw_spin_unlock(&f->tlbstate_lock);
+        if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
+                raw_spin_unlock(&f->tlbstate_lock);
 }
 void native_flush_tlb_others(const struct cpumask *cpumask,
@@ -211,11 +208,10 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
        if (is_uv_system()) {
                unsigned int cpu;
-                cpu = get_cpu();
+                cpu = smp_processor_id();
                cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
                if (cpumask)
                        flush_tlb_others_ipi(cpumask, mm, va);
-                put_cpu();
                return;
        }
        flush_tlb_others_ipi(cpumask, mm, va);
author	Linus Torvalds <torvalds@linux-foundation.org>	2011-03-15 22:49:10 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2011-03-15 22:49:10 -0400
commit	181f977d134a9f8e3f8839f42af655b045fc059e (patch)
tree	5d9bb67c62ef1476c18ed350106a84c02f0dd8e4 /arch/x86/mm
parent	d5d42399bd7b66bd6b55363b311810504110c967 (diff)
parent	25542c646afbf14c43fa7d2b443055cadb73b07a (diff)