aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-03-15 22:49:10 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-03-15 22:49:10 -0400
commit181f977d134a9f8e3f8839f42af655b045fc059e (patch)
tree5d9bb67c62ef1476c18ed350106a84c02f0dd8e4 /arch/x86/mm
parentd5d42399bd7b66bd6b55363b311810504110c967 (diff)
parent25542c646afbf14c43fa7d2b443055cadb73b07a (diff)
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (93 commits) x86, tlb, UV: Do small micro-optimization for native_flush_tlb_others() x86-64, NUMA: Don't call numa_set_distanc() for all possible node combinations during emulation x86-64, NUMA: Don't assume phys node 0 is always online in numa_emulation() x86-64, NUMA: Clean up initmem_init() x86-64, NUMA: Fix numa_emulation code with node0 without RAM x86-64, NUMA: Revert NUMA affine page table allocation x86: Work around old gas bug x86-64, NUMA: Better explain numa_distance handling x86-64, NUMA: Fix distance table handling mm: Move early_node_map[] reverse scan helpers under HAVE_MEMBLOCK x86-64, NUMA: Fix size of numa_distance array x86: Rename e820_table_* to pgt_buf_* bootmem: Move __alloc_memory_core_early() to nobootmem.c bootmem: Move contig_page_data definition to bootmem.c/nobootmem.c bootmem: Separate out CONFIG_NO_BOOTMEM code into nobootmem.c x86-64, NUMA: Seperate out numa_alloc_distance() from numa_set_distance() x86-64, NUMA: Add proper function comments to global functions x86-64, NUMA: Move NUMA emulation into numa_emulation.c x86-64, NUMA: Prepare numa_emulation() for moving NUMA emulation into a separate file x86-64, NUMA: Do not scan two times for setup_node_bootmem() ... Fix up conflicts in arch/x86/kernel/smpboot.c
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile1
-rw-r--r--arch/x86/mm/amdtopology_64.c142
-rw-r--r--arch/x86/mm/init.c56
-rw-r--r--arch/x86/mm/init_32.c11
-rw-r--r--arch/x86/mm/init_64.c72
-rw-r--r--arch/x86/mm/numa.c212
-rw-r--r--arch/x86/mm/numa_32.c10
-rw-r--r--arch/x86/mm/numa_64.c984
-rw-r--r--arch/x86/mm/numa_emulation.c494
-rw-r--r--arch/x86/mm/numa_internal.h31
-rw-r--r--arch/x86/mm/srat_32.c6
-rw-r--r--arch/x86/mm/srat_64.c367
-rw-r--r--arch/x86/mm/tlb.c14
13 files changed, 1207 insertions, 1193 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 09df2f9a3d69..3e608edf9958 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
25obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o 25obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o
26obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o 26obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o
27obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o 27obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
28obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
28 29
29obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o 30obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
30 31
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index f21962c435ed..0919c26820d4 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -26,9 +26,7 @@
26#include <asm/apic.h> 26#include <asm/apic.h>
27#include <asm/amd_nb.h> 27#include <asm/amd_nb.h>
28 28
29static struct bootnode __initdata nodes[8];
30static unsigned char __initdata nodeids[8]; 29static unsigned char __initdata nodeids[8];
31static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
32 30
33static __init int find_northbridge(void) 31static __init int find_northbridge(void)
34{ 32{
@@ -51,7 +49,7 @@ static __init int find_northbridge(void)
51 return num; 49 return num;
52 } 50 }
53 51
54 return -1; 52 return -ENOENT;
55} 53}
56 54
57static __init void early_get_boot_cpu_id(void) 55static __init void early_get_boot_cpu_id(void)
@@ -69,17 +67,18 @@ static __init void early_get_boot_cpu_id(void)
69#endif 67#endif
70} 68}
71 69
72int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) 70int __init amd_numa_init(void)
73{ 71{
74 unsigned long start = PFN_PHYS(start_pfn); 72 unsigned long start = PFN_PHYS(0);
75 unsigned long end = PFN_PHYS(end_pfn); 73 unsigned long end = PFN_PHYS(max_pfn);
76 unsigned numnodes; 74 unsigned numnodes;
77 unsigned long prevbase; 75 unsigned long prevbase;
78 int i, nb, found = 0; 76 int i, j, nb;
79 u32 nodeid, reg; 77 u32 nodeid, reg;
78 unsigned int bits, cores, apicid_base;
80 79
81 if (!early_pci_allowed()) 80 if (!early_pci_allowed())
82 return -1; 81 return -EINVAL;
83 82
84 nb = find_northbridge(); 83 nb = find_northbridge();
85 if (nb < 0) 84 if (nb < 0)
@@ -90,7 +89,7 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
90 reg = read_pci_config(0, nb, 0, 0x60); 89 reg = read_pci_config(0, nb, 0, 0x60);
91 numnodes = ((reg >> 4) & 0xF) + 1; 90 numnodes = ((reg >> 4) & 0xF) + 1;
92 if (numnodes <= 1) 91 if (numnodes <= 1)
93 return -1; 92 return -ENOENT;
94 93
95 pr_info("Number of physical nodes %d\n", numnodes); 94 pr_info("Number of physical nodes %d\n", numnodes);
96 95
@@ -121,9 +120,9 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
121 if ((base >> 8) & 3 || (limit >> 8) & 3) { 120 if ((base >> 8) & 3 || (limit >> 8) & 3) {
122 pr_err("Node %d using interleaving mode %lx/%lx\n", 121 pr_err("Node %d using interleaving mode %lx/%lx\n",
123 nodeid, (base >> 8) & 3, (limit >> 8) & 3); 122 nodeid, (base >> 8) & 3, (limit >> 8) & 3);
124 return -1; 123 return -EINVAL;
125 } 124 }
126 if (node_isset(nodeid, nodes_parsed)) { 125 if (node_isset(nodeid, numa_nodes_parsed)) {
127 pr_info("Node %d already present, skipping\n", 126 pr_info("Node %d already present, skipping\n",
128 nodeid); 127 nodeid);
129 continue; 128 continue;
@@ -160,117 +159,28 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
160 if (prevbase > base) { 159 if (prevbase > base) {
161 pr_err("Node map not sorted %lx,%lx\n", 160 pr_err("Node map not sorted %lx,%lx\n",
162 prevbase, base); 161 prevbase, base);
163 return -1; 162 return -EINVAL;
164 } 163 }
165 164
166 pr_info("Node %d MemBase %016lx Limit %016lx\n", 165 pr_info("Node %d MemBase %016lx Limit %016lx\n",
167 nodeid, base, limit); 166 nodeid, base, limit);
168 167
169 found++;
170
171 nodes[nodeid].start = base;
172 nodes[nodeid].end = limit;
173
174 prevbase = base; 168 prevbase = base;
175 169 numa_add_memblk(nodeid, base, limit);
176 node_set(nodeid, nodes_parsed); 170 node_set(nodeid, numa_nodes_parsed);
177 }
178
179 if (!found)
180 return -1;
181 return 0;
182}
183
184#ifdef CONFIG_NUMA_EMU
185static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
186 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
187};
188
189void __init amd_get_nodes(struct bootnode *physnodes)
190{
191 int i;
192
193 for_each_node_mask(i, nodes_parsed) {
194 physnodes[i].start = nodes[i].start;
195 physnodes[i].end = nodes[i].end;
196 } 171 }
197}
198
199static int __init find_node_by_addr(unsigned long addr)
200{
201 int ret = NUMA_NO_NODE;
202 int i;
203
204 for (i = 0; i < 8; i++)
205 if (addr >= nodes[i].start && addr < nodes[i].end) {
206 ret = i;
207 break;
208 }
209 return ret;
210}
211 172
212/* 173 if (!nodes_weight(numa_nodes_parsed))
213 * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be 174 return -ENOENT;
214 * setup to represent the physical topology but reflect the emulated
215 * environment. For each emulated node, the real node which it appears on is
216 * found and a fake pxm to nid mapping is created which mirrors the actual
217 * locality. node_distance() then represents the correct distances between
218 * emulated nodes by using the fake acpi mappings to pxms.
219 */
220void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes)
221{
222 unsigned int bits;
223 unsigned int cores;
224 unsigned int apicid_base = 0;
225 int i;
226 175
176 /*
177 * We seem to have valid NUMA configuration. Map apicids to nodes
178 * using the coreid bits from early_identify_cpu.
179 */
227 bits = boot_cpu_data.x86_coreid_bits; 180 bits = boot_cpu_data.x86_coreid_bits;
228 cores = 1 << bits; 181 cores = 1 << bits;
229 early_get_boot_cpu_id();
230 if (boot_cpu_physical_apicid > 0)
231 apicid_base = boot_cpu_physical_apicid;
232
233 for (i = 0; i < nr_nodes; i++) {
234 int index;
235 int nid;
236 int j;
237
238 nid = find_node_by_addr(nodes[i].start);
239 if (nid == NUMA_NO_NODE)
240 continue;
241
242 index = nodeids[nid] << bits;
243 if (fake_apicid_to_node[index + apicid_base] == NUMA_NO_NODE)
244 for (j = apicid_base; j < cores + apicid_base; j++)
245 fake_apicid_to_node[index + j] = i;
246#ifdef CONFIG_ACPI_NUMA
247 __acpi_map_pxm_to_node(nid, i);
248#endif
249 }
250 memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
251}
252#endif /* CONFIG_NUMA_EMU */
253
254int __init amd_scan_nodes(void)
255{
256 unsigned int bits;
257 unsigned int cores;
258 unsigned int apicid_base;
259 int i;
260
261 BUG_ON(nodes_empty(nodes_parsed));
262 node_possible_map = nodes_parsed;
263 memnode_shift = compute_hash_shift(nodes, 8, NULL);
264 if (memnode_shift < 0) {
265 pr_err("No NUMA node hash function found. Contact maintainer\n");
266 return -1;
267 }
268 pr_info("Using node hash shift of %d\n", memnode_shift);
269
270 /* use the coreid bits from early_identify_cpu */
271 bits = boot_cpu_data.x86_coreid_bits;
272 cores = (1<<bits);
273 apicid_base = 0; 182 apicid_base = 0;
183
274 /* get the APIC ID of the BSP early for systems with apicid lifting */ 184 /* get the APIC ID of the BSP early for systems with apicid lifting */
275 early_get_boot_cpu_id(); 185 early_get_boot_cpu_id();
276 if (boot_cpu_physical_apicid > 0) { 186 if (boot_cpu_physical_apicid > 0) {
@@ -278,17 +188,9 @@ int __init amd_scan_nodes(void)
278 apicid_base = boot_cpu_physical_apicid; 188 apicid_base = boot_cpu_physical_apicid;
279 } 189 }
280 190
281 for_each_node_mask(i, node_possible_map) { 191 for_each_node_mask(i, numa_nodes_parsed)
282 int j;
283
284 memblock_x86_register_active_regions(i,
285 nodes[i].start >> PAGE_SHIFT,
286 nodes[i].end >> PAGE_SHIFT);
287 for (j = apicid_base; j < cores + apicid_base; j++) 192 for (j = apicid_base; j < cores + apicid_base; j++)
288 apicid_to_node[(i << bits) + j] = i; 193 set_apicid_to_node((i << bits) + j, i);
289 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
290 }
291 194
292 numa_init_array();
293 return 0; 195 return 0;
294} 196}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 947f42abe820..286d289b039b 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -18,9 +18,9 @@
18 18
19DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 19DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
20 20
21unsigned long __initdata e820_table_start; 21unsigned long __initdata pgt_buf_start;
22unsigned long __meminitdata e820_table_end; 22unsigned long __meminitdata pgt_buf_end;
23unsigned long __meminitdata e820_table_top; 23unsigned long __meminitdata pgt_buf_top;
24 24
25int after_bootmem; 25int after_bootmem;
26 26
@@ -33,7 +33,7 @@ int direct_gbpages
33static void __init find_early_table_space(unsigned long end, int use_pse, 33static void __init find_early_table_space(unsigned long end, int use_pse,
34 int use_gbpages) 34 int use_gbpages)
35{ 35{
36 unsigned long puds, pmds, ptes, tables, start; 36 unsigned long puds, pmds, ptes, tables, start = 0, good_end = end;
37 phys_addr_t base; 37 phys_addr_t base;
38 38
39 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; 39 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
@@ -65,29 +65,20 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
65#ifdef CONFIG_X86_32 65#ifdef CONFIG_X86_32
66 /* for fixmap */ 66 /* for fixmap */
67 tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); 67 tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
68#endif
69 68
70 /* 69 good_end = max_pfn_mapped << PAGE_SHIFT;
71 * RED-PEN putting page tables only on node 0 could
72 * cause a hotspot and fill up ZONE_DMA. The page tables
73 * need roughly 0.5KB per GB.
74 */
75#ifdef CONFIG_X86_32
76 start = 0x7000;
77#else
78 start = 0x8000;
79#endif 70#endif
80 base = memblock_find_in_range(start, max_pfn_mapped<<PAGE_SHIFT, 71
81 tables, PAGE_SIZE); 72 base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
82 if (base == MEMBLOCK_ERROR) 73 if (base == MEMBLOCK_ERROR)
83 panic("Cannot find space for the kernel page tables"); 74 panic("Cannot find space for the kernel page tables");
84 75
85 e820_table_start = base >> PAGE_SHIFT; 76 pgt_buf_start = base >> PAGE_SHIFT;
86 e820_table_end = e820_table_start; 77 pgt_buf_end = pgt_buf_start;
87 e820_table_top = e820_table_start + (tables >> PAGE_SHIFT); 78 pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
88 79
89 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", 80 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
90 end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT); 81 end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT);
91} 82}
92 83
93struct map_range { 84struct map_range {
@@ -279,30 +270,11 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
279 load_cr3(swapper_pg_dir); 270 load_cr3(swapper_pg_dir);
280#endif 271#endif
281 272
282#ifdef CONFIG_X86_64
283 if (!after_bootmem && !start) {
284 pud_t *pud;
285 pmd_t *pmd;
286
287 mmu_cr4_features = read_cr4();
288
289 /*
290 * _brk_end cannot change anymore, but it and _end may be
291 * located on different 2M pages. cleanup_highmap(), however,
292 * can only consider _end when it runs, so destroy any
293 * mappings beyond _brk_end here.
294 */
295 pud = pud_offset(pgd_offset_k(_brk_end), _brk_end);
296 pmd = pmd_offset(pud, _brk_end - 1);
297 while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1))
298 pmd_clear(pmd);
299 }
300#endif
301 __flush_tlb_all(); 273 __flush_tlb_all();
302 274
303 if (!after_bootmem && e820_table_end > e820_table_start) 275 if (!after_bootmem && pgt_buf_end > pgt_buf_start)
304 memblock_x86_reserve_range(e820_table_start << PAGE_SHIFT, 276 memblock_x86_reserve_range(pgt_buf_start << PAGE_SHIFT,
305 e820_table_end << PAGE_SHIFT, "PGTABLE"); 277 pgt_buf_end << PAGE_SHIFT, "PGTABLE");
306 278
307 if (!after_bootmem) 279 if (!after_bootmem)
308 early_memtest(start, end); 280 early_memtest(start, end);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c821074b7f0b..73ad7ebd6e9c 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -62,10 +62,10 @@ bool __read_mostly __vmalloc_start_set = false;
62 62
63static __init void *alloc_low_page(void) 63static __init void *alloc_low_page(void)
64{ 64{
65 unsigned long pfn = e820_table_end++; 65 unsigned long pfn = pgt_buf_end++;
66 void *adr; 66 void *adr;
67 67
68 if (pfn >= e820_table_top) 68 if (pfn >= pgt_buf_top)
69 panic("alloc_low_page: ran out of memory"); 69 panic("alloc_low_page: ran out of memory");
70 70
71 adr = __va(pfn * PAGE_SIZE); 71 adr = __va(pfn * PAGE_SIZE);
@@ -163,8 +163,8 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
163 if (pmd_idx_kmap_begin != pmd_idx_kmap_end 163 if (pmd_idx_kmap_begin != pmd_idx_kmap_end
164 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin 164 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
165 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end 165 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
166 && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start 166 && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start
167 || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) { 167 || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) {
168 pte_t *newpte; 168 pte_t *newpte;
169 int i; 169 int i;
170 170
@@ -644,8 +644,7 @@ void __init find_low_pfn_range(void)
644} 644}
645 645
646#ifndef CONFIG_NEED_MULTIPLE_NODES 646#ifndef CONFIG_NEED_MULTIPLE_NODES
647void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, 647void __init initmem_init(void)
648 int acpi, int k8)
649{ 648{
650#ifdef CONFIG_HIGHMEM 649#ifdef CONFIG_HIGHMEM
651 highstart_pfn = highend_pfn = max_pfn; 650 highstart_pfn = highend_pfn = max_pfn;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index c14a5422e152..a08a62cb136e 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -314,7 +314,7 @@ void __init cleanup_highmap(void)
314 314
315static __ref void *alloc_low_page(unsigned long *phys) 315static __ref void *alloc_low_page(unsigned long *phys)
316{ 316{
317 unsigned long pfn = e820_table_end++; 317 unsigned long pfn = pgt_buf_end++;
318 void *adr; 318 void *adr;
319 319
320 if (after_bootmem) { 320 if (after_bootmem) {
@@ -324,7 +324,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
324 return adr; 324 return adr;
325 } 325 }
326 326
327 if (pfn >= e820_table_top) 327 if (pfn >= pgt_buf_top)
328 panic("alloc_low_page: ran out of memory"); 328 panic("alloc_low_page: ran out of memory");
329 329
330 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); 330 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
@@ -333,12 +333,28 @@ static __ref void *alloc_low_page(unsigned long *phys)
333 return adr; 333 return adr;
334} 334}
335 335
336static __ref void *map_low_page(void *virt)
337{
338 void *adr;
339 unsigned long phys, left;
340
341 if (after_bootmem)
342 return virt;
343
344 phys = __pa(virt);
345 left = phys & (PAGE_SIZE - 1);
346 adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE);
347 adr = (void *)(((unsigned long)adr) | left);
348
349 return adr;
350}
351
336static __ref void unmap_low_page(void *adr) 352static __ref void unmap_low_page(void *adr)
337{ 353{
338 if (after_bootmem) 354 if (after_bootmem)
339 return; 355 return;
340 356
341 early_iounmap(adr, PAGE_SIZE); 357 early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
342} 358}
343 359
344static unsigned long __meminit 360static unsigned long __meminit
@@ -386,15 +402,6 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
386} 402}
387 403
388static unsigned long __meminit 404static unsigned long __meminit
389phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
390 pgprot_t prot)
391{
392 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
393
394 return phys_pte_init(pte, address, end, prot);
395}
396
397static unsigned long __meminit
398phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, 405phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
399 unsigned long page_size_mask, pgprot_t prot) 406 unsigned long page_size_mask, pgprot_t prot)
400{ 407{
@@ -420,8 +427,10 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
420 if (pmd_val(*pmd)) { 427 if (pmd_val(*pmd)) {
421 if (!pmd_large(*pmd)) { 428 if (!pmd_large(*pmd)) {
422 spin_lock(&init_mm.page_table_lock); 429 spin_lock(&init_mm.page_table_lock);
423 last_map_addr = phys_pte_update(pmd, address, 430 pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
431 last_map_addr = phys_pte_init(pte, address,
424 end, prot); 432 end, prot);
433 unmap_low_page(pte);
425 spin_unlock(&init_mm.page_table_lock); 434 spin_unlock(&init_mm.page_table_lock);
426 continue; 435 continue;
427 } 436 }
@@ -468,18 +477,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
468} 477}
469 478
470static unsigned long __meminit 479static unsigned long __meminit
471phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
472 unsigned long page_size_mask, pgprot_t prot)
473{
474 pmd_t *pmd = pmd_offset(pud, 0);
475 unsigned long last_map_addr;
476
477 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
478 __flush_tlb_all();
479 return last_map_addr;
480}
481
482static unsigned long __meminit
483phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, 480phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
484 unsigned long page_size_mask) 481 unsigned long page_size_mask)
485{ 482{
@@ -504,8 +501,11 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
504 501
505 if (pud_val(*pud)) { 502 if (pud_val(*pud)) {
506 if (!pud_large(*pud)) { 503 if (!pud_large(*pud)) {
507 last_map_addr = phys_pmd_update(pud, addr, end, 504 pmd = map_low_page(pmd_offset(pud, 0));
505 last_map_addr = phys_pmd_init(pmd, addr, end,
508 page_size_mask, prot); 506 page_size_mask, prot);
507 unmap_low_page(pmd);
508 __flush_tlb_all();
509 continue; 509 continue;
510 } 510 }
511 /* 511 /*
@@ -553,17 +553,6 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
553 return last_map_addr; 553 return last_map_addr;
554} 554}
555 555
556static unsigned long __meminit
557phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
558 unsigned long page_size_mask)
559{
560 pud_t *pud;
561
562 pud = (pud_t *)pgd_page_vaddr(*pgd);
563
564 return phys_pud_init(pud, addr, end, page_size_mask);
565}
566
567unsigned long __meminit 556unsigned long __meminit
568kernel_physical_mapping_init(unsigned long start, 557kernel_physical_mapping_init(unsigned long start,
569 unsigned long end, 558 unsigned long end,
@@ -587,8 +576,10 @@ kernel_physical_mapping_init(unsigned long start,
587 next = end; 576 next = end;
588 577
589 if (pgd_val(*pgd)) { 578 if (pgd_val(*pgd)) {
590 last_map_addr = phys_pud_update(pgd, __pa(start), 579 pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
580 last_map_addr = phys_pud_init(pud, __pa(start),
591 __pa(end), page_size_mask); 581 __pa(end), page_size_mask);
582 unmap_low_page(pud);
592 continue; 583 continue;
593 } 584 }
594 585
@@ -612,10 +603,9 @@ kernel_physical_mapping_init(unsigned long start,
612} 603}
613 604
614#ifndef CONFIG_NUMA 605#ifndef CONFIG_NUMA
615void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, 606void __init initmem_init(void)
616 int acpi, int k8)
617{ 607{
618 memblock_x86_register_active_regions(0, start_pfn, end_pfn); 608 memblock_x86_register_active_regions(0, 0, max_pfn);
619} 609}
620#endif 610#endif
621 611
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index ebf6d7887a38..9559d360fde7 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -26,12 +26,50 @@ static __init int numa_setup(char *opt)
26early_param("numa", numa_setup); 26early_param("numa", numa_setup);
27 27
28/* 28/*
29 * Which logical CPUs are on which nodes 29 * apicid, cpu, node mappings
30 */ 30 */
31s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
32 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
33};
34
31cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 35cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
32EXPORT_SYMBOL(node_to_cpumask_map); 36EXPORT_SYMBOL(node_to_cpumask_map);
33 37
34/* 38/*
39 * Map cpu index to node index
40 */
41DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
42EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
43
44void __cpuinit numa_set_node(int cpu, int node)
45{
46 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
47
48 /* early setting, no percpu area yet */
49 if (cpu_to_node_map) {
50 cpu_to_node_map[cpu] = node;
51 return;
52 }
53
54#ifdef CONFIG_DEBUG_PER_CPU_MAPS
55 if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
56 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
57 dump_stack();
58 return;
59 }
60#endif
61 per_cpu(x86_cpu_to_node_map, cpu) = node;
62
63 if (node != NUMA_NO_NODE)
64 set_cpu_numa_node(cpu, node);
65}
66
67void __cpuinit numa_clear_node(int cpu)
68{
69 numa_set_node(cpu, NUMA_NO_NODE);
70}
71
72/*
35 * Allocate node_to_cpumask_map based on number of available nodes 73 * Allocate node_to_cpumask_map based on number of available nodes
36 * Requires node_possible_map to be valid. 74 * Requires node_possible_map to be valid.
37 * 75 *
@@ -57,7 +95,174 @@ void __init setup_node_to_cpumask_map(void)
57 pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); 95 pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
58} 96}
59 97
60#ifdef CONFIG_DEBUG_PER_CPU_MAPS 98/*
99 * There are unfortunately some poorly designed mainboards around that
100 * only connect memory to a single CPU. This breaks the 1:1 cpu->node
101 * mapping. To avoid this fill in the mapping for all possible CPUs,
102 * as the number of CPUs is not known yet. We round robin the existing
103 * nodes.
104 */
105void __init numa_init_array(void)
106{
107 int rr, i;
108
109 rr = first_node(node_online_map);
110 for (i = 0; i < nr_cpu_ids; i++) {
111 if (early_cpu_to_node(i) != NUMA_NO_NODE)
112 continue;
113 numa_set_node(i, rr);
114 rr = next_node(rr, node_online_map);
115 if (rr == MAX_NUMNODES)
116 rr = first_node(node_online_map);
117 }
118}
119
120static __init int find_near_online_node(int node)
121{
122 int n, val;
123 int min_val = INT_MAX;
124 int best_node = -1;
125
126 for_each_online_node(n) {
127 val = node_distance(node, n);
128
129 if (val < min_val) {
130 min_val = val;
131 best_node = n;
132 }
133 }
134
135 return best_node;
136}
137
138/*
139 * Setup early cpu_to_node.
140 *
141 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
142 * and apicid_to_node[] tables have valid entries for a CPU.
143 * This means we skip cpu_to_node[] initialisation for NUMA
144 * emulation and faking node case (when running a kernel compiled
145 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
146 * is already initialized in a round robin manner at numa_init_array,
147 * prior to this call, and this initialization is good enough
148 * for the fake NUMA cases.
149 *
150 * Called before the per_cpu areas are setup.
151 */
152void __init init_cpu_to_node(void)
153{
154 int cpu;
155 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
156
157 BUG_ON(cpu_to_apicid == NULL);
158
159 for_each_possible_cpu(cpu) {
160 int node = numa_cpu_node(cpu);
161
162 if (node == NUMA_NO_NODE)
163 continue;
164 if (!node_online(node))
165 node = find_near_online_node(node);
166 numa_set_node(cpu, node);
167 }
168}
169
170#ifndef CONFIG_DEBUG_PER_CPU_MAPS
171
172# ifndef CONFIG_NUMA_EMU
173void __cpuinit numa_add_cpu(int cpu)
174{
175 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
176}
177
178void __cpuinit numa_remove_cpu(int cpu)
179{
180 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
181}
182# endif /* !CONFIG_NUMA_EMU */
183
184#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
185
186int __cpu_to_node(int cpu)
187{
188 if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
189 printk(KERN_WARNING
190 "cpu_to_node(%d): usage too early!\n", cpu);
191 dump_stack();
192 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
193 }
194 return per_cpu(x86_cpu_to_node_map, cpu);
195}
196EXPORT_SYMBOL(__cpu_to_node);
197
198/*
199 * Same function as cpu_to_node() but used if called before the
200 * per_cpu areas are setup.
201 */
202int early_cpu_to_node(int cpu)
203{
204 if (early_per_cpu_ptr(x86_cpu_to_node_map))
205 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
206
207 if (!cpu_possible(cpu)) {
208 printk(KERN_WARNING
209 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
210 dump_stack();
211 return NUMA_NO_NODE;
212 }
213 return per_cpu(x86_cpu_to_node_map, cpu);
214}
215
216struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable)
217{
218 int node = early_cpu_to_node(cpu);
219 struct cpumask *mask;
220 char buf[64];
221
222 if (node == NUMA_NO_NODE) {
223 /* early_cpu_to_node() already emits a warning and trace */
224 return NULL;
225 }
226 mask = node_to_cpumask_map[node];
227 if (!mask) {
228 pr_err("node_to_cpumask_map[%i] NULL\n", node);
229 dump_stack();
230 return NULL;
231 }
232
233 cpulist_scnprintf(buf, sizeof(buf), mask);
234 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
235 enable ? "numa_add_cpu" : "numa_remove_cpu",
236 cpu, node, buf);
237 return mask;
238}
239
240# ifndef CONFIG_NUMA_EMU
241static void __cpuinit numa_set_cpumask(int cpu, int enable)
242{
243 struct cpumask *mask;
244
245 mask = debug_cpumask_set_cpu(cpu, enable);
246 if (!mask)
247 return;
248
249 if (enable)
250 cpumask_set_cpu(cpu, mask);
251 else
252 cpumask_clear_cpu(cpu, mask);
253}
254
255void __cpuinit numa_add_cpu(int cpu)
256{
257 numa_set_cpumask(cpu, 1);
258}
259
260void __cpuinit numa_remove_cpu(int cpu)
261{
262 numa_set_cpumask(cpu, 0);
263}
264# endif /* !CONFIG_NUMA_EMU */
265
61/* 266/*
62 * Returns a pointer to the bitmask of CPUs on Node 'node'. 267 * Returns a pointer to the bitmask of CPUs on Node 'node'.
63 */ 268 */
@@ -80,4 +285,5 @@ const struct cpumask *cpumask_of_node(int node)
80 return node_to_cpumask_map[node]; 285 return node_to_cpumask_map[node];
81} 286}
82EXPORT_SYMBOL(cpumask_of_node); 287EXPORT_SYMBOL(cpumask_of_node);
83#endif 288
289#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 84a3e4c9f277..bde3906420df 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -110,6 +110,12 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
110 110
111static unsigned long kva_start_pfn; 111static unsigned long kva_start_pfn;
112static unsigned long kva_pages; 112static unsigned long kva_pages;
113
114int __cpuinit numa_cpu_node(int cpu)
115{
116 return apic->x86_32_numa_cpu_node(cpu);
117}
118
113/* 119/*
114 * FLAT - support for basic PC memory model with discontig enabled, essentially 120 * FLAT - support for basic PC memory model with discontig enabled, essentially
115 * a single node with all available processors in it with a flat 121 * a single node with all available processors in it with a flat
@@ -346,8 +352,7 @@ static void init_remap_allocator(int nid)
346 (ulong) node_remap_end_vaddr[nid]); 352 (ulong) node_remap_end_vaddr[nid]);
347} 353}
348 354
349void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, 355void __init initmem_init(void)
350 int acpi, int k8)
351{ 356{
352 int nid; 357 int nid;
353 long kva_target_pfn; 358 long kva_target_pfn;
@@ -361,6 +366,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
361 */ 366 */
362 367
363 get_memcfg_numa(); 368 get_memcfg_numa();
369 numa_init_array();
364 370
365 kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); 371 kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
366 372
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 1337c51b07d7..9ec0f209a6a4 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -13,31 +13,30 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/nodemask.h> 14#include <linux/nodemask.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/acpi.h>
16 17
17#include <asm/e820.h> 18#include <asm/e820.h>
18#include <asm/proto.h> 19#include <asm/proto.h>
19#include <asm/dma.h> 20#include <asm/dma.h>
20#include <asm/numa.h>
21#include <asm/acpi.h> 21#include <asm/acpi.h>
22#include <asm/amd_nb.h> 22#include <asm/amd_nb.h>
23 23
24#include "numa_internal.h"
25
24struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 26struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
25EXPORT_SYMBOL(node_data); 27EXPORT_SYMBOL(node_data);
26 28
27struct memnode memnode; 29nodemask_t numa_nodes_parsed __initdata;
28 30
29s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { 31struct memnode memnode;
30 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
31};
32 32
33static unsigned long __initdata nodemap_addr; 33static unsigned long __initdata nodemap_addr;
34static unsigned long __initdata nodemap_size; 34static unsigned long __initdata nodemap_size;
35 35
36/* 36static struct numa_meminfo numa_meminfo __initdata;
37 * Map cpu index to node index 37
38 */ 38static int numa_distance_cnt;
39DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); 39static u8 *numa_distance;
40EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
41 40
42/* 41/*
43 * Given a shift value, try to populate memnodemap[] 42 * Given a shift value, try to populate memnodemap[]
@@ -46,16 +45,15 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
46 * 0 if memnodmap[] too small (of shift too small) 45 * 0 if memnodmap[] too small (of shift too small)
47 * -1 if node overlap or lost ram (shift too big) 46 * -1 if node overlap or lost ram (shift too big)
48 */ 47 */
49static int __init populate_memnodemap(const struct bootnode *nodes, 48static int __init populate_memnodemap(const struct numa_meminfo *mi, int shift)
50 int numnodes, int shift, int *nodeids)
51{ 49{
52 unsigned long addr, end; 50 unsigned long addr, end;
53 int i, res = -1; 51 int i, res = -1;
54 52
55 memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize); 53 memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
56 for (i = 0; i < numnodes; i++) { 54 for (i = 0; i < mi->nr_blks; i++) {
57 addr = nodes[i].start; 55 addr = mi->blk[i].start;
58 end = nodes[i].end; 56 end = mi->blk[i].end;
59 if (addr >= end) 57 if (addr >= end)
60 continue; 58 continue;
61 if ((end >> shift) >= memnodemapsize) 59 if ((end >> shift) >= memnodemapsize)
@@ -63,12 +61,7 @@ static int __init populate_memnodemap(const struct bootnode *nodes,
63 do { 61 do {
64 if (memnodemap[addr >> shift] != NUMA_NO_NODE) 62 if (memnodemap[addr >> shift] != NUMA_NO_NODE)
65 return -1; 63 return -1;
66 64 memnodemap[addr >> shift] = mi->blk[i].nid;
67 if (!nodeids)
68 memnodemap[addr >> shift] = i;
69 else
70 memnodemap[addr >> shift] = nodeids[i];
71
72 addr += (1UL << shift); 65 addr += (1UL << shift);
73 } while (addr < end); 66 } while (addr < end);
74 res = 1; 67 res = 1;
@@ -86,7 +79,7 @@ static int __init allocate_cachealigned_memnodemap(void)
86 79
87 addr = 0x8000; 80 addr = 0x8000;
88 nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); 81 nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
89 nodemap_addr = memblock_find_in_range(addr, max_pfn<<PAGE_SHIFT, 82 nodemap_addr = memblock_find_in_range(addr, get_max_mapped(),
90 nodemap_size, L1_CACHE_BYTES); 83 nodemap_size, L1_CACHE_BYTES);
91 if (nodemap_addr == MEMBLOCK_ERROR) { 84 if (nodemap_addr == MEMBLOCK_ERROR) {
92 printk(KERN_ERR 85 printk(KERN_ERR
@@ -106,16 +99,15 @@ static int __init allocate_cachealigned_memnodemap(void)
106 * The LSB of all start and end addresses in the node map is the value of the 99 * The LSB of all start and end addresses in the node map is the value of the
107 * maximum possible shift. 100 * maximum possible shift.
108 */ 101 */
109static int __init extract_lsb_from_nodes(const struct bootnode *nodes, 102static int __init extract_lsb_from_nodes(const struct numa_meminfo *mi)
110 int numnodes)
111{ 103{
112 int i, nodes_used = 0; 104 int i, nodes_used = 0;
113 unsigned long start, end; 105 unsigned long start, end;
114 unsigned long bitfield = 0, memtop = 0; 106 unsigned long bitfield = 0, memtop = 0;
115 107
116 for (i = 0; i < numnodes; i++) { 108 for (i = 0; i < mi->nr_blks; i++) {
117 start = nodes[i].start; 109 start = mi->blk[i].start;
118 end = nodes[i].end; 110 end = mi->blk[i].end;
119 if (start >= end) 111 if (start >= end)
120 continue; 112 continue;
121 bitfield |= start; 113 bitfield |= start;
@@ -131,18 +123,17 @@ static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
131 return i; 123 return i;
132} 124}
133 125
134int __init compute_hash_shift(struct bootnode *nodes, int numnodes, 126static int __init compute_hash_shift(const struct numa_meminfo *mi)
135 int *nodeids)
136{ 127{
137 int shift; 128 int shift;
138 129
139 shift = extract_lsb_from_nodes(nodes, numnodes); 130 shift = extract_lsb_from_nodes(mi);
140 if (allocate_cachealigned_memnodemap()) 131 if (allocate_cachealigned_memnodemap())
141 return -1; 132 return -1;
142 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", 133 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
143 shift); 134 shift);
144 135
145 if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) { 136 if (populate_memnodemap(mi, shift) != 1) {
146 printk(KERN_INFO "Your memory is not aligned you need to " 137 printk(KERN_INFO "Your memory is not aligned you need to "
147 "rebuild your kernel with a bigger NODEMAPSIZE " 138 "rebuild your kernel with a bigger NODEMAPSIZE "
148 "shift=%d\n", shift); 139 "shift=%d\n", shift);
@@ -188,6 +179,63 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
188 return NULL; 179 return NULL;
189} 180}
190 181
182static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
183 struct numa_meminfo *mi)
184{
185 /* ignore zero length blks */
186 if (start == end)
187 return 0;
188
189 /* whine about and ignore invalid blks */
190 if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
191 pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
192 nid, start, end);
193 return 0;
194 }
195
196 if (mi->nr_blks >= NR_NODE_MEMBLKS) {
197 pr_err("NUMA: too many memblk ranges\n");
198 return -EINVAL;
199 }
200
201 mi->blk[mi->nr_blks].start = start;
202 mi->blk[mi->nr_blks].end = end;
203 mi->blk[mi->nr_blks].nid = nid;
204 mi->nr_blks++;
205 return 0;
206}
207
208/**
209 * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
210 * @idx: Index of memblk to remove
211 * @mi: numa_meminfo to remove memblk from
212 *
213 * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
214 * decrementing @mi->nr_blks.
215 */
216void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
217{
218 mi->nr_blks--;
219 memmove(&mi->blk[idx], &mi->blk[idx + 1],
220 (mi->nr_blks - idx) * sizeof(mi->blk[0]));
221}
222
223/**
224 * numa_add_memblk - Add one numa_memblk to numa_meminfo
225 * @nid: NUMA node ID of the new memblk
226 * @start: Start address of the new memblk
227 * @end: End address of the new memblk
228 *
229 * Add a new memblk to the default numa_meminfo.
230 *
231 * RETURNS:
232 * 0 on success, -errno on failure.
233 */
234int __init numa_add_memblk(int nid, u64 start, u64 end)
235{
236 return numa_add_memblk_to(nid, start, end, &numa_meminfo);
237}
238
191/* Initialize bootmem allocator for a node */ 239/* Initialize bootmem allocator for a node */
192void __init 240void __init
193setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) 241setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
@@ -234,692 +282,386 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
234 node_set_online(nodeid); 282 node_set_online(nodeid);
235} 283}
236 284
237/* 285/**
238 * There are unfortunately some poorly designed mainboards around that 286 * numa_cleanup_meminfo - Cleanup a numa_meminfo
239 * only connect memory to a single CPU. This breaks the 1:1 cpu->node 287 * @mi: numa_meminfo to clean up
240 * mapping. To avoid this fill in the mapping for all possible CPUs, 288 *
241 * as the number of CPUs is not known yet. We round robin the existing 289 * Sanitize @mi by merging and removing unncessary memblks. Also check for
242 * nodes. 290 * conflicts and clear unused memblks.
291 *
292 * RETURNS:
293 * 0 on success, -errno on failure.
243 */ 294 */
244void __init numa_init_array(void) 295int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
245{ 296{
246 int rr, i; 297 const u64 low = 0;
298 const u64 high = (u64)max_pfn << PAGE_SHIFT;
299 int i, j, k;
247 300
248 rr = first_node(node_online_map); 301 for (i = 0; i < mi->nr_blks; i++) {
249 for (i = 0; i < nr_cpu_ids; i++) { 302 struct numa_memblk *bi = &mi->blk[i];
250 if (early_cpu_to_node(i) != NUMA_NO_NODE)
251 continue;
252 numa_set_node(i, rr);
253 rr = next_node(rr, node_online_map);
254 if (rr == MAX_NUMNODES)
255 rr = first_node(node_online_map);
256 }
257}
258
259#ifdef CONFIG_NUMA_EMU
260/* Numa emulation */
261static struct bootnode nodes[MAX_NUMNODES] __initdata;
262static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata;
263static char *cmdline __initdata;
264 303
265void __init numa_emu_cmdline(char *str) 304 /* make sure all blocks are inside the limits */
266{ 305 bi->start = max(bi->start, low);
267 cmdline = str; 306 bi->end = min(bi->end, high);
268}
269 307
270static int __init setup_physnodes(unsigned long start, unsigned long end, 308 /* and there's no empty block */
271 int acpi, int amd) 309 if (bi->start == bi->end) {
272{ 310 numa_remove_memblk_from(i--, mi);
273 int ret = 0;
274 int i;
275
276 memset(physnodes, 0, sizeof(physnodes));
277#ifdef CONFIG_ACPI_NUMA
278 if (acpi)
279 acpi_get_nodes(physnodes, start, end);
280#endif
281#ifdef CONFIG_AMD_NUMA
282 if (amd)
283 amd_get_nodes(physnodes);
284#endif
285 /*
286 * Basic sanity checking on the physical node map: there may be errors
287 * if the SRAT or AMD code incorrectly reported the topology or the mem=
288 * kernel parameter is used.
289 */
290 for (i = 0; i < MAX_NUMNODES; i++) {
291 if (physnodes[i].start == physnodes[i].end)
292 continue;
293 if (physnodes[i].start > end) {
294 physnodes[i].end = physnodes[i].start;
295 continue;
296 }
297 if (physnodes[i].end < start) {
298 physnodes[i].start = physnodes[i].end;
299 continue; 311 continue;
300 } 312 }
301 if (physnodes[i].start < start)
302 physnodes[i].start = start;
303 if (physnodes[i].end > end)
304 physnodes[i].end = end;
305 ret++;
306 }
307 313
308 /* 314 for (j = i + 1; j < mi->nr_blks; j++) {
309 * If no physical topology was detected, a single node is faked to cover 315 struct numa_memblk *bj = &mi->blk[j];
310 * the entire address space. 316 unsigned long start, end;
311 */
312 if (!ret) {
313 physnodes[ret].start = start;
314 physnodes[ret].end = end;
315 ret = 1;
316 }
317 return ret;
318}
319
320static void __init fake_physnodes(int acpi, int amd, int nr_nodes)
321{
322 int i;
323
324 BUG_ON(acpi && amd);
325#ifdef CONFIG_ACPI_NUMA
326 if (acpi)
327 acpi_fake_nodes(nodes, nr_nodes);
328#endif
329#ifdef CONFIG_AMD_NUMA
330 if (amd)
331 amd_fake_nodes(nodes, nr_nodes);
332#endif
333 if (!acpi && !amd)
334 for (i = 0; i < nr_cpu_ids; i++)
335 numa_set_node(i, 0);
336}
337
338/*
339 * Setups up nid to range from addr to addr + size. If the end
340 * boundary is greater than max_addr, then max_addr is used instead.
341 * The return value is 0 if there is additional memory left for
342 * allocation past addr and -1 otherwise. addr is adjusted to be at
343 * the end of the node.
344 */
345static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
346{
347 int ret = 0;
348 nodes[nid].start = *addr;
349 *addr += size;
350 if (*addr >= max_addr) {
351 *addr = max_addr;
352 ret = -1;
353 }
354 nodes[nid].end = *addr;
355 node_set(nid, node_possible_map);
356 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
357 nodes[nid].start, nodes[nid].end,
358 (nodes[nid].end - nodes[nid].start) >> 20);
359 return ret;
360}
361
362/*
363 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
364 * to max_addr. The return value is the number of nodes allocated.
365 */
366static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes)
367{
368 nodemask_t physnode_mask = NODE_MASK_NONE;
369 u64 size;
370 int big;
371 int ret = 0;
372 int i;
373
374 if (nr_nodes <= 0)
375 return -1;
376 if (nr_nodes > MAX_NUMNODES) {
377 pr_info("numa=fake=%d too large, reducing to %d\n",
378 nr_nodes, MAX_NUMNODES);
379 nr_nodes = MAX_NUMNODES;
380 }
381
382 size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes;
383 /*
384 * Calculate the number of big nodes that can be allocated as a result
385 * of consolidating the remainder.
386 */
387 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
388 FAKE_NODE_MIN_SIZE;
389
390 size &= FAKE_NODE_MIN_HASH_MASK;
391 if (!size) {
392 pr_err("Not enough memory for each node. "
393 "NUMA emulation disabled.\n");
394 return -1;
395 }
396
397 for (i = 0; i < MAX_NUMNODES; i++)
398 if (physnodes[i].start != physnodes[i].end)
399 node_set(i, physnode_mask);
400
401 /*
402 * Continue to fill physical nodes with fake nodes until there is no
403 * memory left on any of them.
404 */
405 while (nodes_weight(physnode_mask)) {
406 for_each_node_mask(i, physnode_mask) {
407 u64 end = physnodes[i].start + size;
408 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
409
410 if (ret < big)
411 end += FAKE_NODE_MIN_SIZE;
412 317
413 /* 318 /*
414 * Continue to add memory to this fake node if its 319 * See whether there are overlapping blocks. Whine
415 * non-reserved memory is less than the per-node size. 320 * about but allow overlaps of the same nid. They
321 * will be merged below.
416 */ 322 */
417 while (end - physnodes[i].start - 323 if (bi->end > bj->start && bi->start < bj->end) {
418 memblock_x86_hole_size(physnodes[i].start, end) < size) { 324 if (bi->nid != bj->nid) {
419 end += FAKE_NODE_MIN_SIZE; 325 pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
420 if (end > physnodes[i].end) { 326 bi->nid, bi->start, bi->end,
421 end = physnodes[i].end; 327 bj->nid, bj->start, bj->end);
422 break; 328 return -EINVAL;
423 } 329 }
330 pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
331 bi->nid, bi->start, bi->end,
332 bj->start, bj->end);
424 } 333 }
425 334
426 /* 335 /*
427 * If there won't be at least FAKE_NODE_MIN_SIZE of 336 * Join together blocks on the same node, holes
428 * non-reserved memory in ZONE_DMA32 for the next node, 337 * between which don't overlap with memory on other
429 * this one must extend to the boundary. 338 * nodes.
430 */
431 if (end < dma32_end && dma32_end - end -
432 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
433 end = dma32_end;
434
435 /*
436 * If there won't be enough non-reserved memory for the
437 * next node, this one must extend to the end of the
438 * physical node.
439 */ 339 */
440 if (physnodes[i].end - end - 340 if (bi->nid != bj->nid)
441 memblock_x86_hole_size(end, physnodes[i].end) < size) 341 continue;
442 end = physnodes[i].end; 342 start = max(min(bi->start, bj->start), low);
443 343 end = min(max(bi->end, bj->end), high);
444 /* 344 for (k = 0; k < mi->nr_blks; k++) {
445 * Avoid allocating more nodes than requested, which can 345 struct numa_memblk *bk = &mi->blk[k];
446 * happen as a result of rounding down each node's size 346
447 * to FAKE_NODE_MIN_SIZE. 347 if (bi->nid == bk->nid)
448 */ 348 continue;
449 if (nodes_weight(physnode_mask) + ret >= nr_nodes) 349 if (start < bk->end && end > bk->start)
450 end = physnodes[i].end; 350 break;
451 351 }
452 if (setup_node_range(ret++, &physnodes[i].start, 352 if (k < mi->nr_blks)
453 end - physnodes[i].start, 353 continue;
454 physnodes[i].end) < 0) 354 printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
455 node_clear(i, physnode_mask); 355 bi->nid, bi->start, bi->end, bj->start, bj->end,
356 start, end);
357 bi->start = start;
358 bi->end = end;
359 numa_remove_memblk_from(j--, mi);
456 } 360 }
457 } 361 }
458 return ret;
459}
460
461/*
462 * Returns the end address of a node so that there is at least `size' amount of
463 * non-reserved memory or `max_addr' is reached.
464 */
465static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
466{
467 u64 end = start + size;
468 362
469 while (end - start - memblock_x86_hole_size(start, end) < size) { 363 for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
470 end += FAKE_NODE_MIN_SIZE; 364 mi->blk[i].start = mi->blk[i].end = 0;
471 if (end > max_addr) { 365 mi->blk[i].nid = NUMA_NO_NODE;
472 end = max_addr;
473 break;
474 }
475 } 366 }
476 return end; 367
368 return 0;
477} 369}
478 370
479/* 371/*
480 * Sets up fake nodes of `size' interleaved over physical nodes ranging from 372 * Set nodes, which have memory in @mi, in *@nodemask.
481 * `addr' to `max_addr'. The return value is the number of nodes allocated.
482 */ 373 */
483static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) 374static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
375 const struct numa_meminfo *mi)
484{ 376{
485 nodemask_t physnode_mask = NODE_MASK_NONE;
486 u64 min_size;
487 int ret = 0;
488 int i; 377 int i;
489 378
490 if (!size) 379 for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
491 return -1; 380 if (mi->blk[i].start != mi->blk[i].end &&
492 /* 381 mi->blk[i].nid != NUMA_NO_NODE)
493 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is 382 node_set(mi->blk[i].nid, *nodemask);
494 * increased accordingly if the requested size is too small. This 383}
495 * creates a uniform distribution of node sizes across the entire
496 * machine (but not necessarily over physical nodes).
497 */
498 min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
499 MAX_NUMNODES;
500 min_size = max(min_size, FAKE_NODE_MIN_SIZE);
501 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
502 min_size = (min_size + FAKE_NODE_MIN_SIZE) &
503 FAKE_NODE_MIN_HASH_MASK;
504 if (size < min_size) {
505 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
506 size >> 20, min_size >> 20);
507 size = min_size;
508 }
509 size &= FAKE_NODE_MIN_HASH_MASK;
510
511 for (i = 0; i < MAX_NUMNODES; i++)
512 if (physnodes[i].start != physnodes[i].end)
513 node_set(i, physnode_mask);
514 /*
515 * Fill physical nodes with fake nodes of size until there is no memory
516 * left on any of them.
517 */
518 while (nodes_weight(physnode_mask)) {
519 for_each_node_mask(i, physnode_mask) {
520 u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
521 u64 end;
522
523 end = find_end_of_node(physnodes[i].start,
524 physnodes[i].end, size);
525 /*
526 * If there won't be at least FAKE_NODE_MIN_SIZE of
527 * non-reserved memory in ZONE_DMA32 for the next node,
528 * this one must extend to the boundary.
529 */
530 if (end < dma32_end && dma32_end - end -
531 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
532 end = dma32_end;
533 384
534 /* 385/**
535 * If there won't be enough non-reserved memory for the 386 * numa_reset_distance - Reset NUMA distance table
536 * next node, this one must extend to the end of the 387 *
537 * physical node. 388 * The current table is freed. The next numa_set_distance() call will
538 */ 389 * create a new one.
539 if (physnodes[i].end - end - 390 */
540 memblock_x86_hole_size(end, physnodes[i].end) < size) 391void __init numa_reset_distance(void)
541 end = physnodes[i].end; 392{
393 size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
542 394
543 /* 395 /* numa_distance could be 1LU marking allocation failure, test cnt */
544 * Setup the fake node that will be allocated as bootmem 396 if (numa_distance_cnt)
545 * later. If setup_node_range() returns non-zero, there 397 memblock_x86_free_range(__pa(numa_distance),
546 * is no more memory available on this physical node. 398 __pa(numa_distance) + size);
547 */ 399 numa_distance_cnt = 0;
548 if (setup_node_range(ret++, &physnodes[i].start, 400 numa_distance = NULL; /* enable table creation */
549 end - physnodes[i].start,
550 physnodes[i].end) < 0)
551 node_clear(i, physnode_mask);
552 }
553 }
554 return ret;
555} 401}
556 402
557/* 403static int __init numa_alloc_distance(void)
558 * Sets up the system RAM area from start_pfn to last_pfn according to the
559 * numa=fake command-line option.
560 */
561static int __init numa_emulation(unsigned long start_pfn,
562 unsigned long last_pfn, int acpi, int amd)
563{ 404{
564 u64 addr = start_pfn << PAGE_SHIFT; 405 nodemask_t nodes_parsed;
565 u64 max_addr = last_pfn << PAGE_SHIFT; 406 size_t size;
566 int num_nodes; 407 int i, j, cnt = 0;
567 int i; 408 u64 phys;
568 409
569 /* 410 /* size the new table and allocate it */
570 * If the numa=fake command-line contains a 'M' or 'G', it represents 411 nodes_parsed = numa_nodes_parsed;
571 * the fixed node size. Otherwise, if it is just a single number N, 412 numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
572 * split the system RAM into N fake nodes.
573 */
574 if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) {
575 u64 size;
576 413
577 size = memparse(cmdline, &cmdline); 414 for_each_node_mask(i, nodes_parsed)
578 num_nodes = split_nodes_size_interleave(addr, max_addr, size); 415 cnt = i;
579 } else { 416 cnt++;
580 unsigned long n; 417 size = cnt * cnt * sizeof(numa_distance[0]);
581 418
582 n = simple_strtoul(cmdline, NULL, 0); 419 phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT,
583 num_nodes = split_nodes_interleave(addr, max_addr, n); 420 size, PAGE_SIZE);
421 if (phys == MEMBLOCK_ERROR) {
422 pr_warning("NUMA: Warning: can't allocate distance table!\n");
423 /* don't retry until explicitly reset */
424 numa_distance = (void *)1LU;
425 return -ENOMEM;
584 } 426 }
427 memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
585 428
586 if (num_nodes < 0) 429 numa_distance = __va(phys);
587 return num_nodes; 430 numa_distance_cnt = cnt;
588 memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); 431
589 if (memnode_shift < 0) { 432 /* fill with the default distances */
590 memnode_shift = 0; 433 for (i = 0; i < cnt; i++)
591 printk(KERN_ERR "No NUMA hash function found. NUMA emulation " 434 for (j = 0; j < cnt; j++)
592 "disabled.\n"); 435 numa_distance[i * cnt + j] = i == j ?
593 return -1; 436 LOCAL_DISTANCE : REMOTE_DISTANCE;
594 } 437 printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
595 438
596 /*
597 * We need to vacate all active ranges that may have been registered for
598 * the e820 memory map.
599 */
600 remove_all_active_ranges();
601 for_each_node_mask(i, node_possible_map) {
602 memblock_x86_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
603 nodes[i].end >> PAGE_SHIFT);
604 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
605 }
606 setup_physnodes(addr, max_addr, acpi, amd);
607 fake_physnodes(acpi, amd, num_nodes);
608 numa_init_array();
609 return 0; 439 return 0;
610} 440}
611#endif /* CONFIG_NUMA_EMU */
612 441
613void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, 442/**
614 int acpi, int amd) 443 * numa_set_distance - Set NUMA distance from one NUMA to another
444 * @from: the 'from' node to set distance
445 * @to: the 'to' node to set distance
446 * @distance: NUMA distance
447 *
448 * Set the distance from node @from to @to to @distance. If distance table
449 * doesn't exist, one which is large enough to accomodate all the currently
450 * known nodes will be created.
451 *
452 * If such table cannot be allocated, a warning is printed and further
453 * calls are ignored until the distance table is reset with
454 * numa_reset_distance().
455 *
456 * If @from or @to is higher than the highest known node at the time of
457 * table creation or @distance doesn't make sense, the call is ignored.
458 * This is to allow simplification of specific NUMA config implementations.
459 */
460void __init numa_set_distance(int from, int to, int distance)
615{ 461{
616 int i; 462 if (!numa_distance && numa_alloc_distance() < 0)
617
618 nodes_clear(node_possible_map);
619 nodes_clear(node_online_map);
620
621#ifdef CONFIG_NUMA_EMU
622 setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
623 acpi, amd);
624 if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd))
625 return; 463 return;
626 setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
627 acpi, amd);
628 nodes_clear(node_possible_map);
629 nodes_clear(node_online_map);
630#endif
631 464
632#ifdef CONFIG_ACPI_NUMA 465 if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
633 if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 466 printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
634 last_pfn << PAGE_SHIFT)) 467 from, to, distance);
635 return; 468 return;
636 nodes_clear(node_possible_map); 469 }
637 nodes_clear(node_online_map);
638#endif
639 470
640#ifdef CONFIG_AMD_NUMA 471 if ((u8)distance != distance ||
641 if (!numa_off && amd && !amd_scan_nodes()) 472 (from == to && distance != LOCAL_DISTANCE)) {
473 pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
474 from, to, distance);
642 return; 475 return;
643 nodes_clear(node_possible_map); 476 }
644 nodes_clear(node_online_map);
645#endif
646 printk(KERN_INFO "%s\n",
647 numa_off ? "NUMA turned off" : "No NUMA configuration found");
648 477
649 printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 478 numa_distance[from * numa_distance_cnt + to] = distance;
650 start_pfn << PAGE_SHIFT,
651 last_pfn << PAGE_SHIFT);
652 /* setup dummy node covering all memory */
653 memnode_shift = 63;
654 memnodemap = memnode.embedded_map;
655 memnodemap[0] = 0;
656 node_set_online(0);
657 node_set(0, node_possible_map);
658 for (i = 0; i < nr_cpu_ids; i++)
659 numa_set_node(i, 0);
660 memblock_x86_register_active_regions(0, start_pfn, last_pfn);
661 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
662} 479}
663 480
664unsigned long __init numa_free_all_bootmem(void) 481int __node_distance(int from, int to)
665{ 482{
666 unsigned long pages = 0; 483 if (from >= numa_distance_cnt || to >= numa_distance_cnt)
667 int i; 484 return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
485 return numa_distance[from * numa_distance_cnt + to];
486}
487EXPORT_SYMBOL(__node_distance);
668 488
669 for_each_online_node(i) 489/*
670 pages += free_all_bootmem_node(NODE_DATA(i)); 490 * Sanity check to catch more bad NUMA configurations (they are amazingly
491 * common). Make sure the nodes cover all memory.
492 */
493static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
494{
495 unsigned long numaram, e820ram;
496 int i;
671 497
672 pages += free_all_memory_core_early(MAX_NUMNODES); 498 numaram = 0;
499 for (i = 0; i < mi->nr_blks; i++) {
500 unsigned long s = mi->blk[i].start >> PAGE_SHIFT;
501 unsigned long e = mi->blk[i].end >> PAGE_SHIFT;
502 numaram += e - s;
503 numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
504 if ((long)numaram < 0)
505 numaram = 0;
506 }
673 507
674 return pages; 508 e820ram = max_pfn - (memblock_x86_hole_size(0,
509 max_pfn << PAGE_SHIFT) >> PAGE_SHIFT);
510 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
511 if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
512 printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n",
513 (numaram << PAGE_SHIFT) >> 20,
514 (e820ram << PAGE_SHIFT) >> 20);
515 return false;
516 }
517 return true;
675} 518}
676 519
677#ifdef CONFIG_NUMA 520static int __init numa_register_memblks(struct numa_meminfo *mi)
678
679static __init int find_near_online_node(int node)
680{ 521{
681 int n, val; 522 int i, nid;
682 int min_val = INT_MAX;
683 int best_node = -1;
684 523
685 for_each_online_node(n) { 524 /* Account for nodes with cpus and no memory */
686 val = node_distance(node, n); 525 node_possible_map = numa_nodes_parsed;
526 numa_nodemask_from_meminfo(&node_possible_map, mi);
527 if (WARN_ON(nodes_empty(node_possible_map)))
528 return -EINVAL;
687 529
688 if (val < min_val) { 530 memnode_shift = compute_hash_shift(mi);
689 min_val = val; 531 if (memnode_shift < 0) {
690 best_node = n; 532 printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n");
533 return -EINVAL;
534 }
535
536 for (i = 0; i < mi->nr_blks; i++)
537 memblock_x86_register_active_regions(mi->blk[i].nid,
538 mi->blk[i].start >> PAGE_SHIFT,
539 mi->blk[i].end >> PAGE_SHIFT);
540
541 /* for out of order entries */
542 sort_node_map();
543 if (!numa_meminfo_cover_memory(mi))
544 return -EINVAL;
545
546 /* Finally register nodes. */
547 for_each_node_mask(nid, node_possible_map) {
548 u64 start = (u64)max_pfn << PAGE_SHIFT;
549 u64 end = 0;
550
551 for (i = 0; i < mi->nr_blks; i++) {
552 if (nid != mi->blk[i].nid)
553 continue;
554 start = min(mi->blk[i].start, start);
555 end = max(mi->blk[i].end, end);
691 } 556 }
557
558 if (start < end)
559 setup_node_bootmem(nid, start, end);
692 } 560 }
693 561
694 return best_node; 562 return 0;
695} 563}
696 564
697/* 565/**
698 * Setup early cpu_to_node. 566 * dummy_numma_init - Fallback dummy NUMA init
699 * 567 *
700 * Populate cpu_to_node[] only if x86_cpu_to_apicid[], 568 * Used if there's no underlying NUMA architecture, NUMA initialization
701 * and apicid_to_node[] tables have valid entries for a CPU. 569 * fails, or NUMA is disabled on the command line.
702 * This means we skip cpu_to_node[] initialisation for NUMA
703 * emulation and faking node case (when running a kernel compiled
704 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
705 * is already initialized in a round robin manner at numa_init_array,
706 * prior to this call, and this initialization is good enough
707 * for the fake NUMA cases.
708 * 570 *
709 * Called before the per_cpu areas are setup. 571 * Must online at least one node and add memory blocks that cover all
572 * allowed memory. This function must not fail.
710 */ 573 */
711void __init init_cpu_to_node(void) 574static int __init dummy_numa_init(void)
712{ 575{
713 int cpu; 576 printk(KERN_INFO "%s\n",
714 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); 577 numa_off ? "NUMA turned off" : "No NUMA configuration found");
715 578 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
716 BUG_ON(cpu_to_apicid == NULL); 579 0LU, max_pfn << PAGE_SHIFT);
717 580
718 for_each_possible_cpu(cpu) { 581 node_set(0, numa_nodes_parsed);
719 int node; 582 numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT);
720 u16 apicid = cpu_to_apicid[cpu];
721 583
722 if (apicid == BAD_APICID) 584 return 0;
723 continue;
724 node = apicid_to_node[apicid];
725 if (node == NUMA_NO_NODE)
726 continue;
727 if (!node_online(node))
728 node = find_near_online_node(node);
729 numa_set_node(cpu, node);
730 }
731} 585}
732#endif
733 586
734 587static int __init numa_init(int (*init_func)(void))
735void __cpuinit numa_set_node(int cpu, int node)
736{ 588{
737 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); 589 int i;
738 590 int ret;
739 /* early setting, no percpu area yet */
740 if (cpu_to_node_map) {
741 cpu_to_node_map[cpu] = node;
742 return;
743 }
744
745#ifdef CONFIG_DEBUG_PER_CPU_MAPS
746 if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
747 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
748 dump_stack();
749 return;
750 }
751#endif
752 per_cpu(x86_cpu_to_node_map, cpu) = node;
753 591
754 if (node != NUMA_NO_NODE) 592 for (i = 0; i < MAX_LOCAL_APIC; i++)
755 set_cpu_numa_node(cpu, node); 593 set_apicid_to_node(i, NUMA_NO_NODE);
756}
757 594
758void __cpuinit numa_clear_node(int cpu) 595 nodes_clear(numa_nodes_parsed);
759{ 596 nodes_clear(node_possible_map);
760 numa_set_node(cpu, NUMA_NO_NODE); 597 nodes_clear(node_online_map);
761} 598 memset(&numa_meminfo, 0, sizeof(numa_meminfo));
762 599 remove_all_active_ranges();
763#ifndef CONFIG_DEBUG_PER_CPU_MAPS 600 numa_reset_distance();
764 601
765#ifndef CONFIG_NUMA_EMU 602 ret = init_func();
766void __cpuinit numa_add_cpu(int cpu) 603 if (ret < 0)
767{ 604 return ret;
768 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 605 ret = numa_cleanup_meminfo(&numa_meminfo);
769} 606 if (ret < 0)
607 return ret;
770 608
771void __cpuinit numa_remove_cpu(int cpu) 609 numa_emulation(&numa_meminfo, numa_distance_cnt);
772{
773 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
774}
775#else
776void __cpuinit numa_add_cpu(int cpu)
777{
778 unsigned long addr;
779 u16 apicid;
780 int physnid;
781 int nid = NUMA_NO_NODE;
782 610
783 nid = early_cpu_to_node(cpu); 611 ret = numa_register_memblks(&numa_meminfo);
784 BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); 612 if (ret < 0)
613 return ret;
785 614
786 /* 615 for (i = 0; i < nr_cpu_ids; i++) {
787 * Use the starting address of the emulated node to find which physical 616 int nid = early_cpu_to_node(i);
788 * node it is allocated on.
789 */
790 addr = node_start_pfn(nid) << PAGE_SHIFT;
791 for (physnid = 0; physnid < MAX_NUMNODES; physnid++)
792 if (addr >= physnodes[physnid].start &&
793 addr < physnodes[physnid].end)
794 break;
795 617
796 /* 618 if (nid == NUMA_NO_NODE)
797 * Map the cpu to each emulated node that is allocated on the physical 619 continue;
798 * node of the cpu's apic id. 620 if (!node_online(nid))
799 */ 621 numa_clear_node(i);
800 for_each_online_node(nid) {
801 addr = node_start_pfn(nid) << PAGE_SHIFT;
802 if (addr >= physnodes[physnid].start &&
803 addr < physnodes[physnid].end)
804 cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
805 } 622 }
623 numa_init_array();
624 return 0;
806} 625}
807 626
808void __cpuinit numa_remove_cpu(int cpu) 627void __init initmem_init(void)
809{ 628{
810 int i; 629 int ret;
811 630
812 for_each_online_node(i) 631 if (!numa_off) {
813 cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); 632#ifdef CONFIG_ACPI_NUMA
814} 633 ret = numa_init(x86_acpi_numa_init);
815#endif /* !CONFIG_NUMA_EMU */ 634 if (!ret)
816 635 return;
817#else /* CONFIG_DEBUG_PER_CPU_MAPS */ 636#endif
818static struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable) 637#ifdef CONFIG_AMD_NUMA
819{ 638 ret = numa_init(amd_numa_init);
820 int node = early_cpu_to_node(cpu); 639 if (!ret)
821 struct cpumask *mask; 640 return;
822 char buf[64]; 641#endif
823
824 mask = node_to_cpumask_map[node];
825 if (!mask) {
826 pr_err("node_to_cpumask_map[%i] NULL\n", node);
827 dump_stack();
828 return NULL;
829 } 642 }
830 643
831 cpulist_scnprintf(buf, sizeof(buf), mask); 644 numa_init(dummy_numa_init);
832 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
833 enable ? "numa_add_cpu" : "numa_remove_cpu",
834 cpu, node, buf);
835 return mask;
836} 645}
837 646
838/* 647unsigned long __init numa_free_all_bootmem(void)
839 * --------- debug versions of the numa functions ---------
840 */
841#ifndef CONFIG_NUMA_EMU
842static void __cpuinit numa_set_cpumask(int cpu, int enable)
843{
844 struct cpumask *mask;
845
846 mask = debug_cpumask_set_cpu(cpu, enable);
847 if (!mask)
848 return;
849
850 if (enable)
851 cpumask_set_cpu(cpu, mask);
852 else
853 cpumask_clear_cpu(cpu, mask);
854}
855#else
856static void __cpuinit numa_set_cpumask(int cpu, int enable)
857{ 648{
858 int node = early_cpu_to_node(cpu); 649 unsigned long pages = 0;
859 struct cpumask *mask;
860 int i; 650 int i;
861 651
862 for_each_online_node(i) { 652 for_each_online_node(i)
863 unsigned long addr; 653 pages += free_all_bootmem_node(NODE_DATA(i));
864
865 addr = node_start_pfn(i) << PAGE_SHIFT;
866 if (addr < physnodes[node].start ||
867 addr >= physnodes[node].end)
868 continue;
869 mask = debug_cpumask_set_cpu(cpu, enable);
870 if (!mask)
871 return;
872
873 if (enable)
874 cpumask_set_cpu(cpu, mask);
875 else
876 cpumask_clear_cpu(cpu, mask);
877 }
878}
879#endif /* CONFIG_NUMA_EMU */
880 654
881void __cpuinit numa_add_cpu(int cpu) 655 pages += free_all_memory_core_early(MAX_NUMNODES);
882{
883 numa_set_cpumask(cpu, 1);
884}
885 656
886void __cpuinit numa_remove_cpu(int cpu) 657 return pages;
887{
888 numa_set_cpumask(cpu, 0);
889} 658}
890 659
891int __cpu_to_node(int cpu) 660int __cpuinit numa_cpu_node(int cpu)
892{ 661{
893 if (early_per_cpu_ptr(x86_cpu_to_node_map)) { 662 int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
894 printk(KERN_WARNING
895 "cpu_to_node(%d): usage too early!\n", cpu);
896 dump_stack();
897 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
898 }
899 return per_cpu(x86_cpu_to_node_map, cpu);
900}
901EXPORT_SYMBOL(__cpu_to_node);
902 663
903/* 664 if (apicid != BAD_APICID)
904 * Same function as cpu_to_node() but used if called before the 665 return __apicid_to_node[apicid];
905 * per_cpu areas are setup. 666 return NUMA_NO_NODE;
906 */
907int early_cpu_to_node(int cpu)
908{
909 if (early_per_cpu_ptr(x86_cpu_to_node_map))
910 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
911
912 if (!cpu_possible(cpu)) {
913 printk(KERN_WARNING
914 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
915 dump_stack();
916 return NUMA_NO_NODE;
917 }
918 return per_cpu(x86_cpu_to_node_map, cpu);
919} 667}
920
921/*
922 * --------- end of debug versions of the numa functions ---------
923 */
924
925#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
new file mode 100644
index 000000000000..ad091e4cff17
--- /dev/null
+++ b/arch/x86/mm/numa_emulation.c
@@ -0,0 +1,494 @@
1/*
2 * NUMA emulation
3 */
4#include <linux/kernel.h>
5#include <linux/errno.h>
6#include <linux/topology.h>
7#include <linux/memblock.h>
8#include <asm/dma.h>
9
10#include "numa_internal.h"
11
12static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata;
13static char *emu_cmdline __initdata;
14
15void __init numa_emu_cmdline(char *str)
16{
17 emu_cmdline = str;
18}
19
20static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
21{
22 int i;
23
24 for (i = 0; i < mi->nr_blks; i++)
25 if (mi->blk[i].nid == nid)
26 return i;
27 return -ENOENT;
28}
29
30/*
31 * Sets up nid to range from @start to @end. The return value is -errno if
32 * something went wrong, 0 otherwise.
33 */
34static int __init emu_setup_memblk(struct numa_meminfo *ei,
35 struct numa_meminfo *pi,
36 int nid, int phys_blk, u64 size)
37{
38 struct numa_memblk *eb = &ei->blk[ei->nr_blks];
39 struct numa_memblk *pb = &pi->blk[phys_blk];
40
41 if (ei->nr_blks >= NR_NODE_MEMBLKS) {
42 pr_err("NUMA: Too many emulated memblks, failing emulation\n");
43 return -EINVAL;
44 }
45
46 ei->nr_blks++;
47 eb->start = pb->start;
48 eb->end = pb->start + size;
49 eb->nid = nid;
50
51 if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
52 emu_nid_to_phys[nid] = pb->nid;
53
54 pb->start += size;
55 if (pb->start >= pb->end) {
56 WARN_ON_ONCE(pb->start > pb->end);
57 numa_remove_memblk_from(phys_blk, pi);
58 }
59
60 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
61 eb->start, eb->end, (eb->end - eb->start) >> 20);
62 return 0;
63}
64
65/*
66 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
67 * to max_addr. The return value is the number of nodes allocated.
68 */
69static int __init split_nodes_interleave(struct numa_meminfo *ei,
70 struct numa_meminfo *pi,
71 u64 addr, u64 max_addr, int nr_nodes)
72{
73 nodemask_t physnode_mask = NODE_MASK_NONE;
74 u64 size;
75 int big;
76 int nid = 0;
77 int i, ret;
78
79 if (nr_nodes <= 0)
80 return -1;
81 if (nr_nodes > MAX_NUMNODES) {
82 pr_info("numa=fake=%d too large, reducing to %d\n",
83 nr_nodes, MAX_NUMNODES);
84 nr_nodes = MAX_NUMNODES;
85 }
86
87 size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes;
88 /*
89 * Calculate the number of big nodes that can be allocated as a result
90 * of consolidating the remainder.
91 */
92 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
93 FAKE_NODE_MIN_SIZE;
94
95 size &= FAKE_NODE_MIN_HASH_MASK;
96 if (!size) {
97 pr_err("Not enough memory for each node. "
98 "NUMA emulation disabled.\n");
99 return -1;
100 }
101
102 for (i = 0; i < pi->nr_blks; i++)
103 node_set(pi->blk[i].nid, physnode_mask);
104
105 /*
106 * Continue to fill physical nodes with fake nodes until there is no
107 * memory left on any of them.
108 */
109 while (nodes_weight(physnode_mask)) {
110 for_each_node_mask(i, physnode_mask) {
111 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
112 u64 start, limit, end;
113 int phys_blk;
114
115 phys_blk = emu_find_memblk_by_nid(i, pi);
116 if (phys_blk < 0) {
117 node_clear(i, physnode_mask);
118 continue;
119 }
120 start = pi->blk[phys_blk].start;
121 limit = pi->blk[phys_blk].end;
122 end = start + size;
123
124 if (nid < big)
125 end += FAKE_NODE_MIN_SIZE;
126
127 /*
128 * Continue to add memory to this fake node if its
129 * non-reserved memory is less than the per-node size.
130 */
131 while (end - start -
132 memblock_x86_hole_size(start, end) < size) {
133 end += FAKE_NODE_MIN_SIZE;
134 if (end > limit) {
135 end = limit;
136 break;
137 }
138 }
139
140 /*
141 * If there won't be at least FAKE_NODE_MIN_SIZE of
142 * non-reserved memory in ZONE_DMA32 for the next node,
143 * this one must extend to the boundary.
144 */
145 if (end < dma32_end && dma32_end - end -
146 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
147 end = dma32_end;
148
149 /*
150 * If there won't be enough non-reserved memory for the
151 * next node, this one must extend to the end of the
152 * physical node.
153 */
154 if (limit - end -
155 memblock_x86_hole_size(end, limit) < size)
156 end = limit;
157
158 ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
159 phys_blk,
160 min(end, limit) - start);
161 if (ret < 0)
162 return ret;
163 }
164 }
165 return 0;
166}
167
168/*
169 * Returns the end address of a node so that there is at least `size' amount of
170 * non-reserved memory or `max_addr' is reached.
171 */
172static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
173{
174 u64 end = start + size;
175
176 while (end - start - memblock_x86_hole_size(start, end) < size) {
177 end += FAKE_NODE_MIN_SIZE;
178 if (end > max_addr) {
179 end = max_addr;
180 break;
181 }
182 }
183 return end;
184}
185
186/*
187 * Sets up fake nodes of `size' interleaved over physical nodes ranging from
188 * `addr' to `max_addr'. The return value is the number of nodes allocated.
189 */
190static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
191 struct numa_meminfo *pi,
192 u64 addr, u64 max_addr, u64 size)
193{
194 nodemask_t physnode_mask = NODE_MASK_NONE;
195 u64 min_size;
196 int nid = 0;
197 int i, ret;
198
199 if (!size)
200 return -1;
201 /*
202 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
203 * increased accordingly if the requested size is too small. This
204 * creates a uniform distribution of node sizes across the entire
205 * machine (but not necessarily over physical nodes).
206 */
207 min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
208 MAX_NUMNODES;
209 min_size = max(min_size, FAKE_NODE_MIN_SIZE);
210 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
211 min_size = (min_size + FAKE_NODE_MIN_SIZE) &
212 FAKE_NODE_MIN_HASH_MASK;
213 if (size < min_size) {
214 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
215 size >> 20, min_size >> 20);
216 size = min_size;
217 }
218 size &= FAKE_NODE_MIN_HASH_MASK;
219
220 for (i = 0; i < pi->nr_blks; i++)
221 node_set(pi->blk[i].nid, physnode_mask);
222
223 /*
224 * Fill physical nodes with fake nodes of size until there is no memory
225 * left on any of them.
226 */
227 while (nodes_weight(physnode_mask)) {
228 for_each_node_mask(i, physnode_mask) {
229 u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
230 u64 start, limit, end;
231 int phys_blk;
232
233 phys_blk = emu_find_memblk_by_nid(i, pi);
234 if (phys_blk < 0) {
235 node_clear(i, physnode_mask);
236 continue;
237 }
238 start = pi->blk[phys_blk].start;
239 limit = pi->blk[phys_blk].end;
240
241 end = find_end_of_node(start, limit, size);
242 /*
243 * If there won't be at least FAKE_NODE_MIN_SIZE of
244 * non-reserved memory in ZONE_DMA32 for the next node,
245 * this one must extend to the boundary.
246 */
247 if (end < dma32_end && dma32_end - end -
248 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
249 end = dma32_end;
250
251 /*
252 * If there won't be enough non-reserved memory for the
253 * next node, this one must extend to the end of the
254 * physical node.
255 */
256 if (limit - end -
257 memblock_x86_hole_size(end, limit) < size)
258 end = limit;
259
260 ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
261 phys_blk,
262 min(end, limit) - start);
263 if (ret < 0)
264 return ret;
265 }
266 }
267 return 0;
268}
269
270/**
271 * numa_emulation - Emulate NUMA nodes
272 * @numa_meminfo: NUMA configuration to massage
273 * @numa_dist_cnt: The size of the physical NUMA distance table
274 *
275 * Emulate NUMA nodes according to the numa=fake kernel parameter.
276 * @numa_meminfo contains the physical memory configuration and is modified
277 * to reflect the emulated configuration on success. @numa_dist_cnt is
278 * used to determine the size of the physical distance table.
279 *
280 * On success, the following modifications are made.
281 *
282 * - @numa_meminfo is updated to reflect the emulated nodes.
283 *
284 * - __apicid_to_node[] is updated such that APIC IDs are mapped to the
285 * emulated nodes.
286 *
287 * - NUMA distance table is rebuilt to represent distances between emulated
288 * nodes. The distances are determined considering how emulated nodes
289 * are mapped to physical nodes and match the actual distances.
290 *
291 * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical
292 * nodes. This is used by numa_add_cpu() and numa_remove_cpu().
293 *
294 * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with
295 * identity mapping and no other modification is made.
296 */
297void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
298{
299 static struct numa_meminfo ei __initdata;
300 static struct numa_meminfo pi __initdata;
301 const u64 max_addr = max_pfn << PAGE_SHIFT;
302 u8 *phys_dist = NULL;
303 size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
304 int max_emu_nid, dfl_phys_nid;
305 int i, j, ret;
306
307 if (!emu_cmdline)
308 goto no_emu;
309
310 memset(&ei, 0, sizeof(ei));
311 pi = *numa_meminfo;
312
313 for (i = 0; i < MAX_NUMNODES; i++)
314 emu_nid_to_phys[i] = NUMA_NO_NODE;
315
316 /*
317 * If the numa=fake command-line contains a 'M' or 'G', it represents
318 * the fixed node size. Otherwise, if it is just a single number N,
319 * split the system RAM into N fake nodes.
320 */
321 if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
322 u64 size;
323
324 size = memparse(emu_cmdline, &emu_cmdline);
325 ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
326 } else {
327 unsigned long n;
328
329 n = simple_strtoul(emu_cmdline, NULL, 0);
330 ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
331 }
332
333 if (ret < 0)
334 goto no_emu;
335
336 if (numa_cleanup_meminfo(&ei) < 0) {
337 pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
338 goto no_emu;
339 }
340
341 /* copy the physical distance table */
342 if (numa_dist_cnt) {
343 u64 phys;
344
345 phys = memblock_find_in_range(0,
346 (u64)max_pfn_mapped << PAGE_SHIFT,
347 phys_size, PAGE_SIZE);
348 if (phys == MEMBLOCK_ERROR) {
349 pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
350 goto no_emu;
351 }
352 memblock_x86_reserve_range(phys, phys + phys_size, "TMP NUMA DIST");
353 phys_dist = __va(phys);
354
355 for (i = 0; i < numa_dist_cnt; i++)
356 for (j = 0; j < numa_dist_cnt; j++)
357 phys_dist[i * numa_dist_cnt + j] =
358 node_distance(i, j);
359 }
360
361 /*
362 * Determine the max emulated nid and the default phys nid to use
363 * for unmapped nodes.
364 */
365 max_emu_nid = 0;
366 dfl_phys_nid = NUMA_NO_NODE;
367 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
368 if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
369 max_emu_nid = i;
370 if (dfl_phys_nid == NUMA_NO_NODE)
371 dfl_phys_nid = emu_nid_to_phys[i];
372 }
373 }
374 if (dfl_phys_nid == NUMA_NO_NODE) {
375 pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n");
376 goto no_emu;
377 }
378
379 /* commit */
380 *numa_meminfo = ei;
381
382 /*
383 * Transform __apicid_to_node table to use emulated nids by
384 * reverse-mapping phys_nid. The maps should always exist but fall
385 * back to zero just in case.
386 */
387 for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
388 if (__apicid_to_node[i] == NUMA_NO_NODE)
389 continue;
390 for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
391 if (__apicid_to_node[i] == emu_nid_to_phys[j])
392 break;
393 __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
394 }
395
396 /* make sure all emulated nodes are mapped to a physical node */
397 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
398 if (emu_nid_to_phys[i] == NUMA_NO_NODE)
399 emu_nid_to_phys[i] = dfl_phys_nid;
400
401 /* transform distance table */
402 numa_reset_distance();
403 for (i = 0; i < max_emu_nid + 1; i++) {
404 for (j = 0; j < max_emu_nid + 1; j++) {
405 int physi = emu_nid_to_phys[i];
406 int physj = emu_nid_to_phys[j];
407 int dist;
408
409 if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
410 dist = physi == physj ?
411 LOCAL_DISTANCE : REMOTE_DISTANCE;
412 else
413 dist = phys_dist[physi * numa_dist_cnt + physj];
414
415 numa_set_distance(i, j, dist);
416 }
417 }
418
419 /* free the copied physical distance table */
420 if (phys_dist)
421 memblock_x86_free_range(__pa(phys_dist), __pa(phys_dist) + phys_size);
422 return;
423
424no_emu:
425 /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */
426 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
427 emu_nid_to_phys[i] = i;
428}
429
430#ifndef CONFIG_DEBUG_PER_CPU_MAPS
431void __cpuinit numa_add_cpu(int cpu)
432{
433 int physnid, nid;
434
435 nid = early_cpu_to_node(cpu);
436 BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
437
438 physnid = emu_nid_to_phys[nid];
439
440 /*
441 * Map the cpu to each emulated node that is allocated on the physical
442 * node of the cpu's apic id.
443 */
444 for_each_online_node(nid)
445 if (emu_nid_to_phys[nid] == physnid)
446 cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
447}
448
449void __cpuinit numa_remove_cpu(int cpu)
450{
451 int i;
452
453 for_each_online_node(i)
454 cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
455}
456#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
457static void __cpuinit numa_set_cpumask(int cpu, int enable)
458{
459 struct cpumask *mask;
460 int nid, physnid, i;
461
462 nid = early_cpu_to_node(cpu);
463 if (nid == NUMA_NO_NODE) {
464 /* early_cpu_to_node() already emits a warning and trace */
465 return;
466 }
467
468 physnid = emu_nid_to_phys[nid];
469
470 for_each_online_node(i) {
471 if (emu_nid_to_phys[nid] != physnid)
472 continue;
473
474 mask = debug_cpumask_set_cpu(cpu, enable);
475 if (!mask)
476 return;
477
478 if (enable)
479 cpumask_set_cpu(cpu, mask);
480 else
481 cpumask_clear_cpu(cpu, mask);
482 }
483}
484
485void __cpuinit numa_add_cpu(int cpu)
486{
487 numa_set_cpumask(cpu, 1);
488}
489
490void __cpuinit numa_remove_cpu(int cpu)
491{
492 numa_set_cpumask(cpu, 0);
493}
494#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
new file mode 100644
index 000000000000..ef2d97377d7c
--- /dev/null
+++ b/arch/x86/mm/numa_internal.h
@@ -0,0 +1,31 @@
1#ifndef __X86_MM_NUMA_INTERNAL_H
2#define __X86_MM_NUMA_INTERNAL_H
3
4#include <linux/types.h>
5#include <asm/numa.h>
6
7struct numa_memblk {
8 u64 start;
9 u64 end;
10 int nid;
11};
12
13struct numa_meminfo {
14 int nr_blks;
15 struct numa_memblk blk[NR_NODE_MEMBLKS];
16};
17
18void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
19int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
20void __init numa_reset_distance(void);
21
22#ifdef CONFIG_NUMA_EMU
23void __init numa_emulation(struct numa_meminfo *numa_meminfo,
24 int numa_dist_cnt);
25#else
26static inline void numa_emulation(struct numa_meminfo *numa_meminfo,
27 int numa_dist_cnt)
28{ }
29#endif
30
31#endif /* __X86_MM_NUMA_INTERNAL_H */
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index ae96e7b8051d..48651c6f657d 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -57,7 +57,7 @@ struct node_memory_chunk_s {
57static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS]; 57static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
58 58
59static int __initdata num_memory_chunks; /* total number of memory chunks */ 59static int __initdata num_memory_chunks; /* total number of memory chunks */
60static u8 __initdata apicid_to_pxm[MAX_APICID]; 60static u8 __initdata apicid_to_pxm[MAX_LOCAL_APIC];
61 61
62int acpi_numa __initdata; 62int acpi_numa __initdata;
63 63
@@ -254,8 +254,8 @@ int __init get_memcfg_from_srat(void)
254 printk(KERN_DEBUG "Number of memory chunks in system = %d\n", 254 printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
255 num_memory_chunks); 255 num_memory_chunks);
256 256
257 for (i = 0; i < MAX_APICID; i++) 257 for (i = 0; i < MAX_LOCAL_APIC; i++)
258 apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]); 258 set_apicid_to_node(i, pxm_to_node(apicid_to_pxm[i]));
259 259
260 for (j = 0; j < num_memory_chunks; j++){ 260 for (j = 0; j < num_memory_chunks; j++){
261 struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; 261 struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 603d285d1daa..8e9d3394f6d4 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -26,88 +26,34 @@
26 26
27int acpi_numa __initdata; 27int acpi_numa __initdata;
28 28
29static struct acpi_table_slit *acpi_slit;
30
31static nodemask_t nodes_parsed __initdata;
32static nodemask_t cpu_nodes_parsed __initdata;
33static struct bootnode nodes[MAX_NUMNODES] __initdata;
34static struct bootnode nodes_add[MAX_NUMNODES]; 29static struct bootnode nodes_add[MAX_NUMNODES];
35 30
36static int num_node_memblks __initdata;
37static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
38static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
39
40static __init int setup_node(int pxm) 31static __init int setup_node(int pxm)
41{ 32{
42 return acpi_map_pxm_to_node(pxm); 33 return acpi_map_pxm_to_node(pxm);
43} 34}
44 35
45static __init int conflicting_memblks(unsigned long start, unsigned long end)
46{
47 int i;
48 for (i = 0; i < num_node_memblks; i++) {
49 struct bootnode *nd = &node_memblk_range[i];
50 if (nd->start == nd->end)
51 continue;
52 if (nd->end > start && nd->start < end)
53 return memblk_nodeid[i];
54 if (nd->end == end && nd->start == start)
55 return memblk_nodeid[i];
56 }
57 return -1;
58}
59
60static __init void cutoff_node(int i, unsigned long start, unsigned long end)
61{
62 struct bootnode *nd = &nodes[i];
63
64 if (nd->start < start) {
65 nd->start = start;
66 if (nd->end < nd->start)
67 nd->start = nd->end;
68 }
69 if (nd->end > end) {
70 nd->end = end;
71 if (nd->start > nd->end)
72 nd->start = nd->end;
73 }
74}
75
76static __init void bad_srat(void) 36static __init void bad_srat(void)
77{ 37{
78 int i;
79 printk(KERN_ERR "SRAT: SRAT not used.\n"); 38 printk(KERN_ERR "SRAT: SRAT not used.\n");
80 acpi_numa = -1; 39 acpi_numa = -1;
81 for (i = 0; i < MAX_LOCAL_APIC; i++) 40 memset(nodes_add, 0, sizeof(nodes_add));
82 apicid_to_node[i] = NUMA_NO_NODE;
83 for (i = 0; i < MAX_NUMNODES; i++) {
84 nodes[i].start = nodes[i].end = 0;
85 nodes_add[i].start = nodes_add[i].end = 0;
86 }
87 remove_all_active_ranges();
88} 41}
89 42
90static __init inline int srat_disabled(void) 43static __init inline int srat_disabled(void)
91{ 44{
92 return numa_off || acpi_numa < 0; 45 return acpi_numa < 0;
93} 46}
94 47
95/* Callback for SLIT parsing */ 48/* Callback for SLIT parsing */
96void __init acpi_numa_slit_init(struct acpi_table_slit *slit) 49void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
97{ 50{
98 unsigned length; 51 int i, j;
99 unsigned long phys;
100
101 length = slit->header.length;
102 phys = memblock_find_in_range(0, max_pfn_mapped<<PAGE_SHIFT, length,
103 PAGE_SIZE);
104
105 if (phys == MEMBLOCK_ERROR)
106 panic(" Can not save slit!\n");
107 52
108 acpi_slit = __va(phys); 53 for (i = 0; i < slit->locality_count; i++)
109 memcpy(acpi_slit, slit, length); 54 for (j = 0; j < slit->locality_count; j++)
110 memblock_x86_reserve_range(phys, phys + length, "ACPI SLIT"); 55 numa_set_distance(pxm_to_node(i), pxm_to_node(j),
56 slit->entry[slit->locality_count * i + j]);
111} 57}
112 58
113/* Callback for Proximity Domain -> x2APIC mapping */ 59/* Callback for Proximity Domain -> x2APIC mapping */
@@ -138,8 +84,8 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
138 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); 84 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
139 return; 85 return;
140 } 86 }
141 apicid_to_node[apic_id] = node; 87 set_apicid_to_node(apic_id, node);
142 node_set(node, cpu_nodes_parsed); 88 node_set(node, numa_nodes_parsed);
143 acpi_numa = 1; 89 acpi_numa = 1;
144 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", 90 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
145 pxm, apic_id, node); 91 pxm, apic_id, node);
@@ -178,8 +124,8 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
178 return; 124 return;
179 } 125 }
180 126
181 apicid_to_node[apic_id] = node; 127 set_apicid_to_node(apic_id, node);
182 node_set(node, cpu_nodes_parsed); 128 node_set(node, numa_nodes_parsed);
183 acpi_numa = 1; 129 acpi_numa = 1;
184 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", 130 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
185 pxm, apic_id, node); 131 pxm, apic_id, node);
@@ -241,7 +187,7 @@ update_nodes_add(int node, unsigned long start, unsigned long end)
241 } 187 }
242 188
243 if (changed) { 189 if (changed) {
244 node_set(node, cpu_nodes_parsed); 190 node_set(node, numa_nodes_parsed);
245 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", 191 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
246 nd->start, nd->end); 192 nd->start, nd->end);
247 } 193 }
@@ -251,10 +197,8 @@ update_nodes_add(int node, unsigned long start, unsigned long end)
251void __init 197void __init
252acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) 198acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
253{ 199{
254 struct bootnode *nd, oldnode;
255 unsigned long start, end; 200 unsigned long start, end;
256 int node, pxm; 201 int node, pxm;
257 int i;
258 202
259 if (srat_disabled()) 203 if (srat_disabled())
260 return; 204 return;
@@ -276,300 +220,31 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
276 bad_srat(); 220 bad_srat();
277 return; 221 return;
278 } 222 }
279 i = conflicting_memblks(start, end); 223
280 if (i == node) { 224 if (numa_add_memblk(node, start, end) < 0) {
281 printk(KERN_WARNING
282 "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
283 pxm, start, end, nodes[i].start, nodes[i].end);
284 } else if (i >= 0) {
285 printk(KERN_ERR
286 "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
287 pxm, start, end, node_to_pxm(i),
288 nodes[i].start, nodes[i].end);
289 bad_srat(); 225 bad_srat();
290 return; 226 return;
291 } 227 }
292 nd = &nodes[node];
293 oldnode = *nd;
294 if (!node_test_and_set(node, nodes_parsed)) {
295 nd->start = start;
296 nd->end = end;
297 } else {
298 if (start < nd->start)
299 nd->start = start;
300 if (nd->end < end)
301 nd->end = end;
302 }
303 228
304 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, 229 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
305 start, end); 230 start, end);
306 231
307 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) { 232 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)
308 update_nodes_add(node, start, end); 233 update_nodes_add(node, start, end);
309 /* restore nodes[node] */
310 *nd = oldnode;
311 if ((nd->start | nd->end) == 0)
312 node_clear(node, nodes_parsed);
313 }
314
315 node_memblk_range[num_node_memblks].start = start;
316 node_memblk_range[num_node_memblks].end = end;
317 memblk_nodeid[num_node_memblks] = node;
318 num_node_memblks++;
319}
320
321/* Sanity check to catch more bad SRATs (they are amazingly common).
322 Make sure the PXMs cover all memory. */
323static int __init nodes_cover_memory(const struct bootnode *nodes)
324{
325 int i;
326 unsigned long pxmram, e820ram;
327
328 pxmram = 0;
329 for_each_node_mask(i, nodes_parsed) {
330 unsigned long s = nodes[i].start >> PAGE_SHIFT;
331 unsigned long e = nodes[i].end >> PAGE_SHIFT;
332 pxmram += e - s;
333 pxmram -= __absent_pages_in_range(i, s, e);
334 if ((long)pxmram < 0)
335 pxmram = 0;
336 }
337
338 e820ram = max_pfn - (memblock_x86_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
339 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
340 if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) {
341 printk(KERN_ERR
342 "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
343 (pxmram << PAGE_SHIFT) >> 20,
344 (e820ram << PAGE_SHIFT) >> 20);
345 return 0;
346 }
347 return 1;
348} 234}
349 235
350void __init acpi_numa_arch_fixup(void) {} 236void __init acpi_numa_arch_fixup(void) {}
351 237
352#ifdef CONFIG_NUMA_EMU 238int __init x86_acpi_numa_init(void)
353void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
354 unsigned long end)
355{
356 int i;
357
358 for_each_node_mask(i, nodes_parsed) {
359 cutoff_node(i, start, end);
360 physnodes[i].start = nodes[i].start;
361 physnodes[i].end = nodes[i].end;
362 }
363}
364#endif /* CONFIG_NUMA_EMU */
365
366/* Use the information discovered above to actually set up the nodes. */
367int __init acpi_scan_nodes(unsigned long start, unsigned long end)
368{ 239{
369 int i; 240 int ret;
370
371 if (acpi_numa <= 0)
372 return -1;
373
374 /* First clean up the node list */
375 for (i = 0; i < MAX_NUMNODES; i++)
376 cutoff_node(i, start, end);
377
378 /*
379 * Join together blocks on the same node, holes between
380 * which don't overlap with memory on other nodes.
381 */
382 for (i = 0; i < num_node_memblks; ++i) {
383 int j, k;
384
385 for (j = i + 1; j < num_node_memblks; ++j) {
386 unsigned long start, end;
387
388 if (memblk_nodeid[i] != memblk_nodeid[j])
389 continue;
390 start = min(node_memblk_range[i].end,
391 node_memblk_range[j].end);
392 end = max(node_memblk_range[i].start,
393 node_memblk_range[j].start);
394 for (k = 0; k < num_node_memblks; ++k) {
395 if (memblk_nodeid[i] == memblk_nodeid[k])
396 continue;
397 if (start < node_memblk_range[k].end &&
398 end > node_memblk_range[k].start)
399 break;
400 }
401 if (k < num_node_memblks)
402 continue;
403 start = min(node_memblk_range[i].start,
404 node_memblk_range[j].start);
405 end = max(node_memblk_range[i].end,
406 node_memblk_range[j].end);
407 printk(KERN_INFO "SRAT: Node %d "
408 "[%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
409 memblk_nodeid[i],
410 node_memblk_range[i].start,
411 node_memblk_range[i].end,
412 node_memblk_range[j].start,
413 node_memblk_range[j].end,
414 start, end);
415 node_memblk_range[i].start = start;
416 node_memblk_range[i].end = end;
417 k = --num_node_memblks - j;
418 memmove(memblk_nodeid + j, memblk_nodeid + j+1,
419 k * sizeof(*memblk_nodeid));
420 memmove(node_memblk_range + j, node_memblk_range + j+1,
421 k * sizeof(*node_memblk_range));
422 --j;
423 }
424 }
425
426 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
427 memblk_nodeid);
428 if (memnode_shift < 0) {
429 printk(KERN_ERR
430 "SRAT: No NUMA node hash function found. Contact maintainer\n");
431 bad_srat();
432 return -1;
433 }
434
435 for (i = 0; i < num_node_memblks; i++)
436 memblock_x86_register_active_regions(memblk_nodeid[i],
437 node_memblk_range[i].start >> PAGE_SHIFT,
438 node_memblk_range[i].end >> PAGE_SHIFT);
439
440 /* for out of order entries in SRAT */
441 sort_node_map();
442 if (!nodes_cover_memory(nodes)) {
443 bad_srat();
444 return -1;
445 }
446 241
447 /* Account for nodes with cpus and no memory */ 242 ret = acpi_numa_init();
448 nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed); 243 if (ret < 0)
449 244 return ret;
450 /* Finally register nodes */ 245 return srat_disabled() ? -EINVAL : 0;
451 for_each_node_mask(i, node_possible_map)
452 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
453 /* Try again in case setup_node_bootmem missed one due
454 to missing bootmem */
455 for_each_node_mask(i, node_possible_map)
456 if (!node_online(i))
457 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
458
459 for (i = 0; i < nr_cpu_ids; i++) {
460 int node = early_cpu_to_node(i);
461
462 if (node == NUMA_NO_NODE)
463 continue;
464 if (!node_online(node))
465 numa_clear_node(i);
466 }
467 numa_init_array();
468 return 0;
469}
470
471#ifdef CONFIG_NUMA_EMU
472static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
473 [0 ... MAX_NUMNODES-1] = PXM_INVAL
474};
475static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
476 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
477};
478static int __init find_node_by_addr(unsigned long addr)
479{
480 int ret = NUMA_NO_NODE;
481 int i;
482
483 for_each_node_mask(i, nodes_parsed) {
484 /*
485 * Find the real node that this emulated node appears on. For
486 * the sake of simplicity, we only use a real node's starting
487 * address to determine which emulated node it appears on.
488 */
489 if (addr >= nodes[i].start && addr < nodes[i].end) {
490 ret = i;
491 break;
492 }
493 }
494 return ret;
495} 246}
496 247
497/*
498 * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
499 * mappings that respect the real ACPI topology but reflect our emulated
500 * environment. For each emulated node, we find which real node it appears on
501 * and create PXM to NID mappings for those fake nodes which mirror that
502 * locality. SLIT will now represent the correct distances between emulated
503 * nodes as a result of the real topology.
504 */
505void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
506{
507 int i, j;
508
509 for (i = 0; i < num_nodes; i++) {
510 int nid, pxm;
511
512 nid = find_node_by_addr(fake_nodes[i].start);
513 if (nid == NUMA_NO_NODE)
514 continue;
515 pxm = node_to_pxm(nid);
516 if (pxm == PXM_INVAL)
517 continue;
518 fake_node_to_pxm_map[i] = pxm;
519 /*
520 * For each apicid_to_node mapping that exists for this real
521 * node, it must now point to the fake node ID.
522 */
523 for (j = 0; j < MAX_LOCAL_APIC; j++)
524 if (apicid_to_node[j] == nid &&
525 fake_apicid_to_node[j] == NUMA_NO_NODE)
526 fake_apicid_to_node[j] = i;
527 }
528
529 /*
530 * If there are apicid-to-node mappings for physical nodes that do not
531 * have a corresponding emulated node, it should default to a guaranteed
532 * value.
533 */
534 for (i = 0; i < MAX_LOCAL_APIC; i++)
535 if (apicid_to_node[i] != NUMA_NO_NODE &&
536 fake_apicid_to_node[i] == NUMA_NO_NODE)
537 fake_apicid_to_node[i] = 0;
538
539 for (i = 0; i < num_nodes; i++)
540 __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
541 memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
542
543 nodes_clear(nodes_parsed);
544 for (i = 0; i < num_nodes; i++)
545 if (fake_nodes[i].start != fake_nodes[i].end)
546 node_set(i, nodes_parsed);
547}
548
549static int null_slit_node_compare(int a, int b)
550{
551 return node_to_pxm(a) == node_to_pxm(b);
552}
553#else
554static int null_slit_node_compare(int a, int b)
555{
556 return a == b;
557}
558#endif /* CONFIG_NUMA_EMU */
559
560int __node_distance(int a, int b)
561{
562 int index;
563
564 if (!acpi_slit)
565 return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
566 REMOTE_DISTANCE;
567 index = acpi_slit->locality_count * node_to_pxm(a);
568 return acpi_slit->entry[index + node_to_pxm(b)];
569}
570
571EXPORT_SYMBOL(__node_distance);
572
573#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY) 248#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
574int memory_add_physaddr_to_nid(u64 start) 249int memory_add_physaddr_to_nid(u64 start)
575{ 250{
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6acc724d5d8f..d6c0418c3e47 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -179,12 +179,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
179 sender = this_cpu_read(tlb_vector_offset); 179 sender = this_cpu_read(tlb_vector_offset);
180 f = &flush_state[sender]; 180 f = &flush_state[sender];
181 181
182 /* 182 if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
183 * Could avoid this lock when 183 raw_spin_lock(&f->tlbstate_lock);
184 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
185 * probably not worth checking this for a cache-hot lock.
186 */
187 raw_spin_lock(&f->tlbstate_lock);
188 184
189 f->flush_mm = mm; 185 f->flush_mm = mm;
190 f->flush_va = va; 186 f->flush_va = va;
@@ -202,7 +198,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
202 198
203 f->flush_mm = NULL; 199 f->flush_mm = NULL;
204 f->flush_va = 0; 200 f->flush_va = 0;
205 raw_spin_unlock(&f->tlbstate_lock); 201 if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
202 raw_spin_unlock(&f->tlbstate_lock);
206} 203}
207 204
208void native_flush_tlb_others(const struct cpumask *cpumask, 205void native_flush_tlb_others(const struct cpumask *cpumask,
@@ -211,11 +208,10 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
211 if (is_uv_system()) { 208 if (is_uv_system()) {
212 unsigned int cpu; 209 unsigned int cpu;
213 210
214 cpu = get_cpu(); 211 cpu = smp_processor_id();
215 cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); 212 cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
216 if (cpumask) 213 if (cpumask)
217 flush_tlb_others_ipi(cpumask, mm, va); 214 flush_tlb_others_ipi(cpumask, mm, va);
218 put_cpu();
219 return; 215 return;
220 } 216 }
221 flush_tlb_others_ipi(cpumask, mm, va); 217 flush_tlb_others_ipi(cpumask, mm, va);