aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm/numa_64.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm/numa_64.c')
-rw-r--r--arch/x86/mm/numa_64.c307
1 files changed, 145 insertions, 162 deletions
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 3d6926ba899..5a02bf4c91e 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Generic VM initialization for x86-64 NUMA setups. 2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs. 3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */ 4 */
5#include <linux/kernel.h> 5#include <linux/kernel.h>
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/string.h> 7#include <linux/string.h>
@@ -11,35 +11,45 @@
11#include <linux/ctype.h> 11#include <linux/ctype.h>
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/nodemask.h> 13#include <linux/nodemask.h>
14#include <linux/sched.h>
14 15
15#include <asm/e820.h> 16#include <asm/e820.h>
16#include <asm/proto.h> 17#include <asm/proto.h>
17#include <asm/dma.h> 18#include <asm/dma.h>
18#include <asm/numa.h> 19#include <asm/numa.h>
19#include <asm/acpi.h> 20#include <asm/acpi.h>
21#include <asm/k8.h>
20 22
21#ifndef Dprintk 23#ifndef Dprintk
22#define Dprintk(x...) 24#define Dprintk(x...)
23#endif 25#endif
24 26
25struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 27struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
28EXPORT_SYMBOL(node_data);
29
26bootmem_data_t plat_node_bdata[MAX_NUMNODES]; 30bootmem_data_t plat_node_bdata[MAX_NUMNODES];
27 31
28struct memnode memnode; 32struct memnode memnode;
29 33
30unsigned char cpu_to_node[NR_CPUS] __read_mostly = { 34int x86_cpu_to_node_map_init[NR_CPUS] = {
31 [0 ... NR_CPUS-1] = NUMA_NO_NODE 35 [0 ... NR_CPUS-1] = NUMA_NO_NODE
32}; 36};
33unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { 37void *x86_cpu_to_node_map_early_ptr;
34 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 38DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
39EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map);
40EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr);
41
42s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
43 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
35}; 44};
36cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; 45
46cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly;
47EXPORT_SYMBOL(node_to_cpumask_map);
37 48
38int numa_off __initdata; 49int numa_off __initdata;
39unsigned long __initdata nodemap_addr; 50unsigned long __initdata nodemap_addr;
40unsigned long __initdata nodemap_size; 51unsigned long __initdata nodemap_size;
41 52
42
43/* 53/*
44 * Given a shift value, try to populate memnodemap[] 54 * Given a shift value, try to populate memnodemap[]
45 * Returns : 55 * Returns :
@@ -47,14 +57,13 @@ unsigned long __initdata nodemap_size;
47 * 0 if memnodmap[] too small (of shift too small) 57 * 0 if memnodmap[] too small (of shift too small)
48 * -1 if node overlap or lost ram (shift too big) 58 * -1 if node overlap or lost ram (shift too big)
49 */ 59 */
50static int __init 60static int __init populate_memnodemap(const struct bootnode *nodes,
51populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) 61 int numnodes, int shift)
52{ 62{
53 int i;
54 int res = -1;
55 unsigned long addr, end; 63 unsigned long addr, end;
64 int i, res = -1;
56 65
57 memset(memnodemap, 0xff, memnodemapsize); 66 memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
58 for (i = 0; i < numnodes; i++) { 67 for (i = 0; i < numnodes; i++) {
59 addr = nodes[i].start; 68 addr = nodes[i].start;
60 end = nodes[i].end; 69 end = nodes[i].end;
@@ -63,37 +72,36 @@ populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
63 if ((end >> shift) >= memnodemapsize) 72 if ((end >> shift) >= memnodemapsize)
64 return 0; 73 return 0;
65 do { 74 do {
66 if (memnodemap[addr >> shift] != 0xff) 75 if (memnodemap[addr >> shift] != NUMA_NO_NODE)
67 return -1; 76 return -1;
68 memnodemap[addr >> shift] = i; 77 memnodemap[addr >> shift] = i;
69 addr += (1UL << shift); 78 addr += (1UL << shift);
70 } while (addr < end); 79 } while (addr < end);
71 res = 1; 80 res = 1;
72 } 81 }
73 return res; 82 return res;
74} 83}
75 84
76static int __init allocate_cachealigned_memnodemap(void) 85static int __init allocate_cachealigned_memnodemap(void)
77{ 86{
78 unsigned long pad, pad_addr; 87 unsigned long addr;
79 88
80 memnodemap = memnode.embedded_map; 89 memnodemap = memnode.embedded_map;
81 if (memnodemapsize <= 48) 90 if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
82 return 0; 91 return 0;
83 92
84 pad = L1_CACHE_BYTES - 1; 93 addr = 0x8000;
85 pad_addr = 0x8000; 94 nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
86 nodemap_size = pad + memnodemapsize; 95 nodemap_addr = find_e820_area(addr, end_pfn<<PAGE_SHIFT,
87 nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT, 96 nodemap_size, L1_CACHE_BYTES);
88 nodemap_size);
89 if (nodemap_addr == -1UL) { 97 if (nodemap_addr == -1UL) {
90 printk(KERN_ERR 98 printk(KERN_ERR
91 "NUMA: Unable to allocate Memory to Node hash map\n"); 99 "NUMA: Unable to allocate Memory to Node hash map\n");
92 nodemap_addr = nodemap_size = 0; 100 nodemap_addr = nodemap_size = 0;
93 return -1; 101 return -1;
94 } 102 }
95 pad_addr = (nodemap_addr + pad) & ~pad; 103 memnodemap = phys_to_virt(nodemap_addr);
96 memnodemap = phys_to_virt(pad_addr); 104 reserve_early(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP");
97 105
98 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", 106 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
99 nodemap_addr, nodemap_addr + nodemap_size); 107 nodemap_addr, nodemap_addr + nodemap_size);
@@ -104,8 +112,8 @@ static int __init allocate_cachealigned_memnodemap(void)
104 * The LSB of all start and end addresses in the node map is the value of the 112 * The LSB of all start and end addresses in the node map is the value of the
105 * maximum possible shift. 113 * maximum possible shift.
106 */ 114 */
107static int __init 115static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
108extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes) 116 int numnodes)
109{ 117{
110 int i, nodes_used = 0; 118 int i, nodes_used = 0;
111 unsigned long start, end; 119 unsigned long start, end;
@@ -140,59 +148,62 @@ int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
140 shift); 148 shift);
141 149
142 if (populate_memnodemap(nodes, numnodes, shift) != 1) { 150 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
143 printk(KERN_INFO 151 printk(KERN_INFO "Your memory is not aligned you need to "
144 "Your memory is not aligned you need to rebuild your kernel " 152 "rebuild your kernel with a bigger NODEMAPSIZE "
145 "with a bigger NODEMAPSIZE shift=%d\n", 153 "shift=%d\n", shift);
146 shift);
147 return -1; 154 return -1;
148 } 155 }
149 return shift; 156 return shift;
150} 157}
151 158
152#ifdef CONFIG_SPARSEMEM
153int early_pfn_to_nid(unsigned long pfn) 159int early_pfn_to_nid(unsigned long pfn)
154{ 160{
155 return phys_to_nid(pfn << PAGE_SHIFT); 161 return phys_to_nid(pfn << PAGE_SHIFT);
156} 162}
157#endif
158 163
159static void * __init 164static void * __init early_node_mem(int nodeid, unsigned long start,
160early_node_mem(int nodeid, unsigned long start, unsigned long end, 165 unsigned long end, unsigned long size,
161 unsigned long size) 166 unsigned long align)
162{ 167{
163 unsigned long mem = find_e820_area(start, end, size); 168 unsigned long mem = find_e820_area(start, end, size, align);
164 void *ptr; 169 void *ptr;
170
165 if (mem != -1L) 171 if (mem != -1L)
166 return __va(mem); 172 return __va(mem);
167 ptr = __alloc_bootmem_nopanic(size, 173
168 SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)); 174 ptr = __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
169 if (ptr == NULL) { 175 if (ptr == NULL) {
170 printk(KERN_ERR "Cannot find %lu bytes in node %d\n", 176 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
171 size, nodeid); 177 size, nodeid);
172 return NULL; 178 return NULL;
173 } 179 }
174 return ptr; 180 return ptr;
175} 181}
176 182
177/* Initialize bootmem allocator for a node */ 183/* Initialize bootmem allocator for a node */
178void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) 184void __init setup_node_bootmem(int nodeid, unsigned long start,
179{ 185 unsigned long end)
180 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; 186{
181 unsigned long nodedata_phys; 187 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size;
188 unsigned long bootmap_start, nodedata_phys;
182 void *bootmap; 189 void *bootmap;
183 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); 190 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
184 191
185 start = round_up(start, ZONE_ALIGN); 192 start = round_up(start, ZONE_ALIGN);
186 193
187 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end); 194 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
195 start, end);
188 196
189 start_pfn = start >> PAGE_SHIFT; 197 start_pfn = start >> PAGE_SHIFT;
190 end_pfn = end >> PAGE_SHIFT; 198 end_pfn = end >> PAGE_SHIFT;
191 199
192 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size); 200 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
201 SMP_CACHE_BYTES);
193 if (node_data[nodeid] == NULL) 202 if (node_data[nodeid] == NULL)
194 return; 203 return;
195 nodedata_phys = __pa(node_data[nodeid]); 204 nodedata_phys = __pa(node_data[nodeid]);
205 printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
206 nodedata_phys + pgdat_size - 1);
196 207
197 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 208 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
198 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; 209 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
@@ -200,75 +211,62 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
200 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; 211 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
201 212
202 /* Find a place for the bootmem map */ 213 /* Find a place for the bootmem map */
203 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 214 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
204 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); 215 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
216 /*
217 * SMP_CAHCE_BYTES could be enough, but init_bootmem_node like
218 * to use that to align to PAGE_SIZE
219 */
205 bootmap = early_node_mem(nodeid, bootmap_start, end, 220 bootmap = early_node_mem(nodeid, bootmap_start, end,
206 bootmap_pages<<PAGE_SHIFT); 221 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
207 if (bootmap == NULL) { 222 if (bootmap == NULL) {
208 if (nodedata_phys < start || nodedata_phys >= end) 223 if (nodedata_phys < start || nodedata_phys >= end)
209 free_bootmem((unsigned long)node_data[nodeid],pgdat_size); 224 free_bootmem((unsigned long)node_data[nodeid],
225 pgdat_size);
210 node_data[nodeid] = NULL; 226 node_data[nodeid] = NULL;
211 return; 227 return;
212 } 228 }
213 bootmap_start = __pa(bootmap); 229 bootmap_start = __pa(bootmap);
214 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); 230
215
216 bootmap_size = init_bootmem_node(NODE_DATA(nodeid), 231 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
217 bootmap_start >> PAGE_SHIFT, 232 bootmap_start >> PAGE_SHIFT,
218 start_pfn, end_pfn); 233 start_pfn, end_pfn);
234
235 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n",
236 bootmap_start, bootmap_start + bootmap_size - 1,
237 bootmap_pages);
219 238
220 free_bootmem_with_active_regions(nodeid, end); 239 free_bootmem_with_active_regions(nodeid, end);
221 240
222 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); 241 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
223 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT); 242 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
243 bootmap_pages<<PAGE_SHIFT);
224#ifdef CONFIG_ACPI_NUMA 244#ifdef CONFIG_ACPI_NUMA
225 srat_reserve_add_area(nodeid); 245 srat_reserve_add_area(nodeid);
226#endif 246#endif
227 node_set_online(nodeid); 247 node_set_online(nodeid);
228} 248}
229
230/* Initialize final allocator for a zone */
231void __init setup_node_zones(int nodeid)
232{
233 unsigned long start_pfn, end_pfn, memmapsize, limit;
234
235 start_pfn = node_start_pfn(nodeid);
236 end_pfn = node_end_pfn(nodeid);
237
238 Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n",
239 nodeid, start_pfn, end_pfn);
240
241 /* Try to allocate mem_map at end to not fill up precious <4GB
242 memory. */
243 memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
244 limit = end_pfn << PAGE_SHIFT;
245#ifdef CONFIG_FLAT_NODE_MEM_MAP
246 NODE_DATA(nodeid)->node_mem_map =
247 __alloc_bootmem_core(NODE_DATA(nodeid)->bdata,
248 memmapsize, SMP_CACHE_BYTES,
249 round_down(limit - memmapsize, PAGE_SIZE),
250 limit);
251#endif
252}
253 249
250/*
251 * There are unfortunately some poorly designed mainboards around that
252 * only connect memory to a single CPU. This breaks the 1:1 cpu->node
253 * mapping. To avoid this fill in the mapping for all possible CPUs,
254 * as the number of CPUs is not known yet. We round robin the existing
255 * nodes.
256 */
254void __init numa_init_array(void) 257void __init numa_init_array(void)
255{ 258{
256 int rr, i; 259 int rr, i;
257 /* There are unfortunately some poorly designed mainboards around 260
258 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
259 mapping. To avoid this fill in the mapping for all possible
260 CPUs, as the number of CPUs is not known yet.
261 We round robin the existing nodes. */
262 rr = first_node(node_online_map); 261 rr = first_node(node_online_map);
263 for (i = 0; i < NR_CPUS; i++) { 262 for (i = 0; i < NR_CPUS; i++) {
264 if (cpu_to_node(i) != NUMA_NO_NODE) 263 if (early_cpu_to_node(i) != NUMA_NO_NODE)
265 continue; 264 continue;
266 numa_set_node(i, rr); 265 numa_set_node(i, rr);
267 rr = next_node(rr, node_online_map); 266 rr = next_node(rr, node_online_map);
268 if (rr == MAX_NUMNODES) 267 if (rr == MAX_NUMNODES)
269 rr = first_node(node_online_map); 268 rr = first_node(node_online_map);
270 } 269 }
271
272} 270}
273 271
274#ifdef CONFIG_NUMA_EMU 272#ifdef CONFIG_NUMA_EMU
@@ -276,15 +274,17 @@ void __init numa_init_array(void)
276char *cmdline __initdata; 274char *cmdline __initdata;
277 275
278/* 276/*
279 * Setups up nid to range from addr to addr + size. If the end boundary is 277 * Setups up nid to range from addr to addr + size. If the end
280 * greater than max_addr, then max_addr is used instead. The return value is 0 278 * boundary is greater than max_addr, then max_addr is used instead.
281 * if there is additional memory left for allocation past addr and -1 otherwise. 279 * The return value is 0 if there is additional memory left for
282 * addr is adjusted to be at the end of the node. 280 * allocation past addr and -1 otherwise. addr is adjusted to be at
281 * the end of the node.
283 */ 282 */
284static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, 283static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
285 u64 size, u64 max_addr) 284 u64 size, u64 max_addr)
286{ 285{
287 int ret = 0; 286 int ret = 0;
287
288 nodes[nid].start = *addr; 288 nodes[nid].start = *addr;
289 *addr += size; 289 *addr += size;
290 if (*addr >= max_addr) { 290 if (*addr >= max_addr) {
@@ -335,6 +335,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
335 335
336 for (i = node_start; i < num_nodes + node_start; i++) { 336 for (i = node_start; i < num_nodes + node_start; i++) {
337 u64 end = *addr + size; 337 u64 end = *addr + size;
338
338 if (i < big) 339 if (i < big)
339 end += FAKE_NODE_MIN_SIZE; 340 end += FAKE_NODE_MIN_SIZE;
340 /* 341 /*
@@ -380,14 +381,9 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
380static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) 381static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
381{ 382{
382 struct bootnode nodes[MAX_NUMNODES]; 383 struct bootnode nodes[MAX_NUMNODES];
383 u64 addr = start_pfn << PAGE_SHIFT; 384 u64 size, addr = start_pfn << PAGE_SHIFT;
384 u64 max_addr = end_pfn << PAGE_SHIFT; 385 u64 max_addr = end_pfn << PAGE_SHIFT;
385 int num_nodes = 0; 386 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
386 int coeff_flag;
387 int coeff = -1;
388 int num = 0;
389 u64 size;
390 int i;
391 387
392 memset(&nodes, 0, sizeof(nodes)); 388 memset(&nodes, 0, sizeof(nodes));
393 /* 389 /*
@@ -395,8 +391,9 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
395 * system RAM into N fake nodes. 391 * system RAM into N fake nodes.
396 */ 392 */
397 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { 393 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
398 num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, 394 long n = simple_strtol(cmdline, NULL, 0);
399 simple_strtol(cmdline, NULL, 0)); 395
396 num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
400 if (num_nodes < 0) 397 if (num_nodes < 0)
401 return num_nodes; 398 return num_nodes;
402 goto out; 399 goto out;
@@ -483,46 +480,47 @@ out:
483 for_each_node_mask(i, node_possible_map) { 480 for_each_node_mask(i, node_possible_map) {
484 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, 481 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
485 nodes[i].end >> PAGE_SHIFT); 482 nodes[i].end >> PAGE_SHIFT);
486 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 483 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
487 } 484 }
488 acpi_fake_nodes(nodes, num_nodes); 485 acpi_fake_nodes(nodes, num_nodes);
489 numa_init_array(); 486 numa_init_array();
490 return 0; 487 return 0;
491} 488}
492#endif /* CONFIG_NUMA_EMU */ 489#endif /* CONFIG_NUMA_EMU */
493 490
494void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) 491void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
495{ 492{
496 int i; 493 int i;
497 494
498 nodes_clear(node_possible_map); 495 nodes_clear(node_possible_map);
499 496
500#ifdef CONFIG_NUMA_EMU 497#ifdef CONFIG_NUMA_EMU
501 if (cmdline && !numa_emulation(start_pfn, end_pfn)) 498 if (cmdline && !numa_emulation(start_pfn, end_pfn))
502 return; 499 return;
503 nodes_clear(node_possible_map); 500 nodes_clear(node_possible_map);
504#endif 501#endif
505 502
506#ifdef CONFIG_ACPI_NUMA 503#ifdef CONFIG_ACPI_NUMA
507 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 504 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
508 end_pfn << PAGE_SHIFT)) 505 end_pfn << PAGE_SHIFT))
509 return; 506 return;
510 nodes_clear(node_possible_map); 507 nodes_clear(node_possible_map);
511#endif 508#endif
512 509
513#ifdef CONFIG_K8_NUMA 510#ifdef CONFIG_K8_NUMA
514 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT)) 511 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT,
512 end_pfn<<PAGE_SHIFT))
515 return; 513 return;
516 nodes_clear(node_possible_map); 514 nodes_clear(node_possible_map);
517#endif 515#endif
518 printk(KERN_INFO "%s\n", 516 printk(KERN_INFO "%s\n",
519 numa_off ? "NUMA turned off" : "No NUMA configuration found"); 517 numa_off ? "NUMA turned off" : "No NUMA configuration found");
520 518
521 printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 519 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
522 start_pfn << PAGE_SHIFT, 520 start_pfn << PAGE_SHIFT,
523 end_pfn << PAGE_SHIFT); 521 end_pfn << PAGE_SHIFT);
524 /* setup dummy node covering all memory */ 522 /* setup dummy node covering all memory */
525 memnode_shift = 63; 523 memnode_shift = 63;
526 memnodemap = memnode.embedded_map; 524 memnodemap = memnode.embedded_map;
527 memnodemap[0] = 0; 525 memnodemap[0] = 0;
528 nodes_clear(node_online_map); 526 nodes_clear(node_online_map);
@@ -530,36 +528,48 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
530 node_set(0, node_possible_map); 528 node_set(0, node_possible_map);
531 for (i = 0; i < NR_CPUS; i++) 529 for (i = 0; i < NR_CPUS; i++)
532 numa_set_node(i, 0); 530 numa_set_node(i, 0);
533 node_to_cpumask[0] = cpumask_of_cpu(0); 531 /* cpumask_of_cpu() may not be available during early startup */
532 memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0]));
533 cpu_set(0, node_to_cpumask_map[0]);
534 e820_register_active_regions(0, start_pfn, end_pfn); 534 e820_register_active_regions(0, start_pfn, end_pfn);
535 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); 535 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
536} 536}
537 537
538__cpuinit void numa_add_cpu(int cpu) 538__cpuinit void numa_add_cpu(int cpu)
539{ 539{
540 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]); 540 set_bit(cpu,
541} 541 (unsigned long *)&node_to_cpumask_map[early_cpu_to_node(cpu)]);
542}
542 543
543void __cpuinit numa_set_node(int cpu, int node) 544void __cpuinit numa_set_node(int cpu, int node)
544{ 545{
546 int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr;
547
545 cpu_pda(cpu)->nodenumber = node; 548 cpu_pda(cpu)->nodenumber = node;
546 cpu_to_node(cpu) = node; 549
550 if(cpu_to_node_map)
551 cpu_to_node_map[cpu] = node;
552 else if(per_cpu_offset(cpu))
553 per_cpu(x86_cpu_to_node_map, cpu) = node;
554 else
555 Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu);
547} 556}
548 557
549unsigned long __init numa_free_all_bootmem(void) 558unsigned long __init numa_free_all_bootmem(void)
550{ 559{
551 int i;
552 unsigned long pages = 0; 560 unsigned long pages = 0;
553 for_each_online_node(i) { 561 int i;
562
563 for_each_online_node(i)
554 pages += free_all_bootmem_node(NODE_DATA(i)); 564 pages += free_all_bootmem_node(NODE_DATA(i));
555 } 565
556 return pages; 566 return pages;
557} 567}
558 568
559void __init paging_init(void) 569void __init paging_init(void)
560{ 570{
561 int i;
562 unsigned long max_zone_pfns[MAX_NR_ZONES]; 571 unsigned long max_zone_pfns[MAX_NR_ZONES];
572
563 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 573 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
564 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; 574 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
565 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 575 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
@@ -568,32 +578,27 @@ void __init paging_init(void)
568 sparse_memory_present_with_active_regions(MAX_NUMNODES); 578 sparse_memory_present_with_active_regions(MAX_NUMNODES);
569 sparse_init(); 579 sparse_init();
570 580
571 for_each_online_node(i) {
572 setup_node_zones(i);
573 }
574
575 free_area_init_nodes(max_zone_pfns); 581 free_area_init_nodes(max_zone_pfns);
576} 582}
577 583
578static __init int numa_setup(char *opt) 584static __init int numa_setup(char *opt)
579{ 585{
580 if (!opt) 586 if (!opt)
581 return -EINVAL; 587 return -EINVAL;
582 if (!strncmp(opt,"off",3)) 588 if (!strncmp(opt, "off", 3))
583 numa_off = 1; 589 numa_off = 1;
584#ifdef CONFIG_NUMA_EMU 590#ifdef CONFIG_NUMA_EMU
585 if (!strncmp(opt, "fake=", 5)) 591 if (!strncmp(opt, "fake=", 5))
586 cmdline = opt + 5; 592 cmdline = opt + 5;
587#endif 593#endif
588#ifdef CONFIG_ACPI_NUMA 594#ifdef CONFIG_ACPI_NUMA
589 if (!strncmp(opt,"noacpi",6)) 595 if (!strncmp(opt, "noacpi", 6))
590 acpi_numa = -1; 596 acpi_numa = -1;
591 if (!strncmp(opt,"hotadd=", 7)) 597 if (!strncmp(opt, "hotadd=", 7))
592 hotadd_percent = simple_strtoul(opt+7, NULL, 10); 598 hotadd_percent = simple_strtoul(opt+7, NULL, 10);
593#endif 599#endif
594 return 0; 600 return 0;
595} 601}
596
597early_param("numa", numa_setup); 602early_param("numa", numa_setup);
598 603
599/* 604/*
@@ -611,38 +616,16 @@ early_param("numa", numa_setup);
611void __init init_cpu_to_node(void) 616void __init init_cpu_to_node(void)
612{ 617{
613 int i; 618 int i;
614 for (i = 0; i < NR_CPUS; i++) { 619
615 u8 apicid = x86_cpu_to_apicid_init[i]; 620 for (i = 0; i < NR_CPUS; i++) {
621 u16 apicid = x86_cpu_to_apicid_init[i];
622
616 if (apicid == BAD_APICID) 623 if (apicid == BAD_APICID)
617 continue; 624 continue;
618 if (apicid_to_node[apicid] == NUMA_NO_NODE) 625 if (apicid_to_node[apicid] == NUMA_NO_NODE)
619 continue; 626 continue;
620 numa_set_node(i,apicid_to_node[apicid]); 627 numa_set_node(i, apicid_to_node[apicid]);
621 } 628 }
622} 629}
623 630
624EXPORT_SYMBOL(cpu_to_node);
625EXPORT_SYMBOL(node_to_cpumask);
626EXPORT_SYMBOL(memnode);
627EXPORT_SYMBOL(node_data);
628 631
629#ifdef CONFIG_DISCONTIGMEM
630/*
631 * Functions to convert PFNs from/to per node page addresses.
632 * These are out of line because they are quite big.
633 * They could be all tuned by pre caching more state.
634 * Should do that.
635 */
636
637int pfn_valid(unsigned long pfn)
638{
639 unsigned nid;
640 if (pfn >= num_physpages)
641 return 0;
642 nid = pfn_to_nid(pfn);
643 if (nid == 0xff)
644 return 0;
645 return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
646}
647EXPORT_SYMBOL(pfn_valid);
648#endif