diff options
Diffstat (limited to 'arch/x86/mm/numa_64.c')
-rw-r--r-- | arch/x86/mm/numa_64.c | 274 |
1 files changed, 124 insertions, 150 deletions
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 3d6926ba8995..dc3b1f7e1451 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Generic VM initialization for x86-64 NUMA setups. | 2 | * Generic VM initialization for x86-64 NUMA setups. |
3 | * Copyright 2002,2003 Andi Kleen, SuSE Labs. | 3 | * Copyright 2002,2003 Andi Kleen, SuSE Labs. |
4 | */ | 4 | */ |
5 | #include <linux/kernel.h> | 5 | #include <linux/kernel.h> |
6 | #include <linux/mm.h> | 6 | #include <linux/mm.h> |
7 | #include <linux/string.h> | 7 | #include <linux/string.h> |
@@ -11,35 +11,45 @@ | |||
11 | #include <linux/ctype.h> | 11 | #include <linux/ctype.h> |
12 | #include <linux/module.h> | 12 | #include <linux/module.h> |
13 | #include <linux/nodemask.h> | 13 | #include <linux/nodemask.h> |
14 | #include <linux/sched.h> | ||
14 | 15 | ||
15 | #include <asm/e820.h> | 16 | #include <asm/e820.h> |
16 | #include <asm/proto.h> | 17 | #include <asm/proto.h> |
17 | #include <asm/dma.h> | 18 | #include <asm/dma.h> |
18 | #include <asm/numa.h> | 19 | #include <asm/numa.h> |
19 | #include <asm/acpi.h> | 20 | #include <asm/acpi.h> |
21 | #include <asm/k8.h> | ||
20 | 22 | ||
21 | #ifndef Dprintk | 23 | #ifndef Dprintk |
22 | #define Dprintk(x...) | 24 | #define Dprintk(x...) |
23 | #endif | 25 | #endif |
24 | 26 | ||
25 | struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; | 27 | struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; |
28 | EXPORT_SYMBOL(node_data); | ||
29 | |||
26 | bootmem_data_t plat_node_bdata[MAX_NUMNODES]; | 30 | bootmem_data_t plat_node_bdata[MAX_NUMNODES]; |
27 | 31 | ||
28 | struct memnode memnode; | 32 | struct memnode memnode; |
29 | 33 | ||
30 | unsigned char cpu_to_node[NR_CPUS] __read_mostly = { | 34 | int x86_cpu_to_node_map_init[NR_CPUS] = { |
31 | [0 ... NR_CPUS-1] = NUMA_NO_NODE | 35 | [0 ... NR_CPUS-1] = NUMA_NO_NODE |
32 | }; | 36 | }; |
33 | unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { | 37 | void *x86_cpu_to_node_map_early_ptr; |
34 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE | 38 | DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE; |
39 | EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map); | ||
40 | EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr); | ||
41 | |||
42 | s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { | ||
43 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE | ||
35 | }; | 44 | }; |
36 | cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; | 45 | |
46 | cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly; | ||
47 | EXPORT_SYMBOL(node_to_cpumask_map); | ||
37 | 48 | ||
38 | int numa_off __initdata; | 49 | int numa_off __initdata; |
39 | unsigned long __initdata nodemap_addr; | 50 | unsigned long __initdata nodemap_addr; |
40 | unsigned long __initdata nodemap_size; | 51 | unsigned long __initdata nodemap_size; |
41 | 52 | ||
42 | |||
43 | /* | 53 | /* |
44 | * Given a shift value, try to populate memnodemap[] | 54 | * Given a shift value, try to populate memnodemap[] |
45 | * Returns : | 55 | * Returns : |
@@ -47,14 +57,13 @@ unsigned long __initdata nodemap_size; | |||
47 | * 0 if memnodmap[] too small (of shift too small) | 57 | * 0 if memnodmap[] too small (of shift too small) |
48 | * -1 if node overlap or lost ram (shift too big) | 58 | * -1 if node overlap or lost ram (shift too big) |
49 | */ | 59 | */ |
50 | static int __init | 60 | static int __init populate_memnodemap(const struct bootnode *nodes, |
51 | populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) | 61 | int numnodes, int shift) |
52 | { | 62 | { |
53 | int i; | ||
54 | int res = -1; | ||
55 | unsigned long addr, end; | 63 | unsigned long addr, end; |
64 | int i, res = -1; | ||
56 | 65 | ||
57 | memset(memnodemap, 0xff, memnodemapsize); | 66 | memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize); |
58 | for (i = 0; i < numnodes; i++) { | 67 | for (i = 0; i < numnodes; i++) { |
59 | addr = nodes[i].start; | 68 | addr = nodes[i].start; |
60 | end = nodes[i].end; | 69 | end = nodes[i].end; |
@@ -63,13 +72,13 @@ populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) | |||
63 | if ((end >> shift) >= memnodemapsize) | 72 | if ((end >> shift) >= memnodemapsize) |
64 | return 0; | 73 | return 0; |
65 | do { | 74 | do { |
66 | if (memnodemap[addr >> shift] != 0xff) | 75 | if (memnodemap[addr >> shift] != NUMA_NO_NODE) |
67 | return -1; | 76 | return -1; |
68 | memnodemap[addr >> shift] = i; | 77 | memnodemap[addr >> shift] = i; |
69 | addr += (1UL << shift); | 78 | addr += (1UL << shift); |
70 | } while (addr < end); | 79 | } while (addr < end); |
71 | res = 1; | 80 | res = 1; |
72 | } | 81 | } |
73 | return res; | 82 | return res; |
74 | } | 83 | } |
75 | 84 | ||
@@ -78,12 +87,12 @@ static int __init allocate_cachealigned_memnodemap(void) | |||
78 | unsigned long pad, pad_addr; | 87 | unsigned long pad, pad_addr; |
79 | 88 | ||
80 | memnodemap = memnode.embedded_map; | 89 | memnodemap = memnode.embedded_map; |
81 | if (memnodemapsize <= 48) | 90 | if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map)) |
82 | return 0; | 91 | return 0; |
83 | 92 | ||
84 | pad = L1_CACHE_BYTES - 1; | 93 | pad = L1_CACHE_BYTES - 1; |
85 | pad_addr = 0x8000; | 94 | pad_addr = 0x8000; |
86 | nodemap_size = pad + memnodemapsize; | 95 | nodemap_size = pad + sizeof(s16) * memnodemapsize; |
87 | nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT, | 96 | nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT, |
88 | nodemap_size); | 97 | nodemap_size); |
89 | if (nodemap_addr == -1UL) { | 98 | if (nodemap_addr == -1UL) { |
@@ -94,6 +103,7 @@ static int __init allocate_cachealigned_memnodemap(void) | |||
94 | } | 103 | } |
95 | pad_addr = (nodemap_addr + pad) & ~pad; | 104 | pad_addr = (nodemap_addr + pad) & ~pad; |
96 | memnodemap = phys_to_virt(pad_addr); | 105 | memnodemap = phys_to_virt(pad_addr); |
106 | reserve_early(nodemap_addr, nodemap_addr + nodemap_size); | ||
97 | 107 | ||
98 | printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", | 108 | printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", |
99 | nodemap_addr, nodemap_addr + nodemap_size); | 109 | nodemap_addr, nodemap_addr + nodemap_size); |
@@ -104,8 +114,8 @@ static int __init allocate_cachealigned_memnodemap(void) | |||
104 | * The LSB of all start and end addresses in the node map is the value of the | 114 | * The LSB of all start and end addresses in the node map is the value of the |
105 | * maximum possible shift. | 115 | * maximum possible shift. |
106 | */ | 116 | */ |
107 | static int __init | 117 | static int __init extract_lsb_from_nodes(const struct bootnode *nodes, |
108 | extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes) | 118 | int numnodes) |
109 | { | 119 | { |
110 | int i, nodes_used = 0; | 120 | int i, nodes_used = 0; |
111 | unsigned long start, end; | 121 | unsigned long start, end; |
@@ -140,51 +150,50 @@ int __init compute_hash_shift(struct bootnode *nodes, int numnodes) | |||
140 | shift); | 150 | shift); |
141 | 151 | ||
142 | if (populate_memnodemap(nodes, numnodes, shift) != 1) { | 152 | if (populate_memnodemap(nodes, numnodes, shift) != 1) { |
143 | printk(KERN_INFO | 153 | printk(KERN_INFO "Your memory is not aligned you need to " |
144 | "Your memory is not aligned you need to rebuild your kernel " | 154 | "rebuild your kernel with a bigger NODEMAPSIZE " |
145 | "with a bigger NODEMAPSIZE shift=%d\n", | 155 | "shift=%d\n", shift); |
146 | shift); | ||
147 | return -1; | 156 | return -1; |
148 | } | 157 | } |
149 | return shift; | 158 | return shift; |
150 | } | 159 | } |
151 | 160 | ||
152 | #ifdef CONFIG_SPARSEMEM | ||
153 | int early_pfn_to_nid(unsigned long pfn) | 161 | int early_pfn_to_nid(unsigned long pfn) |
154 | { | 162 | { |
155 | return phys_to_nid(pfn << PAGE_SHIFT); | 163 | return phys_to_nid(pfn << PAGE_SHIFT); |
156 | } | 164 | } |
157 | #endif | ||
158 | 165 | ||
159 | static void * __init | 166 | static void * __init early_node_mem(int nodeid, unsigned long start, |
160 | early_node_mem(int nodeid, unsigned long start, unsigned long end, | 167 | unsigned long end, unsigned long size) |
161 | unsigned long size) | ||
162 | { | 168 | { |
163 | unsigned long mem = find_e820_area(start, end, size); | 169 | unsigned long mem = find_e820_area(start, end, size); |
164 | void *ptr; | 170 | void *ptr; |
171 | |||
165 | if (mem != -1L) | 172 | if (mem != -1L) |
166 | return __va(mem); | 173 | return __va(mem); |
167 | ptr = __alloc_bootmem_nopanic(size, | 174 | ptr = __alloc_bootmem_nopanic(size, |
168 | SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)); | 175 | SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)); |
169 | if (ptr == NULL) { | 176 | if (ptr == NULL) { |
170 | printk(KERN_ERR "Cannot find %lu bytes in node %d\n", | 177 | printk(KERN_ERR "Cannot find %lu bytes in node %d\n", |
171 | size, nodeid); | 178 | size, nodeid); |
172 | return NULL; | 179 | return NULL; |
173 | } | 180 | } |
174 | return ptr; | 181 | return ptr; |
175 | } | 182 | } |
176 | 183 | ||
177 | /* Initialize bootmem allocator for a node */ | 184 | /* Initialize bootmem allocator for a node */ |
178 | void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) | 185 | void __init setup_node_bootmem(int nodeid, unsigned long start, |
179 | { | 186 | unsigned long end) |
180 | unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; | 187 | { |
181 | unsigned long nodedata_phys; | 188 | unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size; |
189 | unsigned long bootmap_start, nodedata_phys; | ||
182 | void *bootmap; | 190 | void *bootmap; |
183 | const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); | 191 | const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); |
184 | 192 | ||
185 | start = round_up(start, ZONE_ALIGN); | 193 | start = round_up(start, ZONE_ALIGN); |
186 | 194 | ||
187 | printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end); | 195 | printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, |
196 | start, end); | ||
188 | 197 | ||
189 | start_pfn = start >> PAGE_SHIFT; | 198 | start_pfn = start >> PAGE_SHIFT; |
190 | end_pfn = end >> PAGE_SHIFT; | 199 | end_pfn = end >> PAGE_SHIFT; |
@@ -200,75 +209,55 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en | |||
200 | NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; | 209 | NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; |
201 | 210 | ||
202 | /* Find a place for the bootmem map */ | 211 | /* Find a place for the bootmem map */ |
203 | bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); | 212 | bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); |
204 | bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); | 213 | bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); |
205 | bootmap = early_node_mem(nodeid, bootmap_start, end, | 214 | bootmap = early_node_mem(nodeid, bootmap_start, end, |
206 | bootmap_pages<<PAGE_SHIFT); | 215 | bootmap_pages<<PAGE_SHIFT); |
207 | if (bootmap == NULL) { | 216 | if (bootmap == NULL) { |
208 | if (nodedata_phys < start || nodedata_phys >= end) | 217 | if (nodedata_phys < start || nodedata_phys >= end) |
209 | free_bootmem((unsigned long)node_data[nodeid],pgdat_size); | 218 | free_bootmem((unsigned long)node_data[nodeid], |
219 | pgdat_size); | ||
210 | node_data[nodeid] = NULL; | 220 | node_data[nodeid] = NULL; |
211 | return; | 221 | return; |
212 | } | 222 | } |
213 | bootmap_start = __pa(bootmap); | 223 | bootmap_start = __pa(bootmap); |
214 | Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); | 224 | Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); |
215 | 225 | ||
216 | bootmap_size = init_bootmem_node(NODE_DATA(nodeid), | 226 | bootmap_size = init_bootmem_node(NODE_DATA(nodeid), |
217 | bootmap_start >> PAGE_SHIFT, | 227 | bootmap_start >> PAGE_SHIFT, |
218 | start_pfn, end_pfn); | 228 | start_pfn, end_pfn); |
219 | 229 | ||
220 | free_bootmem_with_active_regions(nodeid, end); | 230 | free_bootmem_with_active_regions(nodeid, end); |
221 | 231 | ||
222 | reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); | 232 | reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); |
223 | reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT); | 233 | reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, |
234 | bootmap_pages<<PAGE_SHIFT); | ||
224 | #ifdef CONFIG_ACPI_NUMA | 235 | #ifdef CONFIG_ACPI_NUMA |
225 | srat_reserve_add_area(nodeid); | 236 | srat_reserve_add_area(nodeid); |
226 | #endif | 237 | #endif |
227 | node_set_online(nodeid); | 238 | node_set_online(nodeid); |
228 | } | 239 | } |
229 | |||
230 | /* Initialize final allocator for a zone */ | ||
231 | void __init setup_node_zones(int nodeid) | ||
232 | { | ||
233 | unsigned long start_pfn, end_pfn, memmapsize, limit; | ||
234 | |||
235 | start_pfn = node_start_pfn(nodeid); | ||
236 | end_pfn = node_end_pfn(nodeid); | ||
237 | |||
238 | Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n", | ||
239 | nodeid, start_pfn, end_pfn); | ||
240 | |||
241 | /* Try to allocate mem_map at end to not fill up precious <4GB | ||
242 | memory. */ | ||
243 | memmapsize = sizeof(struct page) * (end_pfn-start_pfn); | ||
244 | limit = end_pfn << PAGE_SHIFT; | ||
245 | #ifdef CONFIG_FLAT_NODE_MEM_MAP | ||
246 | NODE_DATA(nodeid)->node_mem_map = | ||
247 | __alloc_bootmem_core(NODE_DATA(nodeid)->bdata, | ||
248 | memmapsize, SMP_CACHE_BYTES, | ||
249 | round_down(limit - memmapsize, PAGE_SIZE), | ||
250 | limit); | ||
251 | #endif | ||
252 | } | ||
253 | 240 | ||
241 | /* | ||
242 | * There are unfortunately some poorly designed mainboards around that | ||
243 | * only connect memory to a single CPU. This breaks the 1:1 cpu->node | ||
244 | * mapping. To avoid this fill in the mapping for all possible CPUs, | ||
245 | * as the number of CPUs is not known yet. We round robin the existing | ||
246 | * nodes. | ||
247 | */ | ||
254 | void __init numa_init_array(void) | 248 | void __init numa_init_array(void) |
255 | { | 249 | { |
256 | int rr, i; | 250 | int rr, i; |
257 | /* There are unfortunately some poorly designed mainboards around | 251 | |
258 | that only connect memory to a single CPU. This breaks the 1:1 cpu->node | ||
259 | mapping. To avoid this fill in the mapping for all possible | ||
260 | CPUs, as the number of CPUs is not known yet. | ||
261 | We round robin the existing nodes. */ | ||
262 | rr = first_node(node_online_map); | 252 | rr = first_node(node_online_map); |
263 | for (i = 0; i < NR_CPUS; i++) { | 253 | for (i = 0; i < NR_CPUS; i++) { |
264 | if (cpu_to_node(i) != NUMA_NO_NODE) | 254 | if (early_cpu_to_node(i) != NUMA_NO_NODE) |
265 | continue; | 255 | continue; |
266 | numa_set_node(i, rr); | 256 | numa_set_node(i, rr); |
267 | rr = next_node(rr, node_online_map); | 257 | rr = next_node(rr, node_online_map); |
268 | if (rr == MAX_NUMNODES) | 258 | if (rr == MAX_NUMNODES) |
269 | rr = first_node(node_online_map); | 259 | rr = first_node(node_online_map); |
270 | } | 260 | } |
271 | |||
272 | } | 261 | } |
273 | 262 | ||
274 | #ifdef CONFIG_NUMA_EMU | 263 | #ifdef CONFIG_NUMA_EMU |
@@ -276,15 +265,17 @@ void __init numa_init_array(void) | |||
276 | char *cmdline __initdata; | 265 | char *cmdline __initdata; |
277 | 266 | ||
278 | /* | 267 | /* |
279 | * Setups up nid to range from addr to addr + size. If the end boundary is | 268 | * Setups up nid to range from addr to addr + size. If the end |
280 | * greater than max_addr, then max_addr is used instead. The return value is 0 | 269 | * boundary is greater than max_addr, then max_addr is used instead. |
281 | * if there is additional memory left for allocation past addr and -1 otherwise. | 270 | * The return value is 0 if there is additional memory left for |
282 | * addr is adjusted to be at the end of the node. | 271 | * allocation past addr and -1 otherwise. addr is adjusted to be at |
272 | * the end of the node. | ||
283 | */ | 273 | */ |
284 | static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, | 274 | static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, |
285 | u64 size, u64 max_addr) | 275 | u64 size, u64 max_addr) |
286 | { | 276 | { |
287 | int ret = 0; | 277 | int ret = 0; |
278 | |||
288 | nodes[nid].start = *addr; | 279 | nodes[nid].start = *addr; |
289 | *addr += size; | 280 | *addr += size; |
290 | if (*addr >= max_addr) { | 281 | if (*addr >= max_addr) { |
@@ -335,6 +326,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, | |||
335 | 326 | ||
336 | for (i = node_start; i < num_nodes + node_start; i++) { | 327 | for (i = node_start; i < num_nodes + node_start; i++) { |
337 | u64 end = *addr + size; | 328 | u64 end = *addr + size; |
329 | |||
338 | if (i < big) | 330 | if (i < big) |
339 | end += FAKE_NODE_MIN_SIZE; | 331 | end += FAKE_NODE_MIN_SIZE; |
340 | /* | 332 | /* |
@@ -380,14 +372,9 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, | |||
380 | static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) | 372 | static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) |
381 | { | 373 | { |
382 | struct bootnode nodes[MAX_NUMNODES]; | 374 | struct bootnode nodes[MAX_NUMNODES]; |
383 | u64 addr = start_pfn << PAGE_SHIFT; | 375 | u64 size, addr = start_pfn << PAGE_SHIFT; |
384 | u64 max_addr = end_pfn << PAGE_SHIFT; | 376 | u64 max_addr = end_pfn << PAGE_SHIFT; |
385 | int num_nodes = 0; | 377 | int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; |
386 | int coeff_flag; | ||
387 | int coeff = -1; | ||
388 | int num = 0; | ||
389 | u64 size; | ||
390 | int i; | ||
391 | 378 | ||
392 | memset(&nodes, 0, sizeof(nodes)); | 379 | memset(&nodes, 0, sizeof(nodes)); |
393 | /* | 380 | /* |
@@ -395,8 +382,9 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) | |||
395 | * system RAM into N fake nodes. | 382 | * system RAM into N fake nodes. |
396 | */ | 383 | */ |
397 | if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { | 384 | if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { |
398 | num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, | 385 | long n = simple_strtol(cmdline, NULL, 0); |
399 | simple_strtol(cmdline, NULL, 0)); | 386 | |
387 | num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n); | ||
400 | if (num_nodes < 0) | 388 | if (num_nodes < 0) |
401 | return num_nodes; | 389 | return num_nodes; |
402 | goto out; | 390 | goto out; |
@@ -483,46 +471,47 @@ out: | |||
483 | for_each_node_mask(i, node_possible_map) { | 471 | for_each_node_mask(i, node_possible_map) { |
484 | e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, | 472 | e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, |
485 | nodes[i].end >> PAGE_SHIFT); | 473 | nodes[i].end >> PAGE_SHIFT); |
486 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | 474 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); |
487 | } | 475 | } |
488 | acpi_fake_nodes(nodes, num_nodes); | 476 | acpi_fake_nodes(nodes, num_nodes); |
489 | numa_init_array(); | 477 | numa_init_array(); |
490 | return 0; | 478 | return 0; |
491 | } | 479 | } |
492 | #endif /* CONFIG_NUMA_EMU */ | 480 | #endif /* CONFIG_NUMA_EMU */ |
493 | 481 | ||
494 | void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) | 482 | void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) |
495 | { | 483 | { |
496 | int i; | 484 | int i; |
497 | 485 | ||
498 | nodes_clear(node_possible_map); | 486 | nodes_clear(node_possible_map); |
499 | 487 | ||
500 | #ifdef CONFIG_NUMA_EMU | 488 | #ifdef CONFIG_NUMA_EMU |
501 | if (cmdline && !numa_emulation(start_pfn, end_pfn)) | 489 | if (cmdline && !numa_emulation(start_pfn, end_pfn)) |
502 | return; | 490 | return; |
503 | nodes_clear(node_possible_map); | 491 | nodes_clear(node_possible_map); |
504 | #endif | 492 | #endif |
505 | 493 | ||
506 | #ifdef CONFIG_ACPI_NUMA | 494 | #ifdef CONFIG_ACPI_NUMA |
507 | if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, | 495 | if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, |
508 | end_pfn << PAGE_SHIFT)) | 496 | end_pfn << PAGE_SHIFT)) |
509 | return; | 497 | return; |
510 | nodes_clear(node_possible_map); | 498 | nodes_clear(node_possible_map); |
511 | #endif | 499 | #endif |
512 | 500 | ||
513 | #ifdef CONFIG_K8_NUMA | 501 | #ifdef CONFIG_K8_NUMA |
514 | if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT)) | 502 | if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, |
503 | end_pfn<<PAGE_SHIFT)) | ||
515 | return; | 504 | return; |
516 | nodes_clear(node_possible_map); | 505 | nodes_clear(node_possible_map); |
517 | #endif | 506 | #endif |
518 | printk(KERN_INFO "%s\n", | 507 | printk(KERN_INFO "%s\n", |
519 | numa_off ? "NUMA turned off" : "No NUMA configuration found"); | 508 | numa_off ? "NUMA turned off" : "No NUMA configuration found"); |
520 | 509 | ||
521 | printk(KERN_INFO "Faking a node at %016lx-%016lx\n", | 510 | printk(KERN_INFO "Faking a node at %016lx-%016lx\n", |
522 | start_pfn << PAGE_SHIFT, | 511 | start_pfn << PAGE_SHIFT, |
523 | end_pfn << PAGE_SHIFT); | 512 | end_pfn << PAGE_SHIFT); |
524 | /* setup dummy node covering all memory */ | 513 | /* setup dummy node covering all memory */ |
525 | memnode_shift = 63; | 514 | memnode_shift = 63; |
526 | memnodemap = memnode.embedded_map; | 515 | memnodemap = memnode.embedded_map; |
527 | memnodemap[0] = 0; | 516 | memnodemap[0] = 0; |
528 | nodes_clear(node_online_map); | 517 | nodes_clear(node_online_map); |
@@ -530,36 +519,48 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) | |||
530 | node_set(0, node_possible_map); | 519 | node_set(0, node_possible_map); |
531 | for (i = 0; i < NR_CPUS; i++) | 520 | for (i = 0; i < NR_CPUS; i++) |
532 | numa_set_node(i, 0); | 521 | numa_set_node(i, 0); |
533 | node_to_cpumask[0] = cpumask_of_cpu(0); | 522 | /* cpumask_of_cpu() may not be available during early startup */ |
523 | memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0])); | ||
524 | cpu_set(0, node_to_cpumask_map[0]); | ||
534 | e820_register_active_regions(0, start_pfn, end_pfn); | 525 | e820_register_active_regions(0, start_pfn, end_pfn); |
535 | setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); | 526 | setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); |
536 | } | 527 | } |
537 | 528 | ||
538 | __cpuinit void numa_add_cpu(int cpu) | 529 | __cpuinit void numa_add_cpu(int cpu) |
539 | { | 530 | { |
540 | set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]); | 531 | set_bit(cpu, |
541 | } | 532 | (unsigned long *)&node_to_cpumask_map[early_cpu_to_node(cpu)]); |
533 | } | ||
542 | 534 | ||
543 | void __cpuinit numa_set_node(int cpu, int node) | 535 | void __cpuinit numa_set_node(int cpu, int node) |
544 | { | 536 | { |
537 | int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr; | ||
538 | |||
545 | cpu_pda(cpu)->nodenumber = node; | 539 | cpu_pda(cpu)->nodenumber = node; |
546 | cpu_to_node(cpu) = node; | 540 | |
541 | if(cpu_to_node_map) | ||
542 | cpu_to_node_map[cpu] = node; | ||
543 | else if(per_cpu_offset(cpu)) | ||
544 | per_cpu(x86_cpu_to_node_map, cpu) = node; | ||
545 | else | ||
546 | Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu); | ||
547 | } | 547 | } |
548 | 548 | ||
549 | unsigned long __init numa_free_all_bootmem(void) | 549 | unsigned long __init numa_free_all_bootmem(void) |
550 | { | 550 | { |
551 | int i; | ||
552 | unsigned long pages = 0; | 551 | unsigned long pages = 0; |
553 | for_each_online_node(i) { | 552 | int i; |
553 | |||
554 | for_each_online_node(i) | ||
554 | pages += free_all_bootmem_node(NODE_DATA(i)); | 555 | pages += free_all_bootmem_node(NODE_DATA(i)); |
555 | } | 556 | |
556 | return pages; | 557 | return pages; |
557 | } | 558 | } |
558 | 559 | ||
559 | void __init paging_init(void) | 560 | void __init paging_init(void) |
560 | { | 561 | { |
561 | int i; | ||
562 | unsigned long max_zone_pfns[MAX_NR_ZONES]; | 562 | unsigned long max_zone_pfns[MAX_NR_ZONES]; |
563 | |||
563 | memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); | 564 | memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); |
564 | max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; | 565 | max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; |
565 | max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; | 566 | max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; |
@@ -568,32 +569,27 @@ void __init paging_init(void) | |||
568 | sparse_memory_present_with_active_regions(MAX_NUMNODES); | 569 | sparse_memory_present_with_active_regions(MAX_NUMNODES); |
569 | sparse_init(); | 570 | sparse_init(); |
570 | 571 | ||
571 | for_each_online_node(i) { | ||
572 | setup_node_zones(i); | ||
573 | } | ||
574 | |||
575 | free_area_init_nodes(max_zone_pfns); | 572 | free_area_init_nodes(max_zone_pfns); |
576 | } | 573 | } |
577 | 574 | ||
578 | static __init int numa_setup(char *opt) | 575 | static __init int numa_setup(char *opt) |
579 | { | 576 | { |
580 | if (!opt) | 577 | if (!opt) |
581 | return -EINVAL; | 578 | return -EINVAL; |
582 | if (!strncmp(opt,"off",3)) | 579 | if (!strncmp(opt, "off", 3)) |
583 | numa_off = 1; | 580 | numa_off = 1; |
584 | #ifdef CONFIG_NUMA_EMU | 581 | #ifdef CONFIG_NUMA_EMU |
585 | if (!strncmp(opt, "fake=", 5)) | 582 | if (!strncmp(opt, "fake=", 5)) |
586 | cmdline = opt + 5; | 583 | cmdline = opt + 5; |
587 | #endif | 584 | #endif |
588 | #ifdef CONFIG_ACPI_NUMA | 585 | #ifdef CONFIG_ACPI_NUMA |
589 | if (!strncmp(opt,"noacpi",6)) | 586 | if (!strncmp(opt, "noacpi", 6)) |
590 | acpi_numa = -1; | 587 | acpi_numa = -1; |
591 | if (!strncmp(opt,"hotadd=", 7)) | 588 | if (!strncmp(opt, "hotadd=", 7)) |
592 | hotadd_percent = simple_strtoul(opt+7, NULL, 10); | 589 | hotadd_percent = simple_strtoul(opt+7, NULL, 10); |
593 | #endif | 590 | #endif |
594 | return 0; | 591 | return 0; |
595 | } | 592 | } |
596 | |||
597 | early_param("numa", numa_setup); | 593 | early_param("numa", numa_setup); |
598 | 594 | ||
599 | /* | 595 | /* |
@@ -611,38 +607,16 @@ early_param("numa", numa_setup); | |||
611 | void __init init_cpu_to_node(void) | 607 | void __init init_cpu_to_node(void) |
612 | { | 608 | { |
613 | int i; | 609 | int i; |
614 | for (i = 0; i < NR_CPUS; i++) { | 610 | |
615 | u8 apicid = x86_cpu_to_apicid_init[i]; | 611 | for (i = 0; i < NR_CPUS; i++) { |
612 | u16 apicid = x86_cpu_to_apicid_init[i]; | ||
613 | |||
616 | if (apicid == BAD_APICID) | 614 | if (apicid == BAD_APICID) |
617 | continue; | 615 | continue; |
618 | if (apicid_to_node[apicid] == NUMA_NO_NODE) | 616 | if (apicid_to_node[apicid] == NUMA_NO_NODE) |
619 | continue; | 617 | continue; |
620 | numa_set_node(i,apicid_to_node[apicid]); | 618 | numa_set_node(i, apicid_to_node[apicid]); |
621 | } | 619 | } |
622 | } | 620 | } |
623 | 621 | ||
624 | EXPORT_SYMBOL(cpu_to_node); | ||
625 | EXPORT_SYMBOL(node_to_cpumask); | ||
626 | EXPORT_SYMBOL(memnode); | ||
627 | EXPORT_SYMBOL(node_data); | ||
628 | |||
629 | #ifdef CONFIG_DISCONTIGMEM | ||
630 | /* | ||
631 | * Functions to convert PFNs from/to per node page addresses. | ||
632 | * These are out of line because they are quite big. | ||
633 | * They could be all tuned by pre caching more state. | ||
634 | * Should do that. | ||
635 | */ | ||
636 | 622 | ||
637 | int pfn_valid(unsigned long pfn) | ||
638 | { | ||
639 | unsigned nid; | ||
640 | if (pfn >= num_physpages) | ||
641 | return 0; | ||
642 | nid = pfn_to_nid(pfn); | ||
643 | if (nid == 0xff) | ||
644 | return 0; | ||
645 | return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid); | ||
646 | } | ||
647 | EXPORT_SYMBOL(pfn_valid); | ||
648 | #endif | ||