diff options
Diffstat (limited to 'arch/x86/mm/numa_64.c')
-rw-r--r-- | arch/x86/mm/numa_64.c | 307 |
1 files changed, 145 insertions, 162 deletions
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 3d6926ba899..5a02bf4c91e 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Generic VM initialization for x86-64 NUMA setups. | 2 | * Generic VM initialization for x86-64 NUMA setups. |
3 | * Copyright 2002,2003 Andi Kleen, SuSE Labs. | 3 | * Copyright 2002,2003 Andi Kleen, SuSE Labs. |
4 | */ | 4 | */ |
5 | #include <linux/kernel.h> | 5 | #include <linux/kernel.h> |
6 | #include <linux/mm.h> | 6 | #include <linux/mm.h> |
7 | #include <linux/string.h> | 7 | #include <linux/string.h> |
@@ -11,35 +11,45 @@ | |||
11 | #include <linux/ctype.h> | 11 | #include <linux/ctype.h> |
12 | #include <linux/module.h> | 12 | #include <linux/module.h> |
13 | #include <linux/nodemask.h> | 13 | #include <linux/nodemask.h> |
14 | #include <linux/sched.h> | ||
14 | 15 | ||
15 | #include <asm/e820.h> | 16 | #include <asm/e820.h> |
16 | #include <asm/proto.h> | 17 | #include <asm/proto.h> |
17 | #include <asm/dma.h> | 18 | #include <asm/dma.h> |
18 | #include <asm/numa.h> | 19 | #include <asm/numa.h> |
19 | #include <asm/acpi.h> | 20 | #include <asm/acpi.h> |
21 | #include <asm/k8.h> | ||
20 | 22 | ||
21 | #ifndef Dprintk | 23 | #ifndef Dprintk |
22 | #define Dprintk(x...) | 24 | #define Dprintk(x...) |
23 | #endif | 25 | #endif |
24 | 26 | ||
25 | struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; | 27 | struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; |
28 | EXPORT_SYMBOL(node_data); | ||
29 | |||
26 | bootmem_data_t plat_node_bdata[MAX_NUMNODES]; | 30 | bootmem_data_t plat_node_bdata[MAX_NUMNODES]; |
27 | 31 | ||
28 | struct memnode memnode; | 32 | struct memnode memnode; |
29 | 33 | ||
30 | unsigned char cpu_to_node[NR_CPUS] __read_mostly = { | 34 | int x86_cpu_to_node_map_init[NR_CPUS] = { |
31 | [0 ... NR_CPUS-1] = NUMA_NO_NODE | 35 | [0 ... NR_CPUS-1] = NUMA_NO_NODE |
32 | }; | 36 | }; |
33 | unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { | 37 | void *x86_cpu_to_node_map_early_ptr; |
34 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE | 38 | DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE; |
39 | EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map); | ||
40 | EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr); | ||
41 | |||
42 | s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { | ||
43 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE | ||
35 | }; | 44 | }; |
36 | cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; | 45 | |
46 | cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly; | ||
47 | EXPORT_SYMBOL(node_to_cpumask_map); | ||
37 | 48 | ||
38 | int numa_off __initdata; | 49 | int numa_off __initdata; |
39 | unsigned long __initdata nodemap_addr; | 50 | unsigned long __initdata nodemap_addr; |
40 | unsigned long __initdata nodemap_size; | 51 | unsigned long __initdata nodemap_size; |
41 | 52 | ||
42 | |||
43 | /* | 53 | /* |
44 | * Given a shift value, try to populate memnodemap[] | 54 | * Given a shift value, try to populate memnodemap[] |
45 | * Returns : | 55 | * Returns : |
@@ -47,14 +57,13 @@ unsigned long __initdata nodemap_size; | |||
47 | * 0 if memnodmap[] too small (of shift too small) | 57 | * 0 if memnodmap[] too small (of shift too small) |
48 | * -1 if node overlap or lost ram (shift too big) | 58 | * -1 if node overlap or lost ram (shift too big) |
49 | */ | 59 | */ |
50 | static int __init | 60 | static int __init populate_memnodemap(const struct bootnode *nodes, |
51 | populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) | 61 | int numnodes, int shift) |
52 | { | 62 | { |
53 | int i; | ||
54 | int res = -1; | ||
55 | unsigned long addr, end; | 63 | unsigned long addr, end; |
64 | int i, res = -1; | ||
56 | 65 | ||
57 | memset(memnodemap, 0xff, memnodemapsize); | 66 | memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize); |
58 | for (i = 0; i < numnodes; i++) { | 67 | for (i = 0; i < numnodes; i++) { |
59 | addr = nodes[i].start; | 68 | addr = nodes[i].start; |
60 | end = nodes[i].end; | 69 | end = nodes[i].end; |
@@ -63,37 +72,36 @@ populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) | |||
63 | if ((end >> shift) >= memnodemapsize) | 72 | if ((end >> shift) >= memnodemapsize) |
64 | return 0; | 73 | return 0; |
65 | do { | 74 | do { |
66 | if (memnodemap[addr >> shift] != 0xff) | 75 | if (memnodemap[addr >> shift] != NUMA_NO_NODE) |
67 | return -1; | 76 | return -1; |
68 | memnodemap[addr >> shift] = i; | 77 | memnodemap[addr >> shift] = i; |
69 | addr += (1UL << shift); | 78 | addr += (1UL << shift); |
70 | } while (addr < end); | 79 | } while (addr < end); |
71 | res = 1; | 80 | res = 1; |
72 | } | 81 | } |
73 | return res; | 82 | return res; |
74 | } | 83 | } |
75 | 84 | ||
76 | static int __init allocate_cachealigned_memnodemap(void) | 85 | static int __init allocate_cachealigned_memnodemap(void) |
77 | { | 86 | { |
78 | unsigned long pad, pad_addr; | 87 | unsigned long addr; |
79 | 88 | ||
80 | memnodemap = memnode.embedded_map; | 89 | memnodemap = memnode.embedded_map; |
81 | if (memnodemapsize <= 48) | 90 | if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map)) |
82 | return 0; | 91 | return 0; |
83 | 92 | ||
84 | pad = L1_CACHE_BYTES - 1; | 93 | addr = 0x8000; |
85 | pad_addr = 0x8000; | 94 | nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); |
86 | nodemap_size = pad + memnodemapsize; | 95 | nodemap_addr = find_e820_area(addr, end_pfn<<PAGE_SHIFT, |
87 | nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT, | 96 | nodemap_size, L1_CACHE_BYTES); |
88 | nodemap_size); | ||
89 | if (nodemap_addr == -1UL) { | 97 | if (nodemap_addr == -1UL) { |
90 | printk(KERN_ERR | 98 | printk(KERN_ERR |
91 | "NUMA: Unable to allocate Memory to Node hash map\n"); | 99 | "NUMA: Unable to allocate Memory to Node hash map\n"); |
92 | nodemap_addr = nodemap_size = 0; | 100 | nodemap_addr = nodemap_size = 0; |
93 | return -1; | 101 | return -1; |
94 | } | 102 | } |
95 | pad_addr = (nodemap_addr + pad) & ~pad; | 103 | memnodemap = phys_to_virt(nodemap_addr); |
96 | memnodemap = phys_to_virt(pad_addr); | 104 | reserve_early(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP"); |
97 | 105 | ||
98 | printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", | 106 | printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", |
99 | nodemap_addr, nodemap_addr + nodemap_size); | 107 | nodemap_addr, nodemap_addr + nodemap_size); |
@@ -104,8 +112,8 @@ static int __init allocate_cachealigned_memnodemap(void) | |||
104 | * The LSB of all start and end addresses in the node map is the value of the | 112 | * The LSB of all start and end addresses in the node map is the value of the |
105 | * maximum possible shift. | 113 | * maximum possible shift. |
106 | */ | 114 | */ |
107 | static int __init | 115 | static int __init extract_lsb_from_nodes(const struct bootnode *nodes, |
108 | extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes) | 116 | int numnodes) |
109 | { | 117 | { |
110 | int i, nodes_used = 0; | 118 | int i, nodes_used = 0; |
111 | unsigned long start, end; | 119 | unsigned long start, end; |
@@ -140,59 +148,62 @@ int __init compute_hash_shift(struct bootnode *nodes, int numnodes) | |||
140 | shift); | 148 | shift); |
141 | 149 | ||
142 | if (populate_memnodemap(nodes, numnodes, shift) != 1) { | 150 | if (populate_memnodemap(nodes, numnodes, shift) != 1) { |
143 | printk(KERN_INFO | 151 | printk(KERN_INFO "Your memory is not aligned you need to " |
144 | "Your memory is not aligned you need to rebuild your kernel " | 152 | "rebuild your kernel with a bigger NODEMAPSIZE " |
145 | "with a bigger NODEMAPSIZE shift=%d\n", | 153 | "shift=%d\n", shift); |
146 | shift); | ||
147 | return -1; | 154 | return -1; |
148 | } | 155 | } |
149 | return shift; | 156 | return shift; |
150 | } | 157 | } |
151 | 158 | ||
152 | #ifdef CONFIG_SPARSEMEM | ||
153 | int early_pfn_to_nid(unsigned long pfn) | 159 | int early_pfn_to_nid(unsigned long pfn) |
154 | { | 160 | { |
155 | return phys_to_nid(pfn << PAGE_SHIFT); | 161 | return phys_to_nid(pfn << PAGE_SHIFT); |
156 | } | 162 | } |
157 | #endif | ||
158 | 163 | ||
159 | static void * __init | 164 | static void * __init early_node_mem(int nodeid, unsigned long start, |
160 | early_node_mem(int nodeid, unsigned long start, unsigned long end, | 165 | unsigned long end, unsigned long size, |
161 | unsigned long size) | 166 | unsigned long align) |
162 | { | 167 | { |
163 | unsigned long mem = find_e820_area(start, end, size); | 168 | unsigned long mem = find_e820_area(start, end, size, align); |
164 | void *ptr; | 169 | void *ptr; |
170 | |||
165 | if (mem != -1L) | 171 | if (mem != -1L) |
166 | return __va(mem); | 172 | return __va(mem); |
167 | ptr = __alloc_bootmem_nopanic(size, | 173 | |
168 | SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)); | 174 | ptr = __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); |
169 | if (ptr == NULL) { | 175 | if (ptr == NULL) { |
170 | printk(KERN_ERR "Cannot find %lu bytes in node %d\n", | 176 | printk(KERN_ERR "Cannot find %lu bytes in node %d\n", |
171 | size, nodeid); | 177 | size, nodeid); |
172 | return NULL; | 178 | return NULL; |
173 | } | 179 | } |
174 | return ptr; | 180 | return ptr; |
175 | } | 181 | } |
176 | 182 | ||
177 | /* Initialize bootmem allocator for a node */ | 183 | /* Initialize bootmem allocator for a node */ |
178 | void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) | 184 | void __init setup_node_bootmem(int nodeid, unsigned long start, |
179 | { | 185 | unsigned long end) |
180 | unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; | 186 | { |
181 | unsigned long nodedata_phys; | 187 | unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size; |
188 | unsigned long bootmap_start, nodedata_phys; | ||
182 | void *bootmap; | 189 | void *bootmap; |
183 | const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); | 190 | const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); |
184 | 191 | ||
185 | start = round_up(start, ZONE_ALIGN); | 192 | start = round_up(start, ZONE_ALIGN); |
186 | 193 | ||
187 | printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end); | 194 | printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, |
195 | start, end); | ||
188 | 196 | ||
189 | start_pfn = start >> PAGE_SHIFT; | 197 | start_pfn = start >> PAGE_SHIFT; |
190 | end_pfn = end >> PAGE_SHIFT; | 198 | end_pfn = end >> PAGE_SHIFT; |
191 | 199 | ||
192 | node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size); | 200 | node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size, |
201 | SMP_CACHE_BYTES); | ||
193 | if (node_data[nodeid] == NULL) | 202 | if (node_data[nodeid] == NULL) |
194 | return; | 203 | return; |
195 | nodedata_phys = __pa(node_data[nodeid]); | 204 | nodedata_phys = __pa(node_data[nodeid]); |
205 | printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, | ||
206 | nodedata_phys + pgdat_size - 1); | ||
196 | 207 | ||
197 | memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); | 208 | memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); |
198 | NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; | 209 | NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; |
@@ -200,75 +211,62 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en | |||
200 | NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; | 211 | NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; |
201 | 212 | ||
202 | /* Find a place for the bootmem map */ | 213 | /* Find a place for the bootmem map */ |
203 | bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); | 214 | bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); |
204 | bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); | 215 | bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); |
216 | /* | ||
217 | * SMP_CAHCE_BYTES could be enough, but init_bootmem_node like | ||
218 | * to use that to align to PAGE_SIZE | ||
219 | */ | ||
205 | bootmap = early_node_mem(nodeid, bootmap_start, end, | 220 | bootmap = early_node_mem(nodeid, bootmap_start, end, |
206 | bootmap_pages<<PAGE_SHIFT); | 221 | bootmap_pages<<PAGE_SHIFT, PAGE_SIZE); |
207 | if (bootmap == NULL) { | 222 | if (bootmap == NULL) { |
208 | if (nodedata_phys < start || nodedata_phys >= end) | 223 | if (nodedata_phys < start || nodedata_phys >= end) |
209 | free_bootmem((unsigned long)node_data[nodeid],pgdat_size); | 224 | free_bootmem((unsigned long)node_data[nodeid], |
225 | pgdat_size); | ||
210 | node_data[nodeid] = NULL; | 226 | node_data[nodeid] = NULL; |
211 | return; | 227 | return; |
212 | } | 228 | } |
213 | bootmap_start = __pa(bootmap); | 229 | bootmap_start = __pa(bootmap); |
214 | Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); | 230 | |
215 | |||
216 | bootmap_size = init_bootmem_node(NODE_DATA(nodeid), | 231 | bootmap_size = init_bootmem_node(NODE_DATA(nodeid), |
217 | bootmap_start >> PAGE_SHIFT, | 232 | bootmap_start >> PAGE_SHIFT, |
218 | start_pfn, end_pfn); | 233 | start_pfn, end_pfn); |
234 | |||
235 | printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", | ||
236 | bootmap_start, bootmap_start + bootmap_size - 1, | ||
237 | bootmap_pages); | ||
219 | 238 | ||
220 | free_bootmem_with_active_regions(nodeid, end); | 239 | free_bootmem_with_active_regions(nodeid, end); |
221 | 240 | ||
222 | reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); | 241 | reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); |
223 | reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT); | 242 | reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, |
243 | bootmap_pages<<PAGE_SHIFT); | ||
224 | #ifdef CONFIG_ACPI_NUMA | 244 | #ifdef CONFIG_ACPI_NUMA |
225 | srat_reserve_add_area(nodeid); | 245 | srat_reserve_add_area(nodeid); |
226 | #endif | 246 | #endif |
227 | node_set_online(nodeid); | 247 | node_set_online(nodeid); |
228 | } | 248 | } |
229 | |||
230 | /* Initialize final allocator for a zone */ | ||
231 | void __init setup_node_zones(int nodeid) | ||
232 | { | ||
233 | unsigned long start_pfn, end_pfn, memmapsize, limit; | ||
234 | |||
235 | start_pfn = node_start_pfn(nodeid); | ||
236 | end_pfn = node_end_pfn(nodeid); | ||
237 | |||
238 | Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n", | ||
239 | nodeid, start_pfn, end_pfn); | ||
240 | |||
241 | /* Try to allocate mem_map at end to not fill up precious <4GB | ||
242 | memory. */ | ||
243 | memmapsize = sizeof(struct page) * (end_pfn-start_pfn); | ||
244 | limit = end_pfn << PAGE_SHIFT; | ||
245 | #ifdef CONFIG_FLAT_NODE_MEM_MAP | ||
246 | NODE_DATA(nodeid)->node_mem_map = | ||
247 | __alloc_bootmem_core(NODE_DATA(nodeid)->bdata, | ||
248 | memmapsize, SMP_CACHE_BYTES, | ||
249 | round_down(limit - memmapsize, PAGE_SIZE), | ||
250 | limit); | ||
251 | #endif | ||
252 | } | ||
253 | 249 | ||
250 | /* | ||
251 | * There are unfortunately some poorly designed mainboards around that | ||
252 | * only connect memory to a single CPU. This breaks the 1:1 cpu->node | ||
253 | * mapping. To avoid this fill in the mapping for all possible CPUs, | ||
254 | * as the number of CPUs is not known yet. We round robin the existing | ||
255 | * nodes. | ||
256 | */ | ||
254 | void __init numa_init_array(void) | 257 | void __init numa_init_array(void) |
255 | { | 258 | { |
256 | int rr, i; | 259 | int rr, i; |
257 | /* There are unfortunately some poorly designed mainboards around | 260 | |
258 | that only connect memory to a single CPU. This breaks the 1:1 cpu->node | ||
259 | mapping. To avoid this fill in the mapping for all possible | ||
260 | CPUs, as the number of CPUs is not known yet. | ||
261 | We round robin the existing nodes. */ | ||
262 | rr = first_node(node_online_map); | 261 | rr = first_node(node_online_map); |
263 | for (i = 0; i < NR_CPUS; i++) { | 262 | for (i = 0; i < NR_CPUS; i++) { |
264 | if (cpu_to_node(i) != NUMA_NO_NODE) | 263 | if (early_cpu_to_node(i) != NUMA_NO_NODE) |
265 | continue; | 264 | continue; |
266 | numa_set_node(i, rr); | 265 | numa_set_node(i, rr); |
267 | rr = next_node(rr, node_online_map); | 266 | rr = next_node(rr, node_online_map); |
268 | if (rr == MAX_NUMNODES) | 267 | if (rr == MAX_NUMNODES) |
269 | rr = first_node(node_online_map); | 268 | rr = first_node(node_online_map); |
270 | } | 269 | } |
271 | |||
272 | } | 270 | } |
273 | 271 | ||
274 | #ifdef CONFIG_NUMA_EMU | 272 | #ifdef CONFIG_NUMA_EMU |
@@ -276,15 +274,17 @@ void __init numa_init_array(void) | |||
276 | char *cmdline __initdata; | 274 | char *cmdline __initdata; |
277 | 275 | ||
278 | /* | 276 | /* |
279 | * Setups up nid to range from addr to addr + size. If the end boundary is | 277 | * Setups up nid to range from addr to addr + size. If the end |
280 | * greater than max_addr, then max_addr is used instead. The return value is 0 | 278 | * boundary is greater than max_addr, then max_addr is used instead. |
281 | * if there is additional memory left for allocation past addr and -1 otherwise. | 279 | * The return value is 0 if there is additional memory left for |
282 | * addr is adjusted to be at the end of the node. | 280 | * allocation past addr and -1 otherwise. addr is adjusted to be at |
281 | * the end of the node. | ||
283 | */ | 282 | */ |
284 | static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, | 283 | static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, |
285 | u64 size, u64 max_addr) | 284 | u64 size, u64 max_addr) |
286 | { | 285 | { |
287 | int ret = 0; | 286 | int ret = 0; |
287 | |||
288 | nodes[nid].start = *addr; | 288 | nodes[nid].start = *addr; |
289 | *addr += size; | 289 | *addr += size; |
290 | if (*addr >= max_addr) { | 290 | if (*addr >= max_addr) { |
@@ -335,6 +335,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, | |||
335 | 335 | ||
336 | for (i = node_start; i < num_nodes + node_start; i++) { | 336 | for (i = node_start; i < num_nodes + node_start; i++) { |
337 | u64 end = *addr + size; | 337 | u64 end = *addr + size; |
338 | |||
338 | if (i < big) | 339 | if (i < big) |
339 | end += FAKE_NODE_MIN_SIZE; | 340 | end += FAKE_NODE_MIN_SIZE; |
340 | /* | 341 | /* |
@@ -380,14 +381,9 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, | |||
380 | static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) | 381 | static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) |
381 | { | 382 | { |
382 | struct bootnode nodes[MAX_NUMNODES]; | 383 | struct bootnode nodes[MAX_NUMNODES]; |
383 | u64 addr = start_pfn << PAGE_SHIFT; | 384 | u64 size, addr = start_pfn << PAGE_SHIFT; |
384 | u64 max_addr = end_pfn << PAGE_SHIFT; | 385 | u64 max_addr = end_pfn << PAGE_SHIFT; |
385 | int num_nodes = 0; | 386 | int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; |
386 | int coeff_flag; | ||
387 | int coeff = -1; | ||
388 | int num = 0; | ||
389 | u64 size; | ||
390 | int i; | ||
391 | 387 | ||
392 | memset(&nodes, 0, sizeof(nodes)); | 388 | memset(&nodes, 0, sizeof(nodes)); |
393 | /* | 389 | /* |
@@ -395,8 +391,9 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) | |||
395 | * system RAM into N fake nodes. | 391 | * system RAM into N fake nodes. |
396 | */ | 392 | */ |
397 | if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { | 393 | if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { |
398 | num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, | 394 | long n = simple_strtol(cmdline, NULL, 0); |
399 | simple_strtol(cmdline, NULL, 0)); | 395 | |
396 | num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n); | ||
400 | if (num_nodes < 0) | 397 | if (num_nodes < 0) |
401 | return num_nodes; | 398 | return num_nodes; |
402 | goto out; | 399 | goto out; |
@@ -483,46 +480,47 @@ out: | |||
483 | for_each_node_mask(i, node_possible_map) { | 480 | for_each_node_mask(i, node_possible_map) { |
484 | e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, | 481 | e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, |
485 | nodes[i].end >> PAGE_SHIFT); | 482 | nodes[i].end >> PAGE_SHIFT); |
486 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | 483 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); |
487 | } | 484 | } |
488 | acpi_fake_nodes(nodes, num_nodes); | 485 | acpi_fake_nodes(nodes, num_nodes); |
489 | numa_init_array(); | 486 | numa_init_array(); |
490 | return 0; | 487 | return 0; |
491 | } | 488 | } |
492 | #endif /* CONFIG_NUMA_EMU */ | 489 | #endif /* CONFIG_NUMA_EMU */ |
493 | 490 | ||
494 | void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) | 491 | void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) |
495 | { | 492 | { |
496 | int i; | 493 | int i; |
497 | 494 | ||
498 | nodes_clear(node_possible_map); | 495 | nodes_clear(node_possible_map); |
499 | 496 | ||
500 | #ifdef CONFIG_NUMA_EMU | 497 | #ifdef CONFIG_NUMA_EMU |
501 | if (cmdline && !numa_emulation(start_pfn, end_pfn)) | 498 | if (cmdline && !numa_emulation(start_pfn, end_pfn)) |
502 | return; | 499 | return; |
503 | nodes_clear(node_possible_map); | 500 | nodes_clear(node_possible_map); |
504 | #endif | 501 | #endif |
505 | 502 | ||
506 | #ifdef CONFIG_ACPI_NUMA | 503 | #ifdef CONFIG_ACPI_NUMA |
507 | if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, | 504 | if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, |
508 | end_pfn << PAGE_SHIFT)) | 505 | end_pfn << PAGE_SHIFT)) |
509 | return; | 506 | return; |
510 | nodes_clear(node_possible_map); | 507 | nodes_clear(node_possible_map); |
511 | #endif | 508 | #endif |
512 | 509 | ||
513 | #ifdef CONFIG_K8_NUMA | 510 | #ifdef CONFIG_K8_NUMA |
514 | if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT)) | 511 | if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, |
512 | end_pfn<<PAGE_SHIFT)) | ||
515 | return; | 513 | return; |
516 | nodes_clear(node_possible_map); | 514 | nodes_clear(node_possible_map); |
517 | #endif | 515 | #endif |
518 | printk(KERN_INFO "%s\n", | 516 | printk(KERN_INFO "%s\n", |
519 | numa_off ? "NUMA turned off" : "No NUMA configuration found"); | 517 | numa_off ? "NUMA turned off" : "No NUMA configuration found"); |
520 | 518 | ||
521 | printk(KERN_INFO "Faking a node at %016lx-%016lx\n", | 519 | printk(KERN_INFO "Faking a node at %016lx-%016lx\n", |
522 | start_pfn << PAGE_SHIFT, | 520 | start_pfn << PAGE_SHIFT, |
523 | end_pfn << PAGE_SHIFT); | 521 | end_pfn << PAGE_SHIFT); |
524 | /* setup dummy node covering all memory */ | 522 | /* setup dummy node covering all memory */ |
525 | memnode_shift = 63; | 523 | memnode_shift = 63; |
526 | memnodemap = memnode.embedded_map; | 524 | memnodemap = memnode.embedded_map; |
527 | memnodemap[0] = 0; | 525 | memnodemap[0] = 0; |
528 | nodes_clear(node_online_map); | 526 | nodes_clear(node_online_map); |
@@ -530,36 +528,48 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) | |||
530 | node_set(0, node_possible_map); | 528 | node_set(0, node_possible_map); |
531 | for (i = 0; i < NR_CPUS; i++) | 529 | for (i = 0; i < NR_CPUS; i++) |
532 | numa_set_node(i, 0); | 530 | numa_set_node(i, 0); |
533 | node_to_cpumask[0] = cpumask_of_cpu(0); | 531 | /* cpumask_of_cpu() may not be available during early startup */ |
532 | memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0])); | ||
533 | cpu_set(0, node_to_cpumask_map[0]); | ||
534 | e820_register_active_regions(0, start_pfn, end_pfn); | 534 | e820_register_active_regions(0, start_pfn, end_pfn); |
535 | setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); | 535 | setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); |
536 | } | 536 | } |
537 | 537 | ||
538 | __cpuinit void numa_add_cpu(int cpu) | 538 | __cpuinit void numa_add_cpu(int cpu) |
539 | { | 539 | { |
540 | set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]); | 540 | set_bit(cpu, |
541 | } | 541 | (unsigned long *)&node_to_cpumask_map[early_cpu_to_node(cpu)]); |
542 | } | ||
542 | 543 | ||
543 | void __cpuinit numa_set_node(int cpu, int node) | 544 | void __cpuinit numa_set_node(int cpu, int node) |
544 | { | 545 | { |
546 | int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr; | ||
547 | |||
545 | cpu_pda(cpu)->nodenumber = node; | 548 | cpu_pda(cpu)->nodenumber = node; |
546 | cpu_to_node(cpu) = node; | 549 | |
550 | if(cpu_to_node_map) | ||
551 | cpu_to_node_map[cpu] = node; | ||
552 | else if(per_cpu_offset(cpu)) | ||
553 | per_cpu(x86_cpu_to_node_map, cpu) = node; | ||
554 | else | ||
555 | Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu); | ||
547 | } | 556 | } |
548 | 557 | ||
549 | unsigned long __init numa_free_all_bootmem(void) | 558 | unsigned long __init numa_free_all_bootmem(void) |
550 | { | 559 | { |
551 | int i; | ||
552 | unsigned long pages = 0; | 560 | unsigned long pages = 0; |
553 | for_each_online_node(i) { | 561 | int i; |
562 | |||
563 | for_each_online_node(i) | ||
554 | pages += free_all_bootmem_node(NODE_DATA(i)); | 564 | pages += free_all_bootmem_node(NODE_DATA(i)); |
555 | } | 565 | |
556 | return pages; | 566 | return pages; |
557 | } | 567 | } |
558 | 568 | ||
559 | void __init paging_init(void) | 569 | void __init paging_init(void) |
560 | { | 570 | { |
561 | int i; | ||
562 | unsigned long max_zone_pfns[MAX_NR_ZONES]; | 571 | unsigned long max_zone_pfns[MAX_NR_ZONES]; |
572 | |||
563 | memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); | 573 | memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); |
564 | max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; | 574 | max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; |
565 | max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; | 575 | max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; |
@@ -568,32 +578,27 @@ void __init paging_init(void) | |||
568 | sparse_memory_present_with_active_regions(MAX_NUMNODES); | 578 | sparse_memory_present_with_active_regions(MAX_NUMNODES); |
569 | sparse_init(); | 579 | sparse_init(); |
570 | 580 | ||
571 | for_each_online_node(i) { | ||
572 | setup_node_zones(i); | ||
573 | } | ||
574 | |||
575 | free_area_init_nodes(max_zone_pfns); | 581 | free_area_init_nodes(max_zone_pfns); |
576 | } | 582 | } |
577 | 583 | ||
578 | static __init int numa_setup(char *opt) | 584 | static __init int numa_setup(char *opt) |
579 | { | 585 | { |
580 | if (!opt) | 586 | if (!opt) |
581 | return -EINVAL; | 587 | return -EINVAL; |
582 | if (!strncmp(opt,"off",3)) | 588 | if (!strncmp(opt, "off", 3)) |
583 | numa_off = 1; | 589 | numa_off = 1; |
584 | #ifdef CONFIG_NUMA_EMU | 590 | #ifdef CONFIG_NUMA_EMU |
585 | if (!strncmp(opt, "fake=", 5)) | 591 | if (!strncmp(opt, "fake=", 5)) |
586 | cmdline = opt + 5; | 592 | cmdline = opt + 5; |
587 | #endif | 593 | #endif |
588 | #ifdef CONFIG_ACPI_NUMA | 594 | #ifdef CONFIG_ACPI_NUMA |
589 | if (!strncmp(opt,"noacpi",6)) | 595 | if (!strncmp(opt, "noacpi", 6)) |
590 | acpi_numa = -1; | 596 | acpi_numa = -1; |
591 | if (!strncmp(opt,"hotadd=", 7)) | 597 | if (!strncmp(opt, "hotadd=", 7)) |
592 | hotadd_percent = simple_strtoul(opt+7, NULL, 10); | 598 | hotadd_percent = simple_strtoul(opt+7, NULL, 10); |
593 | #endif | 599 | #endif |
594 | return 0; | 600 | return 0; |
595 | } | 601 | } |
596 | |||
597 | early_param("numa", numa_setup); | 602 | early_param("numa", numa_setup); |
598 | 603 | ||
599 | /* | 604 | /* |
@@ -611,38 +616,16 @@ early_param("numa", numa_setup); | |||
611 | void __init init_cpu_to_node(void) | 616 | void __init init_cpu_to_node(void) |
612 | { | 617 | { |
613 | int i; | 618 | int i; |
614 | for (i = 0; i < NR_CPUS; i++) { | 619 | |
615 | u8 apicid = x86_cpu_to_apicid_init[i]; | 620 | for (i = 0; i < NR_CPUS; i++) { |
621 | u16 apicid = x86_cpu_to_apicid_init[i]; | ||
622 | |||
616 | if (apicid == BAD_APICID) | 623 | if (apicid == BAD_APICID) |
617 | continue; | 624 | continue; |
618 | if (apicid_to_node[apicid] == NUMA_NO_NODE) | 625 | if (apicid_to_node[apicid] == NUMA_NO_NODE) |
619 | continue; | 626 | continue; |
620 | numa_set_node(i,apicid_to_node[apicid]); | 627 | numa_set_node(i, apicid_to_node[apicid]); |
621 | } | 628 | } |
622 | } | 629 | } |
623 | 630 | ||
624 | EXPORT_SYMBOL(cpu_to_node); | ||
625 | EXPORT_SYMBOL(node_to_cpumask); | ||
626 | EXPORT_SYMBOL(memnode); | ||
627 | EXPORT_SYMBOL(node_data); | ||
628 | 631 | ||
629 | #ifdef CONFIG_DISCONTIGMEM | ||
630 | /* | ||
631 | * Functions to convert PFNs from/to per node page addresses. | ||
632 | * These are out of line because they are quite big. | ||
633 | * They could be all tuned by pre caching more state. | ||
634 | * Should do that. | ||
635 | */ | ||
636 | |||
637 | int pfn_valid(unsigned long pfn) | ||
638 | { | ||
639 | unsigned nid; | ||
640 | if (pfn >= num_physpages) | ||
641 | return 0; | ||
642 | nid = pfn_to_nid(pfn); | ||
643 | if (nid == 0xff) | ||
644 | return 0; | ||
645 | return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid); | ||
646 | } | ||
647 | EXPORT_SYMBOL(pfn_valid); | ||
648 | #endif | ||