aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64')
-rw-r--r--arch/x86_64/Kconfig3
-rw-r--r--arch/x86_64/kernel/e820.c125
-rw-r--r--arch/x86_64/kernel/setup.c7
-rw-r--r--arch/x86_64/mm/init.c65
-rw-r--r--arch/x86_64/mm/k8topology.c3
-rw-r--r--arch/x86_64/mm/numa.c21
-rw-r--r--arch/x86_64/mm/srat.c11
7 files changed, 82 insertions, 153 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index efe249e7d6b3..326aff7a87ea 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -85,6 +85,9 @@ config ARCH_MAY_HAVE_PC_FDC
85 bool 85 bool
86 default y 86 default y
87 87
88config ARCH_POPULATES_NODE_MAP
89 def_bool y
90
88config DMI 91config DMI
89 bool 92 bool
90 default y 93 default y
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index c0af3828df45..b3f0908668ec 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -162,59 +162,14 @@ unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsi
162 return -1UL; 162 return -1UL;
163} 163}
164 164
165/*
166 * Free bootmem based on the e820 table for a node.
167 */
168void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end)
169{
170 int i;
171 for (i = 0; i < e820.nr_map; i++) {
172 struct e820entry *ei = &e820.map[i];
173 unsigned long last, addr;
174
175 if (ei->type != E820_RAM ||
176 ei->addr+ei->size <= start ||
177 ei->addr >= end)
178 continue;
179
180 addr = round_up(ei->addr, PAGE_SIZE);
181 if (addr < start)
182 addr = start;
183
184 last = round_down(ei->addr + ei->size, PAGE_SIZE);
185 if (last >= end)
186 last = end;
187
188 if (last > addr && last-addr >= PAGE_SIZE)
189 free_bootmem_node(pgdat, addr, last-addr);
190 }
191}
192
193/* 165/*
194 * Find the highest page frame number we have available 166 * Find the highest page frame number we have available
195 */ 167 */
196unsigned long __init e820_end_of_ram(void) 168unsigned long __init e820_end_of_ram(void)
197{ 169{
198 int i;
199 unsigned long end_pfn = 0; 170 unsigned long end_pfn = 0;
171 end_pfn = find_max_pfn_with_active_regions();
200 172
201 for (i = 0; i < e820.nr_map; i++) {
202 struct e820entry *ei = &e820.map[i];
203 unsigned long start, end;
204
205 start = round_up(ei->addr, PAGE_SIZE);
206 end = round_down(ei->addr + ei->size, PAGE_SIZE);
207 if (start >= end)
208 continue;
209 if (ei->type == E820_RAM) {
210 if (end > end_pfn<<PAGE_SHIFT)
211 end_pfn = end>>PAGE_SHIFT;
212 } else {
213 if (end > end_pfn_map<<PAGE_SHIFT)
214 end_pfn_map = end>>PAGE_SHIFT;
215 }
216 }
217
218 if (end_pfn > end_pfn_map) 173 if (end_pfn > end_pfn_map)
219 end_pfn_map = end_pfn; 174 end_pfn_map = end_pfn;
220 if (end_pfn_map > MAXMEM>>PAGE_SHIFT) 175 if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
@@ -224,43 +179,10 @@ unsigned long __init e820_end_of_ram(void)
224 if (end_pfn > end_pfn_map) 179 if (end_pfn > end_pfn_map)
225 end_pfn = end_pfn_map; 180 end_pfn = end_pfn_map;
226 181
182 printk("end_pfn_map = %lu\n", end_pfn_map);
227 return end_pfn; 183 return end_pfn;
228} 184}
229 185
230/*
231 * Compute how much memory is missing in a range.
232 * Unlike the other functions in this file the arguments are in page numbers.
233 */
234unsigned long __init
235e820_hole_size(unsigned long start_pfn, unsigned long end_pfn)
236{
237 unsigned long ram = 0;
238 unsigned long start = start_pfn << PAGE_SHIFT;
239 unsigned long end = end_pfn << PAGE_SHIFT;
240 int i;
241 for (i = 0; i < e820.nr_map; i++) {
242 struct e820entry *ei = &e820.map[i];
243 unsigned long last, addr;
244
245 if (ei->type != E820_RAM ||
246 ei->addr+ei->size <= start ||
247 ei->addr >= end)
248 continue;
249
250 addr = round_up(ei->addr, PAGE_SIZE);
251 if (addr < start)
252 addr = start;
253
254 last = round_down(ei->addr + ei->size, PAGE_SIZE);
255 if (last >= end)
256 last = end;
257
258 if (last > addr)
259 ram += last - addr;
260 }
261 return ((end - start) - ram) >> PAGE_SHIFT;
262}
263
264/* 186/*
265 * Mark e820 reserved areas as busy for the resource manager. 187 * Mark e820 reserved areas as busy for the resource manager.
266 */ 188 */
@@ -342,6 +264,49 @@ void __init e820_mark_nosave_regions(void)
342 } 264 }
343} 265}
344 266
267/* Walk the e820 map and register active regions within a node */
268void __init
269e820_register_active_regions(int nid, unsigned long start_pfn,
270 unsigned long end_pfn)
271{
272 int i;
273 unsigned long ei_startpfn, ei_endpfn;
274 for (i = 0; i < e820.nr_map; i++) {
275 struct e820entry *ei = &e820.map[i];
276 ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
277 ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE)
278 >> PAGE_SHIFT;
279
280 /* Skip map entries smaller than a page */
281 if (ei_startpfn > ei_endpfn)
282 continue;
283
284 /* Check if end_pfn_map should be updated */
285 if (ei->type != E820_RAM && ei_endpfn > end_pfn_map)
286 end_pfn_map = ei_endpfn;
287
288 /* Skip if map is outside the node */
289 if (ei->type != E820_RAM ||
290 ei_endpfn <= start_pfn ||
291 ei_startpfn >= end_pfn)
292 continue;
293
294 /* Check for overlaps */
295 if (ei_startpfn < start_pfn)
296 ei_startpfn = start_pfn;
297 if (ei_endpfn > end_pfn)
298 ei_endpfn = end_pfn;
299
300 /* Obey end_user_pfn to save on memmap */
301 if (ei_startpfn >= end_user_pfn)
302 continue;
303 if (ei_endpfn > end_user_pfn)
304 ei_endpfn = end_user_pfn;
305
306 add_active_range(nid, ei_startpfn, ei_endpfn);
307 }
308}
309
345/* 310/*
346 * Add a memory region to the kernel e820 map. 311 * Add a memory region to the kernel e820 map.
347 */ 312 */
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index f98e48cae6da..0b00bb2ea576 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -292,7 +292,8 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
292 if (bootmap == -1L) 292 if (bootmap == -1L)
293 panic("Cannot find bootmem map of size %ld\n",bootmap_size); 293 panic("Cannot find bootmem map of size %ld\n",bootmap_size);
294 bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); 294 bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
295 e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT); 295 e820_register_active_regions(0, start_pfn, end_pfn);
296 free_bootmem_with_active_regions(0, end_pfn);
296 reserve_bootmem(bootmap, bootmap_size); 297 reserve_bootmem(bootmap, bootmap_size);
297} 298}
298#endif 299#endif
@@ -384,6 +385,7 @@ void __init setup_arch(char **cmdline_p)
384 385
385 finish_e820_parsing(); 386 finish_e820_parsing();
386 387
388 e820_register_active_regions(0, 0, -1UL);
387 /* 389 /*
388 * partially used pages are not usable - thus 390 * partially used pages are not usable - thus
389 * we are rounding upwards: 391 * we are rounding upwards:
@@ -414,6 +416,9 @@ void __init setup_arch(char **cmdline_p)
414 max_pfn = end_pfn; 416 max_pfn = end_pfn;
415 high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1; 417 high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
416 418
419 /* Remove active ranges so rediscovery with NUMA-awareness happens */
420 remove_all_active_ranges();
421
417#ifdef CONFIG_ACPI_NUMA 422#ifdef CONFIG_ACPI_NUMA
418 /* 423 /*
419 * Parse SRAT to discover nodes. 424 * Parse SRAT to discover nodes.
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 1e4669fa5734..47928399e38a 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -403,69 +403,15 @@ void __cpuinit zap_low_mappings(int cpu)
403 __flush_tlb_all(); 403 __flush_tlb_all();
404} 404}
405 405
406/* Compute zone sizes for the DMA and DMA32 zones in a node. */
407__init void
408size_zones(unsigned long *z, unsigned long *h,
409 unsigned long start_pfn, unsigned long end_pfn)
410{
411 int i;
412 unsigned long w;
413
414 for (i = 0; i < MAX_NR_ZONES; i++)
415 z[i] = 0;
416
417 if (start_pfn < MAX_DMA_PFN)
418 z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
419 if (start_pfn < MAX_DMA32_PFN) {
420 unsigned long dma32_pfn = MAX_DMA32_PFN;
421 if (dma32_pfn > end_pfn)
422 dma32_pfn = end_pfn;
423 z[ZONE_DMA32] = dma32_pfn - start_pfn;
424 }
425 z[ZONE_NORMAL] = end_pfn - start_pfn;
426
427 /* Remove lower zones from higher ones. */
428 w = 0;
429 for (i = 0; i < MAX_NR_ZONES; i++) {
430 if (z[i])
431 z[i] -= w;
432 w += z[i];
433 }
434
435 /* Compute holes */
436 w = start_pfn;
437 for (i = 0; i < MAX_NR_ZONES; i++) {
438 unsigned long s = w;
439 w += z[i];
440 h[i] = e820_hole_size(s, w);
441 }
442
443 /* Add the space pace needed for mem_map to the holes too. */
444 for (i = 0; i < MAX_NR_ZONES; i++)
445 h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
446
447 /* The 16MB DMA zone has the kernel and other misc mappings.
448 Account them too */
449 if (h[ZONE_DMA]) {
450 h[ZONE_DMA] += dma_reserve;
451 if (h[ZONE_DMA] >= z[ZONE_DMA]) {
452 printk(KERN_WARNING
453 "Kernel too large and filling up ZONE_DMA?\n");
454 h[ZONE_DMA] = z[ZONE_DMA];
455 }
456 }
457}
458
459#ifndef CONFIG_NUMA 406#ifndef CONFIG_NUMA
460void __init paging_init(void) 407void __init paging_init(void)
461{ 408{
462 unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES]; 409 unsigned long max_zone_pfns[MAX_NR_ZONES] = {MAX_DMA_PFN,
463 410 MAX_DMA32_PFN,
411 end_pfn};
464 memory_present(0, 0, end_pfn); 412 memory_present(0, 0, end_pfn);
465 sparse_init(); 413 sparse_init();
466 size_zones(zones, holes, 0, end_pfn); 414 free_area_init_nodes(max_zone_pfns);
467 free_area_init_node(0, NODE_DATA(0), zones,
468 __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
469} 415}
470#endif 416#endif
471 417
@@ -608,7 +554,8 @@ void __init mem_init(void)
608#else 554#else
609 totalram_pages = free_all_bootmem(); 555 totalram_pages = free_all_bootmem();
610#endif 556#endif
611 reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn); 557 reservedpages = end_pfn - totalram_pages -
558 absent_pages_in_range(0, end_pfn);
612 559
613 after_bootmem = 1; 560 after_bootmem = 1;
614 561
diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c
index 5cf594f9230d..b5b8dba28b4e 100644
--- a/arch/x86_64/mm/k8topology.c
+++ b/arch/x86_64/mm/k8topology.c
@@ -149,6 +149,9 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
149 149
150 nodes[nodeid].start = base; 150 nodes[nodeid].start = base;
151 nodes[nodeid].end = limit; 151 nodes[nodeid].end = limit;
152 e820_register_active_regions(nodeid,
153 nodes[nodeid].start >> PAGE_SHIFT,
154 nodes[nodeid].end >> PAGE_SHIFT);
152 155
153 prevbase = base; 156 prevbase = base;
154 157
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 322bf45fc36a..829a008bd39b 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -161,7 +161,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
161 bootmap_start >> PAGE_SHIFT, 161 bootmap_start >> PAGE_SHIFT,
162 start_pfn, end_pfn); 162 start_pfn, end_pfn);
163 163
164 e820_bootmem_free(NODE_DATA(nodeid), start, end); 164 free_bootmem_with_active_regions(nodeid, end);
165 165
166 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); 166 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
167 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT); 167 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
@@ -175,13 +175,11 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
175void __init setup_node_zones(int nodeid) 175void __init setup_node_zones(int nodeid)
176{ 176{
177 unsigned long start_pfn, end_pfn, memmapsize, limit; 177 unsigned long start_pfn, end_pfn, memmapsize, limit;
178 unsigned long zones[MAX_NR_ZONES];
179 unsigned long holes[MAX_NR_ZONES];
180 178
181 start_pfn = node_start_pfn(nodeid); 179 start_pfn = node_start_pfn(nodeid);
182 end_pfn = node_end_pfn(nodeid); 180 end_pfn = node_end_pfn(nodeid);
183 181
184 Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n", 182 Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n",
185 nodeid, start_pfn, end_pfn); 183 nodeid, start_pfn, end_pfn);
186 184
187 /* Try to allocate mem_map at end to not fill up precious <4GB 185 /* Try to allocate mem_map at end to not fill up precious <4GB
@@ -195,10 +193,6 @@ void __init setup_node_zones(int nodeid)
195 round_down(limit - memmapsize, PAGE_SIZE), 193 round_down(limit - memmapsize, PAGE_SIZE),
196 limit); 194 limit);
197#endif 195#endif
198
199 size_zones(zones, holes, start_pfn, end_pfn);
200 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
201 start_pfn, holes);
202} 196}
203 197
204void __init numa_init_array(void) 198void __init numa_init_array(void)
@@ -259,8 +253,11 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
259 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n"); 253 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
260 return -1; 254 return -1;
261 } 255 }
262 for_each_online_node(i) 256 for_each_online_node(i) {
257 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
258 nodes[i].end >> PAGE_SHIFT);
263 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 259 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
260 }
264 numa_init_array(); 261 numa_init_array();
265 return 0; 262 return 0;
266} 263}
@@ -299,6 +296,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
299 for (i = 0; i < NR_CPUS; i++) 296 for (i = 0; i < NR_CPUS; i++)
300 numa_set_node(i, 0); 297 numa_set_node(i, 0);
301 node_to_cpumask[0] = cpumask_of_cpu(0); 298 node_to_cpumask[0] = cpumask_of_cpu(0);
299 e820_register_active_regions(0, start_pfn, end_pfn);
302 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); 300 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
303} 301}
304 302
@@ -340,12 +338,17 @@ static void __init arch_sparse_init(void)
340void __init paging_init(void) 338void __init paging_init(void)
341{ 339{
342 int i; 340 int i;
341 unsigned long max_zone_pfns[MAX_NR_ZONES] = { MAX_DMA_PFN,
342 MAX_DMA32_PFN,
343 end_pfn};
343 344
344 arch_sparse_init(); 345 arch_sparse_init();
345 346
346 for_each_online_node(i) { 347 for_each_online_node(i) {
347 setup_node_zones(i); 348 setup_node_zones(i);
348 } 349 }
350
351 free_area_init_nodes(max_zone_pfns);
349} 352}
350 353
351static __init int numa_setup(char *opt) 354static __init int numa_setup(char *opt)
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index ca10701e7a90..7b50bb1caabe 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -93,6 +93,7 @@ static __init void bad_srat(void)
93 apicid_to_node[i] = NUMA_NO_NODE; 93 apicid_to_node[i] = NUMA_NO_NODE;
94 for (i = 0; i < MAX_NUMNODES; i++) 94 for (i = 0; i < MAX_NUMNODES; i++)
95 nodes_add[i].start = nodes[i].end = 0; 95 nodes_add[i].start = nodes[i].end = 0;
96 remove_all_active_ranges();
96} 97}
97 98
98static __init inline int srat_disabled(void) 99static __init inline int srat_disabled(void)
@@ -175,7 +176,7 @@ static int hotadd_enough_memory(struct bootnode *nd)
175 176
176 if (mem < 0) 177 if (mem < 0)
177 return 0; 178 return 0;
178 allowed = (end_pfn - e820_hole_size(0, end_pfn)) * PAGE_SIZE; 179 allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE;
179 allowed = (allowed / 100) * hotadd_percent; 180 allowed = (allowed / 100) * hotadd_percent;
180 if (allocated + mem > allowed) { 181 if (allocated + mem > allowed) {
181 unsigned long range; 182 unsigned long range;
@@ -225,7 +226,7 @@ static int reserve_hotadd(int node, unsigned long start, unsigned long end)
225 } 226 }
226 227
227 /* This check might be a bit too strict, but I'm keeping it for now. */ 228 /* This check might be a bit too strict, but I'm keeping it for now. */
228 if (e820_hole_size(s_pfn, e_pfn) != e_pfn - s_pfn) { 229 if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
229 printk(KERN_ERR "SRAT: Hotplug area has existing memory\n"); 230 printk(KERN_ERR "SRAT: Hotplug area has existing memory\n");
230 return -1; 231 return -1;
231 } 232 }
@@ -319,6 +320,8 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
319 320
320 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, 321 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
321 nd->start, nd->end); 322 nd->start, nd->end);
323 e820_register_active_regions(node, nd->start >> PAGE_SHIFT,
324 nd->end >> PAGE_SHIFT);
322 325
323#ifdef RESERVE_HOTADD 326#ifdef RESERVE_HOTADD
324 if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) { 327 if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) {
@@ -343,13 +346,13 @@ static int nodes_cover_memory(void)
343 unsigned long s = nodes[i].start >> PAGE_SHIFT; 346 unsigned long s = nodes[i].start >> PAGE_SHIFT;
344 unsigned long e = nodes[i].end >> PAGE_SHIFT; 347 unsigned long e = nodes[i].end >> PAGE_SHIFT;
345 pxmram += e - s; 348 pxmram += e - s;
346 pxmram -= e820_hole_size(s, e); 349 pxmram -= absent_pages_in_range(s, e);
347 pxmram -= nodes_add[i].end - nodes_add[i].start; 350 pxmram -= nodes_add[i].end - nodes_add[i].start;
348 if ((long)pxmram < 0) 351 if ((long)pxmram < 0)
349 pxmram = 0; 352 pxmram = 0;
350 } 353 }
351 354
352 e820ram = end_pfn - e820_hole_size(0, end_pfn); 355 e820ram = end_pfn - absent_pages_in_range(0, end_pfn);
353 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ 356 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
354 if ((long)(e820ram - pxmram) >= 1*1024*1024) { 357 if ((long)(e820ram - pxmram) >= 1*1024*1024) {
355 printk(KERN_ERR 358 printk(KERN_ERR