aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRohit Seth <rohitseth@google.com>2007-02-13 07:26:22 -0500
committerAndi Kleen <andi@basil.nowhere.org>2007-02-13 07:26:22 -0500
commit53fee04f318222a3179ca5933d8bda82c1eef17a (patch)
tree989d1c95b07d22473e596561216637711b4cd819
parent3b3d5e1db66cd66148b2cebd2c38aff2a8df03d6 (diff)
[PATCH] x86-64: Fix fake numa for x86_64 machines with big IO hole
This patch resolves the issue of running with numa=fake=X on kernel command line on x86_64 machines that have big IO hole. While calculating the size of each node now we look at the total hole size in that range. Previously there were nodes that only had IO holes in them causing kernel boot problems. We now use the NODE_MIN_SIZE (64MB) as the minimum size of memory that any node must have. We reduce the number of allocated nodes if the number of nodes specified on kernel command line results in any node getting memory smaller than NODE_MIN_SIZE. This change allows the extra memory to be incremented in NODE_MIN_SIZE granule and uniformly distribute among as many nodes (called big nodes) as possible. [akpm@osdl.org: build fix] Signed-off-by: David Rientjes <reintjes@google.com> Signed-off-by: Paul Menage <menage@google.com> Signed-off-by: Rohit Seth <rohitseth@google.com> Signed-off-by: Andi Kleen <ak@suse.de> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org>
-rw-r--r--arch/x86_64/kernel/e820.c31
-rw-r--r--arch/x86_64/mm/numa.c110
-rw-r--r--include/asm-x86_64/e820.h1
-rw-r--r--include/asm-x86_64/mmzone.h5
4 files changed, 133 insertions, 14 deletions
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index 9d67955bbc31..4651fd22b213 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -191,6 +191,37 @@ unsigned long __init e820_end_of_ram(void)
191} 191}
192 192
193/* 193/*
194 * Find the hole size in the range.
195 */
196unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
197{
198 unsigned long ram = 0;
199 int i;
200
201 for (i = 0; i < e820.nr_map; i++) {
202 struct e820entry *ei = &e820.map[i];
203 unsigned long last, addr;
204
205 if (ei->type != E820_RAM ||
206 ei->addr+ei->size <= start ||
207 ei->addr >= end)
208 continue;
209
210 addr = round_up(ei->addr, PAGE_SIZE);
211 if (addr < start)
212 addr = start;
213
214 last = round_down(ei->addr + ei->size, PAGE_SIZE);
215 if (last >= end)
216 last = end;
217
218 if (last > addr)
219 ram += last - addr;
220 }
221 return ((end - start) - ram);
222}
223
224/*
194 * Mark e820 reserved areas as busy for the resource manager. 225 * Mark e820 reserved areas as busy for the resource manager.
195 */ 226 */
196void __init e820_reserve_resources(void) 227void __init e820_reserve_resources(void)
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 1ec16ea97519..d3f747dd61d3 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -272,31 +272,113 @@ void __init numa_init_array(void)
272} 272}
273 273
274#ifdef CONFIG_NUMA_EMU 274#ifdef CONFIG_NUMA_EMU
275/* Numa emulation */
275int numa_fake __initdata = 0; 276int numa_fake __initdata = 0;
276 277
277/* Numa emulation */ 278/*
279 * This function is used to find out if the start and end correspond to
280 * different zones.
281 */
282int zone_cross_over(unsigned long start, unsigned long end)
283{
284 if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) &&
285 (end >= (MAX_DMA32_PFN << PAGE_SHIFT)))
286 return 1;
287 return 0;
288}
289
278static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) 290static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
279{ 291{
280 int i; 292 int i, big;
281 struct bootnode nodes[MAX_NUMNODES]; 293 struct bootnode nodes[MAX_NUMNODES];
282 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake; 294 unsigned long sz, old_sz;
295 unsigned long hole_size;
296 unsigned long start, end;
297 unsigned long max_addr = (end_pfn << PAGE_SHIFT);
298
299 start = (start_pfn << PAGE_SHIFT);
300 hole_size = e820_hole_size(start, max_addr);
301 sz = (max_addr - start - hole_size) / numa_fake;
283 302
284 /* Kludge needed for the hash function */ 303 /* Kludge needed for the hash function */
285 if (hweight64(sz) > 1) {
286 unsigned long x = 1;
287 while ((x << 1) < sz)
288 x <<= 1;
289 if (x < sz/2)
290 printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
291 sz = x;
292 }
293 304
305 old_sz = sz;
306 /*
307 * Round down to the nearest FAKE_NODE_MIN_SIZE.
308 */
309 sz &= FAKE_NODE_MIN_HASH_MASK;
310
311 /*
312 * We ensure that each node is at least 64MB big. Smaller than this
313 * size can cause VM hiccups.
314 */
315 if (sz == 0) {
316 printk(KERN_INFO "Not enough memory for %d nodes. Reducing "
317 "the number of nodes\n", numa_fake);
318 numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE;
319 printk(KERN_INFO "Number of fake nodes will be = %d\n",
320 numa_fake);
321 sz = FAKE_NODE_MIN_SIZE;
322 }
323 /*
324 * Find out how many nodes can get an extra NODE_MIN_SIZE granule.
325 * This logic ensures the extra memory gets distributed among as many
326 * nodes as possible (as compared to one single node getting all that
327 * extra memory.
328 */
329 big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE;
330 printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: "
331 "%d\n",
332 (sz >> 20), (hole_size >> 20), big);
294 memset(&nodes,0,sizeof(nodes)); 333 memset(&nodes,0,sizeof(nodes));
334 end = start;
295 for (i = 0; i < numa_fake; i++) { 335 for (i = 0; i < numa_fake; i++) {
296 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz; 336 /*
337 * In case we are not able to allocate enough memory for all
338 * the nodes, we reduce the number of fake nodes.
339 */
340 if (end >= max_addr) {
341 numa_fake = i - 1;
342 break;
343 }
344 start = nodes[i].start = end;
345 /*
346 * Final node can have all the remaining memory.
347 */
297 if (i == numa_fake-1) 348 if (i == numa_fake-1)
298 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start; 349 sz = max_addr - start;
299 nodes[i].end = nodes[i].start + sz; 350 end = nodes[i].start + sz;
351 /*
352 * Fir "big" number of nodes get extra granule.
353 */
354 if (i < big)
355 end += FAKE_NODE_MIN_SIZE;
356 /*
357 * Iterate over the range to ensure that this node gets at
358 * least sz amount of RAM (excluding holes)
359 */
360 while ((end - start - e820_hole_size(start, end)) < sz) {
361 end += FAKE_NODE_MIN_SIZE;
362 if (end >= max_addr)
363 break;
364 }
365 /*
366 * Look at the next node to make sure there is some real memory
367 * to map. Bad things happen when the only memory present
368 * in a zone on a fake node is IO hole.
369 */
370 while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) {
371 if (zone_cross_over(start, end + sz)) {
372 end = (MAX_DMA32_PFN << PAGE_SHIFT);
373 break;
374 }
375 if (end >= max_addr)
376 break;
377 end += FAKE_NODE_MIN_SIZE;
378 }
379 if (end > max_addr)
380 end = max_addr;
381 nodes[i].end = end;
300 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", 382 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
301 i, 383 i,
302 nodes[i].start, nodes[i].end, 384 nodes[i].start, nodes[i].end,
diff --git a/include/asm-x86_64/e820.h b/include/asm-x86_64/e820.h
index 855fb4a454b6..6216fa3f2802 100644
--- a/include/asm-x86_64/e820.h
+++ b/include/asm-x86_64/e820.h
@@ -46,6 +46,7 @@ extern void e820_mark_nosave_regions(void);
46extern void e820_print_map(char *who); 46extern void e820_print_map(char *who);
47extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type); 47extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type);
48extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type); 48extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type);
49extern unsigned long e820_hole_size(unsigned long start, unsigned long end);
49 50
50extern void e820_setup_gap(void); 51extern void e820_setup_gap(void);
51extern void e820_register_active_regions(int nid, 52extern void e820_register_active_regions(int nid,
diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h
index 39ef106986eb..fb558fb1d211 100644
--- a/include/asm-x86_64/mmzone.h
+++ b/include/asm-x86_64/mmzone.h
@@ -47,5 +47,10 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
47extern int pfn_valid(unsigned long pfn); 47extern int pfn_valid(unsigned long pfn);
48#endif 48#endif
49 49
50#ifdef CONFIG_NUMA_EMU
51#define FAKE_NODE_MIN_SIZE (64*1024*1024)
52#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1ul))
53#endif
54
50#endif 55#endif
51#endif 56#endif