aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Rientjes <rientjes@google.com>2007-05-02 13:27:09 -0400
committerAndi Kleen <andi@basil.nowhere.org>2007-05-02 13:27:09 -0400
commit8b8ca80e192b10eecc01fc44a2902510af86f73b (patch)
treeb5be871c62f4fd3f079a9ce7fcce0c09e87a9a5f
parent8280c0c58e9762a9fe29d550a9db81410de77691 (diff)
[PATCH] x86-64: configurable fake numa node sizes
Extends the numa=fake x86_64 command-line option to allow for configurable node sizes. These nodes can be used in conjunction with cpusets for coarse memory resource management. The old command-line option is still supported: numa=fake=32 gives 32 fake NUMA nodes, ignoring the NUMA setup of the actual machine. But now you may configure your system for the node sizes of your choice: numa=fake=2*512,1024,2*256 gives two 512M nodes, one 1024M node, two 256M nodes, and the rest of system memory to a sixth node. The existing hash function is maintained to support the various node sizes that are possible with this implementation. Each node of the same size receives roughly the same amount of available pages, regardless of any reserved memory with its address range. The total available pages on the system is calculated and divided by the number of equal nodes to allocate. These nodes are then dynamically allocated and their borders extended until such time as their number of available pages reaches the required size. Configurable node sizes are recommended when used in conjunction with cpusets for memory control because it eliminates the overhead associated with scanning the zonelists of many smaller full nodes on page_alloc(). Cc: Andi Kleen <ak@suse.de> Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Andi Kleen <ak@suse.de> Cc: Paul Jackson <pj@sgi.com> Cc: Christoph Lameter <clameter@engr.sgi.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-rw-r--r--Documentation/x86_64/boot-options.txt8
-rw-r--r--arch/x86_64/mm/numa.c261
-rw-r--r--include/asm-x86_64/mmzone.h2
3 files changed, 161 insertions, 110 deletions
diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt
index 85f51e5a749f..7500aad95f3c 100644
--- a/Documentation/x86_64/boot-options.txt
+++ b/Documentation/x86_64/boot-options.txt
@@ -149,7 +149,13 @@ NUMA
149 149
150 numa=noacpi Don't parse the SRAT table for NUMA setup 150 numa=noacpi Don't parse the SRAT table for NUMA setup
151 151
152 numa=fake=X Fake X nodes and ignore NUMA setup of the actual machine. 152 numa=fake=CMDLINE
153 If a number, fakes CMDLINE nodes and ignores NUMA setup of the
154 actual machine. Otherwise, system memory is configured
155 depending on the sizes and coefficients listed. For example:
156 numa=fake=2*512,1024,4*256
157 gives two 512M nodes, a 1024M node, and four 256M nodes. The
158 remaining system RAM is allocated to an additional node.
153 159
154 numa=hotadd=percent 160 numa=hotadd=percent
155 Only allow hotadd memory to preallocate page structures upto 161 Only allow hotadd memory to preallocate page structures upto
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 41b8fb069924..c55936bc6be6 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -273,125 +273,172 @@ void __init numa_init_array(void)
273 273
274#ifdef CONFIG_NUMA_EMU 274#ifdef CONFIG_NUMA_EMU
275/* Numa emulation */ 275/* Numa emulation */
276int numa_fake __initdata = 0; 276#define E820_ADDR_HOLE_SIZE(start, end) \
277 (e820_hole_size((start) >> PAGE_SHIFT, (end) >> PAGE_SHIFT) << \
278 PAGE_SHIFT)
279char *cmdline __initdata;
277 280
278/* 281/*
279 * This function is used to find out if the start and end correspond to 282 * Setups up nid to range from addr to addr + size. If the end boundary is
280 * different zones. 283 * greater than max_addr, then max_addr is used instead. The return value is 0
284 * if there is additional memory left for allocation past addr and -1 otherwise.
285 * addr is adjusted to be at the end of the node.
281 */ 286 */
282int zone_cross_over(unsigned long start, unsigned long end) 287static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
288 u64 size, u64 max_addr)
283{ 289{
284 if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) && 290 int ret = 0;
285 (end >= (MAX_DMA32_PFN << PAGE_SHIFT))) 291 nodes[nid].start = *addr;
286 return 1; 292 *addr += size;
287 return 0; 293 if (*addr >= max_addr) {
294 *addr = max_addr;
295 ret = -1;
296 }
297 nodes[nid].end = *addr;
298 node_set_online(nid);
299 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
300 nodes[nid].start, nodes[nid].end,
301 (nodes[nid].end - nodes[nid].start) >> 20);
302 return ret;
288} 303}
289 304
290static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) 305/*
306 * Splits num_nodes nodes up equally starting at node_start. The return value
307 * is the number of nodes split up and addr is adjusted to be at the end of the
308 * last node allocated.
309 */
310static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
311 u64 max_addr, int node_start,
312 int num_nodes)
291{ 313{
292 int i, big; 314 unsigned int big;
293 struct bootnode nodes[MAX_NUMNODES]; 315 u64 size;
294 unsigned long sz, old_sz; 316 int i;
295 unsigned long hole_size;
296 unsigned long start, end;
297 unsigned long max_addr = (end_pfn << PAGE_SHIFT);
298
299 start = (start_pfn << PAGE_SHIFT);
300 hole_size = e820_hole_size(start, max_addr);
301 sz = (max_addr - start - hole_size) / numa_fake;
302
303 /* Kludge needed for the hash function */
304
305 old_sz = sz;
306 /*
307 * Round down to the nearest FAKE_NODE_MIN_SIZE.
308 */
309 sz &= FAKE_NODE_MIN_HASH_MASK;
310 317
318 if (num_nodes <= 0)
319 return -1;
320 if (num_nodes > MAX_NUMNODES)
321 num_nodes = MAX_NUMNODES;
322 size = (max_addr - *addr - E820_ADDR_HOLE_SIZE(*addr, max_addr)) /
323 num_nodes;
311 /* 324 /*
312 * We ensure that each node is at least 64MB big. Smaller than this 325 * Calculate the number of big nodes that can be allocated as a result
313 * size can cause VM hiccups. 326 * of consolidating the leftovers.
314 */ 327 */
315 if (sz == 0) { 328 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) /
316 printk(KERN_INFO "Not enough memory for %d nodes. Reducing " 329 FAKE_NODE_MIN_SIZE;
317 "the number of nodes\n", numa_fake); 330
318 numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE; 331 /* Round down to nearest FAKE_NODE_MIN_SIZE. */
319 printk(KERN_INFO "Number of fake nodes will be = %d\n", 332 size &= FAKE_NODE_MIN_HASH_MASK;
320 numa_fake); 333 if (!size) {
321 sz = FAKE_NODE_MIN_SIZE; 334 printk(KERN_ERR "Not enough memory for each node. "
335 "NUMA emulation disabled.\n");
336 return -1;
322 } 337 }
323 /* 338
324 * Find out how many nodes can get an extra NODE_MIN_SIZE granule. 339 for (i = node_start; i < num_nodes + node_start; i++) {
325 * This logic ensures the extra memory gets distributed among as many 340 u64 end = *addr + size;
326 * nodes as possible (as compared to one single node getting all that
327 * extra memory.
328 */
329 big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE;
330 printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: "
331 "%d\n",
332 (sz >> 20), (hole_size >> 20), big);
333 memset(&nodes,0,sizeof(nodes));
334 end = start;
335 for (i = 0; i < numa_fake; i++) {
336 /*
337 * In case we are not able to allocate enough memory for all
338 * the nodes, we reduce the number of fake nodes.
339 */
340 if (end >= max_addr) {
341 numa_fake = i - 1;
342 break;
343 }
344 start = nodes[i].start = end;
345 /*
346 * Final node can have all the remaining memory.
347 */
348 if (i == numa_fake-1)
349 sz = max_addr - start;
350 end = nodes[i].start + sz;
351 /*
352 * Fir "big" number of nodes get extra granule.
353 */
354 if (i < big) 341 if (i < big)
355 end += FAKE_NODE_MIN_SIZE; 342 end += FAKE_NODE_MIN_SIZE;
356 /* 343 /*
357 * Iterate over the range to ensure that this node gets at 344 * The final node can have the remaining system RAM. Other
358 * least sz amount of RAM (excluding holes) 345 * nodes receive roughly the same amount of available pages.
359 */ 346 */
360 while ((end - start - e820_hole_size(start, end)) < sz) { 347 if (i == num_nodes + node_start - 1)
361 end += FAKE_NODE_MIN_SIZE; 348 end = max_addr;
362 if (end >= max_addr) 349 else
363 break; 350 while (end - *addr - E820_ADDR_HOLE_SIZE(*addr, end) <
351 size) {
352 end += FAKE_NODE_MIN_SIZE;
353 if (end > max_addr) {
354 end = max_addr;
355 break;
356 }
357 }
358 if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0)
359 break;
360 }
361 return i - node_start + 1;
362}
363
364/*
365 * Sets up the system RAM area from start_pfn to end_pfn according to the
366 * numa=fake command-line option.
367 */
368static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
369{
370 struct bootnode nodes[MAX_NUMNODES];
371 u64 addr = start_pfn << PAGE_SHIFT;
372 u64 max_addr = end_pfn << PAGE_SHIFT;
373 unsigned int coeff;
374 unsigned int num = 0;
375 int num_nodes = 0;
376 u64 size;
377 int i;
378
379 memset(&nodes, 0, sizeof(nodes));
380 /*
381 * If the numa=fake command-line is just a single number N, split the
382 * system RAM into N fake nodes.
383 */
384 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
385 num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0,
386 simple_strtol(cmdline, NULL, 0));
387 if (num_nodes < 0)
388 return num_nodes;
389 goto out;
390 }
391
392 /* Parse the command line. */
393 for (coeff = 1; ; cmdline++) {
394 if (*cmdline && isdigit(*cmdline)) {
395 num = num * 10 + *cmdline - '0';
396 continue;
364 } 397 }
365 /* 398 if (*cmdline == '*')
366 * Look at the next node to make sure there is some real memory 399 coeff = num;
367 * to map. Bad things happen when the only memory present 400 if (!*cmdline || *cmdline == ',') {
368 * in a zone on a fake node is IO hole. 401 /*
369 */ 402 * Round down to the nearest FAKE_NODE_MIN_SIZE.
370 while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) { 403 * Command-line coefficients are in megabytes.
371 if (zone_cross_over(start, end + sz)) { 404 */
372 end = (MAX_DMA32_PFN << PAGE_SHIFT); 405 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
373 break; 406 if (size) {
407 for (i = 0; i < coeff; i++, num_nodes++)
408 if (setup_node_range(num_nodes, nodes,
409 &addr, size, max_addr) < 0)
410 goto done;
411 coeff = 1;
374 } 412 }
375 if (end >= max_addr)
376 break;
377 end += FAKE_NODE_MIN_SIZE;
378 } 413 }
379 if (end > max_addr) 414 if (!*cmdline)
380 end = max_addr; 415 break;
381 nodes[i].end = end; 416 num = 0;
382 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", 417 }
383 i, 418done:
384 nodes[i].start, nodes[i].end, 419 if (!num_nodes)
385 (nodes[i].end - nodes[i].start) >> 20); 420 return -1;
386 node_set_online(i); 421 /* Fill remainder of system RAM with a final node, if appropriate. */
387 } 422 if (addr < max_addr) {
388 memnode_shift = compute_hash_shift(nodes, numa_fake); 423 setup_node_range(num_nodes, nodes, &addr, max_addr - addr,
389 if (memnode_shift < 0) { 424 max_addr);
390 memnode_shift = 0; 425 num_nodes++;
391 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n"); 426 }
392 return -1; 427out:
393 } 428 memnode_shift = compute_hash_shift(nodes, num_nodes);
394 for_each_online_node(i) { 429 if (memnode_shift < 0) {
430 memnode_shift = 0;
431 printk(KERN_ERR "No NUMA hash function found. NUMA emulation "
432 "disabled.\n");
433 return -1;
434 }
435
436 /*
437 * We need to vacate all active ranges that may have been registered by
438 * SRAT.
439 */
440 remove_all_active_ranges();
441 for_each_online_node(i) {
395 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, 442 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
396 nodes[i].end >> PAGE_SHIFT); 443 nodes[i].end >> PAGE_SHIFT);
397 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 444 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
@@ -399,14 +446,15 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
399 numa_init_array(); 446 numa_init_array();
400 return 0; 447 return 0;
401} 448}
402#endif 449#undef E820_ADDR_HOLE_SIZE
450#endif /* CONFIG_NUMA_EMU */
403 451
404void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) 452void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
405{ 453{
406 int i; 454 int i;
407 455
408#ifdef CONFIG_NUMA_EMU 456#ifdef CONFIG_NUMA_EMU
409 if (numa_fake && !numa_emulation(start_pfn, end_pfn)) 457 if (cmdline && !numa_emulation(start_pfn, end_pfn))
410 return; 458 return;
411#endif 459#endif
412 460
@@ -486,11 +534,8 @@ static __init int numa_setup(char *opt)
486 if (!strncmp(opt,"off",3)) 534 if (!strncmp(opt,"off",3))
487 numa_off = 1; 535 numa_off = 1;
488#ifdef CONFIG_NUMA_EMU 536#ifdef CONFIG_NUMA_EMU
489 if(!strncmp(opt, "fake=", 5)) { 537 if (!strncmp(opt, "fake=", 5))
490 numa_fake = simple_strtoul(opt+5,NULL,0); ; 538 cmdline = opt + 5;
491 if (numa_fake >= MAX_NUMNODES)
492 numa_fake = MAX_NUMNODES;
493 }
494#endif 539#endif
495#ifdef CONFIG_ACPI_NUMA 540#ifdef CONFIG_ACPI_NUMA
496 if (!strncmp(opt,"noacpi",6)) 541 if (!strncmp(opt,"noacpi",6))
diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h
index fb558fb1d211..19a89377b123 100644
--- a/include/asm-x86_64/mmzone.h
+++ b/include/asm-x86_64/mmzone.h
@@ -49,7 +49,7 @@ extern int pfn_valid(unsigned long pfn);
49 49
50#ifdef CONFIG_NUMA_EMU 50#ifdef CONFIG_NUMA_EMU
51#define FAKE_NODE_MIN_SIZE (64*1024*1024) 51#define FAKE_NODE_MIN_SIZE (64*1024*1024)
52#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1ul)) 52#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1uL))
53#endif 53#endif
54 54
55#endif 55#endif