diff options
author | David Rientjes <rientjes@google.com> | 2007-05-02 13:27:09 -0400 |
---|---|---|
committer | Andi Kleen <andi@basil.nowhere.org> | 2007-05-02 13:27:09 -0400 |
commit | 8b8ca80e192b10eecc01fc44a2902510af86f73b (patch) | |
tree | b5be871c62f4fd3f079a9ce7fcce0c09e87a9a5f | |
parent | 8280c0c58e9762a9fe29d550a9db81410de77691 (diff) |
[PATCH] x86-64: configurable fake numa node sizes
Extends the numa=fake x86_64 command-line option to allow for configurable
node sizes. These nodes can be used in conjunction with cpusets for coarse
memory resource management.
The old command-line option is still supported:
numa=fake=32 gives 32 fake NUMA nodes, ignoring the NUMA setup of the
actual machine.
But now you may configure your system for the node sizes of your choice:
numa=fake=2*512,1024,2*256
gives two 512M nodes, one 1024M node, two 256M nodes, and
the rest of system memory to a sixth node.
The existing hash function is maintained to support the various node sizes
that are possible with this implementation.
Each node of the same size receives roughly the same amount of available
pages, regardless of any reserved memory with its address range. The total
available pages on the system is calculated and divided by the number of equal
nodes to allocate. These nodes are then dynamically allocated and their
borders extended until such time as their number of available pages reaches
the required size.
Configurable node sizes are recommended when used in conjunction with cpusets
for memory control because it eliminates the overhead associated with scanning
the zonelists of many smaller full nodes on page_alloc().
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-rw-r--r-- | Documentation/x86_64/boot-options.txt | 8 | ||||
-rw-r--r-- | arch/x86_64/mm/numa.c | 261 | ||||
-rw-r--r-- | include/asm-x86_64/mmzone.h | 2 |
3 files changed, 161 insertions, 110 deletions
diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt index 85f51e5a749f..7500aad95f3c 100644 --- a/Documentation/x86_64/boot-options.txt +++ b/Documentation/x86_64/boot-options.txt | |||
@@ -149,7 +149,13 @@ NUMA | |||
149 | 149 | ||
150 | numa=noacpi Don't parse the SRAT table for NUMA setup | 150 | numa=noacpi Don't parse the SRAT table for NUMA setup |
151 | 151 | ||
152 | numa=fake=X Fake X nodes and ignore NUMA setup of the actual machine. | 152 | numa=fake=CMDLINE |
153 | If a number, fakes CMDLINE nodes and ignores NUMA setup of the | ||
154 | actual machine. Otherwise, system memory is configured | ||
155 | depending on the sizes and coefficients listed. For example: | ||
156 | numa=fake=2*512,1024,4*256 | ||
157 | gives two 512M nodes, a 1024M node, and four 256M nodes. The | ||
158 | remaining system RAM is allocated to an additional node. | ||
153 | 159 | ||
154 | numa=hotadd=percent | 160 | numa=hotadd=percent |
155 | Only allow hotadd memory to preallocate page structures upto | 161 | Only allow hotadd memory to preallocate page structures upto |
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 41b8fb069924..c55936bc6be6 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c | |||
@@ -273,125 +273,172 @@ void __init numa_init_array(void) | |||
273 | 273 | ||
274 | #ifdef CONFIG_NUMA_EMU | 274 | #ifdef CONFIG_NUMA_EMU |
275 | /* Numa emulation */ | 275 | /* Numa emulation */ |
276 | int numa_fake __initdata = 0; | 276 | #define E820_ADDR_HOLE_SIZE(start, end) \ |
277 | (e820_hole_size((start) >> PAGE_SHIFT, (end) >> PAGE_SHIFT) << \ | ||
278 | PAGE_SHIFT) | ||
279 | char *cmdline __initdata; | ||
277 | 280 | ||
278 | /* | 281 | /* |
279 | * This function is used to find out if the start and end correspond to | 282 | * Setups up nid to range from addr to addr + size. If the end boundary is |
280 | * different zones. | 283 | * greater than max_addr, then max_addr is used instead. The return value is 0 |
284 | * if there is additional memory left for allocation past addr and -1 otherwise. | ||
285 | * addr is adjusted to be at the end of the node. | ||
281 | */ | 286 | */ |
282 | int zone_cross_over(unsigned long start, unsigned long end) | 287 | static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, |
288 | u64 size, u64 max_addr) | ||
283 | { | 289 | { |
284 | if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) && | 290 | int ret = 0; |
285 | (end >= (MAX_DMA32_PFN << PAGE_SHIFT))) | 291 | nodes[nid].start = *addr; |
286 | return 1; | 292 | *addr += size; |
287 | return 0; | 293 | if (*addr >= max_addr) { |
294 | *addr = max_addr; | ||
295 | ret = -1; | ||
296 | } | ||
297 | nodes[nid].end = *addr; | ||
298 | node_set_online(nid); | ||
299 | printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, | ||
300 | nodes[nid].start, nodes[nid].end, | ||
301 | (nodes[nid].end - nodes[nid].start) >> 20); | ||
302 | return ret; | ||
288 | } | 303 | } |
289 | 304 | ||
290 | static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) | 305 | /* |
306 | * Splits num_nodes nodes up equally starting at node_start. The return value | ||
307 | * is the number of nodes split up and addr is adjusted to be at the end of the | ||
308 | * last node allocated. | ||
309 | */ | ||
310 | static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, | ||
311 | u64 max_addr, int node_start, | ||
312 | int num_nodes) | ||
291 | { | 313 | { |
292 | int i, big; | 314 | unsigned int big; |
293 | struct bootnode nodes[MAX_NUMNODES]; | 315 | u64 size; |
294 | unsigned long sz, old_sz; | 316 | int i; |
295 | unsigned long hole_size; | ||
296 | unsigned long start, end; | ||
297 | unsigned long max_addr = (end_pfn << PAGE_SHIFT); | ||
298 | |||
299 | start = (start_pfn << PAGE_SHIFT); | ||
300 | hole_size = e820_hole_size(start, max_addr); | ||
301 | sz = (max_addr - start - hole_size) / numa_fake; | ||
302 | |||
303 | /* Kludge needed for the hash function */ | ||
304 | |||
305 | old_sz = sz; | ||
306 | /* | ||
307 | * Round down to the nearest FAKE_NODE_MIN_SIZE. | ||
308 | */ | ||
309 | sz &= FAKE_NODE_MIN_HASH_MASK; | ||
310 | 317 | ||
318 | if (num_nodes <= 0) | ||
319 | return -1; | ||
320 | if (num_nodes > MAX_NUMNODES) | ||
321 | num_nodes = MAX_NUMNODES; | ||
322 | size = (max_addr - *addr - E820_ADDR_HOLE_SIZE(*addr, max_addr)) / | ||
323 | num_nodes; | ||
311 | /* | 324 | /* |
312 | * We ensure that each node is at least 64MB big. Smaller than this | 325 | * Calculate the number of big nodes that can be allocated as a result |
313 | * size can cause VM hiccups. | 326 | * of consolidating the leftovers. |
314 | */ | 327 | */ |
315 | if (sz == 0) { | 328 | big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) / |
316 | printk(KERN_INFO "Not enough memory for %d nodes. Reducing " | 329 | FAKE_NODE_MIN_SIZE; |
317 | "the number of nodes\n", numa_fake); | 330 | |
318 | numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE; | 331 | /* Round down to nearest FAKE_NODE_MIN_SIZE. */ |
319 | printk(KERN_INFO "Number of fake nodes will be = %d\n", | 332 | size &= FAKE_NODE_MIN_HASH_MASK; |
320 | numa_fake); | 333 | if (!size) { |
321 | sz = FAKE_NODE_MIN_SIZE; | 334 | printk(KERN_ERR "Not enough memory for each node. " |
335 | "NUMA emulation disabled.\n"); | ||
336 | return -1; | ||
322 | } | 337 | } |
323 | /* | 338 | |
324 | * Find out how many nodes can get an extra NODE_MIN_SIZE granule. | 339 | for (i = node_start; i < num_nodes + node_start; i++) { |
325 | * This logic ensures the extra memory gets distributed among as many | 340 | u64 end = *addr + size; |
326 | * nodes as possible (as compared to one single node getting all that | ||
327 | * extra memory. | ||
328 | */ | ||
329 | big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE; | ||
330 | printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: " | ||
331 | "%d\n", | ||
332 | (sz >> 20), (hole_size >> 20), big); | ||
333 | memset(&nodes,0,sizeof(nodes)); | ||
334 | end = start; | ||
335 | for (i = 0; i < numa_fake; i++) { | ||
336 | /* | ||
337 | * In case we are not able to allocate enough memory for all | ||
338 | * the nodes, we reduce the number of fake nodes. | ||
339 | */ | ||
340 | if (end >= max_addr) { | ||
341 | numa_fake = i - 1; | ||
342 | break; | ||
343 | } | ||
344 | start = nodes[i].start = end; | ||
345 | /* | ||
346 | * Final node can have all the remaining memory. | ||
347 | */ | ||
348 | if (i == numa_fake-1) | ||
349 | sz = max_addr - start; | ||
350 | end = nodes[i].start + sz; | ||
351 | /* | ||
352 | * Fir "big" number of nodes get extra granule. | ||
353 | */ | ||
354 | if (i < big) | 341 | if (i < big) |
355 | end += FAKE_NODE_MIN_SIZE; | 342 | end += FAKE_NODE_MIN_SIZE; |
356 | /* | 343 | /* |
357 | * Iterate over the range to ensure that this node gets at | 344 | * The final node can have the remaining system RAM. Other |
358 | * least sz amount of RAM (excluding holes) | 345 | * nodes receive roughly the same amount of available pages. |
359 | */ | 346 | */ |
360 | while ((end - start - e820_hole_size(start, end)) < sz) { | 347 | if (i == num_nodes + node_start - 1) |
361 | end += FAKE_NODE_MIN_SIZE; | 348 | end = max_addr; |
362 | if (end >= max_addr) | 349 | else |
363 | break; | 350 | while (end - *addr - E820_ADDR_HOLE_SIZE(*addr, end) < |
351 | size) { | ||
352 | end += FAKE_NODE_MIN_SIZE; | ||
353 | if (end > max_addr) { | ||
354 | end = max_addr; | ||
355 | break; | ||
356 | } | ||
357 | } | ||
358 | if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) | ||
359 | break; | ||
360 | } | ||
361 | return i - node_start + 1; | ||
362 | } | ||
363 | |||
364 | /* | ||
365 | * Sets up the system RAM area from start_pfn to end_pfn according to the | ||
366 | * numa=fake command-line option. | ||
367 | */ | ||
368 | static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) | ||
369 | { | ||
370 | struct bootnode nodes[MAX_NUMNODES]; | ||
371 | u64 addr = start_pfn << PAGE_SHIFT; | ||
372 | u64 max_addr = end_pfn << PAGE_SHIFT; | ||
373 | unsigned int coeff; | ||
374 | unsigned int num = 0; | ||
375 | int num_nodes = 0; | ||
376 | u64 size; | ||
377 | int i; | ||
378 | |||
379 | memset(&nodes, 0, sizeof(nodes)); | ||
380 | /* | ||
381 | * If the numa=fake command-line is just a single number N, split the | ||
382 | * system RAM into N fake nodes. | ||
383 | */ | ||
384 | if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { | ||
385 | num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, | ||
386 | simple_strtol(cmdline, NULL, 0)); | ||
387 | if (num_nodes < 0) | ||
388 | return num_nodes; | ||
389 | goto out; | ||
390 | } | ||
391 | |||
392 | /* Parse the command line. */ | ||
393 | for (coeff = 1; ; cmdline++) { | ||
394 | if (*cmdline && isdigit(*cmdline)) { | ||
395 | num = num * 10 + *cmdline - '0'; | ||
396 | continue; | ||
364 | } | 397 | } |
365 | /* | 398 | if (*cmdline == '*') |
366 | * Look at the next node to make sure there is some real memory | 399 | coeff = num; |
367 | * to map. Bad things happen when the only memory present | 400 | if (!*cmdline || *cmdline == ',') { |
368 | * in a zone on a fake node is IO hole. | 401 | /* |
369 | */ | 402 | * Round down to the nearest FAKE_NODE_MIN_SIZE. |
370 | while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) { | 403 | * Command-line coefficients are in megabytes. |
371 | if (zone_cross_over(start, end + sz)) { | 404 | */ |
372 | end = (MAX_DMA32_PFN << PAGE_SHIFT); | 405 | size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; |
373 | break; | 406 | if (size) { |
407 | for (i = 0; i < coeff; i++, num_nodes++) | ||
408 | if (setup_node_range(num_nodes, nodes, | ||
409 | &addr, size, max_addr) < 0) | ||
410 | goto done; | ||
411 | coeff = 1; | ||
374 | } | 412 | } |
375 | if (end >= max_addr) | ||
376 | break; | ||
377 | end += FAKE_NODE_MIN_SIZE; | ||
378 | } | 413 | } |
379 | if (end > max_addr) | 414 | if (!*cmdline) |
380 | end = max_addr; | 415 | break; |
381 | nodes[i].end = end; | 416 | num = 0; |
382 | printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", | 417 | } |
383 | i, | 418 | done: |
384 | nodes[i].start, nodes[i].end, | 419 | if (!num_nodes) |
385 | (nodes[i].end - nodes[i].start) >> 20); | 420 | return -1; |
386 | node_set_online(i); | 421 | /* Fill remainder of system RAM with a final node, if appropriate. */ |
387 | } | 422 | if (addr < max_addr) { |
388 | memnode_shift = compute_hash_shift(nodes, numa_fake); | 423 | setup_node_range(num_nodes, nodes, &addr, max_addr - addr, |
389 | if (memnode_shift < 0) { | 424 | max_addr); |
390 | memnode_shift = 0; | 425 | num_nodes++; |
391 | printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n"); | 426 | } |
392 | return -1; | 427 | out: |
393 | } | 428 | memnode_shift = compute_hash_shift(nodes, num_nodes); |
394 | for_each_online_node(i) { | 429 | if (memnode_shift < 0) { |
430 | memnode_shift = 0; | ||
431 | printk(KERN_ERR "No NUMA hash function found. NUMA emulation " | ||
432 | "disabled.\n"); | ||
433 | return -1; | ||
434 | } | ||
435 | |||
436 | /* | ||
437 | * We need to vacate all active ranges that may have been registered by | ||
438 | * SRAT. | ||
439 | */ | ||
440 | remove_all_active_ranges(); | ||
441 | for_each_online_node(i) { | ||
395 | e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, | 442 | e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, |
396 | nodes[i].end >> PAGE_SHIFT); | 443 | nodes[i].end >> PAGE_SHIFT); |
397 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | 444 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); |
@@ -399,14 +446,15 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) | |||
399 | numa_init_array(); | 446 | numa_init_array(); |
400 | return 0; | 447 | return 0; |
401 | } | 448 | } |
402 | #endif | 449 | #undef E820_ADDR_HOLE_SIZE |
450 | #endif /* CONFIG_NUMA_EMU */ | ||
403 | 451 | ||
404 | void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) | 452 | void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) |
405 | { | 453 | { |
406 | int i; | 454 | int i; |
407 | 455 | ||
408 | #ifdef CONFIG_NUMA_EMU | 456 | #ifdef CONFIG_NUMA_EMU |
409 | if (numa_fake && !numa_emulation(start_pfn, end_pfn)) | 457 | if (cmdline && !numa_emulation(start_pfn, end_pfn)) |
410 | return; | 458 | return; |
411 | #endif | 459 | #endif |
412 | 460 | ||
@@ -486,11 +534,8 @@ static __init int numa_setup(char *opt) | |||
486 | if (!strncmp(opt,"off",3)) | 534 | if (!strncmp(opt,"off",3)) |
487 | numa_off = 1; | 535 | numa_off = 1; |
488 | #ifdef CONFIG_NUMA_EMU | 536 | #ifdef CONFIG_NUMA_EMU |
489 | if(!strncmp(opt, "fake=", 5)) { | 537 | if (!strncmp(opt, "fake=", 5)) |
490 | numa_fake = simple_strtoul(opt+5,NULL,0); ; | 538 | cmdline = opt + 5; |
491 | if (numa_fake >= MAX_NUMNODES) | ||
492 | numa_fake = MAX_NUMNODES; | ||
493 | } | ||
494 | #endif | 539 | #endif |
495 | #ifdef CONFIG_ACPI_NUMA | 540 | #ifdef CONFIG_ACPI_NUMA |
496 | if (!strncmp(opt,"noacpi",6)) | 541 | if (!strncmp(opt,"noacpi",6)) |
diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h index fb558fb1d211..19a89377b123 100644 --- a/include/asm-x86_64/mmzone.h +++ b/include/asm-x86_64/mmzone.h | |||
@@ -49,7 +49,7 @@ extern int pfn_valid(unsigned long pfn); | |||
49 | 49 | ||
50 | #ifdef CONFIG_NUMA_EMU | 50 | #ifdef CONFIG_NUMA_EMU |
51 | #define FAKE_NODE_MIN_SIZE (64*1024*1024) | 51 | #define FAKE_NODE_MIN_SIZE (64*1024*1024) |
52 | #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1ul)) | 52 | #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1uL)) |
53 | #endif | 53 | #endif |
54 | 54 | ||
55 | #endif | 55 | #endif |