diff options
-rw-r--r-- | Documentation/x86_64/boot-options.txt | 8 | ||||
-rw-r--r-- | arch/x86_64/mm/numa.c | 261 | ||||
-rw-r--r-- | include/asm-x86_64/mmzone.h | 2 |
3 files changed, 161 insertions, 110 deletions
diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt index 85f51e5a749f..7500aad95f3c 100644 --- a/Documentation/x86_64/boot-options.txt +++ b/Documentation/x86_64/boot-options.txt | |||
@@ -149,7 +149,13 @@ NUMA | |||
149 | 149 | ||
150 | numa=noacpi Don't parse the SRAT table for NUMA setup | 150 | numa=noacpi Don't parse the SRAT table for NUMA setup |
151 | 151 | ||
152 | numa=fake=X Fake X nodes and ignore NUMA setup of the actual machine. | 152 | numa=fake=CMDLINE |
153 | If a number, fakes CMDLINE nodes and ignores NUMA setup of the | ||
154 | actual machine. Otherwise, system memory is configured | ||
155 | depending on the sizes and coefficients listed. For example: | ||
156 | numa=fake=2*512,1024,4*256 | ||
157 | gives two 512M nodes, a 1024M node, and four 256M nodes. The | ||
158 | remaining system RAM is allocated to an additional node. | ||
153 | 159 | ||
154 | numa=hotadd=percent | 160 | numa=hotadd=percent |
155 | Only allow hotadd memory to preallocate page structures upto | 161 | Only allow hotadd memory to preallocate page structures upto |
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 41b8fb069924..c55936bc6be6 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c | |||
@@ -273,125 +273,172 @@ void __init numa_init_array(void) | |||
273 | 273 | ||
274 | #ifdef CONFIG_NUMA_EMU | 274 | #ifdef CONFIG_NUMA_EMU |
275 | /* Numa emulation */ | 275 | /* Numa emulation */ |
276 | int numa_fake __initdata = 0; | 276 | #define E820_ADDR_HOLE_SIZE(start, end) \ |
277 | (e820_hole_size((start) >> PAGE_SHIFT, (end) >> PAGE_SHIFT) << \ | ||
278 | PAGE_SHIFT) | ||
279 | char *cmdline __initdata; | ||
277 | 280 | ||
278 | /* | 281 | /* |
279 | * This function is used to find out if the start and end correspond to | 282 | * Setups up nid to range from addr to addr + size. If the end boundary is |
280 | * different zones. | 283 | * greater than max_addr, then max_addr is used instead. The return value is 0 |
284 | * if there is additional memory left for allocation past addr and -1 otherwise. | ||
285 | * addr is adjusted to be at the end of the node. | ||
281 | */ | 286 | */ |
282 | int zone_cross_over(unsigned long start, unsigned long end) | 287 | static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, |
288 | u64 size, u64 max_addr) | ||
283 | { | 289 | { |
284 | if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) && | 290 | int ret = 0; |
285 | (end >= (MAX_DMA32_PFN << PAGE_SHIFT))) | 291 | nodes[nid].start = *addr; |
286 | return 1; | 292 | *addr += size; |
287 | return 0; | 293 | if (*addr >= max_addr) { |
294 | *addr = max_addr; | ||
295 | ret = -1; | ||
296 | } | ||
297 | nodes[nid].end = *addr; | ||
298 | node_set_online(nid); | ||
299 | printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, | ||
300 | nodes[nid].start, nodes[nid].end, | ||
301 | (nodes[nid].end - nodes[nid].start) >> 20); | ||
302 | return ret; | ||
288 | } | 303 | } |
289 | 304 | ||
290 | static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) | 305 | /* |
306 | * Splits num_nodes nodes up equally starting at node_start. The return value | ||
307 | * is the number of nodes split up and addr is adjusted to be at the end of the | ||
308 | * last node allocated. | ||
309 | */ | ||
310 | static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, | ||
311 | u64 max_addr, int node_start, | ||
312 | int num_nodes) | ||
291 | { | 313 | { |
292 | int i, big; | 314 | unsigned int big; |
293 | struct bootnode nodes[MAX_NUMNODES]; | 315 | u64 size; |
294 | unsigned long sz, old_sz; | 316 | int i; |
295 | unsigned long hole_size; | ||
296 | unsigned long start, end; | ||
297 | unsigned long max_addr = (end_pfn << PAGE_SHIFT); | ||
298 | |||
299 | start = (start_pfn << PAGE_SHIFT); | ||
300 | hole_size = e820_hole_size(start, max_addr); | ||
301 | sz = (max_addr - start - hole_size) / numa_fake; | ||
302 | |||
303 | /* Kludge needed for the hash function */ | ||
304 | |||
305 | old_sz = sz; | ||
306 | /* | ||
307 | * Round down to the nearest FAKE_NODE_MIN_SIZE. | ||
308 | */ | ||
309 | sz &= FAKE_NODE_MIN_HASH_MASK; | ||
310 | 317 | ||
318 | if (num_nodes <= 0) | ||
319 | return -1; | ||
320 | if (num_nodes > MAX_NUMNODES) | ||
321 | num_nodes = MAX_NUMNODES; | ||
322 | size = (max_addr - *addr - E820_ADDR_HOLE_SIZE(*addr, max_addr)) / | ||
323 | num_nodes; | ||
311 | /* | 324 | /* |
312 | * We ensure that each node is at least 64MB big. Smaller than this | 325 | * Calculate the number of big nodes that can be allocated as a result |
313 | * size can cause VM hiccups. | 326 | * of consolidating the leftovers. |
314 | */ | 327 | */ |
315 | if (sz == 0) { | 328 | big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) / |
316 | printk(KERN_INFO "Not enough memory for %d nodes. Reducing " | 329 | FAKE_NODE_MIN_SIZE; |
317 | "the number of nodes\n", numa_fake); | 330 | |
318 | numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE; | 331 | /* Round down to nearest FAKE_NODE_MIN_SIZE. */ |
319 | printk(KERN_INFO "Number of fake nodes will be = %d\n", | 332 | size &= FAKE_NODE_MIN_HASH_MASK; |
320 | numa_fake); | 333 | if (!size) { |
321 | sz = FAKE_NODE_MIN_SIZE; | 334 | printk(KERN_ERR "Not enough memory for each node. " |
335 | "NUMA emulation disabled.\n"); | ||
336 | return -1; | ||
322 | } | 337 | } |
323 | /* | 338 | |
324 | * Find out how many nodes can get an extra NODE_MIN_SIZE granule. | 339 | for (i = node_start; i < num_nodes + node_start; i++) { |
325 | * This logic ensures the extra memory gets distributed among as many | 340 | u64 end = *addr + size; |
326 | * nodes as possible (as compared to one single node getting all that | ||
327 | * extra memory. | ||
328 | */ | ||
329 | big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE; | ||
330 | printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: " | ||
331 | "%d\n", | ||
332 | (sz >> 20), (hole_size >> 20), big); | ||
333 | memset(&nodes,0,sizeof(nodes)); | ||
334 | end = start; | ||
335 | for (i = 0; i < numa_fake; i++) { | ||
336 | /* | ||
337 | * In case we are not able to allocate enough memory for all | ||
338 | * the nodes, we reduce the number of fake nodes. | ||
339 | */ | ||
340 | if (end >= max_addr) { | ||
341 | numa_fake = i - 1; | ||
342 | break; | ||
343 | } | ||
344 | start = nodes[i].start = end; | ||
345 | /* | ||
346 | * Final node can have all the remaining memory. | ||
347 | */ | ||
348 | if (i == numa_fake-1) | ||
349 | sz = max_addr - start; | ||
350 | end = nodes[i].start + sz; | ||
351 | /* | ||
352 | * Fir "big" number of nodes get extra granule. | ||
353 | */ | ||
354 | if (i < big) | 341 | if (i < big) |
355 | end += FAKE_NODE_MIN_SIZE; | 342 | end += FAKE_NODE_MIN_SIZE; |
356 | /* | 343 | /* |
357 | * Iterate over the range to ensure that this node gets at | 344 | * The final node can have the remaining system RAM. Other |
358 | * least sz amount of RAM (excluding holes) | 345 | * nodes receive roughly the same amount of available pages. |
359 | */ | 346 | */ |
360 | while ((end - start - e820_hole_size(start, end)) < sz) { | 347 | if (i == num_nodes + node_start - 1) |
361 | end += FAKE_NODE_MIN_SIZE; | 348 | end = max_addr; |
362 | if (end >= max_addr) | 349 | else |
363 | break; | 350 | while (end - *addr - E820_ADDR_HOLE_SIZE(*addr, end) < |
351 | size) { | ||
352 | end += FAKE_NODE_MIN_SIZE; | ||
353 | if (end > max_addr) { | ||
354 | end = max_addr; | ||
355 | break; | ||
356 | } | ||
357 | } | ||
358 | if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) | ||
359 | break; | ||
360 | } | ||
361 | return i - node_start + 1; | ||
362 | } | ||
363 | |||
364 | /* | ||
365 | * Sets up the system RAM area from start_pfn to end_pfn according to the | ||
366 | * numa=fake command-line option. | ||
367 | */ | ||
368 | static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) | ||
369 | { | ||
370 | struct bootnode nodes[MAX_NUMNODES]; | ||
371 | u64 addr = start_pfn << PAGE_SHIFT; | ||
372 | u64 max_addr = end_pfn << PAGE_SHIFT; | ||
373 | unsigned int coeff; | ||
374 | unsigned int num = 0; | ||
375 | int num_nodes = 0; | ||
376 | u64 size; | ||
377 | int i; | ||
378 | |||
379 | memset(&nodes, 0, sizeof(nodes)); | ||
380 | /* | ||
381 | * If the numa=fake command-line is just a single number N, split the | ||
382 | * system RAM into N fake nodes. | ||
383 | */ | ||
384 | if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { | ||
385 | num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, | ||
386 | simple_strtol(cmdline, NULL, 0)); | ||
387 | if (num_nodes < 0) | ||
388 | return num_nodes; | ||
389 | goto out; | ||
390 | } | ||
391 | |||
392 | /* Parse the command line. */ | ||
393 | for (coeff = 1; ; cmdline++) { | ||
394 | if (*cmdline && isdigit(*cmdline)) { | ||
395 | num = num * 10 + *cmdline - '0'; | ||
396 | continue; | ||
364 | } | 397 | } |
365 | /* | 398 | if (*cmdline == '*') |
366 | * Look at the next node to make sure there is some real memory | 399 | coeff = num; |
367 | * to map. Bad things happen when the only memory present | 400 | if (!*cmdline || *cmdline == ',') { |
368 | * in a zone on a fake node is IO hole. | 401 | /* |
369 | */ | 402 | * Round down to the nearest FAKE_NODE_MIN_SIZE. |
370 | while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) { | 403 | * Command-line coefficients are in megabytes. |
371 | if (zone_cross_over(start, end + sz)) { | 404 | */ |
372 | end = (MAX_DMA32_PFN << PAGE_SHIFT); | 405 | size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; |
373 | break; | 406 | if (size) { |
407 | for (i = 0; i < coeff; i++, num_nodes++) | ||
408 | if (setup_node_range(num_nodes, nodes, | ||
409 | &addr, size, max_addr) < 0) | ||
410 | goto done; | ||
411 | coeff = 1; | ||
374 | } | 412 | } |
375 | if (end >= max_addr) | ||
376 | break; | ||
377 | end += FAKE_NODE_MIN_SIZE; | ||
378 | } | 413 | } |
379 | if (end > max_addr) | 414 | if (!*cmdline) |
380 | end = max_addr; | 415 | break; |
381 | nodes[i].end = end; | 416 | num = 0; |
382 | printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", | 417 | } |
383 | i, | 418 | done: |
384 | nodes[i].start, nodes[i].end, | 419 | if (!num_nodes) |
385 | (nodes[i].end - nodes[i].start) >> 20); | 420 | return -1; |
386 | node_set_online(i); | 421 | /* Fill remainder of system RAM with a final node, if appropriate. */ |
387 | } | 422 | if (addr < max_addr) { |
388 | memnode_shift = compute_hash_shift(nodes, numa_fake); | 423 | setup_node_range(num_nodes, nodes, &addr, max_addr - addr, |
389 | if (memnode_shift < 0) { | 424 | max_addr); |
390 | memnode_shift = 0; | 425 | num_nodes++; |
391 | printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n"); | 426 | } |
392 | return -1; | 427 | out: |
393 | } | 428 | memnode_shift = compute_hash_shift(nodes, num_nodes); |
394 | for_each_online_node(i) { | 429 | if (memnode_shift < 0) { |
430 | memnode_shift = 0; | ||
431 | printk(KERN_ERR "No NUMA hash function found. NUMA emulation " | ||
432 | "disabled.\n"); | ||
433 | return -1; | ||
434 | } | ||
435 | |||
436 | /* | ||
437 | * We need to vacate all active ranges that may have been registered by | ||
438 | * SRAT. | ||
439 | */ | ||
440 | remove_all_active_ranges(); | ||
441 | for_each_online_node(i) { | ||
395 | e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, | 442 | e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, |
396 | nodes[i].end >> PAGE_SHIFT); | 443 | nodes[i].end >> PAGE_SHIFT); |
397 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | 444 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); |
@@ -399,14 +446,15 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) | |||
399 | numa_init_array(); | 446 | numa_init_array(); |
400 | return 0; | 447 | return 0; |
401 | } | 448 | } |
402 | #endif | 449 | #undef E820_ADDR_HOLE_SIZE |
450 | #endif /* CONFIG_NUMA_EMU */ | ||
403 | 451 | ||
404 | void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) | 452 | void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) |
405 | { | 453 | { |
406 | int i; | 454 | int i; |
407 | 455 | ||
408 | #ifdef CONFIG_NUMA_EMU | 456 | #ifdef CONFIG_NUMA_EMU |
409 | if (numa_fake && !numa_emulation(start_pfn, end_pfn)) | 457 | if (cmdline && !numa_emulation(start_pfn, end_pfn)) |
410 | return; | 458 | return; |
411 | #endif | 459 | #endif |
412 | 460 | ||
@@ -486,11 +534,8 @@ static __init int numa_setup(char *opt) | |||
486 | if (!strncmp(opt,"off",3)) | 534 | if (!strncmp(opt,"off",3)) |
487 | numa_off = 1; | 535 | numa_off = 1; |
488 | #ifdef CONFIG_NUMA_EMU | 536 | #ifdef CONFIG_NUMA_EMU |
489 | if(!strncmp(opt, "fake=", 5)) { | 537 | if (!strncmp(opt, "fake=", 5)) |
490 | numa_fake = simple_strtoul(opt+5,NULL,0); ; | 538 | cmdline = opt + 5; |
491 | if (numa_fake >= MAX_NUMNODES) | ||
492 | numa_fake = MAX_NUMNODES; | ||
493 | } | ||
494 | #endif | 539 | #endif |
495 | #ifdef CONFIG_ACPI_NUMA | 540 | #ifdef CONFIG_ACPI_NUMA |
496 | if (!strncmp(opt,"noacpi",6)) | 541 | if (!strncmp(opt,"noacpi",6)) |
diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h index fb558fb1d211..19a89377b123 100644 --- a/include/asm-x86_64/mmzone.h +++ b/include/asm-x86_64/mmzone.h | |||
@@ -49,7 +49,7 @@ extern int pfn_valid(unsigned long pfn); | |||
49 | 49 | ||
50 | #ifdef CONFIG_NUMA_EMU | 50 | #ifdef CONFIG_NUMA_EMU |
51 | #define FAKE_NODE_MIN_SIZE (64*1024*1024) | 51 | #define FAKE_NODE_MIN_SIZE (64*1024*1024) |
52 | #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1ul)) | 52 | #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1uL)) |
53 | #endif | 53 | #endif |
54 | 54 | ||
55 | #endif | 55 | #endif |