diff options
author | Benjamin Herrenschmidt <benh@kernel.crashing.org> | 2009-12-09 01:14:38 -0500 |
---|---|---|
committer | Benjamin Herrenschmidt <benh@kernel.crashing.org> | 2009-12-09 01:14:38 -0500 |
commit | bcd6acd51f3d4d1ada201e9bc5c40a31d6d80c71 (patch) | |
tree | 2f6dffd2d3e4dd67355a224de7e7a960335a92fd /arch/x86/mm/numa_64.c | |
parent | 11c34c7deaeeebcee342cbc35e1bb2a6711b2431 (diff) | |
parent | 3ff6a468b45b5dfeb0e903e56f4eb27d34b2437c (diff) |
Merge commit 'origin/master' into next
Conflicts:
include/linux/kvm.h
Diffstat (limited to 'arch/x86/mm/numa_64.c')
-rw-r--r-- | arch/x86/mm/numa_64.c | 252 |
1 files changed, 217 insertions, 35 deletions
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 459913beac7..83bbc70d11b 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c | |||
@@ -239,8 +239,14 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) | |||
239 | bootmap = early_node_mem(nodeid, bootmap_start, end, | 239 | bootmap = early_node_mem(nodeid, bootmap_start, end, |
240 | bootmap_pages<<PAGE_SHIFT, PAGE_SIZE); | 240 | bootmap_pages<<PAGE_SHIFT, PAGE_SIZE); |
241 | if (bootmap == NULL) { | 241 | if (bootmap == NULL) { |
242 | if (nodedata_phys < start || nodedata_phys >= end) | 242 | if (nodedata_phys < start || nodedata_phys >= end) { |
243 | free_bootmem(nodedata_phys, pgdat_size); | 243 | /* |
244 | * only need to free it if it is from other node | ||
245 | * bootmem | ||
246 | */ | ||
247 | if (nid != nodeid) | ||
248 | free_bootmem(nodedata_phys, pgdat_size); | ||
249 | } | ||
244 | node_data[nodeid] = NULL; | 250 | node_data[nodeid] = NULL; |
245 | return; | 251 | return; |
246 | } | 252 | } |
@@ -306,8 +312,71 @@ void __init numa_init_array(void) | |||
306 | 312 | ||
307 | #ifdef CONFIG_NUMA_EMU | 313 | #ifdef CONFIG_NUMA_EMU |
308 | /* Numa emulation */ | 314 | /* Numa emulation */ |
315 | static struct bootnode nodes[MAX_NUMNODES] __initdata; | ||
316 | static struct bootnode physnodes[MAX_NUMNODES] __initdata; | ||
309 | static char *cmdline __initdata; | 317 | static char *cmdline __initdata; |
310 | 318 | ||
319 | static int __init setup_physnodes(unsigned long start, unsigned long end, | ||
320 | int acpi, int k8) | ||
321 | { | ||
322 | int nr_nodes = 0; | ||
323 | int ret = 0; | ||
324 | int i; | ||
325 | |||
326 | #ifdef CONFIG_ACPI_NUMA | ||
327 | if (acpi) | ||
328 | nr_nodes = acpi_get_nodes(physnodes); | ||
329 | #endif | ||
330 | #ifdef CONFIG_K8_NUMA | ||
331 | if (k8) | ||
332 | nr_nodes = k8_get_nodes(physnodes); | ||
333 | #endif | ||
334 | /* | ||
335 | * Basic sanity checking on the physical node map: there may be errors | ||
336 | * if the SRAT or K8 incorrectly reported the topology or the mem= | ||
337 | * kernel parameter is used. | ||
338 | */ | ||
339 | for (i = 0; i < nr_nodes; i++) { | ||
340 | if (physnodes[i].start == physnodes[i].end) | ||
341 | continue; | ||
342 | if (physnodes[i].start > end) { | ||
343 | physnodes[i].end = physnodes[i].start; | ||
344 | continue; | ||
345 | } | ||
346 | if (physnodes[i].end < start) { | ||
347 | physnodes[i].start = physnodes[i].end; | ||
348 | continue; | ||
349 | } | ||
350 | if (physnodes[i].start < start) | ||
351 | physnodes[i].start = start; | ||
352 | if (physnodes[i].end > end) | ||
353 | physnodes[i].end = end; | ||
354 | } | ||
355 | |||
356 | /* | ||
357 | * Remove all nodes that have no memory or were truncated because of the | ||
358 | * limited address range. | ||
359 | */ | ||
360 | for (i = 0; i < nr_nodes; i++) { | ||
361 | if (physnodes[i].start == physnodes[i].end) | ||
362 | continue; | ||
363 | physnodes[ret].start = physnodes[i].start; | ||
364 | physnodes[ret].end = physnodes[i].end; | ||
365 | ret++; | ||
366 | } | ||
367 | |||
368 | /* | ||
369 | * If no physical topology was detected, a single node is faked to cover | ||
370 | * the entire address space. | ||
371 | */ | ||
372 | if (!ret) { | ||
373 | physnodes[ret].start = start; | ||
374 | physnodes[ret].end = end; | ||
375 | ret = 1; | ||
376 | } | ||
377 | return ret; | ||
378 | } | ||
379 | |||
311 | /* | 380 | /* |
312 | * Setups up nid to range from addr to addr + size. If the end | 381 | * Setups up nid to range from addr to addr + size. If the end |
313 | * boundary is greater than max_addr, then max_addr is used instead. | 382 | * boundary is greater than max_addr, then max_addr is used instead. |
@@ -315,11 +384,9 @@ static char *cmdline __initdata; | |||
315 | * allocation past addr and -1 otherwise. addr is adjusted to be at | 384 | * allocation past addr and -1 otherwise. addr is adjusted to be at |
316 | * the end of the node. | 385 | * the end of the node. |
317 | */ | 386 | */ |
318 | static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, | 387 | static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr) |
319 | u64 size, u64 max_addr) | ||
320 | { | 388 | { |
321 | int ret = 0; | 389 | int ret = 0; |
322 | |||
323 | nodes[nid].start = *addr; | 390 | nodes[nid].start = *addr; |
324 | *addr += size; | 391 | *addr += size; |
325 | if (*addr >= max_addr) { | 392 | if (*addr >= max_addr) { |
@@ -335,12 +402,111 @@ static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, | |||
335 | } | 402 | } |
336 | 403 | ||
337 | /* | 404 | /* |
405 | * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr | ||
406 | * to max_addr. The return value is the number of nodes allocated. | ||
407 | */ | ||
408 | static int __init split_nodes_interleave(u64 addr, u64 max_addr, | ||
409 | int nr_phys_nodes, int nr_nodes) | ||
410 | { | ||
411 | nodemask_t physnode_mask = NODE_MASK_NONE; | ||
412 | u64 size; | ||
413 | int big; | ||
414 | int ret = 0; | ||
415 | int i; | ||
416 | |||
417 | if (nr_nodes <= 0) | ||
418 | return -1; | ||
419 | if (nr_nodes > MAX_NUMNODES) { | ||
420 | pr_info("numa=fake=%d too large, reducing to %d\n", | ||
421 | nr_nodes, MAX_NUMNODES); | ||
422 | nr_nodes = MAX_NUMNODES; | ||
423 | } | ||
424 | |||
425 | size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes; | ||
426 | /* | ||
427 | * Calculate the number of big nodes that can be allocated as a result | ||
428 | * of consolidating the remainder. | ||
429 | */ | ||
430 | big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) / | ||
431 | FAKE_NODE_MIN_SIZE; | ||
432 | |||
433 | size &= FAKE_NODE_MIN_HASH_MASK; | ||
434 | if (!size) { | ||
435 | pr_err("Not enough memory for each node. " | ||
436 | "NUMA emulation disabled.\n"); | ||
437 | return -1; | ||
438 | } | ||
439 | |||
440 | for (i = 0; i < nr_phys_nodes; i++) | ||
441 | if (physnodes[i].start != physnodes[i].end) | ||
442 | node_set(i, physnode_mask); | ||
443 | |||
444 | /* | ||
445 | * Continue to fill physical nodes with fake nodes until there is no | ||
446 | * memory left on any of them. | ||
447 | */ | ||
448 | while (nodes_weight(physnode_mask)) { | ||
449 | for_each_node_mask(i, physnode_mask) { | ||
450 | u64 end = physnodes[i].start + size; | ||
451 | u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); | ||
452 | |||
453 | if (ret < big) | ||
454 | end += FAKE_NODE_MIN_SIZE; | ||
455 | |||
456 | /* | ||
457 | * Continue to add memory to this fake node if its | ||
458 | * non-reserved memory is less than the per-node size. | ||
459 | */ | ||
460 | while (end - physnodes[i].start - | ||
461 | e820_hole_size(physnodes[i].start, end) < size) { | ||
462 | end += FAKE_NODE_MIN_SIZE; | ||
463 | if (end > physnodes[i].end) { | ||
464 | end = physnodes[i].end; | ||
465 | break; | ||
466 | } | ||
467 | } | ||
468 | |||
469 | /* | ||
470 | * If there won't be at least FAKE_NODE_MIN_SIZE of | ||
471 | * non-reserved memory in ZONE_DMA32 for the next node, | ||
472 | * this one must extend to the boundary. | ||
473 | */ | ||
474 | if (end < dma32_end && dma32_end - end - | ||
475 | e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) | ||
476 | end = dma32_end; | ||
477 | |||
478 | /* | ||
479 | * If there won't be enough non-reserved memory for the | ||
480 | * next node, this one must extend to the end of the | ||
481 | * physical node. | ||
482 | */ | ||
483 | if (physnodes[i].end - end - | ||
484 | e820_hole_size(end, physnodes[i].end) < size) | ||
485 | end = physnodes[i].end; | ||
486 | |||
487 | /* | ||
488 | * Avoid allocating more nodes than requested, which can | ||
489 | * happen as a result of rounding down each node's size | ||
490 | * to FAKE_NODE_MIN_SIZE. | ||
491 | */ | ||
492 | if (nodes_weight(physnode_mask) + ret >= nr_nodes) | ||
493 | end = physnodes[i].end; | ||
494 | |||
495 | if (setup_node_range(ret++, &physnodes[i].start, | ||
496 | end - physnodes[i].start, | ||
497 | physnodes[i].end) < 0) | ||
498 | node_clear(i, physnode_mask); | ||
499 | } | ||
500 | } | ||
501 | return ret; | ||
502 | } | ||
503 | |||
504 | /* | ||
338 | * Splits num_nodes nodes up equally starting at node_start. The return value | 505 | * Splits num_nodes nodes up equally starting at node_start. The return value |
339 | * is the number of nodes split up and addr is adjusted to be at the end of the | 506 | * is the number of nodes split up and addr is adjusted to be at the end of the |
340 | * last node allocated. | 507 | * last node allocated. |
341 | */ | 508 | */ |
342 | static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, | 509 | static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start, |
343 | u64 max_addr, int node_start, | ||
344 | int num_nodes) | 510 | int num_nodes) |
345 | { | 511 | { |
346 | unsigned int big; | 512 | unsigned int big; |
@@ -388,7 +554,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, | |||
388 | break; | 554 | break; |
389 | } | 555 | } |
390 | } | 556 | } |
391 | if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) | 557 | if (setup_node_range(i, addr, end - *addr, max_addr) < 0) |
392 | break; | 558 | break; |
393 | } | 559 | } |
394 | return i - node_start + 1; | 560 | return i - node_start + 1; |
@@ -399,12 +565,12 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, | |||
399 | * always assigned to a final node and can be asymmetric. Returns the number of | 565 | * always assigned to a final node and can be asymmetric. Returns the number of |
400 | * nodes split. | 566 | * nodes split. |
401 | */ | 567 | */ |
402 | static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, | 568 | static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start, |
403 | u64 max_addr, int node_start, u64 size) | 569 | u64 size) |
404 | { | 570 | { |
405 | int i = node_start; | 571 | int i = node_start; |
406 | size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; | 572 | size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; |
407 | while (!setup_node_range(i++, nodes, addr, size, max_addr)) | 573 | while (!setup_node_range(i++, addr, size, max_addr)) |
408 | ; | 574 | ; |
409 | return i - node_start; | 575 | return i - node_start; |
410 | } | 576 | } |
@@ -413,15 +579,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, | |||
413 | * Sets up the system RAM area from start_pfn to last_pfn according to the | 579 | * Sets up the system RAM area from start_pfn to last_pfn according to the |
414 | * numa=fake command-line option. | 580 | * numa=fake command-line option. |
415 | */ | 581 | */ |
416 | static struct bootnode nodes[MAX_NUMNODES] __initdata; | 582 | static int __init numa_emulation(unsigned long start_pfn, |
417 | 583 | unsigned long last_pfn, int acpi, int k8) | |
418 | static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn) | ||
419 | { | 584 | { |
420 | u64 size, addr = start_pfn << PAGE_SHIFT; | 585 | u64 size, addr = start_pfn << PAGE_SHIFT; |
421 | u64 max_addr = last_pfn << PAGE_SHIFT; | 586 | u64 max_addr = last_pfn << PAGE_SHIFT; |
422 | int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; | 587 | int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; |
588 | int num_phys_nodes; | ||
423 | 589 | ||
424 | memset(&nodes, 0, sizeof(nodes)); | 590 | num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); |
425 | /* | 591 | /* |
426 | * If the numa=fake command-line is just a single number N, split the | 592 | * If the numa=fake command-line is just a single number N, split the |
427 | * system RAM into N fake nodes. | 593 | * system RAM into N fake nodes. |
@@ -429,7 +595,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn | |||
429 | if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { | 595 | if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { |
430 | long n = simple_strtol(cmdline, NULL, 0); | 596 | long n = simple_strtol(cmdline, NULL, 0); |
431 | 597 | ||
432 | num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n); | 598 | num_nodes = split_nodes_interleave(addr, max_addr, |
599 | num_phys_nodes, n); | ||
433 | if (num_nodes < 0) | 600 | if (num_nodes < 0) |
434 | return num_nodes; | 601 | return num_nodes; |
435 | goto out; | 602 | goto out; |
@@ -456,8 +623,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn | |||
456 | size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; | 623 | size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; |
457 | if (size) | 624 | if (size) |
458 | for (i = 0; i < coeff; i++, num_nodes++) | 625 | for (i = 0; i < coeff; i++, num_nodes++) |
459 | if (setup_node_range(num_nodes, nodes, | 626 | if (setup_node_range(num_nodes, &addr, |
460 | &addr, size, max_addr) < 0) | 627 | size, max_addr) < 0) |
461 | goto done; | 628 | goto done; |
462 | if (!*cmdline) | 629 | if (!*cmdline) |
463 | break; | 630 | break; |
@@ -473,7 +640,7 @@ done: | |||
473 | if (addr < max_addr) { | 640 | if (addr < max_addr) { |
474 | if (coeff_flag && coeff < 0) { | 641 | if (coeff_flag && coeff < 0) { |
475 | /* Split remaining nodes into num-sized chunks */ | 642 | /* Split remaining nodes into num-sized chunks */ |
476 | num_nodes += split_nodes_by_size(nodes, &addr, max_addr, | 643 | num_nodes += split_nodes_by_size(&addr, max_addr, |
477 | num_nodes, num); | 644 | num_nodes, num); |
478 | goto out; | 645 | goto out; |
479 | } | 646 | } |
@@ -482,7 +649,7 @@ done: | |||
482 | /* Split remaining nodes into coeff chunks */ | 649 | /* Split remaining nodes into coeff chunks */ |
483 | if (coeff <= 0) | 650 | if (coeff <= 0) |
484 | break; | 651 | break; |
485 | num_nodes += split_nodes_equally(nodes, &addr, max_addr, | 652 | num_nodes += split_nodes_equally(&addr, max_addr, |
486 | num_nodes, coeff); | 653 | num_nodes, coeff); |
487 | break; | 654 | break; |
488 | case ',': | 655 | case ',': |
@@ -490,8 +657,8 @@ done: | |||
490 | break; | 657 | break; |
491 | default: | 658 | default: |
492 | /* Give one final node */ | 659 | /* Give one final node */ |
493 | setup_node_range(num_nodes, nodes, &addr, | 660 | setup_node_range(num_nodes, &addr, max_addr - addr, |
494 | max_addr - addr, max_addr); | 661 | max_addr); |
495 | num_nodes++; | 662 | num_nodes++; |
496 | } | 663 | } |
497 | } | 664 | } |
@@ -505,14 +672,10 @@ out: | |||
505 | } | 672 | } |
506 | 673 | ||
507 | /* | 674 | /* |
508 | * We need to vacate all active ranges that may have been registered by | 675 | * We need to vacate all active ranges that may have been registered for |
509 | * SRAT and set acpi_numa to -1 so that srat_disabled() always returns | 676 | * the e820 memory map. |
510 | * true. NUMA emulation has succeeded so we will not scan ACPI nodes. | ||
511 | */ | 677 | */ |
512 | remove_all_active_ranges(); | 678 | remove_all_active_ranges(); |
513 | #ifdef CONFIG_ACPI_NUMA | ||
514 | acpi_numa = -1; | ||
515 | #endif | ||
516 | for_each_node_mask(i, node_possible_map) { | 679 | for_each_node_mask(i, node_possible_map) { |
517 | e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, | 680 | e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, |
518 | nodes[i].end >> PAGE_SHIFT); | 681 | nodes[i].end >> PAGE_SHIFT); |
@@ -524,7 +687,8 @@ out: | |||
524 | } | 687 | } |
525 | #endif /* CONFIG_NUMA_EMU */ | 688 | #endif /* CONFIG_NUMA_EMU */ |
526 | 689 | ||
527 | void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn) | 690 | void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, |
691 | int acpi, int k8) | ||
528 | { | 692 | { |
529 | int i; | 693 | int i; |
530 | 694 | ||
@@ -532,23 +696,22 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn) | |||
532 | nodes_clear(node_online_map); | 696 | nodes_clear(node_online_map); |
533 | 697 | ||
534 | #ifdef CONFIG_NUMA_EMU | 698 | #ifdef CONFIG_NUMA_EMU |
535 | if (cmdline && !numa_emulation(start_pfn, last_pfn)) | 699 | if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8)) |
536 | return; | 700 | return; |
537 | nodes_clear(node_possible_map); | 701 | nodes_clear(node_possible_map); |
538 | nodes_clear(node_online_map); | 702 | nodes_clear(node_online_map); |
539 | #endif | 703 | #endif |
540 | 704 | ||
541 | #ifdef CONFIG_ACPI_NUMA | 705 | #ifdef CONFIG_ACPI_NUMA |
542 | if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, | 706 | if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, |
543 | last_pfn << PAGE_SHIFT)) | 707 | last_pfn << PAGE_SHIFT)) |
544 | return; | 708 | return; |
545 | nodes_clear(node_possible_map); | 709 | nodes_clear(node_possible_map); |
546 | nodes_clear(node_online_map); | 710 | nodes_clear(node_online_map); |
547 | #endif | 711 | #endif |
548 | 712 | ||
549 | #ifdef CONFIG_K8_NUMA | 713 | #ifdef CONFIG_K8_NUMA |
550 | if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, | 714 | if (!numa_off && k8 && !k8_scan_nodes()) |
551 | last_pfn<<PAGE_SHIFT)) | ||
552 | return; | 715 | return; |
553 | nodes_clear(node_possible_map); | 716 | nodes_clear(node_possible_map); |
554 | nodes_clear(node_online_map); | 717 | nodes_clear(node_online_map); |
@@ -601,6 +764,25 @@ static __init int numa_setup(char *opt) | |||
601 | early_param("numa", numa_setup); | 764 | early_param("numa", numa_setup); |
602 | 765 | ||
603 | #ifdef CONFIG_NUMA | 766 | #ifdef CONFIG_NUMA |
767 | |||
768 | static __init int find_near_online_node(int node) | ||
769 | { | ||
770 | int n, val; | ||
771 | int min_val = INT_MAX; | ||
772 | int best_node = -1; | ||
773 | |||
774 | for_each_online_node(n) { | ||
775 | val = node_distance(node, n); | ||
776 | |||
777 | if (val < min_val) { | ||
778 | min_val = val; | ||
779 | best_node = n; | ||
780 | } | ||
781 | } | ||
782 | |||
783 | return best_node; | ||
784 | } | ||
785 | |||
604 | /* | 786 | /* |
605 | * Setup early cpu_to_node. | 787 | * Setup early cpu_to_node. |
606 | * | 788 | * |
@@ -632,7 +814,7 @@ void __init init_cpu_to_node(void) | |||
632 | if (node == NUMA_NO_NODE) | 814 | if (node == NUMA_NO_NODE) |
633 | continue; | 815 | continue; |
634 | if (!node_online(node)) | 816 | if (!node_online(node)) |
635 | continue; | 817 | node = find_near_online_node(node); |
636 | numa_set_node(cpu, node); | 818 | numa_set_node(cpu, node); |
637 | } | 819 | } |
638 | } | 820 | } |