diff options
-rw-r--r-- | arch/x86/mm/numa_64.c | 173 |
1 files changed, 86 insertions, 87 deletions
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index e9919c4d1573..9736204337b8 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c | |||
@@ -541,7 +541,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) | |||
541 | 541 | ||
542 | #ifdef CONFIG_NUMA_EMU | 542 | #ifdef CONFIG_NUMA_EMU |
543 | /* Numa emulation */ | 543 | /* Numa emulation */ |
544 | static struct bootnode nodes[MAX_NUMNODES] __initdata; | ||
545 | static struct bootnode physnodes[MAX_NUMNODES] __initdata; | 544 | static struct bootnode physnodes[MAX_NUMNODES] __initdata; |
546 | 545 | ||
547 | static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata; | 546 | static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata; |
@@ -626,9 +625,24 @@ static int __init setup_physnodes(unsigned long start, unsigned long end) | |||
626 | return ret; | 625 | return ret; |
627 | } | 626 | } |
628 | 627 | ||
629 | static void __init fake_physnodes(int acpi, int amd, int nr_nodes) | 628 | static void __init fake_physnodes(int acpi, int amd, |
629 | const struct numa_meminfo *ei) | ||
630 | { | 630 | { |
631 | int i; | 631 | static struct bootnode nodes[MAX_NUMNODES] __initdata; |
632 | int i, nr_nodes = 0; | ||
633 | |||
634 | for (i = 0; i < ei->nr_blks; i++) { | ||
635 | int nid = ei->blk[i].nid; | ||
636 | |||
637 | if (nodes[nid].start == nodes[nid].end) { | ||
638 | nodes[nid].start = ei->blk[i].start; | ||
639 | nodes[nid].end = ei->blk[i].end; | ||
640 | nr_nodes++; | ||
641 | } else { | ||
642 | nodes[nid].start = min(ei->blk[i].start, nodes[nid].start); | ||
643 | nodes[nid].end = max(ei->blk[i].end, nodes[nid].end); | ||
644 | } | ||
645 | } | ||
632 | 646 | ||
633 | BUG_ON(acpi && amd); | 647 | BUG_ON(acpi && amd); |
634 | #ifdef CONFIG_ACPI_NUMA | 648 | #ifdef CONFIG_ACPI_NUMA |
@@ -645,45 +659,44 @@ static void __init fake_physnodes(int acpi, int amd, int nr_nodes) | |||
645 | } | 659 | } |
646 | 660 | ||
647 | /* | 661 | /* |
648 | * Setups up nid to range from addr to addr + size. If the end | 662 | * Sets up nid to range from @start to @end. The return value is -errno if |
649 | * boundary is greater than max_addr, then max_addr is used instead. | 663 | * something went wrong, 0 otherwise. |
650 | * The return value is 0 if there is additional memory left for | ||
651 | * allocation past addr and -1 otherwise. addr is adjusted to be at | ||
652 | * the end of the node. | ||
653 | */ | 664 | */ |
654 | static int __init setup_node_range(int nid, int physnid, | 665 | static int __init emu_setup_memblk(struct numa_meminfo *ei, |
655 | u64 *addr, u64 size, u64 max_addr) | 666 | int nid, int physnid, u64 start, u64 end) |
656 | { | 667 | { |
657 | int ret = 0; | 668 | struct numa_memblk *eb = &ei->blk[ei->nr_blks]; |
658 | nodes[nid].start = *addr; | 669 | |
659 | *addr += size; | 670 | if (ei->nr_blks >= NR_NODE_MEMBLKS) { |
660 | if (*addr >= max_addr) { | 671 | pr_err("NUMA: Too many emulated memblks, failing emulation\n"); |
661 | *addr = max_addr; | 672 | return -EINVAL; |
662 | ret = -1; | ||
663 | } | 673 | } |
664 | nodes[nid].end = *addr; | 674 | |
665 | node_set(nid, node_possible_map); | 675 | ei->nr_blks++; |
676 | eb->start = start; | ||
677 | eb->end = end; | ||
678 | eb->nid = nid; | ||
666 | 679 | ||
667 | if (emu_nid_to_phys[nid] == NUMA_NO_NODE) | 680 | if (emu_nid_to_phys[nid] == NUMA_NO_NODE) |
668 | emu_nid_to_phys[nid] = physnid; | 681 | emu_nid_to_phys[nid] = physnid; |
669 | 682 | ||
670 | printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, | 683 | printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, |
671 | nodes[nid].start, nodes[nid].end, | 684 | eb->start, eb->end, (eb->end - eb->start) >> 20); |
672 | (nodes[nid].end - nodes[nid].start) >> 20); | 685 | return 0; |
673 | return ret; | ||
674 | } | 686 | } |
675 | 687 | ||
676 | /* | 688 | /* |
677 | * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr | 689 | * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr |
678 | * to max_addr. The return value is the number of nodes allocated. | 690 | * to max_addr. The return value is the number of nodes allocated. |
679 | */ | 691 | */ |
680 | static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes) | 692 | static int __init split_nodes_interleave(struct numa_meminfo *ei, |
693 | u64 addr, u64 max_addr, int nr_nodes) | ||
681 | { | 694 | { |
682 | nodemask_t physnode_mask = NODE_MASK_NONE; | 695 | nodemask_t physnode_mask = NODE_MASK_NONE; |
683 | u64 size; | 696 | u64 size; |
684 | int big; | 697 | int big; |
685 | int ret = 0; | 698 | int nid = 0; |
686 | int i; | 699 | int i, ret; |
687 | 700 | ||
688 | if (nr_nodes <= 0) | 701 | if (nr_nodes <= 0) |
689 | return -1; | 702 | return -1; |
@@ -721,7 +734,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes) | |||
721 | u64 end = physnodes[i].start + size; | 734 | u64 end = physnodes[i].start + size; |
722 | u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); | 735 | u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); |
723 | 736 | ||
724 | if (ret < big) | 737 | if (nid < big) |
725 | end += FAKE_NODE_MIN_SIZE; | 738 | end += FAKE_NODE_MIN_SIZE; |
726 | 739 | ||
727 | /* | 740 | /* |
@@ -760,16 +773,21 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes) | |||
760 | * happen as a result of rounding down each node's size | 773 | * happen as a result of rounding down each node's size |
761 | * to FAKE_NODE_MIN_SIZE. | 774 | * to FAKE_NODE_MIN_SIZE. |
762 | */ | 775 | */ |
763 | if (nodes_weight(physnode_mask) + ret >= nr_nodes) | 776 | if (nodes_weight(physnode_mask) + nid >= nr_nodes) |
764 | end = physnodes[i].end; | 777 | end = physnodes[i].end; |
765 | 778 | ||
766 | if (setup_node_range(ret++, i, &physnodes[i].start, | 779 | ret = emu_setup_memblk(ei, nid++, i, |
767 | end - physnodes[i].start, | 780 | physnodes[i].start, |
768 | physnodes[i].end) < 0) | 781 | min(end, physnodes[i].end)); |
782 | if (ret < 0) | ||
783 | return ret; | ||
784 | |||
785 | physnodes[i].start = min(end, physnodes[i].end); | ||
786 | if (physnodes[i].start == physnodes[i].end) | ||
769 | node_clear(i, physnode_mask); | 787 | node_clear(i, physnode_mask); |
770 | } | 788 | } |
771 | } | 789 | } |
772 | return ret; | 790 | return 0; |
773 | } | 791 | } |
774 | 792 | ||
775 | /* | 793 | /* |
@@ -794,12 +812,13 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) | |||
794 | * Sets up fake nodes of `size' interleaved over physical nodes ranging from | 812 | * Sets up fake nodes of `size' interleaved over physical nodes ranging from |
795 | * `addr' to `max_addr'. The return value is the number of nodes allocated. | 813 | * `addr' to `max_addr'. The return value is the number of nodes allocated. |
796 | */ | 814 | */ |
797 | static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) | 815 | static int __init split_nodes_size_interleave(struct numa_meminfo *ei, |
816 | u64 addr, u64 max_addr, u64 size) | ||
798 | { | 817 | { |
799 | nodemask_t physnode_mask = NODE_MASK_NONE; | 818 | nodemask_t physnode_mask = NODE_MASK_NONE; |
800 | u64 min_size; | 819 | u64 min_size; |
801 | int ret = 0; | 820 | int nid = 0; |
802 | int i; | 821 | int i, ret; |
803 | 822 | ||
804 | if (!size) | 823 | if (!size) |
805 | return -1; | 824 | return -1; |
@@ -854,30 +873,31 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) | |||
854 | memblock_x86_hole_size(end, physnodes[i].end) < size) | 873 | memblock_x86_hole_size(end, physnodes[i].end) < size) |
855 | end = physnodes[i].end; | 874 | end = physnodes[i].end; |
856 | 875 | ||
857 | /* | 876 | ret = emu_setup_memblk(ei, nid++, i, |
858 | * Setup the fake node that will be allocated as bootmem | 877 | physnodes[i].start, |
859 | * later. If setup_node_range() returns non-zero, there | 878 | min(end, physnodes[i].end)); |
860 | * is no more memory available on this physical node. | 879 | if (ret < 0) |
861 | */ | 880 | return ret; |
862 | if (setup_node_range(ret++, i, &physnodes[i].start, | 881 | |
863 | end - physnodes[i].start, | 882 | physnodes[i].start = min(end, physnodes[i].end); |
864 | physnodes[i].end) < 0) | 883 | if (physnodes[i].start == physnodes[i].end) |
865 | node_clear(i, physnode_mask); | 884 | node_clear(i, physnode_mask); |
866 | } | 885 | } |
867 | } | 886 | } |
868 | return ret; | 887 | return 0; |
869 | } | 888 | } |
870 | 889 | ||
871 | /* | 890 | /* |
872 | * Sets up the system RAM area from start_pfn to last_pfn according to the | 891 | * Sets up the system RAM area from start_pfn to last_pfn according to the |
873 | * numa=fake command-line option. | 892 | * numa=fake command-line option. |
874 | */ | 893 | */ |
875 | static int __init numa_emulation(int acpi, int amd) | 894 | static bool __init numa_emulation(int acpi, int amd) |
876 | { | 895 | { |
877 | static struct numa_meminfo ei __initdata; | 896 | static struct numa_meminfo ei __initdata; |
878 | const u64 max_addr = max_pfn << PAGE_SHIFT; | 897 | const u64 max_addr = max_pfn << PAGE_SHIFT; |
879 | int num_nodes; | 898 | int i, ret; |
880 | int i; | 899 | |
900 | memset(&ei, 0, sizeof(ei)); | ||
881 | 901 | ||
882 | for (i = 0; i < MAX_NUMNODES; i++) | 902 | for (i = 0; i < MAX_NUMNODES; i++) |
883 | emu_nid_to_phys[i] = NUMA_NO_NODE; | 903 | emu_nid_to_phys[i] = NUMA_NO_NODE; |
@@ -891,52 +911,33 @@ static int __init numa_emulation(int acpi, int amd) | |||
891 | u64 size; | 911 | u64 size; |
892 | 912 | ||
893 | size = memparse(emu_cmdline, &emu_cmdline); | 913 | size = memparse(emu_cmdline, &emu_cmdline); |
894 | num_nodes = split_nodes_size_interleave(0, max_addr, size); | 914 | ret = split_nodes_size_interleave(&ei, 0, max_addr, size); |
895 | } else { | 915 | } else { |
896 | unsigned long n; | 916 | unsigned long n; |
897 | 917 | ||
898 | n = simple_strtoul(emu_cmdline, NULL, 0); | 918 | n = simple_strtoul(emu_cmdline, NULL, 0); |
899 | num_nodes = split_nodes_interleave(0, max_addr, n); | 919 | ret = split_nodes_interleave(&ei, 0, max_addr, n); |
920 | } | ||
921 | |||
922 | if (ret < 0) | ||
923 | return false; | ||
924 | |||
925 | if (numa_cleanup_meminfo(&ei) < 0) { | ||
926 | pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); | ||
927 | return false; | ||
900 | } | 928 | } |
901 | 929 | ||
902 | if (num_nodes < 0) | 930 | /* commit */ |
903 | return num_nodes; | 931 | numa_meminfo = ei; |
904 | 932 | ||
905 | /* make sure all emulated nodes are mapped to a physical node */ | 933 | /* make sure all emulated nodes are mapped to a physical node */ |
906 | for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) | 934 | for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) |
907 | if (emu_nid_to_phys[i] == NUMA_NO_NODE) | 935 | if (emu_nid_to_phys[i] == NUMA_NO_NODE) |
908 | emu_nid_to_phys[i] = 0; | 936 | emu_nid_to_phys[i] = 0; |
909 | 937 | ||
910 | ei.nr_blks = num_nodes; | 938 | fake_physnodes(acpi, amd, &ei); |
911 | for (i = 0; i < ei.nr_blks; i++) { | ||
912 | ei.blk[i].start = nodes[i].start; | ||
913 | ei.blk[i].end = nodes[i].end; | ||
914 | ei.blk[i].nid = i; | ||
915 | } | ||
916 | |||
917 | memnode_shift = compute_hash_shift(&ei); | ||
918 | if (memnode_shift < 0) { | ||
919 | memnode_shift = 0; | ||
920 | printk(KERN_ERR "No NUMA hash function found. NUMA emulation " | ||
921 | "disabled.\n"); | ||
922 | return -1; | ||
923 | } | ||
924 | |||
925 | /* | ||
926 | * We need to vacate all active ranges that may have been registered for | ||
927 | * the e820 memory map. | ||
928 | */ | ||
929 | remove_all_active_ranges(); | ||
930 | for_each_node_mask(i, node_possible_map) | ||
931 | memblock_x86_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, | ||
932 | nodes[i].end >> PAGE_SHIFT); | ||
933 | init_memory_mapping_high(); | ||
934 | for_each_node_mask(i, node_possible_map) | ||
935 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | ||
936 | fake_physnodes(acpi, amd, num_nodes); | ||
937 | numa_init_array(); | ||
938 | numa_emu_dist = true; | 939 | numa_emu_dist = true; |
939 | return 0; | 940 | return true; |
940 | } | 941 | } |
941 | #endif /* CONFIG_NUMA_EMU */ | 942 | #endif /* CONFIG_NUMA_EMU */ |
942 | 943 | ||
@@ -988,15 +989,13 @@ void __init initmem_init(void) | |||
988 | continue; | 989 | continue; |
989 | #ifdef CONFIG_NUMA_EMU | 990 | #ifdef CONFIG_NUMA_EMU |
990 | setup_physnodes(0, max_pfn << PAGE_SHIFT); | 991 | setup_physnodes(0, max_pfn << PAGE_SHIFT); |
991 | if (emu_cmdline && !numa_emulation(i == 0, i == 1)) | 992 | /* |
992 | return; | 993 | * If requested, try emulation. If emulation is not used, |
993 | 994 | * build identity emu_nid_to_phys[] for numa_add_cpu() | |
994 | /* not emulating, build identity mapping for numa_add_cpu() */ | 995 | */ |
995 | for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) | 996 | if (!emu_cmdline || !numa_emulation(i == 0, i == 1)) |
996 | emu_nid_to_phys[j] = j; | 997 | for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) |
997 | 998 | emu_nid_to_phys[j] = j; | |
998 | nodes_clear(node_possible_map); | ||
999 | nodes_clear(node_online_map); | ||
1000 | #endif | 999 | #endif |
1001 | if (numa_register_memblks(&numa_meminfo) < 0) | 1000 | if (numa_register_memblks(&numa_meminfo) < 0) |
1002 | continue; | 1001 | continue; |