aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm/numa_64.c
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2011-02-22 05:10:08 -0500
committerTejun Heo <tj@kernel.org>2011-02-22 05:10:08 -0500
commitb8ef9172b2aad7eeb1fcd37a9e632c7b24da1f64 (patch)
treec63a57e3c392f2d45ba975d29cdd768904ef169f /arch/x86/mm/numa_64.c
parentfbe99959d1db85222829a64d869dcab704ac7ec8 (diff)
x86-64, NUMA: Move NUMA emulation into numa_emulation.c
Create numa_emulation.c and move all NUMA emulation code there. The definitions of struct numa_memblk and numa_meminfo are moved to numa_64.h. Also, numa_remove_memblk_from(), numa_cleanup_meminfo(), numa_reset_distance() along with numa_emulation() are made global. - v2: Internal declarations moved to numa_internal.h as suggested by Yinghai. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Yinghai Lu <yinghai@kernel.org> Cc: Ingo Molnar <mingo@redhat.com>
Diffstat (limited to 'arch/x86/mm/numa_64.c')
-rw-r--r--arch/x86/mm/numa_64.c480
1 files changed, 4 insertions, 476 deletions
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 980d51458c4b..45a361b16a59 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -18,20 +18,10 @@
18#include <asm/e820.h> 18#include <asm/e820.h>
19#include <asm/proto.h> 19#include <asm/proto.h>
20#include <asm/dma.h> 20#include <asm/dma.h>
21#include <asm/numa.h>
22#include <asm/acpi.h> 21#include <asm/acpi.h>
23#include <asm/amd_nb.h> 22#include <asm/amd_nb.h>
24 23
25struct numa_memblk { 24#include "numa_internal.h"
26 u64 start;
27 u64 end;
28 int nid;
29};
30
31struct numa_meminfo {
32 int nr_blks;
33 struct numa_memblk blk[NR_NODE_MEMBLKS];
34};
35 25
36struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 26struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
37EXPORT_SYMBOL(node_data); 27EXPORT_SYMBOL(node_data);
@@ -215,7 +205,7 @@ static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
215 return 0; 205 return 0;
216} 206}
217 207
218static void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) 208void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
219{ 209{
220 mi->nr_blks--; 210 mi->nr_blks--;
221 memmove(&mi->blk[idx], &mi->blk[idx + 1], 211 memmove(&mi->blk[idx], &mi->blk[idx + 1],
@@ -273,7 +263,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
273 node_set_online(nodeid); 263 node_set_online(nodeid);
274} 264}
275 265
276static int __init numa_cleanup_meminfo(struct numa_meminfo *mi) 266int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
277{ 267{
278 const u64 low = 0; 268 const u64 low = 0;
279 const u64 high = (u64)max_pfn << PAGE_SHIFT; 269 const u64 high = (u64)max_pfn << PAGE_SHIFT;
@@ -367,7 +357,7 @@ static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
367 * Reset distance table. The current table is freed. The next 357 * Reset distance table. The current table is freed. The next
368 * numa_set_distance() call will create a new one. 358 * numa_set_distance() call will create a new one.
369 */ 359 */
370static void __init numa_reset_distance(void) 360void __init numa_reset_distance(void)
371{ 361{
372 size_t size; 362 size_t size;
373 363
@@ -525,388 +515,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
525 return 0; 515 return 0;
526} 516}
527 517
528#ifdef CONFIG_NUMA_EMU
529/* Numa emulation */
530static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata;
531static char *emu_cmdline __initdata;
532
533void __init numa_emu_cmdline(char *str)
534{
535 emu_cmdline = str;
536}
537
538static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
539{
540 int i;
541
542 for (i = 0; i < mi->nr_blks; i++)
543 if (mi->blk[i].nid == nid)
544 return i;
545 return -ENOENT;
546}
547
548/*
549 * Sets up nid to range from @start to @end. The return value is -errno if
550 * something went wrong, 0 otherwise.
551 */
552static int __init emu_setup_memblk(struct numa_meminfo *ei,
553 struct numa_meminfo *pi,
554 int nid, int phys_blk, u64 size)
555{
556 struct numa_memblk *eb = &ei->blk[ei->nr_blks];
557 struct numa_memblk *pb = &pi->blk[phys_blk];
558
559 if (ei->nr_blks >= NR_NODE_MEMBLKS) {
560 pr_err("NUMA: Too many emulated memblks, failing emulation\n");
561 return -EINVAL;
562 }
563
564 ei->nr_blks++;
565 eb->start = pb->start;
566 eb->end = pb->start + size;
567 eb->nid = nid;
568
569 if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
570 emu_nid_to_phys[nid] = pb->nid;
571
572 pb->start += size;
573 if (pb->start >= pb->end) {
574 WARN_ON_ONCE(pb->start > pb->end);
575 numa_remove_memblk_from(phys_blk, pi);
576 }
577
578 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
579 eb->start, eb->end, (eb->end - eb->start) >> 20);
580 return 0;
581}
582
583/*
584 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
585 * to max_addr. The return value is the number of nodes allocated.
586 */
587static int __init split_nodes_interleave(struct numa_meminfo *ei,
588 struct numa_meminfo *pi,
589 u64 addr, u64 max_addr, int nr_nodes)
590{
591 nodemask_t physnode_mask = NODE_MASK_NONE;
592 u64 size;
593 int big;
594 int nid = 0;
595 int i, ret;
596
597 if (nr_nodes <= 0)
598 return -1;
599 if (nr_nodes > MAX_NUMNODES) {
600 pr_info("numa=fake=%d too large, reducing to %d\n",
601 nr_nodes, MAX_NUMNODES);
602 nr_nodes = MAX_NUMNODES;
603 }
604
605 size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes;
606 /*
607 * Calculate the number of big nodes that can be allocated as a result
608 * of consolidating the remainder.
609 */
610 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
611 FAKE_NODE_MIN_SIZE;
612
613 size &= FAKE_NODE_MIN_HASH_MASK;
614 if (!size) {
615 pr_err("Not enough memory for each node. "
616 "NUMA emulation disabled.\n");
617 return -1;
618 }
619
620 for (i = 0; i < pi->nr_blks; i++)
621 node_set(pi->blk[i].nid, physnode_mask);
622
623 /*
624 * Continue to fill physical nodes with fake nodes until there is no
625 * memory left on any of them.
626 */
627 while (nodes_weight(physnode_mask)) {
628 for_each_node_mask(i, physnode_mask) {
629 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
630 u64 start, limit, end;
631 int phys_blk;
632
633 phys_blk = emu_find_memblk_by_nid(i, pi);
634 if (phys_blk < 0) {
635 node_clear(i, physnode_mask);
636 continue;
637 }
638 start = pi->blk[phys_blk].start;
639 limit = pi->blk[phys_blk].end;
640 end = start + size;
641
642 if (nid < big)
643 end += FAKE_NODE_MIN_SIZE;
644
645 /*
646 * Continue to add memory to this fake node if its
647 * non-reserved memory is less than the per-node size.
648 */
649 while (end - start -
650 memblock_x86_hole_size(start, end) < size) {
651 end += FAKE_NODE_MIN_SIZE;
652 if (end > limit) {
653 end = limit;
654 break;
655 }
656 }
657
658 /*
659 * If there won't be at least FAKE_NODE_MIN_SIZE of
660 * non-reserved memory in ZONE_DMA32 for the next node,
661 * this one must extend to the boundary.
662 */
663 if (end < dma32_end && dma32_end - end -
664 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
665 end = dma32_end;
666
667 /*
668 * If there won't be enough non-reserved memory for the
669 * next node, this one must extend to the end of the
670 * physical node.
671 */
672 if (limit - end -
673 memblock_x86_hole_size(end, limit) < size)
674 end = limit;
675
676 ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
677 phys_blk,
678 min(end, limit) - start);
679 if (ret < 0)
680 return ret;
681 }
682 }
683 return 0;
684}
685
686/*
687 * Returns the end address of a node so that there is at least `size' amount of
688 * non-reserved memory or `max_addr' is reached.
689 */
690static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
691{
692 u64 end = start + size;
693
694 while (end - start - memblock_x86_hole_size(start, end) < size) {
695 end += FAKE_NODE_MIN_SIZE;
696 if (end > max_addr) {
697 end = max_addr;
698 break;
699 }
700 }
701 return end;
702}
703
704/*
705 * Sets up fake nodes of `size' interleaved over physical nodes ranging from
706 * `addr' to `max_addr'. The return value is the number of nodes allocated.
707 */
708static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
709 struct numa_meminfo *pi,
710 u64 addr, u64 max_addr, u64 size)
711{
712 nodemask_t physnode_mask = NODE_MASK_NONE;
713 u64 min_size;
714 int nid = 0;
715 int i, ret;
716
717 if (!size)
718 return -1;
719 /*
720 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
721 * increased accordingly if the requested size is too small. This
722 * creates a uniform distribution of node sizes across the entire
723 * machine (but not necessarily over physical nodes).
724 */
725 min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
726 MAX_NUMNODES;
727 min_size = max(min_size, FAKE_NODE_MIN_SIZE);
728 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
729 min_size = (min_size + FAKE_NODE_MIN_SIZE) &
730 FAKE_NODE_MIN_HASH_MASK;
731 if (size < min_size) {
732 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
733 size >> 20, min_size >> 20);
734 size = min_size;
735 }
736 size &= FAKE_NODE_MIN_HASH_MASK;
737
738 for (i = 0; i < pi->nr_blks; i++)
739 node_set(pi->blk[i].nid, physnode_mask);
740
741 /*
742 * Fill physical nodes with fake nodes of size until there is no memory
743 * left on any of them.
744 */
745 while (nodes_weight(physnode_mask)) {
746 for_each_node_mask(i, physnode_mask) {
747 u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
748 u64 start, limit, end;
749 int phys_blk;
750
751 phys_blk = emu_find_memblk_by_nid(i, pi);
752 if (phys_blk < 0) {
753 node_clear(i, physnode_mask);
754 continue;
755 }
756 start = pi->blk[phys_blk].start;
757 limit = pi->blk[phys_blk].end;
758
759 end = find_end_of_node(start, limit, size);
760 /*
761 * If there won't be at least FAKE_NODE_MIN_SIZE of
762 * non-reserved memory in ZONE_DMA32 for the next node,
763 * this one must extend to the boundary.
764 */
765 if (end < dma32_end && dma32_end - end -
766 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
767 end = dma32_end;
768
769 /*
770 * If there won't be enough non-reserved memory for the
771 * next node, this one must extend to the end of the
772 * physical node.
773 */
774 if (limit - end -
775 memblock_x86_hole_size(end, limit) < size)
776 end = limit;
777
778 ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
779 phys_blk,
780 min(end, limit) - start);
781 if (ret < 0)
782 return ret;
783 }
784 }
785 return 0;
786}
787
788/*
789 * Sets up the system RAM area from start_pfn to last_pfn according to the
790 * numa=fake command-line option.
791 */
792static void __init numa_emulation(struct numa_meminfo *numa_meminfo,
793 int numa_dist_cnt)
794{
795 static struct numa_meminfo ei __initdata;
796 static struct numa_meminfo pi __initdata;
797 const u64 max_addr = max_pfn << PAGE_SHIFT;
798 u8 *phys_dist = NULL;
799 int i, j, ret;
800
801 if (!emu_cmdline)
802 goto no_emu;
803
804 memset(&ei, 0, sizeof(ei));
805 pi = *numa_meminfo;
806
807 for (i = 0; i < MAX_NUMNODES; i++)
808 emu_nid_to_phys[i] = NUMA_NO_NODE;
809
810 /*
811 * If the numa=fake command-line contains a 'M' or 'G', it represents
812 * the fixed node size. Otherwise, if it is just a single number N,
813 * split the system RAM into N fake nodes.
814 */
815 if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
816 u64 size;
817
818 size = memparse(emu_cmdline, &emu_cmdline);
819 ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
820 } else {
821 unsigned long n;
822
823 n = simple_strtoul(emu_cmdline, NULL, 0);
824 ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
825 }
826
827 if (ret < 0)
828 goto no_emu;
829
830 if (numa_cleanup_meminfo(&ei) < 0) {
831 pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
832 goto no_emu;
833 }
834
835 /*
836 * Copy the original distance table. It's temporary so no need to
837 * reserve it.
838 */
839 if (numa_dist_cnt) {
840 size_t size = numa_dist_cnt * sizeof(phys_dist[0]);
841 u64 phys;
842
843 phys = memblock_find_in_range(0,
844 (u64)max_pfn_mapped << PAGE_SHIFT,
845 size, PAGE_SIZE);
846 if (phys == MEMBLOCK_ERROR) {
847 pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
848 goto no_emu;
849 }
850 phys_dist = __va(phys);
851
852 for (i = 0; i < numa_dist_cnt; i++)
853 for (j = 0; j < numa_dist_cnt; j++)
854 phys_dist[i * numa_dist_cnt + j] =
855 node_distance(i, j);
856 }
857
858 /* commit */
859 *numa_meminfo = ei;
860
861 /*
862 * Transform __apicid_to_node table to use emulated nids by
863 * reverse-mapping phys_nid. The maps should always exist but fall
864 * back to zero just in case.
865 */
866 for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
867 if (__apicid_to_node[i] == NUMA_NO_NODE)
868 continue;
869 for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
870 if (__apicid_to_node[i] == emu_nid_to_phys[j])
871 break;
872 __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
873 }
874
875 /* make sure all emulated nodes are mapped to a physical node */
876 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
877 if (emu_nid_to_phys[i] == NUMA_NO_NODE)
878 emu_nid_to_phys[i] = 0;
879
880 /* transform distance table */
881 numa_reset_distance();
882 for (i = 0; i < MAX_NUMNODES; i++) {
883 for (j = 0; j < MAX_NUMNODES; j++) {
884 int physi = emu_nid_to_phys[i];
885 int physj = emu_nid_to_phys[j];
886 int dist;
887
888 if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
889 dist = physi == physj ?
890 LOCAL_DISTANCE : REMOTE_DISTANCE;
891 else
892 dist = phys_dist[physi * numa_dist_cnt + physj];
893
894 numa_set_distance(i, j, dist);
895 }
896 }
897 return;
898
899no_emu:
900 /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */
901 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
902 emu_nid_to_phys[i] = i;
903}
904#else /* CONFIG_NUMA_EMU */
905static inline void numa_emulation(struct numa_meminfo *numa_meminfo,
906 int numa_dist_cnt)
907{ }
908#endif /* CONFIG_NUMA_EMU */
909
910static int __init dummy_numa_init(void) 518static int __init dummy_numa_init(void)
911{ 519{
912 printk(KERN_INFO "%s\n", 520 printk(KERN_INFO "%s\n",
@@ -994,83 +602,3 @@ int __cpuinit numa_cpu_node(int cpu)
994 return __apicid_to_node[apicid]; 602 return __apicid_to_node[apicid];
995 return NUMA_NO_NODE; 603 return NUMA_NO_NODE;
996} 604}
997
998/*
999 * UGLINESS AHEAD: Currently, CONFIG_NUMA_EMU is 64bit only and makes use
1000 * of 64bit specific data structures. The distinction is artificial and
1001 * should be removed. numa_{add|remove}_cpu() are implemented in numa.c
1002 * for both 32 and 64bit when CONFIG_NUMA_EMU is disabled but here when
1003 * enabled.
1004 *
1005 * NUMA emulation is planned to be made generic and the following and other
1006 * related code should be moved to numa.c.
1007 */
1008#ifdef CONFIG_NUMA_EMU
1009# ifndef CONFIG_DEBUG_PER_CPU_MAPS
1010void __cpuinit numa_add_cpu(int cpu)
1011{
1012 int physnid, nid;
1013
1014 nid = numa_cpu_node(cpu);
1015 if (nid == NUMA_NO_NODE)
1016 nid = early_cpu_to_node(cpu);
1017 BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
1018
1019 physnid = emu_nid_to_phys[nid];
1020
1021 /*
1022 * Map the cpu to each emulated node that is allocated on the physical
1023 * node of the cpu's apic id.
1024 */
1025 for_each_online_node(nid)
1026 if (emu_nid_to_phys[nid] == physnid)
1027 cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
1028}
1029
1030void __cpuinit numa_remove_cpu(int cpu)
1031{
1032 int i;
1033
1034 for_each_online_node(i)
1035 cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
1036}
1037# else /* !CONFIG_DEBUG_PER_CPU_MAPS */
1038static void __cpuinit numa_set_cpumask(int cpu, int enable)
1039{
1040 struct cpumask *mask;
1041 int nid, physnid, i;
1042
1043 nid = early_cpu_to_node(cpu);
1044 if (nid == NUMA_NO_NODE) {
1045 /* early_cpu_to_node() already emits a warning and trace */
1046 return;
1047 }
1048
1049 physnid = emu_nid_to_phys[nid];
1050
1051 for_each_online_node(i) {
1052 if (emu_nid_to_phys[nid] != physnid)
1053 continue;
1054
1055 mask = debug_cpumask_set_cpu(cpu, enable);
1056 if (!mask)
1057 return;
1058
1059 if (enable)
1060 cpumask_set_cpu(cpu, mask);
1061 else
1062 cpumask_clear_cpu(cpu, mask);
1063 }
1064}
1065
1066void __cpuinit numa_add_cpu(int cpu)
1067{
1068 numa_set_cpumask(cpu, 1);
1069}
1070
1071void __cpuinit numa_remove_cpu(int cpu)
1072{
1073 numa_set_cpumask(cpu, 0);
1074}
1075# endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
1076#endif /* CONFIG_NUMA_EMU */