aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2011-02-22 05:10:08 -0500
committerTejun Heo <tj@kernel.org>2011-02-22 05:10:08 -0500
commitb8ef9172b2aad7eeb1fcd37a9e632c7b24da1f64 (patch)
treec63a57e3c392f2d45ba975d29cdd768904ef169f
parentfbe99959d1db85222829a64d869dcab704ac7ec8 (diff)
x86-64, NUMA: Move NUMA emulation into numa_emulation.c
Create numa_emulation.c and move all NUMA emulation code there. The definitions of struct numa_memblk and numa_meminfo are moved to numa_64.h. Also, numa_remove_memblk_from(), numa_cleanup_meminfo(), numa_reset_distance() along with numa_emulation() are made global. - v2: Internal declarations moved to numa_internal.h as suggested by Yinghai. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Yinghai Lu <yinghai@kernel.org> Cc: Ingo Molnar <mingo@redhat.com>
-rw-r--r--arch/x86/mm/Makefile1
-rw-r--r--arch/x86/mm/numa_64.c480
-rw-r--r--arch/x86/mm/numa_emulation.c452
-rw-r--r--arch/x86/mm/numa_internal.h31
4 files changed, 488 insertions, 476 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 09df2f9a3d69..3e608edf9958 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
25obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o 25obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o
26obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o 26obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o
27obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o 27obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
28obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
28 29
29obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o 30obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
30 31
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 980d51458c4b..45a361b16a59 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -18,20 +18,10 @@
18#include <asm/e820.h> 18#include <asm/e820.h>
19#include <asm/proto.h> 19#include <asm/proto.h>
20#include <asm/dma.h> 20#include <asm/dma.h>
21#include <asm/numa.h>
22#include <asm/acpi.h> 21#include <asm/acpi.h>
23#include <asm/amd_nb.h> 22#include <asm/amd_nb.h>
24 23
25struct numa_memblk { 24#include "numa_internal.h"
26 u64 start;
27 u64 end;
28 int nid;
29};
30
31struct numa_meminfo {
32 int nr_blks;
33 struct numa_memblk blk[NR_NODE_MEMBLKS];
34};
35 25
36struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 26struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
37EXPORT_SYMBOL(node_data); 27EXPORT_SYMBOL(node_data);
@@ -215,7 +205,7 @@ static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
215 return 0; 205 return 0;
216} 206}
217 207
218static void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) 208void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
219{ 209{
220 mi->nr_blks--; 210 mi->nr_blks--;
221 memmove(&mi->blk[idx], &mi->blk[idx + 1], 211 memmove(&mi->blk[idx], &mi->blk[idx + 1],
@@ -273,7 +263,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
273 node_set_online(nodeid); 263 node_set_online(nodeid);
274} 264}
275 265
276static int __init numa_cleanup_meminfo(struct numa_meminfo *mi) 266int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
277{ 267{
278 const u64 low = 0; 268 const u64 low = 0;
279 const u64 high = (u64)max_pfn << PAGE_SHIFT; 269 const u64 high = (u64)max_pfn << PAGE_SHIFT;
@@ -367,7 +357,7 @@ static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
367 * Reset distance table. The current table is freed. The next 357 * Reset distance table. The current table is freed. The next
368 * numa_set_distance() call will create a new one. 358 * numa_set_distance() call will create a new one.
369 */ 359 */
370static void __init numa_reset_distance(void) 360void __init numa_reset_distance(void)
371{ 361{
372 size_t size; 362 size_t size;
373 363
@@ -525,388 +515,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
525 return 0; 515 return 0;
526} 516}
527 517
528#ifdef CONFIG_NUMA_EMU
529/* Numa emulation */
530static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata;
531static char *emu_cmdline __initdata;
532
533void __init numa_emu_cmdline(char *str)
534{
535 emu_cmdline = str;
536}
537
538static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
539{
540 int i;
541
542 for (i = 0; i < mi->nr_blks; i++)
543 if (mi->blk[i].nid == nid)
544 return i;
545 return -ENOENT;
546}
547
548/*
549 * Sets up nid to range from @start to @end. The return value is -errno if
550 * something went wrong, 0 otherwise.
551 */
552static int __init emu_setup_memblk(struct numa_meminfo *ei,
553 struct numa_meminfo *pi,
554 int nid, int phys_blk, u64 size)
555{
556 struct numa_memblk *eb = &ei->blk[ei->nr_blks];
557 struct numa_memblk *pb = &pi->blk[phys_blk];
558
559 if (ei->nr_blks >= NR_NODE_MEMBLKS) {
560 pr_err("NUMA: Too many emulated memblks, failing emulation\n");
561 return -EINVAL;
562 }
563
564 ei->nr_blks++;
565 eb->start = pb->start;
566 eb->end = pb->start + size;
567 eb->nid = nid;
568
569 if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
570 emu_nid_to_phys[nid] = pb->nid;
571
572 pb->start += size;
573 if (pb->start >= pb->end) {
574 WARN_ON_ONCE(pb->start > pb->end);
575 numa_remove_memblk_from(phys_blk, pi);
576 }
577
578 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
579 eb->start, eb->end, (eb->end - eb->start) >> 20);
580 return 0;
581}
582
583/*
584 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
585 * to max_addr. The return value is the number of nodes allocated.
586 */
587static int __init split_nodes_interleave(struct numa_meminfo *ei,
588 struct numa_meminfo *pi,
589 u64 addr, u64 max_addr, int nr_nodes)
590{
591 nodemask_t physnode_mask = NODE_MASK_NONE;
592 u64 size;
593 int big;
594 int nid = 0;
595 int i, ret;
596
597 if (nr_nodes <= 0)
598 return -1;
599 if (nr_nodes > MAX_NUMNODES) {
600 pr_info("numa=fake=%d too large, reducing to %d\n",
601 nr_nodes, MAX_NUMNODES);
602 nr_nodes = MAX_NUMNODES;
603 }
604
605 size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes;
606 /*
607 * Calculate the number of big nodes that can be allocated as a result
608 * of consolidating the remainder.
609 */
610 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
611 FAKE_NODE_MIN_SIZE;
612
613 size &= FAKE_NODE_MIN_HASH_MASK;
614 if (!size) {
615 pr_err("Not enough memory for each node. "
616 "NUMA emulation disabled.\n");
617 return -1;
618 }
619
620 for (i = 0; i < pi->nr_blks; i++)
621 node_set(pi->blk[i].nid, physnode_mask);
622
623 /*
624 * Continue to fill physical nodes with fake nodes until there is no
625 * memory left on any of them.
626 */
627 while (nodes_weight(physnode_mask)) {
628 for_each_node_mask(i, physnode_mask) {
629 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
630 u64 start, limit, end;
631 int phys_blk;
632
633 phys_blk = emu_find_memblk_by_nid(i, pi);
634 if (phys_blk < 0) {
635 node_clear(i, physnode_mask);
636 continue;
637 }
638 start = pi->blk[phys_blk].start;
639 limit = pi->blk[phys_blk].end;
640 end = start + size;
641
642 if (nid < big)
643 end += FAKE_NODE_MIN_SIZE;
644
645 /*
646 * Continue to add memory to this fake node if its
647 * non-reserved memory is less than the per-node size.
648 */
649 while (end - start -
650 memblock_x86_hole_size(start, end) < size) {
651 end += FAKE_NODE_MIN_SIZE;
652 if (end > limit) {
653 end = limit;
654 break;
655 }
656 }
657
658 /*
659 * If there won't be at least FAKE_NODE_MIN_SIZE of
660 * non-reserved memory in ZONE_DMA32 for the next node,
661 * this one must extend to the boundary.
662 */
663 if (end < dma32_end && dma32_end - end -
664 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
665 end = dma32_end;
666
667 /*
668 * If there won't be enough non-reserved memory for the
669 * next node, this one must extend to the end of the
670 * physical node.
671 */
672 if (limit - end -
673 memblock_x86_hole_size(end, limit) < size)
674 end = limit;
675
676 ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
677 phys_blk,
678 min(end, limit) - start);
679 if (ret < 0)
680 return ret;
681 }
682 }
683 return 0;
684}
685
686/*
687 * Returns the end address of a node so that there is at least `size' amount of
688 * non-reserved memory or `max_addr' is reached.
689 */
690static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
691{
692 u64 end = start + size;
693
694 while (end - start - memblock_x86_hole_size(start, end) < size) {
695 end += FAKE_NODE_MIN_SIZE;
696 if (end > max_addr) {
697 end = max_addr;
698 break;
699 }
700 }
701 return end;
702}
703
704/*
705 * Sets up fake nodes of `size' interleaved over physical nodes ranging from
706 * `addr' to `max_addr'. The return value is the number of nodes allocated.
707 */
708static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
709 struct numa_meminfo *pi,
710 u64 addr, u64 max_addr, u64 size)
711{
712 nodemask_t physnode_mask = NODE_MASK_NONE;
713 u64 min_size;
714 int nid = 0;
715 int i, ret;
716
717 if (!size)
718 return -1;
719 /*
720 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
721 * increased accordingly if the requested size is too small. This
722 * creates a uniform distribution of node sizes across the entire
723 * machine (but not necessarily over physical nodes).
724 */
725 min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
726 MAX_NUMNODES;
727 min_size = max(min_size, FAKE_NODE_MIN_SIZE);
728 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
729 min_size = (min_size + FAKE_NODE_MIN_SIZE) &
730 FAKE_NODE_MIN_HASH_MASK;
731 if (size < min_size) {
732 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
733 size >> 20, min_size >> 20);
734 size = min_size;
735 }
736 size &= FAKE_NODE_MIN_HASH_MASK;
737
738 for (i = 0; i < pi->nr_blks; i++)
739 node_set(pi->blk[i].nid, physnode_mask);
740
741 /*
742 * Fill physical nodes with fake nodes of size until there is no memory
743 * left on any of them.
744 */
745 while (nodes_weight(physnode_mask)) {
746 for_each_node_mask(i, physnode_mask) {
747 u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
748 u64 start, limit, end;
749 int phys_blk;
750
751 phys_blk = emu_find_memblk_by_nid(i, pi);
752 if (phys_blk < 0) {
753 node_clear(i, physnode_mask);
754 continue;
755 }
756 start = pi->blk[phys_blk].start;
757 limit = pi->blk[phys_blk].end;
758
759 end = find_end_of_node(start, limit, size);
760 /*
761 * If there won't be at least FAKE_NODE_MIN_SIZE of
762 * non-reserved memory in ZONE_DMA32 for the next node,
763 * this one must extend to the boundary.
764 */
765 if (end < dma32_end && dma32_end - end -
766 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
767 end = dma32_end;
768
769 /*
770 * If there won't be enough non-reserved memory for the
771 * next node, this one must extend to the end of the
772 * physical node.
773 */
774 if (limit - end -
775 memblock_x86_hole_size(end, limit) < size)
776 end = limit;
777
778 ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
779 phys_blk,
780 min(end, limit) - start);
781 if (ret < 0)
782 return ret;
783 }
784 }
785 return 0;
786}
787
788/*
789 * Sets up the system RAM area from start_pfn to last_pfn according to the
790 * numa=fake command-line option.
791 */
792static void __init numa_emulation(struct numa_meminfo *numa_meminfo,
793 int numa_dist_cnt)
794{
795 static struct numa_meminfo ei __initdata;
796 static struct numa_meminfo pi __initdata;
797 const u64 max_addr = max_pfn << PAGE_SHIFT;
798 u8 *phys_dist = NULL;
799 int i, j, ret;
800
801 if (!emu_cmdline)
802 goto no_emu;
803
804 memset(&ei, 0, sizeof(ei));
805 pi = *numa_meminfo;
806
807 for (i = 0; i < MAX_NUMNODES; i++)
808 emu_nid_to_phys[i] = NUMA_NO_NODE;
809
810 /*
811 * If the numa=fake command-line contains a 'M' or 'G', it represents
812 * the fixed node size. Otherwise, if it is just a single number N,
813 * split the system RAM into N fake nodes.
814 */
815 if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
816 u64 size;
817
818 size = memparse(emu_cmdline, &emu_cmdline);
819 ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
820 } else {
821 unsigned long n;
822
823 n = simple_strtoul(emu_cmdline, NULL, 0);
824 ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
825 }
826
827 if (ret < 0)
828 goto no_emu;
829
830 if (numa_cleanup_meminfo(&ei) < 0) {
831 pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
832 goto no_emu;
833 }
834
835 /*
836 * Copy the original distance table. It's temporary so no need to
837 * reserve it.
838 */
839 if (numa_dist_cnt) {
840 size_t size = numa_dist_cnt * sizeof(phys_dist[0]);
841 u64 phys;
842
843 phys = memblock_find_in_range(0,
844 (u64)max_pfn_mapped << PAGE_SHIFT,
845 size, PAGE_SIZE);
846 if (phys == MEMBLOCK_ERROR) {
847 pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
848 goto no_emu;
849 }
850 phys_dist = __va(phys);
851
852 for (i = 0; i < numa_dist_cnt; i++)
853 for (j = 0; j < numa_dist_cnt; j++)
854 phys_dist[i * numa_dist_cnt + j] =
855 node_distance(i, j);
856 }
857
858 /* commit */
859 *numa_meminfo = ei;
860
861 /*
862 * Transform __apicid_to_node table to use emulated nids by
863 * reverse-mapping phys_nid. The maps should always exist but fall
864 * back to zero just in case.
865 */
866 for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
867 if (__apicid_to_node[i] == NUMA_NO_NODE)
868 continue;
869 for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
870 if (__apicid_to_node[i] == emu_nid_to_phys[j])
871 break;
872 __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
873 }
874
875 /* make sure all emulated nodes are mapped to a physical node */
876 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
877 if (emu_nid_to_phys[i] == NUMA_NO_NODE)
878 emu_nid_to_phys[i] = 0;
879
880 /* transform distance table */
881 numa_reset_distance();
882 for (i = 0; i < MAX_NUMNODES; i++) {
883 for (j = 0; j < MAX_NUMNODES; j++) {
884 int physi = emu_nid_to_phys[i];
885 int physj = emu_nid_to_phys[j];
886 int dist;
887
888 if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
889 dist = physi == physj ?
890 LOCAL_DISTANCE : REMOTE_DISTANCE;
891 else
892 dist = phys_dist[physi * numa_dist_cnt + physj];
893
894 numa_set_distance(i, j, dist);
895 }
896 }
897 return;
898
899no_emu:
900 /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */
901 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
902 emu_nid_to_phys[i] = i;
903}
904#else /* CONFIG_NUMA_EMU */
905static inline void numa_emulation(struct numa_meminfo *numa_meminfo,
906 int numa_dist_cnt)
907{ }
908#endif /* CONFIG_NUMA_EMU */
909
910static int __init dummy_numa_init(void) 518static int __init dummy_numa_init(void)
911{ 519{
912 printk(KERN_INFO "%s\n", 520 printk(KERN_INFO "%s\n",
@@ -994,83 +602,3 @@ int __cpuinit numa_cpu_node(int cpu)
994 return __apicid_to_node[apicid]; 602 return __apicid_to_node[apicid];
995 return NUMA_NO_NODE; 603 return NUMA_NO_NODE;
996} 604}
997
998/*
999 * UGLINESS AHEAD: Currently, CONFIG_NUMA_EMU is 64bit only and makes use
1000 * of 64bit specific data structures. The distinction is artificial and
1001 * should be removed. numa_{add|remove}_cpu() are implemented in numa.c
1002 * for both 32 and 64bit when CONFIG_NUMA_EMU is disabled but here when
1003 * enabled.
1004 *
1005 * NUMA emulation is planned to be made generic and the following and other
1006 * related code should be moved to numa.c.
1007 */
1008#ifdef CONFIG_NUMA_EMU
1009# ifndef CONFIG_DEBUG_PER_CPU_MAPS
1010void __cpuinit numa_add_cpu(int cpu)
1011{
1012 int physnid, nid;
1013
1014 nid = numa_cpu_node(cpu);
1015 if (nid == NUMA_NO_NODE)
1016 nid = early_cpu_to_node(cpu);
1017 BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
1018
1019 physnid = emu_nid_to_phys[nid];
1020
1021 /*
1022 * Map the cpu to each emulated node that is allocated on the physical
1023 * node of the cpu's apic id.
1024 */
1025 for_each_online_node(nid)
1026 if (emu_nid_to_phys[nid] == physnid)
1027 cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
1028}
1029
1030void __cpuinit numa_remove_cpu(int cpu)
1031{
1032 int i;
1033
1034 for_each_online_node(i)
1035 cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
1036}
1037# else /* !CONFIG_DEBUG_PER_CPU_MAPS */
1038static void __cpuinit numa_set_cpumask(int cpu, int enable)
1039{
1040 struct cpumask *mask;
1041 int nid, physnid, i;
1042
1043 nid = early_cpu_to_node(cpu);
1044 if (nid == NUMA_NO_NODE) {
1045 /* early_cpu_to_node() already emits a warning and trace */
1046 return;
1047 }
1048
1049 physnid = emu_nid_to_phys[nid];
1050
1051 for_each_online_node(i) {
1052 if (emu_nid_to_phys[nid] != physnid)
1053 continue;
1054
1055 mask = debug_cpumask_set_cpu(cpu, enable);
1056 if (!mask)
1057 return;
1058
1059 if (enable)
1060 cpumask_set_cpu(cpu, mask);
1061 else
1062 cpumask_clear_cpu(cpu, mask);
1063 }
1064}
1065
1066void __cpuinit numa_add_cpu(int cpu)
1067{
1068 numa_set_cpumask(cpu, 1);
1069}
1070
1071void __cpuinit numa_remove_cpu(int cpu)
1072{
1073 numa_set_cpumask(cpu, 0);
1074}
1075# endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
1076#endif /* CONFIG_NUMA_EMU */
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
new file mode 100644
index 000000000000..23fa2d00253a
--- /dev/null
+++ b/arch/x86/mm/numa_emulation.c
@@ -0,0 +1,452 @@
1/*
2 * NUMA emulation
3 */
4#include <linux/kernel.h>
5#include <linux/errno.h>
6#include <linux/topology.h>
7#include <linux/memblock.h>
8#include <asm/dma.h>
9
10#include "numa_internal.h"
11
12static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata;
13static char *emu_cmdline __initdata;
14
15void __init numa_emu_cmdline(char *str)
16{
17 emu_cmdline = str;
18}
19
20static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
21{
22 int i;
23
24 for (i = 0; i < mi->nr_blks; i++)
25 if (mi->blk[i].nid == nid)
26 return i;
27 return -ENOENT;
28}
29
30/*
31 * Sets up nid to range from @start to @end. The return value is -errno if
32 * something went wrong, 0 otherwise.
33 */
34static int __init emu_setup_memblk(struct numa_meminfo *ei,
35 struct numa_meminfo *pi,
36 int nid, int phys_blk, u64 size)
37{
38 struct numa_memblk *eb = &ei->blk[ei->nr_blks];
39 struct numa_memblk *pb = &pi->blk[phys_blk];
40
41 if (ei->nr_blks >= NR_NODE_MEMBLKS) {
42 pr_err("NUMA: Too many emulated memblks, failing emulation\n");
43 return -EINVAL;
44 }
45
46 ei->nr_blks++;
47 eb->start = pb->start;
48 eb->end = pb->start + size;
49 eb->nid = nid;
50
51 if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
52 emu_nid_to_phys[nid] = pb->nid;
53
54 pb->start += size;
55 if (pb->start >= pb->end) {
56 WARN_ON_ONCE(pb->start > pb->end);
57 numa_remove_memblk_from(phys_blk, pi);
58 }
59
60 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
61 eb->start, eb->end, (eb->end - eb->start) >> 20);
62 return 0;
63}
64
65/*
66 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
67 * to max_addr. The return value is the number of nodes allocated.
68 */
69static int __init split_nodes_interleave(struct numa_meminfo *ei,
70 struct numa_meminfo *pi,
71 u64 addr, u64 max_addr, int nr_nodes)
72{
73 nodemask_t physnode_mask = NODE_MASK_NONE;
74 u64 size;
75 int big;
76 int nid = 0;
77 int i, ret;
78
79 if (nr_nodes <= 0)
80 return -1;
81 if (nr_nodes > MAX_NUMNODES) {
82 pr_info("numa=fake=%d too large, reducing to %d\n",
83 nr_nodes, MAX_NUMNODES);
84 nr_nodes = MAX_NUMNODES;
85 }
86
87 size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes;
88 /*
89 * Calculate the number of big nodes that can be allocated as a result
90 * of consolidating the remainder.
91 */
92 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
93 FAKE_NODE_MIN_SIZE;
94
95 size &= FAKE_NODE_MIN_HASH_MASK;
96 if (!size) {
97 pr_err("Not enough memory for each node. "
98 "NUMA emulation disabled.\n");
99 return -1;
100 }
101
102 for (i = 0; i < pi->nr_blks; i++)
103 node_set(pi->blk[i].nid, physnode_mask);
104
105 /*
106 * Continue to fill physical nodes with fake nodes until there is no
107 * memory left on any of them.
108 */
109 while (nodes_weight(physnode_mask)) {
110 for_each_node_mask(i, physnode_mask) {
111 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
112 u64 start, limit, end;
113 int phys_blk;
114
115 phys_blk = emu_find_memblk_by_nid(i, pi);
116 if (phys_blk < 0) {
117 node_clear(i, physnode_mask);
118 continue;
119 }
120 start = pi->blk[phys_blk].start;
121 limit = pi->blk[phys_blk].end;
122 end = start + size;
123
124 if (nid < big)
125 end += FAKE_NODE_MIN_SIZE;
126
127 /*
128 * Continue to add memory to this fake node if its
129 * non-reserved memory is less than the per-node size.
130 */
131 while (end - start -
132 memblock_x86_hole_size(start, end) < size) {
133 end += FAKE_NODE_MIN_SIZE;
134 if (end > limit) {
135 end = limit;
136 break;
137 }
138 }
139
140 /*
141 * If there won't be at least FAKE_NODE_MIN_SIZE of
142 * non-reserved memory in ZONE_DMA32 for the next node,
143 * this one must extend to the boundary.
144 */
145 if (end < dma32_end && dma32_end - end -
146 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
147 end = dma32_end;
148
149 /*
150 * If there won't be enough non-reserved memory for the
151 * next node, this one must extend to the end of the
152 * physical node.
153 */
154 if (limit - end -
155 memblock_x86_hole_size(end, limit) < size)
156 end = limit;
157
158 ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
159 phys_blk,
160 min(end, limit) - start);
161 if (ret < 0)
162 return ret;
163 }
164 }
165 return 0;
166}
167
168/*
169 * Returns the end address of a node so that there is at least `size' amount of
170 * non-reserved memory or `max_addr' is reached.
171 */
172static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
173{
174 u64 end = start + size;
175
176 while (end - start - memblock_x86_hole_size(start, end) < size) {
177 end += FAKE_NODE_MIN_SIZE;
178 if (end > max_addr) {
179 end = max_addr;
180 break;
181 }
182 }
183 return end;
184}
185
186/*
187 * Sets up fake nodes of `size' interleaved over physical nodes ranging from
188 * `addr' to `max_addr'. The return value is the number of nodes allocated.
189 */
190static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
191 struct numa_meminfo *pi,
192 u64 addr, u64 max_addr, u64 size)
193{
194 nodemask_t physnode_mask = NODE_MASK_NONE;
195 u64 min_size;
196 int nid = 0;
197 int i, ret;
198
199 if (!size)
200 return -1;
201 /*
202 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
203 * increased accordingly if the requested size is too small. This
204 * creates a uniform distribution of node sizes across the entire
205 * machine (but not necessarily over physical nodes).
206 */
207 min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
208 MAX_NUMNODES;
209 min_size = max(min_size, FAKE_NODE_MIN_SIZE);
210 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
211 min_size = (min_size + FAKE_NODE_MIN_SIZE) &
212 FAKE_NODE_MIN_HASH_MASK;
213 if (size < min_size) {
214 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
215 size >> 20, min_size >> 20);
216 size = min_size;
217 }
218 size &= FAKE_NODE_MIN_HASH_MASK;
219
220 for (i = 0; i < pi->nr_blks; i++)
221 node_set(pi->blk[i].nid, physnode_mask);
222
223 /*
224 * Fill physical nodes with fake nodes of size until there is no memory
225 * left on any of them.
226 */
227 while (nodes_weight(physnode_mask)) {
228 for_each_node_mask(i, physnode_mask) {
229 u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
230 u64 start, limit, end;
231 int phys_blk;
232
233 phys_blk = emu_find_memblk_by_nid(i, pi);
234 if (phys_blk < 0) {
235 node_clear(i, physnode_mask);
236 continue;
237 }
238 start = pi->blk[phys_blk].start;
239 limit = pi->blk[phys_blk].end;
240
241 end = find_end_of_node(start, limit, size);
242 /*
243 * If there won't be at least FAKE_NODE_MIN_SIZE of
244 * non-reserved memory in ZONE_DMA32 for the next node,
245 * this one must extend to the boundary.
246 */
247 if (end < dma32_end && dma32_end - end -
248 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
249 end = dma32_end;
250
251 /*
252 * If there won't be enough non-reserved memory for the
253 * next node, this one must extend to the end of the
254 * physical node.
255 */
256 if (limit - end -
257 memblock_x86_hole_size(end, limit) < size)
258 end = limit;
259
260 ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
261 phys_blk,
262 min(end, limit) - start);
263 if (ret < 0)
264 return ret;
265 }
266 }
267 return 0;
268}
269
270/*
271 * Sets up the system RAM area from start_pfn to last_pfn according to the
272 * numa=fake command-line option.
273 */
274void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
275{
276 static struct numa_meminfo ei __initdata;
277 static struct numa_meminfo pi __initdata;
278 const u64 max_addr = max_pfn << PAGE_SHIFT;
279 u8 *phys_dist = NULL;
280 int i, j, ret;
281
282 if (!emu_cmdline)
283 goto no_emu;
284
285 memset(&ei, 0, sizeof(ei));
286 pi = *numa_meminfo;
287
288 for (i = 0; i < MAX_NUMNODES; i++)
289 emu_nid_to_phys[i] = NUMA_NO_NODE;
290
291 /*
292 * If the numa=fake command-line contains a 'M' or 'G', it represents
293 * the fixed node size. Otherwise, if it is just a single number N,
294 * split the system RAM into N fake nodes.
295 */
296 if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
297 u64 size;
298
299 size = memparse(emu_cmdline, &emu_cmdline);
300 ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
301 } else {
302 unsigned long n;
303
304 n = simple_strtoul(emu_cmdline, NULL, 0);
305 ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
306 }
307
308 if (ret < 0)
309 goto no_emu;
310
311 if (numa_cleanup_meminfo(&ei) < 0) {
312 pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
313 goto no_emu;
314 }
315
316 /*
317 * Copy the original distance table. It's temporary so no need to
318 * reserve it.
319 */
320 if (numa_dist_cnt) {
321 size_t size = numa_dist_cnt * sizeof(phys_dist[0]);
322 u64 phys;
323
324 phys = memblock_find_in_range(0,
325 (u64)max_pfn_mapped << PAGE_SHIFT,
326 size, PAGE_SIZE);
327 if (phys == MEMBLOCK_ERROR) {
328 pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
329 goto no_emu;
330 }
331 phys_dist = __va(phys);
332
333 for (i = 0; i < numa_dist_cnt; i++)
334 for (j = 0; j < numa_dist_cnt; j++)
335 phys_dist[i * numa_dist_cnt + j] =
336 node_distance(i, j);
337 }
338
339 /* commit */
340 *numa_meminfo = ei;
341
342 /*
343 * Transform __apicid_to_node table to use emulated nids by
344 * reverse-mapping phys_nid. The maps should always exist but fall
345 * back to zero just in case.
346 */
347 for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
348 if (__apicid_to_node[i] == NUMA_NO_NODE)
349 continue;
350 for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
351 if (__apicid_to_node[i] == emu_nid_to_phys[j])
352 break;
353 __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
354 }
355
356 /* make sure all emulated nodes are mapped to a physical node */
357 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
358 if (emu_nid_to_phys[i] == NUMA_NO_NODE)
359 emu_nid_to_phys[i] = 0;
360
361 /* transform distance table */
362 numa_reset_distance();
363 for (i = 0; i < MAX_NUMNODES; i++) {
364 for (j = 0; j < MAX_NUMNODES; j++) {
365 int physi = emu_nid_to_phys[i];
366 int physj = emu_nid_to_phys[j];
367 int dist;
368
369 if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
370 dist = physi == physj ?
371 LOCAL_DISTANCE : REMOTE_DISTANCE;
372 else
373 dist = phys_dist[physi * numa_dist_cnt + physj];
374
375 numa_set_distance(i, j, dist);
376 }
377 }
378 return;
379
380no_emu:
381 /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */
382 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
383 emu_nid_to_phys[i] = i;
384}
385
386#ifndef CONFIG_DEBUG_PER_CPU_MAPS
387void __cpuinit numa_add_cpu(int cpu)
388{
389 int physnid, nid;
390
391 nid = numa_cpu_node(cpu);
392 if (nid == NUMA_NO_NODE)
393 nid = early_cpu_to_node(cpu);
394 BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
395
396 physnid = emu_nid_to_phys[nid];
397
398 /*
399 * Map the cpu to each emulated node that is allocated on the physical
400 * node of the cpu's apic id.
401 */
402 for_each_online_node(nid)
403 if (emu_nid_to_phys[nid] == physnid)
404 cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
405}
406
407void __cpuinit numa_remove_cpu(int cpu)
408{
409 int i;
410
411 for_each_online_node(i)
412 cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
413}
414#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
415static void __cpuinit numa_set_cpumask(int cpu, int enable)
416{
417 struct cpumask *mask;
418 int nid, physnid, i;
419
420 nid = early_cpu_to_node(cpu);
421 if (nid == NUMA_NO_NODE) {
422 /* early_cpu_to_node() already emits a warning and trace */
423 return;
424 }
425
426 physnid = emu_nid_to_phys[nid];
427
428 for_each_online_node(i) {
429 if (emu_nid_to_phys[nid] != physnid)
430 continue;
431
432 mask = debug_cpumask_set_cpu(cpu, enable);
433 if (!mask)
434 return;
435
436 if (enable)
437 cpumask_set_cpu(cpu, mask);
438 else
439 cpumask_clear_cpu(cpu, mask);
440 }
441}
442
443void __cpuinit numa_add_cpu(int cpu)
444{
445 numa_set_cpumask(cpu, 1);
446}
447
448void __cpuinit numa_remove_cpu(int cpu)
449{
450 numa_set_cpumask(cpu, 0);
451}
452#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
new file mode 100644
index 000000000000..ef2d97377d7c
--- /dev/null
+++ b/arch/x86/mm/numa_internal.h
@@ -0,0 +1,31 @@
1#ifndef __X86_MM_NUMA_INTERNAL_H
2#define __X86_MM_NUMA_INTERNAL_H
3
4#include <linux/types.h>
5#include <asm/numa.h>
6
7struct numa_memblk {
8 u64 start;
9 u64 end;
10 int nid;
11};
12
13struct numa_meminfo {
14 int nr_blks;
15 struct numa_memblk blk[NR_NODE_MEMBLKS];
16};
17
18void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
19int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
20void __init numa_reset_distance(void);
21
22#ifdef CONFIG_NUMA_EMU
23void __init numa_emulation(struct numa_meminfo *numa_meminfo,
24 int numa_dist_cnt);
25#else
26static inline void numa_emulation(struct numa_meminfo *numa_meminfo,
27 int numa_dist_cnt)
28{ }
29#endif
30
31#endif /* __X86_MM_NUMA_INTERNAL_H */