aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJon Tollefson <kniht@linux.vnet.ibm.com>2008-10-09 06:18:40 -0400
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2008-10-10 00:55:19 -0400
commit8f64e1f2d1e09267ac926e15090fd505c1c0cbcb (patch)
tree6b9a5e74343ab1a33c0a28ca84031157a30a96cd
parent91a00302959545a9ae423e99732b1e46eb19e877 (diff)
powerpc: Reserve in bootmem lmb reserved regions that cross NUMA nodes
If there are multiple reserved memory blocks via lmb_reserve() that are contiguous addresses and on different NUMA nodes we are losing track of which address ranges to reserve in bootmem on which node. I discovered this when I recently got to try 16GB huge pages on a system with more then 2 nodes. When scanning the device tree in early boot we call lmb_reserve() with the addresses of the 16G pages that we find so that the memory doesn't get used for something else. For example the addresses for the pages could be 4000000000, 4400000000, 4800000000, 4C00000000, etc - 8 pages, one on each of eight nodes. In the lmb after all the pages have been reserved it will look something like the following: lmb_dump_all: memory.cnt = 0x2 memory.size = 0x3e80000000 memory.region[0x0].base = 0x0 .size = 0x1e80000000 memory.region[0x1].base = 0x4000000000 .size = 0x2000000000 reserved.cnt = 0x5 reserved.size = 0x3e80000000 reserved.region[0x0].base = 0x0 .size = 0x7b5000 reserved.region[0x1].base = 0x2a00000 .size = 0x78c000 reserved.region[0x2].base = 0x328c000 .size = 0x43000 reserved.region[0x3].base = 0xf4e8000 .size = 0xb18000 reserved.region[0x4].base = 0x4000000000 .size = 0x2000000000 The reserved.region[0x4] contains the 16G pages. In arch/powerpc/mm/num.c: do_init_bootmem() we loop through each of the node numbers looking for the reserved regions that belong to the particular node. It is not able to identify region 0x4 as being a part of each of the 8 nodes. It is assuming that a reserved region is only on a single node. This patch takes out the reserved region loop from inside the loop that goes over each node. It looks up the active region containing the start of the reserved region. If it extends past that active region then it adjusts the size and gets the next active region containing it. Signed-off-by: Jon Tollefson <kniht@linux.vnet.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
-rw-r--r--arch/powerpc/mm/numa.c108
1 files changed, 80 insertions, 28 deletions
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index be05457631d4..6cf5c71c431f 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -89,6 +89,46 @@ static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn,
89 return 0; 89 return 0;
90} 90}
91 91
92/*
93 * get_active_region_work_fn - A helper function for get_node_active_region
94 * Returns datax set to the start_pfn and end_pfn if they contain
95 * the initial value of datax->start_pfn between them
96 * @start_pfn: start page(inclusive) of region to check
97 * @end_pfn: end page(exclusive) of region to check
98 * @datax: comes in with ->start_pfn set to value to search for and
99 * goes out with active range if it contains it
100 * Returns 1 if search value is in range else 0
101 */
102static int __init get_active_region_work_fn(unsigned long start_pfn,
103 unsigned long end_pfn, void *datax)
104{
105 struct node_active_region *data;
106 data = (struct node_active_region *)datax;
107
108 if (start_pfn <= data->start_pfn && end_pfn > data->start_pfn) {
109 data->start_pfn = start_pfn;
110 data->end_pfn = end_pfn;
111 return 1;
112 }
113 return 0;
114
115}
116
117/*
118 * get_node_active_region - Return active region containing start_pfn
119 * @start_pfn: The page to return the region for.
120 * @node_ar: Returned set to the active region containing start_pfn
121 */
122static void __init get_node_active_region(unsigned long start_pfn,
123 struct node_active_region *node_ar)
124{
125 int nid = early_pfn_to_nid(start_pfn);
126
127 node_ar->nid = nid;
128 node_ar->start_pfn = start_pfn;
129 work_with_active_regions(nid, get_active_region_work_fn, node_ar);
130}
131
92static void __cpuinit map_cpu_to_node(int cpu, int node) 132static void __cpuinit map_cpu_to_node(int cpu, int node)
93{ 133{
94 numa_cpu_lookup_table[cpu] = node; 134 numa_cpu_lookup_table[cpu] = node;
@@ -882,38 +922,50 @@ void __init do_init_bootmem(void)
882 start_pfn, end_pfn); 922 start_pfn, end_pfn);
883 923
884 free_bootmem_with_active_regions(nid, end_pfn); 924 free_bootmem_with_active_regions(nid, end_pfn);
925 }
885 926
886 /* Mark reserved regions on this node */ 927 /* Mark reserved regions */
887 for (i = 0; i < lmb.reserved.cnt; i++) { 928 for (i = 0; i < lmb.reserved.cnt; i++) {
888 unsigned long physbase = lmb.reserved.region[i].base; 929 unsigned long physbase = lmb.reserved.region[i].base;
889 unsigned long size = lmb.reserved.region[i].size; 930 unsigned long size = lmb.reserved.region[i].size;
890 unsigned long start_paddr = start_pfn << PAGE_SHIFT; 931 unsigned long start_pfn = physbase >> PAGE_SHIFT;
891 unsigned long end_paddr = end_pfn << PAGE_SHIFT; 932 unsigned long end_pfn = ((physbase + size) >> PAGE_SHIFT);
892 933 struct node_active_region node_ar;
893 if (early_pfn_to_nid(physbase >> PAGE_SHIFT) != nid && 934
894 early_pfn_to_nid((physbase+size-1) >> PAGE_SHIFT) != nid) 935 get_node_active_region(start_pfn, &node_ar);
895 continue; 936 while (start_pfn < end_pfn) {
896 937 /*
897 if (physbase < end_paddr && 938 * if reserved region extends past active region
898 (physbase+size) > start_paddr) { 939 * then trim size to active region
899 /* overlaps */ 940 */
900 if (physbase < start_paddr) { 941 if (end_pfn > node_ar.end_pfn)
901 size -= start_paddr - physbase; 942 size = (node_ar.end_pfn << PAGE_SHIFT)
902 physbase = start_paddr; 943 - (start_pfn << PAGE_SHIFT);
903 } 944 dbg("reserve_bootmem %lx %lx nid=%d\n", physbase, size,
904 945 node_ar.nid);
905 if (size > end_paddr - physbase) 946 reserve_bootmem_node(NODE_DATA(node_ar.nid), physbase,
906 size = end_paddr - physbase; 947 size, BOOTMEM_DEFAULT);
907 948 /*
908 dbg("reserve_bootmem %lx %lx\n", physbase, 949 * if reserved region is contained in the active region
909 size); 950 * then done.
910 reserve_bootmem_node(NODE_DATA(nid), physbase, 951 */
911 size, BOOTMEM_DEFAULT); 952 if (end_pfn <= node_ar.end_pfn)
912 } 953 break;
954
955 /*
956 * reserved region extends past the active region
957 * get next active region that contains this
958 * reserved region
959 */
960 start_pfn = node_ar.end_pfn;
961 physbase = start_pfn << PAGE_SHIFT;
962 get_node_active_region(start_pfn, &node_ar);
913 } 963 }
914 964
915 sparse_memory_present_with_active_regions(nid);
916 } 965 }
966
967 for_each_online_node(nid)
968 sparse_memory_present_with_active_regions(nid);
917} 969}
918 970
919void __init paging_init(void) 971void __init paging_init(void)