aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
authorYinghai Lu <yinghai@kernel.org>2013-03-01 17:51:27 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-03-02 12:34:39 -0500
commit20e6926dcbafa1b361f1c29d967688be14b6ca4b (patch)
treec5ea7011124c5c1a476c43484a6072702c178edc /arch/x86/mm
parent14cc0b55b70e297a4b5411733d58c6cdc2d7f1be (diff)
x86, ACPI, mm: Revert movablemem_map support
Tim found: WARNING: at arch/x86/kernel/smpboot.c:324 topology_sane.isra.2+0x6f/0x80() Hardware name: S2600CP sched: CPU #1's llc-sibling CPU #0 is not on the same node! [node: 1 != 0]. Ignoring dependency. smpboot: Booting Node 1, Processors #1 Modules linked in: Pid: 0, comm: swapper/1 Not tainted 3.9.0-0-generic #1 Call Trace: set_cpu_sibling_map+0x279/0x449 start_secondary+0x11d/0x1e5 Don Morris reproduced on a HP z620 workstation, and bisected it to commit e8d195525809 ("acpi, memory-hotplug: parse SRAT before memblock is ready") It turns out movable_map has some problems, and it breaks several things 1. numa_init is called several times, NOT just for srat. so those nodes_clear(numa_nodes_parsed) memset(&numa_meminfo, 0, sizeof(numa_meminfo)) can not be just removed. Need to consider sequence is: numaq, srat, amd, dummy. and make fall back path working. 2. simply split acpi_numa_init to early_parse_srat. a. that early_parse_srat is NOT called for ia64, so you break ia64. b. for (i = 0; i < MAX_LOCAL_APIC; i++) set_apicid_to_node(i, NUMA_NO_NODE) still left in numa_init. So it will just clear result from early_parse_srat. it should be moved before that.... c. it breaks ACPI_TABLE_OVERIDE...as the acpi table scan is moved early before override from INITRD is settled. 3. that patch TITLE is total misleading, there is NO x86 in the title, but it changes critical x86 code. It caused x86 guys did not pay attention to find the problem early. Those patches really should be routed via tip/x86/mm. 4. after that commit, following range can not use movable ram: a. real_mode code.... well..funny, legacy Node0 [0,1M) could be hot-removed? b. initrd... it will be freed after booting, so it could be on movable... c. crashkernel for kdump...: looks like we can not put kdump kernel above 4G anymore. d. init_mem_mapping: can not put page table high anymore. e. initmem_init: vmemmap can not be high local node anymore. That is not good. If node is hotplugable, the mem related range like page table and vmemmap could be on the that node without problem and should be on that node. We have workaround patch that could fix some problems, but some can not be fixed. So just remove that offending commit and related ones including: f7210e6c4ac7 ("mm/memblock.c: use CONFIG_HAVE_MEMBLOCK_NODE_MAP to protect movablecore_map in memblock_overlaps_region().") 01a178a94e8e ("acpi, memory-hotplug: support getting hotplug info from SRAT") 27168d38fa20 ("acpi, memory-hotplug: extend movablemem_map ranges to the end of node") e8d195525809 ("acpi, memory-hotplug: parse SRAT before memblock is ready") fb06bc8e5f42 ("page_alloc: bootmem limit with movablecore_map") 42f47e27e761 ("page_alloc: make movablemem_map have higher priority") 6981ec31146c ("page_alloc: introduce zone_movable_limit[] to keep movable limit for nodes") 34b71f1e04fc ("page_alloc: add movable_memmap kernel parameter") 4d59a75125d5 ("x86: get pg_data_t's memory from other node") Later we should have patches that will make sure kernel put page table and vmemmap on local node ram instead of push them down to node0. Also need to find way to put other kernel used ram to local node ram. Reported-by: Tim Gardner <tim.gardner@canonical.com> Reported-by: Don Morris <don.morris@hp.com> Bisected-by: Don Morris <don.morris@hp.com> Tested-by: Don Morris <don.morris@hp.com> Signed-off-by: Yinghai Lu <yinghai@kernel.org> Cc: Tony Luck <tony.luck@intel.com> Cc: Thomas Renninger <trenn@suse.de> Cc: Tejun Heo <tj@kernel.org> Cc: Tang Chen <tangchen@cn.fujitsu.com> Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/numa.c11
-rw-r--r--arch/x86/mm/srat.c125
2 files changed, 8 insertions, 128 deletions
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index ff3633c794c6..72fe01e9e414 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -212,9 +212,10 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
212 * Allocate node data. Try node-local memory and then any node. 212 * Allocate node data. Try node-local memory and then any node.
213 * Never allocate in DMA zone. 213 * Never allocate in DMA zone.
214 */ 214 */
215 nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); 215 nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
216 if (!nd_pa) { 216 if (!nd_pa) {
217 pr_err("Cannot find %zu bytes in any node\n", nd_size); 217 pr_err("Cannot find %zu bytes in node %d\n",
218 nd_size, nid);
218 return; 219 return;
219 } 220 }
220 nd = __va(nd_pa); 221 nd = __va(nd_pa);
@@ -559,12 +560,10 @@ static int __init numa_init(int (*init_func)(void))
559 for (i = 0; i < MAX_LOCAL_APIC; i++) 560 for (i = 0; i < MAX_LOCAL_APIC; i++)
560 set_apicid_to_node(i, NUMA_NO_NODE); 561 set_apicid_to_node(i, NUMA_NO_NODE);
561 562
562 /* 563 nodes_clear(numa_nodes_parsed);
563 * Do not clear numa_nodes_parsed or zero numa_meminfo here, because
564 * SRAT was parsed earlier in early_parse_srat().
565 */
566 nodes_clear(node_possible_map); 564 nodes_clear(node_possible_map);
567 nodes_clear(node_online_map); 565 nodes_clear(node_online_map);
566 memset(&numa_meminfo, 0, sizeof(numa_meminfo));
568 WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES)); 567 WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES));
569 numa_reset_distance(); 568 numa_reset_distance();
570 569
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 79836d01f789..cdd0da9dd530 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -141,126 +141,11 @@ static inline int save_add_info(void) {return 1;}
141static inline int save_add_info(void) {return 0;} 141static inline int save_add_info(void) {return 0;}
142#endif 142#endif
143 143
144#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
145static void __init
146handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable)
147{
148 int overlap, i;
149 unsigned long start_pfn, end_pfn;
150
151 start_pfn = PFN_DOWN(start);
152 end_pfn = PFN_UP(end);
153
154 /*
155 * For movablemem_map=acpi:
156 *
157 * SRAT: |_____| |_____| |_________| |_________| ......
158 * node id: 0 1 1 2
159 * hotpluggable: n y y n
160 * movablemem_map: |_____| |_________|
161 *
162 * Using movablemem_map, we can prevent memblock from allocating memory
163 * on ZONE_MOVABLE at boot time.
164 *
165 * Before parsing SRAT, memblock has already reserve some memory ranges
166 * for other purposes, such as for kernel image. We cannot prevent
167 * kernel from using these memory, so we need to exclude these memory
168 * even if it is hotpluggable.
169 * Furthermore, to ensure the kernel has enough memory to boot, we make
170 * all the memory on the node which the kernel resides in
171 * un-hotpluggable.
172 */
173 if (hotpluggable && movablemem_map.acpi) {
174 /* Exclude ranges reserved by memblock. */
175 struct memblock_type *rgn = &memblock.reserved;
176
177 for (i = 0; i < rgn->cnt; i++) {
178 if (end <= rgn->regions[i].base ||
179 start >= rgn->regions[i].base +
180 rgn->regions[i].size)
181 continue;
182
183 /*
184 * If the memory range overlaps the memory reserved by
185 * memblock, then the kernel resides in this node.
186 */
187 node_set(node, movablemem_map.numa_nodes_kernel);
188
189 goto out;
190 }
191
192 /*
193 * If the kernel resides in this node, then the whole node
194 * should not be hotpluggable.
195 */
196 if (node_isset(node, movablemem_map.numa_nodes_kernel))
197 goto out;
198
199 insert_movablemem_map(start_pfn, end_pfn);
200
201 /*
202 * numa_nodes_hotplug nodemask represents which nodes are put
203 * into movablemem_map.map[].
204 */
205 node_set(node, movablemem_map.numa_nodes_hotplug);
206 goto out;
207 }
208
209 /*
210 * For movablemem_map=nn[KMG]@ss[KMG]:
211 *
212 * SRAT: |_____| |_____| |_________| |_________| ......
213 * node id: 0 1 1 2
214 * user specified: |__| |___|
215 * movablemem_map: |___| |_________| |______| ......
216 *
217 * Using movablemem_map, we can prevent memblock from allocating memory
218 * on ZONE_MOVABLE at boot time.
219 *
220 * NOTE: In this case, SRAT info will be ingored.
221 */
222 overlap = movablemem_map_overlap(start_pfn, end_pfn);
223 if (overlap >= 0) {
224 /*
225 * If part of this range is in movablemem_map, we need to
226 * add the range after it to extend the range to the end
227 * of the node, because from the min address specified to
228 * the end of the node will be ZONE_MOVABLE.
229 */
230 start_pfn = max(start_pfn,
231 movablemem_map.map[overlap].start_pfn);
232 insert_movablemem_map(start_pfn, end_pfn);
233
234 /*
235 * Set the nodemask, so that if the address range on one node
236 * is not continuse, we can add the subsequent ranges on the
237 * same node into movablemem_map.
238 */
239 node_set(node, movablemem_map.numa_nodes_hotplug);
240 } else {
241 if (node_isset(node, movablemem_map.numa_nodes_hotplug))
242 /*
243 * Insert the range if we already have movable ranges
244 * on the same node.
245 */
246 insert_movablemem_map(start_pfn, end_pfn);
247 }
248out:
249 return;
250}
251#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
252static inline void
253handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable)
254{
255}
256#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
257
258/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ 144/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
259int __init 145int __init
260acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) 146acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
261{ 147{
262 u64 start, end; 148 u64 start, end;
263 u32 hotpluggable;
264 int node, pxm; 149 int node, pxm;
265 150
266 if (srat_disabled()) 151 if (srat_disabled())
@@ -269,8 +154,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
269 goto out_err_bad_srat; 154 goto out_err_bad_srat;
270 if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) 155 if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
271 goto out_err; 156 goto out_err;
272 hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE; 157 if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
273 if (hotpluggable && !save_add_info())
274 goto out_err; 158 goto out_err;
275 159
276 start = ma->base_address; 160 start = ma->base_address;
@@ -290,12 +174,9 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
290 174
291 node_set(node, numa_nodes_parsed); 175 node_set(node, numa_nodes_parsed);
292 176
293 printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx] %s\n", 177 printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n",
294 node, pxm, 178 node, pxm,
295 (unsigned long long) start, (unsigned long long) end - 1, 179 (unsigned long long) start, (unsigned long long) end - 1);
296 hotpluggable ? "Hot Pluggable": "");
297
298 handle_movablemem(node, start, end, hotpluggable);
299 180
300 return 0; 181 return 0;
301out_err_bad_srat: 182out_err_bad_srat: