aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorYinghai Lu <yinghai@kernel.org>2013-03-01 17:51:27 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-03-02 12:34:39 -0500
commit20e6926dcbafa1b361f1c29d967688be14b6ca4b (patch)
treec5ea7011124c5c1a476c43484a6072702c178edc /mm/page_alloc.c
parent14cc0b55b70e297a4b5411733d58c6cdc2d7f1be (diff)
x86, ACPI, mm: Revert movablemem_map support
Tim found: WARNING: at arch/x86/kernel/smpboot.c:324 topology_sane.isra.2+0x6f/0x80() Hardware name: S2600CP sched: CPU #1's llc-sibling CPU #0 is not on the same node! [node: 1 != 0]. Ignoring dependency. smpboot: Booting Node 1, Processors #1 Modules linked in: Pid: 0, comm: swapper/1 Not tainted 3.9.0-0-generic #1 Call Trace: set_cpu_sibling_map+0x279/0x449 start_secondary+0x11d/0x1e5 Don Morris reproduced on a HP z620 workstation, and bisected it to commit e8d195525809 ("acpi, memory-hotplug: parse SRAT before memblock is ready") It turns out movable_map has some problems, and it breaks several things 1. numa_init is called several times, NOT just for srat. so those nodes_clear(numa_nodes_parsed) memset(&numa_meminfo, 0, sizeof(numa_meminfo)) can not be just removed. Need to consider sequence is: numaq, srat, amd, dummy. and make fall back path working. 2. simply split acpi_numa_init to early_parse_srat. a. that early_parse_srat is NOT called for ia64, so you break ia64. b. for (i = 0; i < MAX_LOCAL_APIC; i++) set_apicid_to_node(i, NUMA_NO_NODE) still left in numa_init. So it will just clear result from early_parse_srat. it should be moved before that.... c. it breaks ACPI_TABLE_OVERIDE...as the acpi table scan is moved early before override from INITRD is settled. 3. that patch TITLE is total misleading, there is NO x86 in the title, but it changes critical x86 code. It caused x86 guys did not pay attention to find the problem early. Those patches really should be routed via tip/x86/mm. 4. after that commit, following range can not use movable ram: a. real_mode code.... well..funny, legacy Node0 [0,1M) could be hot-removed? b. initrd... it will be freed after booting, so it could be on movable... c. crashkernel for kdump...: looks like we can not put kdump kernel above 4G anymore. d. init_mem_mapping: can not put page table high anymore. e. initmem_init: vmemmap can not be high local node anymore. That is not good. If node is hotplugable, the mem related range like page table and vmemmap could be on the that node without problem and should be on that node. We have workaround patch that could fix some problems, but some can not be fixed. So just remove that offending commit and related ones including: f7210e6c4ac7 ("mm/memblock.c: use CONFIG_HAVE_MEMBLOCK_NODE_MAP to protect movablecore_map in memblock_overlaps_region().") 01a178a94e8e ("acpi, memory-hotplug: support getting hotplug info from SRAT") 27168d38fa20 ("acpi, memory-hotplug: extend movablemem_map ranges to the end of node") e8d195525809 ("acpi, memory-hotplug: parse SRAT before memblock is ready") fb06bc8e5f42 ("page_alloc: bootmem limit with movablecore_map") 42f47e27e761 ("page_alloc: make movablemem_map have higher priority") 6981ec31146c ("page_alloc: introduce zone_movable_limit[] to keep movable limit for nodes") 34b71f1e04fc ("page_alloc: add movable_memmap kernel parameter") 4d59a75125d5 ("x86: get pg_data_t's memory from other node") Later we should have patches that will make sure kernel put page table and vmemmap on local node ram instead of push them down to node0. Also need to find way to put other kernel used ram to local node ram. Reported-by: Tim Gardner <tim.gardner@canonical.com> Reported-by: Don Morris <don.morris@hp.com> Bisected-by: Don Morris <don.morris@hp.com> Tested-by: Don Morris <don.morris@hp.com> Signed-off-by: Yinghai Lu <yinghai@kernel.org> Cc: Tony Luck <tony.luck@intel.com> Cc: Thomas Renninger <trenn@suse.de> Cc: Tejun Heo <tj@kernel.org> Cc: Tang Chen <tangchen@cn.fujitsu.com> Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c285
1 files changed, 5 insertions, 280 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0dade3f18f7d..8fcced7823fa 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -202,18 +202,11 @@ static unsigned long __meminitdata nr_all_pages;
202static unsigned long __meminitdata dma_reserve; 202static unsigned long __meminitdata dma_reserve;
203 203
204#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 204#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
205/* Movable memory ranges, will also be used by memblock subsystem. */
206struct movablemem_map movablemem_map = {
207 .acpi = false,
208 .nr_map = 0,
209};
210
211static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 205static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
212static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 206static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
213static unsigned long __initdata required_kernelcore; 207static unsigned long __initdata required_kernelcore;
214static unsigned long __initdata required_movablecore; 208static unsigned long __initdata required_movablecore;
215static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 209static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
216static unsigned long __meminitdata zone_movable_limit[MAX_NUMNODES];
217 210
218/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 211/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
219int movable_zone; 212int movable_zone;
@@ -4412,77 +4405,6 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
4412 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 4405 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
4413} 4406}
4414 4407
4415/**
4416 * sanitize_zone_movable_limit - Sanitize the zone_movable_limit array.
4417 *
4418 * zone_movable_limit is initialized as 0. This function will try to get
4419 * the first ZONE_MOVABLE pfn of each node from movablemem_map, and
4420 * assigne them to zone_movable_limit.
4421 * zone_movable_limit[nid] == 0 means no limit for the node.
4422 *
4423 * Note: Each range is represented as [start_pfn, end_pfn)
4424 */
4425static void __meminit sanitize_zone_movable_limit(void)
4426{
4427 int map_pos = 0, i, nid;
4428 unsigned long start_pfn, end_pfn;
4429
4430 if (!movablemem_map.nr_map)
4431 return;
4432
4433 /* Iterate all ranges from minimum to maximum */
4434 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
4435 /*
4436 * If we have found lowest pfn of ZONE_MOVABLE of the node
4437 * specified by user, just go on to check next range.
4438 */
4439 if (zone_movable_limit[nid])
4440 continue;
4441
4442#ifdef CONFIG_ZONE_DMA
4443 /* Skip DMA memory. */
4444 if (start_pfn < arch_zone_highest_possible_pfn[ZONE_DMA])
4445 start_pfn = arch_zone_highest_possible_pfn[ZONE_DMA];
4446#endif
4447
4448#ifdef CONFIG_ZONE_DMA32
4449 /* Skip DMA32 memory. */
4450 if (start_pfn < arch_zone_highest_possible_pfn[ZONE_DMA32])
4451 start_pfn = arch_zone_highest_possible_pfn[ZONE_DMA32];
4452#endif
4453
4454#ifdef CONFIG_HIGHMEM
4455 /* Skip lowmem if ZONE_MOVABLE is highmem. */
4456 if (zone_movable_is_highmem() &&
4457 start_pfn < arch_zone_lowest_possible_pfn[ZONE_HIGHMEM])
4458 start_pfn = arch_zone_lowest_possible_pfn[ZONE_HIGHMEM];
4459#endif
4460
4461 if (start_pfn >= end_pfn)
4462 continue;
4463
4464 while (map_pos < movablemem_map.nr_map) {
4465 if (end_pfn <= movablemem_map.map[map_pos].start_pfn)
4466 break;
4467
4468 if (start_pfn >= movablemem_map.map[map_pos].end_pfn) {
4469 map_pos++;
4470 continue;
4471 }
4472
4473 /*
4474 * The start_pfn of ZONE_MOVABLE is either the minimum
4475 * pfn specified by movablemem_map, or 0, which means
4476 * the node has no ZONE_MOVABLE.
4477 */
4478 zone_movable_limit[nid] = max(start_pfn,
4479 movablemem_map.map[map_pos].start_pfn);
4480
4481 break;
4482 }
4483 }
4484}
4485
4486#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4408#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4487static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, 4409static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
4488 unsigned long zone_type, 4410 unsigned long zone_type,
@@ -4500,6 +4422,7 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
4500 4422
4501 return zholes_size[zone_type]; 4423 return zholes_size[zone_type];
4502} 4424}
4425
4503#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4426#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4504 4427
4505static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, 4428static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
@@ -4941,19 +4864,12 @@ static void __init find_zone_movable_pfns_for_nodes(void)
4941 required_kernelcore = max(required_kernelcore, corepages); 4864 required_kernelcore = max(required_kernelcore, corepages);
4942 } 4865 }
4943 4866
4944 /* 4867 /* If kernelcore was not specified, there is no ZONE_MOVABLE */
4945 * If neither kernelcore/movablecore nor movablemem_map is specified, 4868 if (!required_kernelcore)
4946 * there is no ZONE_MOVABLE. But if movablemem_map is specified, the
4947 * start pfn of ZONE_MOVABLE has been stored in zone_movable_limit[].
4948 */
4949 if (!required_kernelcore) {
4950 if (movablemem_map.nr_map)
4951 memcpy(zone_movable_pfn, zone_movable_limit,
4952 sizeof(zone_movable_pfn));
4953 goto out; 4869 goto out;
4954 }
4955 4870
4956 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 4871 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
4872 find_usable_zone_for_movable();
4957 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; 4873 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
4958 4874
4959restart: 4875restart:
@@ -4981,24 +4897,10 @@ restart:
4981 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 4897 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
4982 unsigned long size_pages; 4898 unsigned long size_pages;
4983 4899
4984 /*
4985 * Find more memory for kernelcore in
4986 * [zone_movable_pfn[nid], zone_movable_limit[nid]).
4987 */
4988 start_pfn = max(start_pfn, zone_movable_pfn[nid]); 4900 start_pfn = max(start_pfn, zone_movable_pfn[nid]);
4989 if (start_pfn >= end_pfn) 4901 if (start_pfn >= end_pfn)
4990 continue; 4902 continue;
4991 4903
4992 if (zone_movable_limit[nid]) {
4993 end_pfn = min(end_pfn, zone_movable_limit[nid]);
4994 /* No range left for kernelcore in this node */
4995 if (start_pfn >= end_pfn) {
4996 zone_movable_pfn[nid] =
4997 zone_movable_limit[nid];
4998 break;
4999 }
5000 }
5001
5002 /* Account for what is only usable for kernelcore */ 4904 /* Account for what is only usable for kernelcore */
5003 if (start_pfn < usable_startpfn) { 4905 if (start_pfn < usable_startpfn) {
5004 unsigned long kernel_pages; 4906 unsigned long kernel_pages;
@@ -5058,12 +4960,12 @@ restart:
5058 if (usable_nodes && required_kernelcore > usable_nodes) 4960 if (usable_nodes && required_kernelcore > usable_nodes)
5059 goto restart; 4961 goto restart;
5060 4962
5061out:
5062 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ 4963 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
5063 for (nid = 0; nid < MAX_NUMNODES; nid++) 4964 for (nid = 0; nid < MAX_NUMNODES; nid++)
5064 zone_movable_pfn[nid] = 4965 zone_movable_pfn[nid] =
5065 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 4966 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
5066 4967
4968out:
5067 /* restore the node_state */ 4969 /* restore the node_state */
5068 node_states[N_MEMORY] = saved_node_state; 4970 node_states[N_MEMORY] = saved_node_state;
5069} 4971}
@@ -5126,8 +5028,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
5126 5028
5127 /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 5029 /* Find the PFNs that ZONE_MOVABLE begins at in each node */
5128 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 5030 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
5129 find_usable_zone_for_movable();
5130 sanitize_zone_movable_limit();
5131 find_zone_movable_pfns_for_nodes(); 5031 find_zone_movable_pfns_for_nodes();
5132 5032
5133 /* Print out the zone ranges */ 5033 /* Print out the zone ranges */
@@ -5211,181 +5111,6 @@ static int __init cmdline_parse_movablecore(char *p)
5211early_param("kernelcore", cmdline_parse_kernelcore); 5111early_param("kernelcore", cmdline_parse_kernelcore);
5212early_param("movablecore", cmdline_parse_movablecore); 5112early_param("movablecore", cmdline_parse_movablecore);
5213 5113
5214/**
5215 * movablemem_map_overlap() - Check if a range overlaps movablemem_map.map[].
5216 * @start_pfn: start pfn of the range to be checked
5217 * @end_pfn: end pfn of the range to be checked (exclusive)
5218 *
5219 * This function checks if a given memory range [start_pfn, end_pfn) overlaps
5220 * the movablemem_map.map[] array.
5221 *
5222 * Return: index of the first overlapped element in movablemem_map.map[]
5223 * or -1 if they don't overlap each other.
5224 */
5225int __init movablemem_map_overlap(unsigned long start_pfn,
5226 unsigned long end_pfn)
5227{
5228 int overlap;
5229
5230 if (!movablemem_map.nr_map)
5231 return -1;
5232
5233 for (overlap = 0; overlap < movablemem_map.nr_map; overlap++)
5234 if (start_pfn < movablemem_map.map[overlap].end_pfn)
5235 break;
5236
5237 if (overlap == movablemem_map.nr_map ||
5238 end_pfn <= movablemem_map.map[overlap].start_pfn)
5239 return -1;
5240
5241 return overlap;
5242}
5243
5244/**
5245 * insert_movablemem_map - Insert a memory range in to movablemem_map.map.
5246 * @start_pfn: start pfn of the range
5247 * @end_pfn: end pfn of the range
5248 *
5249 * This function will also merge the overlapped ranges, and sort the array
5250 * by start_pfn in monotonic increasing order.
5251 */
5252void __init insert_movablemem_map(unsigned long start_pfn,
5253 unsigned long end_pfn)
5254{
5255 int pos, overlap;
5256
5257 /*
5258 * pos will be at the 1st overlapped range, or the position
5259 * where the element should be inserted.
5260 */
5261 for (pos = 0; pos < movablemem_map.nr_map; pos++)
5262 if (start_pfn <= movablemem_map.map[pos].end_pfn)
5263 break;
5264
5265 /* If there is no overlapped range, just insert the element. */
5266 if (pos == movablemem_map.nr_map ||
5267 end_pfn < movablemem_map.map[pos].start_pfn) {
5268 /*
5269 * If pos is not the end of array, we need to move all
5270 * the rest elements backward.
5271 */
5272 if (pos < movablemem_map.nr_map)
5273 memmove(&movablemem_map.map[pos+1],
5274 &movablemem_map.map[pos],
5275 sizeof(struct movablemem_entry) *
5276 (movablemem_map.nr_map - pos));
5277 movablemem_map.map[pos].start_pfn = start_pfn;
5278 movablemem_map.map[pos].end_pfn = end_pfn;
5279 movablemem_map.nr_map++;
5280 return;
5281 }
5282
5283 /* overlap will be at the last overlapped range */
5284 for (overlap = pos + 1; overlap < movablemem_map.nr_map; overlap++)
5285 if (end_pfn < movablemem_map.map[overlap].start_pfn)
5286 break;
5287
5288 /*
5289 * If there are more ranges overlapped, we need to merge them,
5290 * and move the rest elements forward.
5291 */
5292 overlap--;
5293 movablemem_map.map[pos].start_pfn = min(start_pfn,
5294 movablemem_map.map[pos].start_pfn);
5295 movablemem_map.map[pos].end_pfn = max(end_pfn,
5296 movablemem_map.map[overlap].end_pfn);
5297
5298 if (pos != overlap && overlap + 1 != movablemem_map.nr_map)
5299 memmove(&movablemem_map.map[pos+1],
5300 &movablemem_map.map[overlap+1],
5301 sizeof(struct movablemem_entry) *
5302 (movablemem_map.nr_map - overlap - 1));
5303
5304 movablemem_map.nr_map -= overlap - pos;
5305}
5306
5307/**
5308 * movablemem_map_add_region - Add a memory range into movablemem_map.
5309 * @start: physical start address of range
5310 * @end: physical end address of range
5311 *
5312 * This function transform the physical address into pfn, and then add the
5313 * range into movablemem_map by calling insert_movablemem_map().
5314 */
5315static void __init movablemem_map_add_region(u64 start, u64 size)
5316{
5317 unsigned long start_pfn, end_pfn;
5318
5319 /* In case size == 0 or start + size overflows */
5320 if (start + size <= start)
5321 return;
5322
5323 if (movablemem_map.nr_map >= ARRAY_SIZE(movablemem_map.map)) {
5324 pr_err("movablemem_map: too many entries;"
5325 " ignoring [mem %#010llx-%#010llx]\n",
5326 (unsigned long long) start,
5327 (unsigned long long) (start + size - 1));
5328 return;
5329 }
5330
5331 start_pfn = PFN_DOWN(start);
5332 end_pfn = PFN_UP(start + size);
5333 insert_movablemem_map(start_pfn, end_pfn);
5334}
5335
5336/*
5337 * cmdline_parse_movablemem_map - Parse boot option movablemem_map.
5338 * @p: The boot option of the following format:
5339 * movablemem_map=nn[KMG]@ss[KMG]
5340 *
5341 * This option sets the memory range [ss, ss+nn) to be used as movable memory.
5342 *
5343 * Return: 0 on success or -EINVAL on failure.
5344 */
5345static int __init cmdline_parse_movablemem_map(char *p)
5346{
5347 char *oldp;
5348 u64 start_at, mem_size;
5349
5350 if (!p)
5351 goto err;
5352
5353 if (!strcmp(p, "acpi"))
5354 movablemem_map.acpi = true;
5355
5356 /*
5357 * If user decide to use info from BIOS, all the other user specified
5358 * ranges will be ingored.
5359 */
5360 if (movablemem_map.acpi) {
5361 if (movablemem_map.nr_map) {
5362 memset(movablemem_map.map, 0,
5363 sizeof(struct movablemem_entry)
5364 * movablemem_map.nr_map);
5365 movablemem_map.nr_map = 0;
5366 }
5367 return 0;
5368 }
5369
5370 oldp = p;
5371 mem_size = memparse(p, &p);
5372 if (p == oldp)
5373 goto err;
5374
5375 if (*p == '@') {
5376 oldp = ++p;
5377 start_at = memparse(p, &p);
5378 if (p == oldp || *p != '\0')
5379 goto err;
5380
5381 movablemem_map_add_region(start_at, mem_size);
5382 return 0;
5383 }
5384err:
5385 return -EINVAL;
5386}
5387early_param("movablemem_map", cmdline_parse_movablemem_map);
5388
5389#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 5114#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
5390 5115
5391/** 5116/**