aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorTang Chen <tangchen@cn.fujitsu.com>2013-11-12 18:08:05 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-11-12 22:09:08 -0500
commitb959ed6c73845aebf51afb8f76bb74b9388344d2 (patch)
tree18f8eea116ab3272778a896560a13a3150e572d3 /arch
parent0167d7d8b0beb4cf12076b47e4dc73897ae5acb0 (diff)
x86/mem-hotplug: support initialize page tables in bottom-up
The Linux kernel cannot migrate pages used by the kernel. As a result, kernel pages cannot be hot-removed. So we cannot allocate hotpluggable memory for the kernel. In a memory hotplug system, any numa node the kernel resides in should be unhotpluggable. And for a modern server, each node could have at least 16GB memory. So memory around the kernel image is highly likely unhotpluggable. ACPI SRAT (System Resource Affinity Table) contains the memory hotplug info. But before SRAT is parsed, memblock has already started to allocate memory for the kernel. So we need to prevent memblock from doing this. So direct memory mapping page tables setup is the case. init_mem_mapping() is called before SRAT is parsed. To prevent page tables being allocated within hotpluggable memory, we will use bottom-up direction to allocate page tables from the end of kernel image to the higher memory. Note: As for allocating page tables in lower memory, TJ said: : This is an optional behavior which is triggered by a very specific kernel : boot param, which I suspect is gonna need to stick around to support : memory hotplug in the current setup unless we add another layer of address : translation to support memory hotplug. As for page tables may occupy too much lower memory if using 4K mapping (CONFIG_DEBUG_PAGEALLOC and CONFIG_KMEMCHECK both disable using >4k pages), TJ said: : But as I said in the same paragraph, parsing SRAT earlier doesn't solve : the problem in itself either. Ignoring the option if 4k mapping is : required and memory consumption would be prohibitive should work, no? : Something like that would be necessary if we're gonna worry about cases : like this no matter how we implement it, but, frankly, I'm not sure this : is something worth worrying about. Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com> Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com> Acked-by: Tejun Heo <tj@kernel.org> Acked-by: Toshi Kani <toshi.kani@hp.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Ingo Molnar <mingo@elte.hu> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com> Cc: Thomas Renninger <trenn@suse.de> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Jiang Liu <jiang.liu@huawei.com> Cc: Wen Congyang <wency@cn.fujitsu.com> Cc: Lai Jiangshan <laijs@cn.fujitsu.com> Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Cc: Taku Izumi <izumi.taku@jp.fujitsu.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Michal Nazarewicz <mina86@mina86.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Rik van Riel <riel@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/mm/init.c66
1 files changed, 64 insertions, 2 deletions
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 742d6d4ad9eb..91b522072a4d 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -474,6 +474,51 @@ static void __init memory_map_top_down(unsigned long map_start,
474 init_range_memory_mapping(real_end, map_end); 474 init_range_memory_mapping(real_end, map_end);
475} 475}
476 476
477/**
478 * memory_map_bottom_up - Map [map_start, map_end) bottom up
479 * @map_start: start address of the target memory range
480 * @map_end: end address of the target memory range
481 *
482 * This function will setup direct mapping for memory range
483 * [map_start, map_end) in bottom-up. Since we have limited the
484 * bottom-up allocation above the kernel, the page tables will
485 * be allocated just above the kernel and we map the memory
486 * in [map_start, map_end) in bottom-up.
487 */
488static void __init memory_map_bottom_up(unsigned long map_start,
489 unsigned long map_end)
490{
491 unsigned long next, new_mapped_ram_size, start;
492 unsigned long mapped_ram_size = 0;
493 /* step_size need to be small so pgt_buf from BRK could cover it */
494 unsigned long step_size = PMD_SIZE;
495
496 start = map_start;
497 min_pfn_mapped = start >> PAGE_SHIFT;
498
499 /*
500 * We start from the bottom (@map_start) and go to the top (@map_end).
501 * The memblock_find_in_range() gets us a block of RAM from the
502 * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages
503 * for page table.
504 */
505 while (start < map_end) {
506 if (map_end - start > step_size) {
507 next = round_up(start + 1, step_size);
508 if (next > map_end)
509 next = map_end;
510 } else
511 next = map_end;
512
513 new_mapped_ram_size = init_range_memory_mapping(start, next);
514 start = next;
515
516 if (new_mapped_ram_size > mapped_ram_size)
517 step_size = get_new_step_size(step_size);
518 mapped_ram_size += new_mapped_ram_size;
519 }
520}
521
477void __init init_mem_mapping(void) 522void __init init_mem_mapping(void)
478{ 523{
479 unsigned long end; 524 unsigned long end;
@@ -489,8 +534,25 @@ void __init init_mem_mapping(void)
489 /* the ISA range is always mapped regardless of memory holes */ 534 /* the ISA range is always mapped regardless of memory holes */
490 init_memory_mapping(0, ISA_END_ADDRESS); 535 init_memory_mapping(0, ISA_END_ADDRESS);
491 536
492 /* setup direct mapping for range [ISA_END_ADDRESS, end) in top-down*/ 537 /*
493 memory_map_top_down(ISA_END_ADDRESS, end); 538 * If the allocation is in bottom-up direction, we setup direct mapping
539 * in bottom-up, otherwise we setup direct mapping in top-down.
540 */
541 if (memblock_bottom_up()) {
542 unsigned long kernel_end = __pa_symbol(_end);
543
544 /*
545 * we need two separate calls here. This is because we want to
546 * allocate page tables above the kernel. So we first map
547 * [kernel_end, end) to make memory above the kernel be mapped
548 * as soon as possible. And then use page tables allocated above
549 * the kernel to map [ISA_END_ADDRESS, kernel_end).
550 */
551 memory_map_bottom_up(kernel_end, end);
552 memory_map_bottom_up(ISA_END_ADDRESS, kernel_end);
553 } else {
554 memory_map_top_down(ISA_END_ADDRESS, end);
555 }
494 556
495#ifdef CONFIG_X86_64 557#ifdef CONFIG_X86_64
496 if (max_pfn > max_low_pfn) { 558 if (max_pfn > max_low_pfn) {