mm/memblock.c: introduce bottom-up allocation mode

The Linux kernel cannot migrate pages used by the kernel. As a result, kernel pages cannot be hot-removed. So we cannot allocate hotpluggable memory for the kernel. ACPI SRAT (System Resource Affinity Table) contains the memory hotplug info. But before SRAT is parsed, memblock has already started to allocate memory for the kernel. So we need to prevent memblock from doing this. In a memory hotplug system, any numa node the kernel resides in should be unhotpluggable. And for a modern server, each node could have at least 16GB memory. So memory around the kernel image is highly likely unhotpluggable. So the basic idea is: Allocate memory from the end of the kernel image and to the higher memory. Since memory allocation before SRAT is parsed won't be too much, it could highly likely be in the same node with kernel image. The current memblock can only allocate memory top-down. So this patch introduces a new bottom-up allocation mode to allocate memory bottom-up. And later when we use this allocation direction to allocate memory, we will limit the start address above the kernel. Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com> Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com> Acked-by: Toshi Kani <toshi.kani@hp.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Ingo Molnar <mingo@elte.hu> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Tejun Heo <tj@kernel.org> Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com> Cc: Thomas Renninger <trenn@suse.de> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Jiang Liu <jiang.liu@huawei.com> Cc: Wen Congyang <wency@cn.fujitsu.com> Cc: Lai Jiangshan <laijs@cn.fujitsu.com> Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Cc: Taku Izumi <izumi.taku@jp.fujitsu.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Michal Nazarewicz <mina86@mina86.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Rik van Riel <riel@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Tang Chen <tangchen@cn.fujitsu.com> 2013-11-12 18:07:59 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2013-11-12 22:09:08 -0500
commit: 79442ed189acb8b949662676e750eda173c06f9b (patch)
tree: e850c404e1cdd8571ba39e9b8a9436bed8fcc6c3 /mm/memblock.c
parent: 1402899e43fda490f08d2c47a7558931f8b9c60c (diff)
1 files changed, 80 insertions, 3 deletions
diff --git a/mm/memblock.c b/mm/memblock.c
index accff1087137..53e477bb5558 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -20,6 +20,8 @@
 #include <linux/seq_file.h>
 #include <linux/memblock.h>
+#include <asm-generic/sections.h>
 static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
 static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
@@ -32,6 +34,7 @@ struct memblock memblock __initdata_memblock = {
        .reserved.cnt           = 1,    /* empty dummy entry */
        .reserved.max           = INIT_MEMBLOCK_REGIONS,
+        .bottom_up              = false,
        .current_limit          = MEMBLOCK_ALLOC_ANYWHERE,
 };
@@ -82,6 +85,38 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
        return (i < type->cnt) ? i : -1;
 }
+/*
+ * __memblock_find_range_bottom_up - find free area utility in bottom-up
+ * @start: start of candidate range
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @size: size of free area to find
+ * @align: alignment of free area to find
+ * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ *
+ * Utility called from memblock_find_in_range_node(), find free area bottom-up.
+ *
+ * RETURNS:
+ * Found address on success, 0 on failure.
+ */
+static phys_addr_t __init_memblock
+__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
+                                phys_addr_t size, phys_addr_t align, int nid)
+{
+        phys_addr_t this_start, this_end, cand;
+        u64 i;
+        for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) {
+                this_start = clamp(this_start, start, end);
+                this_end = clamp(this_end, start, end);
+                cand = round_up(this_start, align);
+                if (cand < this_end && this_end - cand >= size)
+                        return cand;
+        }
+        return 0;
+}
 /**
 * __memblock_find_range_top_down - find free area utility, in top-down
 * @start: start of candidate range
@@ -93,7 +128,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
 * Utility called from memblock_find_in_range_node(), find free area top-down.
 *
 * RETURNS:
- * Found address on success, %0 on failure.
+ * Found address on success, 0 on failure.
 */
 static phys_addr_t __init_memblock
 __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
@@ -127,13 +162,24 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
 *
 * Find @size free area aligned to @align in the specified range and node.
 *
+ * When allocation direction is bottom-up, the @start should be greater
+ * than the end of the kernel image. Otherwise, it will be trimmed. The
+ * reason is that we want the bottom-up allocation just near the kernel
+ * image so it is highly likely that the allocated memory and the kernel
+ * will reside in the same node.
+ *
+ * If bottom-up allocation failed, will try to allocate memory top-down.
+ *
 * RETURNS:
- * Found address on success, %0 on failure.
+ * Found address on success, 0 on failure.
 */
 phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
                                        phys_addr_t end, phys_addr_t size,
                                        phys_addr_t align, int nid)
 {
+        int ret;
+        phys_addr_t kernel_end;
        /* pump up @end */
        if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
                end = memblock.current_limit;
@@ -141,6 +187,37 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
        /* avoid allocating the first page */
        start = max_t(phys_addr_t, start, PAGE_SIZE);
        end = max(start, end);
+        kernel_end = __pa_symbol(_end);
+        /*
+         * try bottom-up allocation only when bottom-up mode
+         * is set and @end is above the kernel image.
+         */
+        if (memblock_bottom_up() && end > kernel_end) {
+                phys_addr_t bottom_up_start;
+                /* make sure we will allocate above the kernel */
+                bottom_up_start = max(start, kernel_end);
+                /* ok, try bottom-up allocation first */
+                ret = __memblock_find_range_bottom_up(bottom_up_start, end,
+                                                      size, align, nid);
+                if (ret)
+                        return ret;
+                /*
+                 * we always limit bottom-up allocation above the kernel,
+                 * but top-down allocation doesn't have the limit, so
+                 * retrying top-down allocation may succeed when bottom-up
+                 * allocation failed.
+                 *
+                 * bottom-up allocation is expected to be fail very rarely,
+                 * so we use WARN_ONCE() here to see the stack trace if
+                 * fail happens.
+                 */
+                WARN_ONCE(1, "memblock: bottom-up allocation failed, "
+                             "memory hotunplug may be affected\n");
+        }
        return __memblock_find_range_top_down(start, end, size, align, nid);
 }
@@ -155,7 +232,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
 * Find @size free area aligned to @align in the specified range.
 *
 * RETURNS:
- * Found address on success, %0 on failure.
+ * Found address on success, 0 on failure.
 */
 phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
                                        phys_addr_t end, phys_addr_t size,
author	Tang Chen <tangchen@cn.fujitsu.com>	2013-11-12 18:07:59 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2013-11-12 22:09:08 -0500
commit	79442ed189acb8b949662676e750eda173c06f9b (patch)
tree	e850c404e1cdd8571ba39e9b8a9436bed8fcc6c3 /mm/memblock.c
parent	1402899e43fda490f08d2c47a7558931f8b9c60c (diff)

diff --git a/mm/memblock.c b/mm/memblock.c index accff1087137..53e477bb5558 100644 --- a/mm/memblock.c +++ b/mm/memblock.c
@@ -20,6 +20,8 @@
20	#include <linux/seq_file.h>	20	#include <linux/seq_file.h>
21	#include <linux/memblock.h>	21	#include <linux/memblock.h>
22		22
		23	#include <asm-generic/sections.h>
		24
23	static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;	25	static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
24	static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;	26	static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
25		27
@@ -32,6 +34,7 @@ struct memblock memblock __initdata_memblock = {
32	.reserved.cnt = 1, /* empty dummy entry */	34	.reserved.cnt = 1, /* empty dummy entry */
33	.reserved.max = INIT_MEMBLOCK_REGIONS,	35	.reserved.max = INIT_MEMBLOCK_REGIONS,
34		36
		37	.bottom_up = false,
35	.current_limit = MEMBLOCK_ALLOC_ANYWHERE,	38	.current_limit = MEMBLOCK_ALLOC_ANYWHERE,
36	};	39	};
37		40
@@ -82,6 +85,38 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
82	return (i < type->cnt) ? i : -1;	85	return (i < type->cnt) ? i : -1;
83	}	86	}
84		87
		88	/*
		89	* __memblock_find_range_bottom_up - find free area utility in bottom-up
		90	* @start: start of candidate range
		91	* @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE\|ACCESSIBLE}
		92	* @size: size of free area to find
		93	* @align: alignment of free area to find
		94	* @nid: nid of the free area to find, %MAX_NUMNODES for any node
		95	*
		96	* Utility called from memblock_find_in_range_node(), find free area bottom-up.
		97	*
		98	* RETURNS:
		99	* Found address on success, 0 on failure.
		100	*/
		101	static phys_addr_t __init_memblock
		102	__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
		103	phys_addr_t size, phys_addr_t align, int nid)
		104	{
		105	phys_addr_t this_start, this_end, cand;
		106	u64 i;
		107
		108	for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) {
		109	this_start = clamp(this_start, start, end);
		110	this_end = clamp(this_end, start, end);
		111
		112	cand = round_up(this_start, align);
		113	if (cand < this_end && this_end - cand >= size)
		114	return cand;
		115	}
		116
		117	return 0;
		118	}
		119
85	/**	120	/**
86	* __memblock_find_range_top_down - find free area utility, in top-down	121	* __memblock_find_range_top_down - find free area utility, in top-down
87	* @start: start of candidate range	122	* @start: start of candidate range
@@ -93,7 +128,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
93	* Utility called from memblock_find_in_range_node(), find free area top-down.	128	* Utility called from memblock_find_in_range_node(), find free area top-down.
94	*	129	*
95	* RETURNS:	130	* RETURNS:
96	* Found address on success, %0 on failure.	131	* Found address on success, 0 on failure.
97	*/	132	*/
98	static phys_addr_t __init_memblock	133	static phys_addr_t __init_memblock
99	__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,	134	__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
@@ -127,13 +162,24 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
127	*	162	*
128	* Find @size free area aligned to @align in the specified range and node.	163	* Find @size free area aligned to @align in the specified range and node.
129	*	164	*
		165	* When allocation direction is bottom-up, the @start should be greater
		166	* than the end of the kernel image. Otherwise, it will be trimmed. The
		167	* reason is that we want the bottom-up allocation just near the kernel
		168	* image so it is highly likely that the allocated memory and the kernel
		169	* will reside in the same node.
		170	*
		171	* If bottom-up allocation failed, will try to allocate memory top-down.
		172	*
130	* RETURNS:	173	* RETURNS:
131	* Found address on success, %0 on failure.	174	* Found address on success, 0 on failure.
132	*/	175	*/
133	phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,	176	phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
134	phys_addr_t end, phys_addr_t size,	177	phys_addr_t end, phys_addr_t size,
135	phys_addr_t align, int nid)	178	phys_addr_t align, int nid)
136	{	179	{
		180	int ret;
		181	phys_addr_t kernel_end;
		182
137	/* pump up @end */	183	/* pump up @end */
138	if (end == MEMBLOCK_ALLOC_ACCESSIBLE)	184	if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
139	end = memblock.current_limit;	185	end = memblock.current_limit;
@@ -141,6 +187,37 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
141	/* avoid allocating the first page */	187	/* avoid allocating the first page */
142	start = max_t(phys_addr_t, start, PAGE_SIZE);	188	start = max_t(phys_addr_t, start, PAGE_SIZE);
143	end = max(start, end);	189	end = max(start, end);
		190	kernel_end = __pa_symbol(_end);
		191
		192	/*
		193	* try bottom-up allocation only when bottom-up mode
		194	* is set and @end is above the kernel image.
		195	*/
		196	if (memblock_bottom_up() && end > kernel_end) {
		197	phys_addr_t bottom_up_start;
		198
		199	/* make sure we will allocate above the kernel */
		200	bottom_up_start = max(start, kernel_end);
		201
		202	/* ok, try bottom-up allocation first */
		203	ret = __memblock_find_range_bottom_up(bottom_up_start, end,
		204	size, align, nid);
		205	if (ret)
		206	return ret;
		207
		208	/*
		209	* we always limit bottom-up allocation above the kernel,
		210	* but top-down allocation doesn't have the limit, so
		211	* retrying top-down allocation may succeed when bottom-up
		212	* allocation failed.
		213	*
		214	* bottom-up allocation is expected to be fail very rarely,
		215	* so we use WARN_ONCE() here to see the stack trace if
		216	* fail happens.
		217	*/
		218	WARN_ONCE(1, "memblock: bottom-up allocation failed, "
		219	"memory hotunplug may be affected\n");
		220	}
144		221
145	return __memblock_find_range_top_down(start, end, size, align, nid);	222	return __memblock_find_range_top_down(start, end, size, align, nid);
146	}	223	}
@@ -155,7 +232,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
155	* Find @size free area aligned to @align in the specified range.	232	* Find @size free area aligned to @align in the specified range.
156	*	233	*
157	* RETURNS:	234	* RETURNS:
158	* Found address on success, %0 on failure.	235	* Found address on success, 0 on failure.
159	*/	236	*/
160	phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,	237	phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
161	phys_addr_t end, phys_addr_t size,	238	phys_addr_t end, phys_addr_t size,