aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memblock.c
diff options
context:
space:
mode:
authorTang Chen <tangchen@cn.fujitsu.com>2013-11-12 18:07:59 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-11-12 22:09:08 -0500
commit79442ed189acb8b949662676e750eda173c06f9b (patch)
treee850c404e1cdd8571ba39e9b8a9436bed8fcc6c3 /mm/memblock.c
parent1402899e43fda490f08d2c47a7558931f8b9c60c (diff)
mm/memblock.c: introduce bottom-up allocation mode
The Linux kernel cannot migrate pages used by the kernel. As a result, kernel pages cannot be hot-removed. So we cannot allocate hotpluggable memory for the kernel. ACPI SRAT (System Resource Affinity Table) contains the memory hotplug info. But before SRAT is parsed, memblock has already started to allocate memory for the kernel. So we need to prevent memblock from doing this. In a memory hotplug system, any numa node the kernel resides in should be unhotpluggable. And for a modern server, each node could have at least 16GB memory. So memory around the kernel image is highly likely unhotpluggable. So the basic idea is: Allocate memory from the end of the kernel image and to the higher memory. Since memory allocation before SRAT is parsed won't be too much, it could highly likely be in the same node with kernel image. The current memblock can only allocate memory top-down. So this patch introduces a new bottom-up allocation mode to allocate memory bottom-up. And later when we use this allocation direction to allocate memory, we will limit the start address above the kernel. Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com> Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com> Acked-by: Toshi Kani <toshi.kani@hp.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Ingo Molnar <mingo@elte.hu> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Tejun Heo <tj@kernel.org> Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com> Cc: Thomas Renninger <trenn@suse.de> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Jiang Liu <jiang.liu@huawei.com> Cc: Wen Congyang <wency@cn.fujitsu.com> Cc: Lai Jiangshan <laijs@cn.fujitsu.com> Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Cc: Taku Izumi <izumi.taku@jp.fujitsu.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Michal Nazarewicz <mina86@mina86.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Rik van Riel <riel@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memblock.c')
-rw-r--r--mm/memblock.c83
1 files changed, 80 insertions, 3 deletions
diff --git a/mm/memblock.c b/mm/memblock.c
index accff1087137..53e477bb5558 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -20,6 +20,8 @@
20#include <linux/seq_file.h> 20#include <linux/seq_file.h>
21#include <linux/memblock.h> 21#include <linux/memblock.h>
22 22
23#include <asm-generic/sections.h>
24
23static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; 25static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
24static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; 26static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
25 27
@@ -32,6 +34,7 @@ struct memblock memblock __initdata_memblock = {
32 .reserved.cnt = 1, /* empty dummy entry */ 34 .reserved.cnt = 1, /* empty dummy entry */
33 .reserved.max = INIT_MEMBLOCK_REGIONS, 35 .reserved.max = INIT_MEMBLOCK_REGIONS,
34 36
37 .bottom_up = false,
35 .current_limit = MEMBLOCK_ALLOC_ANYWHERE, 38 .current_limit = MEMBLOCK_ALLOC_ANYWHERE,
36}; 39};
37 40
@@ -82,6 +85,38 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
82 return (i < type->cnt) ? i : -1; 85 return (i < type->cnt) ? i : -1;
83} 86}
84 87
88/*
89 * __memblock_find_range_bottom_up - find free area utility in bottom-up
90 * @start: start of candidate range
91 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
92 * @size: size of free area to find
93 * @align: alignment of free area to find
94 * @nid: nid of the free area to find, %MAX_NUMNODES for any node
95 *
96 * Utility called from memblock_find_in_range_node(), find free area bottom-up.
97 *
98 * RETURNS:
99 * Found address on success, 0 on failure.
100 */
101static phys_addr_t __init_memblock
102__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
103 phys_addr_t size, phys_addr_t align, int nid)
104{
105 phys_addr_t this_start, this_end, cand;
106 u64 i;
107
108 for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) {
109 this_start = clamp(this_start, start, end);
110 this_end = clamp(this_end, start, end);
111
112 cand = round_up(this_start, align);
113 if (cand < this_end && this_end - cand >= size)
114 return cand;
115 }
116
117 return 0;
118}
119
85/** 120/**
86 * __memblock_find_range_top_down - find free area utility, in top-down 121 * __memblock_find_range_top_down - find free area utility, in top-down
87 * @start: start of candidate range 122 * @start: start of candidate range
@@ -93,7 +128,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
93 * Utility called from memblock_find_in_range_node(), find free area top-down. 128 * Utility called from memblock_find_in_range_node(), find free area top-down.
94 * 129 *
95 * RETURNS: 130 * RETURNS:
96 * Found address on success, %0 on failure. 131 * Found address on success, 0 on failure.
97 */ 132 */
98static phys_addr_t __init_memblock 133static phys_addr_t __init_memblock
99__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, 134__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
@@ -127,13 +162,24 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
127 * 162 *
128 * Find @size free area aligned to @align in the specified range and node. 163 * Find @size free area aligned to @align in the specified range and node.
129 * 164 *
165 * When allocation direction is bottom-up, the @start should be greater
166 * than the end of the kernel image. Otherwise, it will be trimmed. The
167 * reason is that we want the bottom-up allocation just near the kernel
168 * image so it is highly likely that the allocated memory and the kernel
169 * will reside in the same node.
170 *
171 * If bottom-up allocation failed, will try to allocate memory top-down.
172 *
130 * RETURNS: 173 * RETURNS:
131 * Found address on success, %0 on failure. 174 * Found address on success, 0 on failure.
132 */ 175 */
133phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, 176phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
134 phys_addr_t end, phys_addr_t size, 177 phys_addr_t end, phys_addr_t size,
135 phys_addr_t align, int nid) 178 phys_addr_t align, int nid)
136{ 179{
180 int ret;
181 phys_addr_t kernel_end;
182
137 /* pump up @end */ 183 /* pump up @end */
138 if (end == MEMBLOCK_ALLOC_ACCESSIBLE) 184 if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
139 end = memblock.current_limit; 185 end = memblock.current_limit;
@@ -141,6 +187,37 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
141 /* avoid allocating the first page */ 187 /* avoid allocating the first page */
142 start = max_t(phys_addr_t, start, PAGE_SIZE); 188 start = max_t(phys_addr_t, start, PAGE_SIZE);
143 end = max(start, end); 189 end = max(start, end);
190 kernel_end = __pa_symbol(_end);
191
192 /*
193 * try bottom-up allocation only when bottom-up mode
194 * is set and @end is above the kernel image.
195 */
196 if (memblock_bottom_up() && end > kernel_end) {
197 phys_addr_t bottom_up_start;
198
199 /* make sure we will allocate above the kernel */
200 bottom_up_start = max(start, kernel_end);
201
202 /* ok, try bottom-up allocation first */
203 ret = __memblock_find_range_bottom_up(bottom_up_start, end,
204 size, align, nid);
205 if (ret)
206 return ret;
207
208 /*
209 * we always limit bottom-up allocation above the kernel,
210 * but top-down allocation doesn't have the limit, so
211 * retrying top-down allocation may succeed when bottom-up
212 * allocation failed.
213 *
214 * bottom-up allocation is expected to be fail very rarely,
215 * so we use WARN_ONCE() here to see the stack trace if
216 * fail happens.
217 */
218 WARN_ONCE(1, "memblock: bottom-up allocation failed, "
219 "memory hotunplug may be affected\n");
220 }
144 221
145 return __memblock_find_range_top_down(start, end, size, align, nid); 222 return __memblock_find_range_top_down(start, end, size, align, nid);
146} 223}
@@ -155,7 +232,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
155 * Find @size free area aligned to @align in the specified range. 232 * Find @size free area aligned to @align in the specified range.
156 * 233 *
157 * RETURNS: 234 * RETURNS:
158 * Found address on success, %0 on failure. 235 * Found address on success, 0 on failure.
159 */ 236 */
160phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, 237phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
161 phys_addr_t end, phys_addr_t size, 238 phys_addr_t end, phys_addr_t size,