diff options
author | Tang Chen <tangchen@cn.fujitsu.com> | 2013-02-22 19:33:49 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-02-23 20:50:14 -0500 |
commit | 01a178a94e8eaec351b29ee49fbb3d1c124cb7fb (patch) | |
tree | de978b65bf5d4a05c78cb5ce1180dc3fb04bd12d | |
parent | 27168d38fa209073219abedbe6a9de7ba9acbfad (diff) |
acpi, memory-hotplug: support getting hotplug info from SRAT
We now provide an option for users who don't want to specify physical
memory address in kernel commandline.
/*
* For movablemem_map=acpi:
*
* SRAT: |_____| |_____| |_________| |_________| ......
* node id: 0 1 1 2
* hotpluggable: n y y n
* movablemem_map: |_____| |_________|
*
* Using movablemem_map, we can prevent memblock from allocating memory
* on ZONE_MOVABLE at boot time.
*/
So user just specify movablemem_map=acpi, and the kernel will use
hotpluggable info in SRAT to determine which memory ranges should be set
as ZONE_MOVABLE.
If all the memory ranges in SRAT is hotpluggable, then no memory can be
used by kernel. But before parsing SRAT, memblock has already reserve
some memory ranges for other purposes, such as for kernel image, and so
on. We cannot prevent kernel from using these memory. So we need to
exclude these ranges even if these memory is hotpluggable.
Furthermore, there could be several memory ranges in the single node
which the kernel resides in. We may skip one range that have memory
reserved by memblock, but if the rest of memory is too small, then the
kernel will fail to boot. So, make the whole node which the kernel
resides in un-hotpluggable. Then the kernel has enough memory to use.
NOTE: Using this way will cause NUMA performance down because the
whole node will be set as ZONE_MOVABLE, and kernel cannot use memory
on it. If users don't want to lose NUMA performance, just don't use
it.
[akpm@linux-foundation.org: fix warning]
[akpm@linux-foundation.org: use strcmp()]
Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Jiang Liu <jiang.liu@huawei.com>
Cc: Jianguo Wu <wujianguo@huawei.com>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Wu Jianguo <wujianguo@huawei.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Len Brown <lenb@kernel.org>
Cc: "Brown, Len" <len.brown@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | Documentation/kernel-parameters.txt | 29 | ||||
-rw-r--r-- | arch/x86/mm/srat.c | 71 | ||||
-rw-r--r-- | include/linux/mm.h | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 22 |
4 files changed, 113 insertions, 11 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 722a74161246..766087781ecd 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -1640,15 +1640,30 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
1640 | that the amount of memory usable for all allocations | 1640 | that the amount of memory usable for all allocations |
1641 | is not too small. | 1641 | is not too small. |
1642 | 1642 | ||
1643 | movablemem_map=acpi | ||
1644 | [KNL,X86,IA-64,PPC] This parameter is similar to | ||
1645 | memmap except it specifies the memory map of | ||
1646 | ZONE_MOVABLE. | ||
1647 | This option inform the kernel to use Hot Pluggable bit | ||
1648 | in flags from SRAT from ACPI BIOS to determine which | ||
1649 | memory devices could be hotplugged. The corresponding | ||
1650 | memory ranges will be set as ZONE_MOVABLE. | ||
1651 | NOTE: Whatever node the kernel resides in will always | ||
1652 | be un-hotpluggable. | ||
1653 | |||
1643 | movablemem_map=nn[KMG]@ss[KMG] | 1654 | movablemem_map=nn[KMG]@ss[KMG] |
1644 | [KNL,X86,IA-64,PPC] This parameter is similar to | 1655 | [KNL,X86,IA-64,PPC] This parameter is similar to |
1645 | memmap except it specifies the memory map of | 1656 | memmap except it specifies the memory map of |
1646 | ZONE_MOVABLE. | 1657 | ZONE_MOVABLE. |
1647 | If more areas are all within one node, then from | 1658 | If user specifies memory ranges, the info in SRAT will |
1648 | lowest ss to the end of the node will be ZONE_MOVABLE. | 1659 | be ingored. And it works like the following: |
1649 | If an area covers two or more nodes, the area from | 1660 | - If more ranges are all within one node, then from |
1650 | ss to the end of the 1st node will be ZONE_MOVABLE, | 1661 | lowest ss to the end of the node will be ZONE_MOVABLE. |
1651 | and all the rest nodes will only have ZONE_MOVABLE. | 1662 | - If a range is within a node, then from ss to the end |
1663 | of the node will be ZONE_MOVABLE. | ||
1664 | - If a range covers two or more nodes, then from ss to | ||
1665 | the end of the 1st node will be ZONE_MOVABLE, and all | ||
1666 | the rest nodes will only have ZONE_MOVABLE. | ||
1652 | If memmap is specified at the same time, the | 1667 | If memmap is specified at the same time, the |
1653 | movablemem_map will be limited within the memmap | 1668 | movablemem_map will be limited within the memmap |
1654 | areas. If kernelcore or movablecore is also specified, | 1669 | areas. If kernelcore or movablecore is also specified, |
@@ -1656,6 +1671,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
1656 | satisfied. So the administrator should be careful that | 1671 | satisfied. So the administrator should be careful that |
1657 | the amount of movablemem_map areas are not too large. | 1672 | the amount of movablemem_map areas are not too large. |
1658 | Otherwise kernel won't have enough memory to start. | 1673 | Otherwise kernel won't have enough memory to start. |
1674 | NOTE: We don't stop users specifying the node the | ||
1675 | kernel resides in as hotpluggable so that this | ||
1676 | option can be used as a workaround of firmware | ||
1677 | bugs. | ||
1659 | 1678 | ||
1660 | MTD_Partition= [MTD] | 1679 | MTD_Partition= [MTD] |
1661 | Format: <name>,<region-number>,<size>,<offset> | 1680 | Format: <name>,<region-number>,<size>,<offset> |
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 3e90039e52e0..79836d01f789 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c | |||
@@ -142,16 +142,72 @@ static inline int save_add_info(void) {return 0;} | |||
142 | #endif | 142 | #endif |
143 | 143 | ||
144 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 144 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
145 | static void __init handle_movablemem(int node, u64 start, u64 end) | 145 | static void __init |
146 | handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable) | ||
146 | { | 147 | { |
147 | int overlap; | 148 | int overlap, i; |
148 | unsigned long start_pfn, end_pfn; | 149 | unsigned long start_pfn, end_pfn; |
149 | 150 | ||
150 | start_pfn = PFN_DOWN(start); | 151 | start_pfn = PFN_DOWN(start); |
151 | end_pfn = PFN_UP(end); | 152 | end_pfn = PFN_UP(end); |
152 | 153 | ||
153 | /* | 154 | /* |
154 | * For movablecore_map=nn[KMG]@ss[KMG]: | 155 | * For movablemem_map=acpi: |
156 | * | ||
157 | * SRAT: |_____| |_____| |_________| |_________| ...... | ||
158 | * node id: 0 1 1 2 | ||
159 | * hotpluggable: n y y n | ||
160 | * movablemem_map: |_____| |_________| | ||
161 | * | ||
162 | * Using movablemem_map, we can prevent memblock from allocating memory | ||
163 | * on ZONE_MOVABLE at boot time. | ||
164 | * | ||
165 | * Before parsing SRAT, memblock has already reserve some memory ranges | ||
166 | * for other purposes, such as for kernel image. We cannot prevent | ||
167 | * kernel from using these memory, so we need to exclude these memory | ||
168 | * even if it is hotpluggable. | ||
169 | * Furthermore, to ensure the kernel has enough memory to boot, we make | ||
170 | * all the memory on the node which the kernel resides in | ||
171 | * un-hotpluggable. | ||
172 | */ | ||
173 | if (hotpluggable && movablemem_map.acpi) { | ||
174 | /* Exclude ranges reserved by memblock. */ | ||
175 | struct memblock_type *rgn = &memblock.reserved; | ||
176 | |||
177 | for (i = 0; i < rgn->cnt; i++) { | ||
178 | if (end <= rgn->regions[i].base || | ||
179 | start >= rgn->regions[i].base + | ||
180 | rgn->regions[i].size) | ||
181 | continue; | ||
182 | |||
183 | /* | ||
184 | * If the memory range overlaps the memory reserved by | ||
185 | * memblock, then the kernel resides in this node. | ||
186 | */ | ||
187 | node_set(node, movablemem_map.numa_nodes_kernel); | ||
188 | |||
189 | goto out; | ||
190 | } | ||
191 | |||
192 | /* | ||
193 | * If the kernel resides in this node, then the whole node | ||
194 | * should not be hotpluggable. | ||
195 | */ | ||
196 | if (node_isset(node, movablemem_map.numa_nodes_kernel)) | ||
197 | goto out; | ||
198 | |||
199 | insert_movablemem_map(start_pfn, end_pfn); | ||
200 | |||
201 | /* | ||
202 | * numa_nodes_hotplug nodemask represents which nodes are put | ||
203 | * into movablemem_map.map[]. | ||
204 | */ | ||
205 | node_set(node, movablemem_map.numa_nodes_hotplug); | ||
206 | goto out; | ||
207 | } | ||
208 | |||
209 | /* | ||
210 | * For movablemem_map=nn[KMG]@ss[KMG]: | ||
155 | * | 211 | * |
156 | * SRAT: |_____| |_____| |_________| |_________| ...... | 212 | * SRAT: |_____| |_____| |_________| |_________| ...... |
157 | * node id: 0 1 1 2 | 213 | * node id: 0 1 1 2 |
@@ -160,6 +216,8 @@ static void __init handle_movablemem(int node, u64 start, u64 end) | |||
160 | * | 216 | * |
161 | * Using movablemem_map, we can prevent memblock from allocating memory | 217 | * Using movablemem_map, we can prevent memblock from allocating memory |
162 | * on ZONE_MOVABLE at boot time. | 218 | * on ZONE_MOVABLE at boot time. |
219 | * | ||
220 | * NOTE: In this case, SRAT info will be ingored. | ||
163 | */ | 221 | */ |
164 | overlap = movablemem_map_overlap(start_pfn, end_pfn); | 222 | overlap = movablemem_map_overlap(start_pfn, end_pfn); |
165 | if (overlap >= 0) { | 223 | if (overlap >= 0) { |
@@ -187,9 +245,12 @@ static void __init handle_movablemem(int node, u64 start, u64 end) | |||
187 | */ | 245 | */ |
188 | insert_movablemem_map(start_pfn, end_pfn); | 246 | insert_movablemem_map(start_pfn, end_pfn); |
189 | } | 247 | } |
248 | out: | ||
249 | return; | ||
190 | } | 250 | } |
191 | #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 251 | #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
192 | static inline void handle_movablemem(int node, u64 start, u64 end) | 252 | static inline void |
253 | handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable) | ||
193 | { | 254 | { |
194 | } | 255 | } |
195 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 256 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
@@ -234,7 +295,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) | |||
234 | (unsigned long long) start, (unsigned long long) end - 1, | 295 | (unsigned long long) start, (unsigned long long) end - 1, |
235 | hotpluggable ? "Hot Pluggable": ""); | 296 | hotpluggable ? "Hot Pluggable": ""); |
236 | 297 | ||
237 | handle_movablemem(node, start, end); | 298 | handle_movablemem(node, start, end, hotpluggable); |
238 | 299 | ||
239 | return 0; | 300 | return 0; |
240 | out_err_bad_srat: | 301 | out_err_bad_srat: |
diff --git a/include/linux/mm.h b/include/linux/mm.h index 4d7377a1d084..72a42c0fa633 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -1366,9 +1366,11 @@ struct movablemem_entry { | |||
1366 | }; | 1366 | }; |
1367 | 1367 | ||
1368 | struct movablemem_map { | 1368 | struct movablemem_map { |
1369 | bool acpi; /* true if using SRAT info */ | ||
1369 | int nr_map; | 1370 | int nr_map; |
1370 | struct movablemem_entry map[MOVABLEMEM_MAP_MAX]; | 1371 | struct movablemem_entry map[MOVABLEMEM_MAP_MAX]; |
1371 | nodemask_t numa_nodes_hotplug; /* on which nodes we specify memory */ | 1372 | nodemask_t numa_nodes_hotplug; /* on which nodes we specify memory */ |
1373 | nodemask_t numa_nodes_kernel; /* on which nodes kernel resides in */ | ||
1372 | }; | 1374 | }; |
1373 | 1375 | ||
1374 | extern void __init insert_movablemem_map(unsigned long start_pfn, | 1376 | extern void __init insert_movablemem_map(unsigned long start_pfn, |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7ea9a003ad57..a7381be21320 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -203,7 +203,10 @@ static unsigned long __meminitdata dma_reserve; | |||
203 | 203 | ||
204 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 204 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
205 | /* Movable memory ranges, will also be used by memblock subsystem. */ | 205 | /* Movable memory ranges, will also be used by memblock subsystem. */ |
206 | struct movablemem_map movablemem_map; | 206 | struct movablemem_map movablemem_map = { |
207 | .acpi = false, | ||
208 | .nr_map = 0, | ||
209 | }; | ||
207 | 210 | ||
208 | static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; | 211 | static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; |
209 | static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; | 212 | static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; |
@@ -5314,6 +5317,23 @@ static int __init cmdline_parse_movablemem_map(char *p) | |||
5314 | if (!p) | 5317 | if (!p) |
5315 | goto err; | 5318 | goto err; |
5316 | 5319 | ||
5320 | if (!strcmp(p, "acpi")) | ||
5321 | movablemem_map.acpi = true; | ||
5322 | |||
5323 | /* | ||
5324 | * If user decide to use info from BIOS, all the other user specified | ||
5325 | * ranges will be ingored. | ||
5326 | */ | ||
5327 | if (movablemem_map.acpi) { | ||
5328 | if (movablemem_map.nr_map) { | ||
5329 | memset(movablemem_map.map, 0, | ||
5330 | sizeof(struct movablemem_entry) | ||
5331 | * movablemem_map.nr_map); | ||
5332 | movablemem_map.nr_map = 0; | ||
5333 | } | ||
5334 | return 0; | ||
5335 | } | ||
5336 | |||
5317 | oldp = p; | 5337 | oldp = p; |
5318 | mem_size = memparse(p, &p); | 5338 | mem_size = memparse(p, &p); |
5319 | if (p == oldp) | 5339 | if (p == oldp) |