aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTang Chen <tangchen@cn.fujitsu.com>2013-11-12 18:08:10 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-11-12 22:09:09 -0500
commitc5320926e370b4cfb8f10c2169e26f960079cf67 (patch)
treeef61f5718e366f0e8a82695daa358516f092698d
parentfa591c4ae76ecbd4d26d7e8f65429d6d454554a6 (diff)
mem-hotplug: introduce movable_node boot option
The hot-Pluggable field in SRAT specifies which memory is hotpluggable. As we mentioned before, if hotpluggable memory is used by the kernel, it cannot be hot-removed. So memory hotplug users may want to set all hotpluggable memory in ZONE_MOVABLE so that the kernel won't use it. Memory hotplug users may also set a node as movable node, which has ZONE_MOVABLE only, so that the whole node can be hot-removed. But the kernel cannot use memory in ZONE_MOVABLE. By doing this, the kernel cannot use memory in movable nodes. This will cause NUMA performance down. And other users may be unhappy. So we need a way to allow users to enable and disable this functionality. In this patch, we introduce movable_node boot option to allow users to choose to not to consume hotpluggable memory at early boot time and later we can set it as ZONE_MOVABLE. To achieve this, the movable_node boot option will control the memblock allocation direction. That said, after memblock is ready, before SRAT is parsed, we should allocate memory near the kernel image as we explained in the previous patches. So if movable_node boot option is set, the kernel does the following: 1. After memblock is ready, make memblock allocate memory bottom up. 2. After SRAT is parsed, make memblock behave as default, allocate memory top down. Users can specify "movable_node" in kernel commandline to enable this functionality. For those who don't use memory hotplug or who don't want to lose their NUMA performance, just don't specify anything. The kernel will work as before. Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com> Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com> Suggested-by: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Suggested-by: Ingo Molnar <mingo@kernel.org> Acked-by: Tejun Heo <tj@kernel.org> Acked-by: Toshi Kani <toshi.kani@hp.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com> Cc: Thomas Renninger <trenn@suse.de> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Jiang Liu <jiang.liu@huawei.com> Cc: Wen Congyang <wency@cn.fujitsu.com> Cc: Lai Jiangshan <laijs@cn.fujitsu.com> Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Cc: Taku Izumi <izumi.taku@jp.fujitsu.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Michal Nazarewicz <mina86@mina86.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Rik van Riel <riel@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/kernel-parameters.txt3
-rw-r--r--arch/x86/mm/numa.c11
-rw-r--r--mm/Kconfig17
-rw-r--r--mm/memory_hotplug.c31
4 files changed, 57 insertions, 5 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index fd3ecedc084d..882a40d405c8 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1775,6 +1775,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1775 that the amount of memory usable for all allocations 1775 that the amount of memory usable for all allocations
1776 is not too small. 1776 is not too small.
1777 1777
1778 movable_node [KNL,X86] Boot-time switch to enable the effects
1779 of CONFIG_MOVABLE_NODE=y. See mm/Kconfig for details.
1780
1778 MTD_Partition= [MTD] 1781 MTD_Partition= [MTD]
1779 Format: <name>,<region-number>,<size>,<offset> 1782 Format: <name>,<region-number>,<size>,<offset>
1780 1783
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 8bf93bae1f13..24aec58d6afd 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -567,6 +567,17 @@ static int __init numa_init(int (*init_func)(void))
567 ret = init_func(); 567 ret = init_func();
568 if (ret < 0) 568 if (ret < 0)
569 return ret; 569 return ret;
570
571 /*
572 * We reset memblock back to the top-down direction
573 * here because if we configured ACPI_NUMA, we have
574 * parsed SRAT in init_func(). It is ok to have the
575 * reset here even if we did't configure ACPI_NUMA
576 * or acpi numa init fails and fallbacks to dummy
577 * numa init.
578 */
579 memblock_set_bottom_up(false);
580
570 ret = numa_cleanup_meminfo(&numa_meminfo); 581 ret = numa_cleanup_meminfo(&numa_meminfo);
571 if (ret < 0) 582 if (ret < 0)
572 return ret; 583 return ret;
diff --git a/mm/Kconfig b/mm/Kconfig
index 394838f489eb..3f4ffda152bb 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -153,11 +153,18 @@ config MOVABLE_NODE
153 help 153 help
154 Allow a node to have only movable memory. Pages used by the kernel, 154 Allow a node to have only movable memory. Pages used by the kernel,
155 such as direct mapping pages cannot be migrated. So the corresponding 155 such as direct mapping pages cannot be migrated. So the corresponding
156 memory device cannot be hotplugged. This option allows users to 156 memory device cannot be hotplugged. This option allows the following
157 online all the memory of a node as movable memory so that the whole 157 two things:
158 node can be hotplugged. Users who don't use the memory hotplug 158 - When the system is booting, node full of hotpluggable memory can
159 feature are fine with this option on since they don't online memory 159 be arranged to have only movable memory so that the whole node can
160 as movable. 160 be hot-removed. (need movable_node boot option specified).
161 - After the system is up, the option allows users to online all the
162 memory of a node as movable memory so that the whole node can be
163 hot-removed.
164
165 Users who don't use the memory hotplug feature are fine with this
166 option on since they don't specify movable_node boot option or they
167 don't online memory as movable.
161 168
162 Say Y here if you want to hotplug a whole node. 169 Say Y here if you want to hotplug a whole node.
163 Say N here if you want kernel to use memory on all nodes evenly. 170 Say N here if you want kernel to use memory on all nodes evenly.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1b6fe8ca71e6..489f235502db 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -31,6 +31,7 @@
31#include <linux/firmware-map.h> 31#include <linux/firmware-map.h>
32#include <linux/stop_machine.h> 32#include <linux/stop_machine.h>
33#include <linux/hugetlb.h> 33#include <linux/hugetlb.h>
34#include <linux/memblock.h>
34 35
35#include <asm/tlbflush.h> 36#include <asm/tlbflush.h>
36 37
@@ -1422,6 +1423,36 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
1422} 1423}
1423#endif /* CONFIG_MOVABLE_NODE */ 1424#endif /* CONFIG_MOVABLE_NODE */
1424 1425
1426static int __init cmdline_parse_movable_node(char *p)
1427{
1428#ifdef CONFIG_MOVABLE_NODE
1429 /*
1430 * Memory used by the kernel cannot be hot-removed because Linux
1431 * cannot migrate the kernel pages. When memory hotplug is
1432 * enabled, we should prevent memblock from allocating memory
1433 * for the kernel.
1434 *
1435 * ACPI SRAT records all hotpluggable memory ranges. But before
1436 * SRAT is parsed, we don't know about it.
1437 *
1438 * The kernel image is loaded into memory at very early time. We
1439 * cannot prevent this anyway. So on NUMA system, we set any
1440 * node the kernel resides in as un-hotpluggable.
1441 *
1442 * Since on modern servers, one node could have double-digit
1443 * gigabytes memory, we can assume the memory around the kernel
1444 * image is also un-hotpluggable. So before SRAT is parsed, just
1445 * allocate memory near the kernel image to try the best to keep
1446 * the kernel away from hotpluggable memory.
1447 */
1448 memblock_set_bottom_up(true);
1449#else
1450 pr_warn("movable_node option not supported\n");
1451#endif
1452 return 0;
1453}
1454early_param("movable_node", cmdline_parse_movable_node);
1455
1425/* check which state of node_states will be changed when offline memory */ 1456/* check which state of node_states will be changed when offline memory */
1426static void node_states_check_changes_offline(unsigned long nr_pages, 1457static void node_states_check_changes_offline(unsigned long nr_pages,
1427 struct zone *zone, struct memory_notify *arg) 1458 struct zone *zone, struct memory_notify *arg)