aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTang Chen <tangchen@cn.fujitsu.com>2013-02-22 19:33:44 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-02-23 20:50:14 -0500
commite8d1955258091e4c92d5a975ebd7fd8a98f5d30f (patch)
tree747facdf560c5ca88af4952614c7d5376ad3e4c3
parentfb06bc8e5f42f38c011de0e59481f464a82380f6 (diff)
acpi, memory-hotplug: parse SRAT before memblock is ready
On linux, the pages used by kernel could not be migrated. As a result, if a memory range is used by kernel, it cannot be hot-removed. So if we want to hot-remove memory, we should prevent kernel from using it. The way now used to prevent this is specify a memory range by movablemem_map boot option and set it as ZONE_MOVABLE. But when the system is booting, memblock will allocate memory, and reserve the memory for kernel. And before we parse SRAT, and know the node memory ranges, memblock is working. And it may allocate memory in ranges to be set as ZONE_MOVABLE. This memory can be used by kernel, and never be freed. So, let's parse SRAT before memblock is called first. And it is early enough. The first call of memblock_find_in_range_node() is in: setup_arch() |-->setup_real_mode() so, this patch add a function early_parse_srat() to parse SRAT, and call it before setup_real_mode() is called. NOTE: 1) early_parse_srat() is called before numa_init(), and has initialized numa_meminfo. So DO NOT clear numa_nodes_parsed in numa_init() and DO NOT zero numa_meminfo in numa_init(), otherwise we will lose memory numa info. 2) I don't know why using count of memory affinities parsed from SRAT as a return value in original acpi_numa_init(). So I add a static variable srat_mem_cnt to remember this count and use it as the return value of the new acpi_numa_init() [mhocko@suse.cz: parse SRAT before memblock is ready fix] Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com> Reviewed-by: Wen Congyang <wency@cn.fujitsu.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Jiang Liu <jiang.liu@huawei.com> Cc: Jianguo Wu <wujianguo@huawei.com> Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Lai Jiangshan <laijs@cn.fujitsu.com> Cc: Wu Jianguo <wujianguo@huawei.com> Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Len Brown <lenb@kernel.org> Cc: "Brown, Len" <len.brown@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--arch/x86/kernel/setup.c13
-rw-r--r--arch/x86/mm/numa.c6
-rw-r--r--drivers/acpi/numa.c23
-rw-r--r--include/linux/acpi.h8
4 files changed, 34 insertions, 16 deletions
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 915f5efefcf5..9c857f05cef0 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1056,6 +1056,15 @@ void __init setup_arch(char **cmdline_p)
1056 setup_bios_corruption_check(); 1056 setup_bios_corruption_check();
1057#endif 1057#endif
1058 1058
1059 /*
1060 * In the memory hotplug case, the kernel needs info from SRAT to
1061 * determine which memory is hotpluggable before allocating memory
1062 * using memblock.
1063 */
1064 acpi_boot_table_init();
1065 early_acpi_boot_init();
1066 early_parse_srat();
1067
1059#ifdef CONFIG_X86_32 1068#ifdef CONFIG_X86_32
1060 printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n", 1069 printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n",
1061 (max_pfn_mapped<<PAGE_SHIFT) - 1); 1070 (max_pfn_mapped<<PAGE_SHIFT) - 1);
@@ -1101,10 +1110,6 @@ void __init setup_arch(char **cmdline_p)
1101 /* 1110 /*
1102 * Parse the ACPI tables for possible boot-time SMP configuration. 1111 * Parse the ACPI tables for possible boot-time SMP configuration.
1103 */ 1112 */
1104 acpi_boot_table_init();
1105
1106 early_acpi_boot_init();
1107
1108 initmem_init(); 1113 initmem_init();
1109 memblock_find_dma_reserve(); 1114 memblock_find_dma_reserve();
1110 1115
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index e3963f52aaea..dfd30259eb89 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -560,10 +560,12 @@ static int __init numa_init(int (*init_func)(void))
560 for (i = 0; i < MAX_LOCAL_APIC; i++) 560 for (i = 0; i < MAX_LOCAL_APIC; i++)
561 set_apicid_to_node(i, NUMA_NO_NODE); 561 set_apicid_to_node(i, NUMA_NO_NODE);
562 562
563 nodes_clear(numa_nodes_parsed); 563 /*
564 * Do not clear numa_nodes_parsed or zero numa_meminfo here, because
565 * SRAT was parsed earlier in early_parse_srat().
566 */
564 nodes_clear(node_possible_map); 567 nodes_clear(node_possible_map);
565 nodes_clear(node_online_map); 568 nodes_clear(node_online_map);
566 memset(&numa_meminfo, 0, sizeof(numa_meminfo));
567 WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES)); 569 WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES));
568 numa_reset_distance(); 570 numa_reset_distance();
569 571
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 33e609f63585..59844ee149be 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -282,10 +282,10 @@ acpi_table_parse_srat(enum acpi_srat_type id,
282 handler, max_entries); 282 handler, max_entries);
283} 283}
284 284
285int __init acpi_numa_init(void) 285static int srat_mem_cnt;
286{
287 int cnt = 0;
288 286
287void __init early_parse_srat(void)
288{
289 /* 289 /*
290 * Should not limit number with cpu num that is from NR_CPUS or nr_cpus= 290 * Should not limit number with cpu num that is from NR_CPUS or nr_cpus=
291 * SRAT cpu entries could have different order with that in MADT. 291 * SRAT cpu entries could have different order with that in MADT.
@@ -295,21 +295,24 @@ int __init acpi_numa_init(void)
295 /* SRAT: Static Resource Affinity Table */ 295 /* SRAT: Static Resource Affinity Table */
296 if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) { 296 if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) {
297 acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY, 297 acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY,
298 acpi_parse_x2apic_affinity, 0); 298 acpi_parse_x2apic_affinity, 0);
299 acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY, 299 acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY,
300 acpi_parse_processor_affinity, 0); 300 acpi_parse_processor_affinity, 0);
301 cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, 301 srat_mem_cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
302 acpi_parse_memory_affinity, 302 acpi_parse_memory_affinity,
303 NR_NODE_MEMBLKS); 303 NR_NODE_MEMBLKS);
304 } 304 }
305}
305 306
307int __init acpi_numa_init(void)
308{
306 /* SLIT: System Locality Information Table */ 309 /* SLIT: System Locality Information Table */
307 acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit); 310 acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit);
308 311
309 acpi_numa_arch_fixup(); 312 acpi_numa_arch_fixup();
310 313
311 if (cnt < 0) 314 if (srat_mem_cnt < 0)
312 return cnt; 315 return srat_mem_cnt;
313 else if (!parsed_numa_memblks) 316 else if (!parsed_numa_memblks)
314 return -ENOENT; 317 return -ENOENT;
315 return 0; 318 return 0;
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index bcbdd7484e58..f46cfd73a553 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -485,6 +485,14 @@ static inline bool acpi_driver_match_device(struct device *dev,
485 485
486#endif /* !CONFIG_ACPI */ 486#endif /* !CONFIG_ACPI */
487 487
488#ifdef CONFIG_ACPI_NUMA
489void __init early_parse_srat(void);
490#else
491static inline void early_parse_srat(void)
492{
493}
494#endif
495
488#ifdef CONFIG_ACPI 496#ifdef CONFIG_ACPI
489void acpi_os_set_prepare_sleep(int (*func)(u8 sleep_state, 497void acpi_os_set_prepare_sleep(int (*func)(u8 sleep_state,
490 u32 pm1a_ctrl, u32 pm1b_ctrl)); 498 u32 pm1a_ctrl, u32 pm1b_ctrl));