aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2015-06-30 17:57:27 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-06-30 22:44:56 -0400
commit0e1cc95b4cc7293bb7b39175035e7f7e45c90977 (patch)
tree61b1173bce0ef2738aec4259956c1ecc52280dfa
parent74033a798f5a5db368126ee6f690111cf019bf7a (diff)
mm: meminit: finish initialisation of struct pages before basic setup
Waiman Long reported that 24TB machines hit OOM during basic setup when struct page initialisation was deferred. One approach is to initialise memory on demand but it interferes with page allocator paths. This patch creates dedicated threads to initialise memory before basic setup. It then blocks on a rw_semaphore until completion as a wait_queue and counter is overkill. This may be slower to boot but it's simplier overall and also gets rid of a section mangling which existed so kswapd could do the initialisation. [akpm@linux-foundation.org: include rwsem.h, use DECLARE_RWSEM, fix comment, remove unneeded cast] Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Waiman Long <waiman.long@hp.com Cc: Nathan Zimmer <nzimmer@sgi.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Scott Norton <scott.norton@hp.com> Tested-by: Daniel J Blueman <daniel@numascale.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/gfp.h8
-rw-r--r--init/main.c2
-rw-r--r--mm/internal.h24
-rw-r--r--mm/page_alloc.c46
-rw-r--r--mm/vmscan.c6
5 files changed, 49 insertions, 37 deletions
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 6ba7cf23748f..ad35f300b9a4 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -384,6 +384,14 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
384void drain_all_pages(struct zone *zone); 384void drain_all_pages(struct zone *zone);
385void drain_local_pages(struct zone *zone); 385void drain_local_pages(struct zone *zone);
386 386
387#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
388void page_alloc_init_late(void);
389#else
390static inline void page_alloc_init_late(void)
391{
392}
393#endif
394
387/* 395/*
388 * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what 396 * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
389 * GFP flags are used before interrupts are enabled. Once interrupts are 397 * GFP flags are used before interrupts are enabled. Once interrupts are
diff --git a/init/main.c b/init/main.c
index c599aea23bb1..c5d5626289ce 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1004,6 +1004,8 @@ static noinline void __init kernel_init_freeable(void)
1004 smp_init(); 1004 smp_init();
1005 sched_init_smp(); 1005 sched_init_smp();
1006 1006
1007 page_alloc_init_late();
1008
1007 do_basic_setup(); 1009 do_basic_setup();
1008 1010
1009 /* Open the /dev/console on the rootfs, this should never fail */ 1011 /* Open the /dev/console on the rootfs, this should never fail */
diff --git a/mm/internal.h b/mm/internal.h
index a48cbefde8ca..36b23f1e2ca6 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -379,30 +379,6 @@ static inline void mminit_verify_zonelist(void)
379} 379}
380#endif /* CONFIG_DEBUG_MEMORY_INIT */ 380#endif /* CONFIG_DEBUG_MEMORY_INIT */
381 381
382/*
383 * Deferred struct page initialisation requires init functions that are freed
384 * before kswapd is available. Reuse the memory hotplug section annotation
385 * to mark the required code.
386 *
387 * __defermem_init is code that always exists but is annotated __meminit to
388 * avoid section warnings.
389 * __defer_init code gets marked __meminit when deferring struct page
390 * initialistion but is otherwise in the init section.
391 */
392#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
393#define __defermem_init __meminit
394#define __defer_init __meminit
395
396void deferred_init_memmap(int nid);
397#else
398#define __defermem_init
399#define __defer_init __init
400
401static inline void deferred_init_memmap(int nid)
402{
403}
404#endif
405
406/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */ 382/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */
407#if defined(CONFIG_SPARSEMEM) 383#if defined(CONFIG_SPARSEMEM)
408extern void mminit_validate_memmodel_limits(unsigned long *start_pfn, 384extern void mminit_validate_memmodel_limits(unsigned long *start_pfn,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5a38e39b30d1..506eac8b38af 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -18,6 +18,7 @@
18#include <linux/mm.h> 18#include <linux/mm.h>
19#include <linux/swap.h> 19#include <linux/swap.h>
20#include <linux/interrupt.h> 20#include <linux/interrupt.h>
21#include <linux/rwsem.h>
21#include <linux/pagemap.h> 22#include <linux/pagemap.h>
22#include <linux/jiffies.h> 23#include <linux/jiffies.h>
23#include <linux/bootmem.h> 24#include <linux/bootmem.h>
@@ -61,6 +62,7 @@
61#include <linux/hugetlb.h> 62#include <linux/hugetlb.h>
62#include <linux/sched/rt.h> 63#include <linux/sched/rt.h>
63#include <linux/page_owner.h> 64#include <linux/page_owner.h>
65#include <linux/kthread.h>
64 66
65#include <asm/sections.h> 67#include <asm/sections.h>
66#include <asm/tlbflush.h> 68#include <asm/tlbflush.h>
@@ -242,7 +244,7 @@ static inline void reset_deferred_meminit(pg_data_t *pgdat)
242} 244}
243 245
244/* Returns true if the struct page for the pfn is uninitialised */ 246/* Returns true if the struct page for the pfn is uninitialised */
245static inline bool __defermem_init early_page_uninitialised(unsigned long pfn) 247static inline bool __meminit early_page_uninitialised(unsigned long pfn)
246{ 248{
247 int nid = early_pfn_to_nid(pfn); 249 int nid = early_pfn_to_nid(pfn);
248 250
@@ -958,7 +960,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
958 local_irq_restore(flags); 960 local_irq_restore(flags);
959} 961}
960 962
961static void __defer_init __free_pages_boot_core(struct page *page, 963static void __init __free_pages_boot_core(struct page *page,
962 unsigned long pfn, unsigned int order) 964 unsigned long pfn, unsigned int order)
963{ 965{
964 unsigned int nr_pages = 1 << order; 966 unsigned int nr_pages = 1 << order;
@@ -1031,7 +1033,7 @@ static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node,
1031#endif 1033#endif
1032 1034
1033 1035
1034void __defer_init __free_pages_bootmem(struct page *page, unsigned long pfn, 1036void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
1035 unsigned int order) 1037 unsigned int order)
1036{ 1038{
1037 if (early_page_uninitialised(pfn)) 1039 if (early_page_uninitialised(pfn))
@@ -1040,7 +1042,7 @@ void __defer_init __free_pages_bootmem(struct page *page, unsigned long pfn,
1040} 1042}
1041 1043
1042#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 1044#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1043static void __defermem_init deferred_free_range(struct page *page, 1045static void __init deferred_free_range(struct page *page,
1044 unsigned long pfn, int nr_pages) 1046 unsigned long pfn, int nr_pages)
1045{ 1047{
1046 int i; 1048 int i;
@@ -1060,20 +1062,30 @@ static void __defermem_init deferred_free_range(struct page *page,
1060 __free_pages_boot_core(page, pfn, 0); 1062 __free_pages_boot_core(page, pfn, 0);
1061} 1063}
1062 1064
1065static __initdata DECLARE_RWSEM(pgdat_init_rwsem);
1066
1063/* Initialise remaining memory on a node */ 1067/* Initialise remaining memory on a node */
1064void __defermem_init deferred_init_memmap(int nid) 1068static int __init deferred_init_memmap(void *data)
1065{ 1069{
1070 pg_data_t *pgdat = data;
1071 int nid = pgdat->node_id;
1066 struct mminit_pfnnid_cache nid_init_state = { }; 1072 struct mminit_pfnnid_cache nid_init_state = { };
1067 unsigned long start = jiffies; 1073 unsigned long start = jiffies;
1068 unsigned long nr_pages = 0; 1074 unsigned long nr_pages = 0;
1069 unsigned long walk_start, walk_end; 1075 unsigned long walk_start, walk_end;
1070 int i, zid; 1076 int i, zid;
1071 struct zone *zone; 1077 struct zone *zone;
1072 pg_data_t *pgdat = NODE_DATA(nid);
1073 unsigned long first_init_pfn = pgdat->first_deferred_pfn; 1078 unsigned long first_init_pfn = pgdat->first_deferred_pfn;
1079 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
1074 1080
1075 if (first_init_pfn == ULONG_MAX) 1081 if (first_init_pfn == ULONG_MAX) {
1076 return; 1082 up_read(&pgdat_init_rwsem);
1083 return 0;
1084 }
1085
1086 /* Bind memory initialisation thread to a local node if possible */
1087 if (!cpumask_empty(cpumask))
1088 set_cpus_allowed_ptr(current, cpumask);
1077 1089
1078 /* Sanity check boundaries */ 1090 /* Sanity check boundaries */
1079 BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn); 1091 BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
@@ -1165,8 +1177,24 @@ free_range:
1165 /* Sanity check that the next zone really is unpopulated */ 1177 /* Sanity check that the next zone really is unpopulated */
1166 WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); 1178 WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
1167 1179
1168 pr_info("kswapd %d initialised %lu pages in %ums\n", nid, nr_pages, 1180 pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
1169 jiffies_to_msecs(jiffies - start)); 1181 jiffies_to_msecs(jiffies - start));
1182 up_read(&pgdat_init_rwsem);
1183 return 0;
1184}
1185
1186void __init page_alloc_init_late(void)
1187{
1188 int nid;
1189
1190 for_each_node_state(nid, N_MEMORY) {
1191 down_read(&pgdat_init_rwsem);
1192 kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
1193 }
1194
1195 /* Block until all are initialised */
1196 down_write(&pgdat_init_rwsem);
1197 up_write(&pgdat_init_rwsem);
1170} 1198}
1171#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 1199#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
1172 1200
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f4a487110764..e61445dce04e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3386,7 +3386,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
3386 * If there are applications that are active memory-allocators 3386 * If there are applications that are active memory-allocators
3387 * (most normal use), this basically shouldn't matter. 3387 * (most normal use), this basically shouldn't matter.
3388 */ 3388 */
3389static int __defermem_init kswapd(void *p) 3389static int kswapd(void *p)
3390{ 3390{
3391 unsigned long order, new_order; 3391 unsigned long order, new_order;
3392 unsigned balanced_order; 3392 unsigned balanced_order;
@@ -3421,8 +3421,6 @@ static int __defermem_init kswapd(void *p)
3421 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; 3421 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
3422 set_freezable(); 3422 set_freezable();
3423 3423
3424 deferred_init_memmap(pgdat->node_id);
3425
3426 order = new_order = 0; 3424 order = new_order = 0;
3427 balanced_order = 0; 3425 balanced_order = 0;
3428 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; 3426 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
@@ -3578,7 +3576,7 @@ static int cpu_callback(struct notifier_block *nfb, unsigned long action,
3578 * This kswapd start function will be called by init and node-hot-add. 3576 * This kswapd start function will be called by init and node-hot-add.
3579 * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. 3577 * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
3580 */ 3578 */
3581int __defermem_init kswapd_run(int nid) 3579int kswapd_run(int nid)
3582{ 3580{
3583 pg_data_t *pgdat = NODE_DATA(nid); 3581 pg_data_t *pgdat = NODE_DATA(nid);
3584 int ret = 0; 3582 int ret = 0;