aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorMichal Hocko <mhocko@suse.com>2017-06-02 17:46:49 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-06-02 18:07:38 -0400
commit864b9a393dcb5aed09b8fd31b9bbda0fdda99374 (patch)
tree8ce58c6ba1c874053e4814bd168c990d6cadda84 /mm/page_alloc.c
parent9a291a7c9428155e8e623e4a3989f8be47134df5 (diff)
mm: consider memblock reservations for deferred memory initialization sizing
We have seen an early OOM killer invocation on ppc64 systems with crashkernel=4096M: kthreadd invoked oom-killer: gfp_mask=0x16040c0(GFP_KERNEL|__GFP_COMP|__GFP_NOTRACK), nodemask=7, order=0, oom_score_adj=0 kthreadd cpuset=/ mems_allowed=7 CPU: 0 PID: 2 Comm: kthreadd Not tainted 4.4.68-1.gd7fe927-default #1 Call Trace: dump_stack+0xb0/0xf0 (unreliable) dump_header+0xb0/0x258 out_of_memory+0x5f0/0x640 __alloc_pages_nodemask+0xa8c/0xc80 kmem_getpages+0x84/0x1a0 fallback_alloc+0x2a4/0x320 kmem_cache_alloc_node+0xc0/0x2e0 copy_process.isra.25+0x260/0x1b30 _do_fork+0x94/0x470 kernel_thread+0x48/0x60 kthreadd+0x264/0x330 ret_from_kernel_thread+0x5c/0xa4 Mem-Info: active_anon:0 inactive_anon:0 isolated_anon:0 active_file:0 inactive_file:0 isolated_file:0 unevictable:0 dirty:0 writeback:0 unstable:0 slab_reclaimable:5 slab_unreclaimable:73 mapped:0 shmem:0 pagetables:0 bounce:0 free:0 free_pcp:0 free_cma:0 Node 7 DMA free:0kB min:0kB low:0kB high:0kB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:52428800kB managed:110016kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:320kB slab_unreclaimable:4672kB kernel_stack:1152kB pagetables:0kB unstable:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? yes lowmem_reserve[]: 0 0 0 0 Node 7 DMA: 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB 0*8192kB 0*16384kB = 0kB 0 total pagecache pages 0 pages in swap cache Swap cache stats: add 0, delete 0, find 0/0 Free swap = 0kB Total swap = 0kB 819200 pages RAM 0 pages HighMem/MovableOnly 817481 pages reserved 0 pages cma reserved 0 pages hwpoisoned the reason is that the managed memory is too low (only 110MB) while the rest of the the 50GB is still waiting for the deferred intialization to be done. update_defer_init estimates the initial memoty to initialize to 2GB at least but it doesn't consider any memory allocated in that range. In this particular case we've had Reserving 4096MB of memory at 128MB for crashkernel (System RAM: 51200MB) so the low 2GB is mostly depleted. Fix this by considering memblock allocations in the initial static initialization estimation. Move the max_initialise to reset_deferred_meminit and implement a simple memblock_reserved_memory helper which iterates all reserved blocks and sums the size of all that start below the given address. The cumulative size is than added on top of the initial estimation. This is still not ideal because reset_deferred_meminit doesn't consider holes and so reservation might be above the initial estimation whihch we ignore but let's make the logic simpler until we really need to handle more complicated cases. Fixes: 3a80a7fa7989 ("mm: meminit: initialise a subset of struct pages if CONFIG_DEFERRED_STRUCT_PAGE_INIT is set") Link: http://lkml.kernel.org/r/20170531104010.GI27783@dhcp22.suse.cz Signed-off-by: Michal Hocko <mhocko@suse.com> Acked-by: Mel Gorman <mgorman@suse.de> Tested-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Cc: <stable@vger.kernel.org> [4.2+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c33
1 files changed, 22 insertions, 11 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b7a6f583a373..2302f250d6b1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -292,6 +292,26 @@ int page_group_by_mobility_disabled __read_mostly;
292#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 292#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
293static inline void reset_deferred_meminit(pg_data_t *pgdat) 293static inline void reset_deferred_meminit(pg_data_t *pgdat)
294{ 294{
295 unsigned long max_initialise;
296 unsigned long reserved_lowmem;
297
298 /*
299 * Initialise at least 2G of a node but also take into account that
300 * two large system hashes that can take up 1GB for 0.25TB/node.
301 */
302 max_initialise = max(2UL << (30 - PAGE_SHIFT),
303 (pgdat->node_spanned_pages >> 8));
304
305 /*
306 * Compensate the all the memblock reservations (e.g. crash kernel)
307 * from the initial estimation to make sure we will initialize enough
308 * memory to boot.
309 */
310 reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn,
311 pgdat->node_start_pfn + max_initialise);
312 max_initialise += reserved_lowmem;
313
314 pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages);
295 pgdat->first_deferred_pfn = ULONG_MAX; 315 pgdat->first_deferred_pfn = ULONG_MAX;
296} 316}
297 317
@@ -314,20 +334,11 @@ static inline bool update_defer_init(pg_data_t *pgdat,
314 unsigned long pfn, unsigned long zone_end, 334 unsigned long pfn, unsigned long zone_end,
315 unsigned long *nr_initialised) 335 unsigned long *nr_initialised)
316{ 336{
317 unsigned long max_initialise;
318
319 /* Always populate low zones for address-contrained allocations */ 337 /* Always populate low zones for address-contrained allocations */
320 if (zone_end < pgdat_end_pfn(pgdat)) 338 if (zone_end < pgdat_end_pfn(pgdat))
321 return true; 339 return true;
322 /*
323 * Initialise at least 2G of a node but also take into account that
324 * two large system hashes that can take up 1GB for 0.25TB/node.
325 */
326 max_initialise = max(2UL << (30 - PAGE_SHIFT),
327 (pgdat->node_spanned_pages >> 8));
328
329 (*nr_initialised)++; 340 (*nr_initialised)++;
330 if ((*nr_initialised > max_initialise) && 341 if ((*nr_initialised > pgdat->static_init_size) &&
331 (pfn & (PAGES_PER_SECTION - 1)) == 0) { 342 (pfn & (PAGES_PER_SECTION - 1)) == 0) {
332 pgdat->first_deferred_pfn = pfn; 343 pgdat->first_deferred_pfn = pfn;
333 return false; 344 return false;
@@ -6138,7 +6149,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
6138 /* pg_data_t should be reset to zero when it's allocated */ 6149 /* pg_data_t should be reset to zero when it's allocated */
6139 WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx); 6150 WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);
6140 6151
6141 reset_deferred_meminit(pgdat);
6142 pgdat->node_id = nid; 6152 pgdat->node_id = nid;
6143 pgdat->node_start_pfn = node_start_pfn; 6153 pgdat->node_start_pfn = node_start_pfn;
6144 pgdat->per_cpu_nodestats = NULL; 6154 pgdat->per_cpu_nodestats = NULL;
@@ -6160,6 +6170,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
6160 (unsigned long)pgdat->node_mem_map); 6170 (unsigned long)pgdat->node_mem_map);
6161#endif 6171#endif
6162 6172
6173 reset_deferred_meminit(pgdat);
6163 free_area_init_core(pgdat); 6174 free_area_init_core(pgdat);
6164} 6175}
6165 6176