summaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorPavel Tatashin <pasha.tatashin@oracle.com>2018-04-05 19:22:31 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2018-04-06 00:36:25 -0400
commitc9e97a1997fbf3a1d18d4065c2ca381f0704d7e5 (patch)
treea860648d747c3054868baef0d88b6e34acecb609 /mm/page_alloc.c
parent3a2d7fa8a3d5ae740bd0c21d933acc6220857ed0 (diff)
mm: initialize pages on demand during boot
Deferred page initialization allows the boot cpu to initialize a small subset of the system's pages early in boot, with other cpus doing the rest later on. It is, however, problematic to know how many pages the kernel needs during boot. Different modules and kernel parameters may change the requirement, so the boot cpu either initializes too many pages or runs out of memory. To fix that, initialize early pages on demand. This ensures the kernel does the minimum amount of work to initialize pages during boot and leaves the rest to be divided in the multithreaded initialization path (deferred_init_memmap). The on-demand code is permanently disabled using static branching once deferred pages are initialized. After the static branch is changed to false, the overhead is up-to two branch-always instructions if the zone watermark check fails or if rmqueue fails. Sergey Senozhatsky noticed that while deferred pages currently make sense only on NUMA machines (we start one thread per latency node), CONFIG_NUMA is not a requirement for CONFIG_DEFERRED_STRUCT_PAGE_INIT, so that is also must be addressed in the patch. [akpm@linux-foundation.org: fix typo in comment, make deferred_pages static] [pasha.tatashin@oracle.com: fix min() type mismatch warning] Link: http://lkml.kernel.org/r/20180212164543.26592-1-pasha.tatashin@oracle.com [pasha.tatashin@oracle.com: use zone_to_nid() in deferred_grow_zone()] Link: http://lkml.kernel.org/r/20180214163343.21234-2-pasha.tatashin@oracle.com [pasha.tatashin@oracle.com: might_sleep warning] Link: http://lkml.kernel.org/r/20180306192022.28289-1-pasha.tatashin@oracle.com [akpm@linux-foundation.org: s/spin_lock/spin_lock_irq/ in page_alloc_init_late()] [pasha.tatashin@oracle.com: v5] Link: http://lkml.kernel.org/r/20180309220807.24961-3-pasha.tatashin@oracle.com [akpm@linux-foundation.org: tweak comments] [pasha.tatashin@oracle.com: v6] Link: http://lkml.kernel.org/r/20180313182355.17669-3-pasha.tatashin@oracle.com [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20180209192216.20509-2-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com> Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com> Reviewed-by: Steven Sistare <steven.sistare@oracle.com> Reviewed-by: Andrew Morton <akpm@linux-foundation.org> Tested-by: Masayoshi Mizuma <m.mizuma@jp.fujitsu.com> Acked-by: Mel Gorman <mgorman@suse.de> Cc: Michal Hocko <mhocko@suse.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: AKASHI Takahiro <takahiro.akashi@linaro.org> Cc: Gioh Kim <gi-oh.kim@profitbricks.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Yaowei Bai <baiyaowei@cmss.chinamobile.com> Cc: Wei Yang <richard.weiyang@gmail.com> Cc: Paul Burton <paul.burton@mips.com> Cc: Miles Chen <miles.chen@mediatek.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c183
1 files changed, 144 insertions, 39 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cf5555df78bd..3183eb2f579c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -292,40 +292,6 @@ EXPORT_SYMBOL(nr_online_nodes);
292int page_group_by_mobility_disabled __read_mostly; 292int page_group_by_mobility_disabled __read_mostly;
293 293
294#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 294#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
295
296/*
297 * Determine how many pages need to be initialized during early boot
298 * (non-deferred initialization).
299 * The value of first_deferred_pfn will be set later, once non-deferred pages
300 * are initialized, but for now set it ULONG_MAX.
301 */
302static inline void reset_deferred_meminit(pg_data_t *pgdat)
303{
304 phys_addr_t start_addr, end_addr;
305 unsigned long max_pgcnt;
306 unsigned long reserved;
307
308 /*
309 * Initialise at least 2G of a node but also take into account that
310 * two large system hashes that can take up 1GB for 0.25TB/node.
311 */
312 max_pgcnt = max(2UL << (30 - PAGE_SHIFT),
313 (pgdat->node_spanned_pages >> 8));
314
315 /*
316 * Compensate the all the memblock reservations (e.g. crash kernel)
317 * from the initial estimation to make sure we will initialize enough
318 * memory to boot.
319 */
320 start_addr = PFN_PHYS(pgdat->node_start_pfn);
321 end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt);
322 reserved = memblock_reserved_memory_within(start_addr, end_addr);
323 max_pgcnt += PHYS_PFN(reserved);
324
325 pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages);
326 pgdat->first_deferred_pfn = ULONG_MAX;
327}
328
329/* Returns true if the struct page for the pfn is uninitialised */ 295/* Returns true if the struct page for the pfn is uninitialised */
330static inline bool __meminit early_page_uninitialised(unsigned long pfn) 296static inline bool __meminit early_page_uninitialised(unsigned long pfn)
331{ 297{
@@ -361,10 +327,6 @@ static inline bool update_defer_init(pg_data_t *pgdat,
361 return true; 327 return true;
362} 328}
363#else 329#else
364static inline void reset_deferred_meminit(pg_data_t *pgdat)
365{
366}
367
368static inline bool early_page_uninitialised(unsigned long pfn) 330static inline bool early_page_uninitialised(unsigned long pfn)
369{ 331{
370 return false; 332 return false;
@@ -1611,6 +1573,117 @@ static int __init deferred_init_memmap(void *data)
1611 pgdat_init_report_one_done(); 1573 pgdat_init_report_one_done();
1612 return 0; 1574 return 0;
1613} 1575}
1576
1577/*
1578 * During boot we initialize deferred pages on-demand, as needed, but once
1579 * page_alloc_init_late() has finished, the deferred pages are all initialized,
1580 * and we can permanently disable that path.
1581 */
1582static DEFINE_STATIC_KEY_TRUE(deferred_pages);
1583
1584/*
1585 * If this zone has deferred pages, try to grow it by initializing enough
1586 * deferred pages to satisfy the allocation specified by order, rounded up to
1587 * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments
1588 * of SECTION_SIZE bytes by initializing struct pages in increments of
1589 * PAGES_PER_SECTION * sizeof(struct page) bytes.
1590 *
1591 * Return true when zone was grown, otherwise return false. We return true even
1592 * when we grow less than requested, to let the caller decide if there are
1593 * enough pages to satisfy the allocation.
1594 *
1595 * Note: We use noinline because this function is needed only during boot, and
1596 * it is called from a __ref function _deferred_grow_zone. This way we are
1597 * making sure that it is not inlined into permanent text section.
1598 */
1599static noinline bool __init
1600deferred_grow_zone(struct zone *zone, unsigned int order)
1601{
1602 int zid = zone_idx(zone);
1603 int nid = zone_to_nid(zone);
1604 pg_data_t *pgdat = NODE_DATA(nid);
1605 unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
1606 unsigned long nr_pages = 0;
1607 unsigned long first_init_pfn, spfn, epfn, t, flags;
1608 unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
1609 phys_addr_t spa, epa;
1610 u64 i;
1611
1612 /* Only the last zone may have deferred pages */
1613 if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
1614 return false;
1615
1616 pgdat_resize_lock(pgdat, &flags);
1617
1618 /*
1619 * If deferred pages have been initialized while we were waiting for
1620 * the lock, return true, as the zone was grown. The caller will retry
1621 * this zone. We won't return to this function since the caller also
1622 * has this static branch.
1623 */
1624 if (!static_branch_unlikely(&deferred_pages)) {
1625 pgdat_resize_unlock(pgdat, &flags);
1626 return true;
1627 }
1628
1629 /*
1630 * If someone grew this zone while we were waiting for spinlock, return
1631 * true, as there might be enough pages already.
1632 */
1633 if (first_deferred_pfn != pgdat->first_deferred_pfn) {
1634 pgdat_resize_unlock(pgdat, &flags);
1635 return true;
1636 }
1637
1638 first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn);
1639
1640 if (first_init_pfn >= pgdat_end_pfn(pgdat)) {
1641 pgdat_resize_unlock(pgdat, &flags);
1642 return false;
1643 }
1644
1645 for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1646 spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1647 epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
1648
1649 while (spfn < epfn && nr_pages < nr_pages_needed) {
1650 t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION);
1651 first_deferred_pfn = min(t, epfn);
1652 nr_pages += deferred_init_pages(nid, zid, spfn,
1653 first_deferred_pfn);
1654 spfn = first_deferred_pfn;
1655 }
1656
1657 if (nr_pages >= nr_pages_needed)
1658 break;
1659 }
1660
1661 for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1662 spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1663 epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa));
1664 deferred_free_pages(nid, zid, spfn, epfn);
1665
1666 if (first_deferred_pfn == epfn)
1667 break;
1668 }
1669 pgdat->first_deferred_pfn = first_deferred_pfn;
1670 pgdat_resize_unlock(pgdat, &flags);
1671
1672 return nr_pages > 0;
1673}
1674
1675/*
1676 * deferred_grow_zone() is __init, but it is called from
1677 * get_page_from_freelist() during early boot until deferred_pages permanently
1678 * disables this call. This is why we have refdata wrapper to avoid warning,
1679 * and to ensure that the function body gets unloaded.
1680 */
1681static bool __ref
1682_deferred_grow_zone(struct zone *zone, unsigned int order)
1683{
1684 return deferred_grow_zone(zone, order);
1685}
1686
1614#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 1687#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
1615 1688
1616void __init page_alloc_init_late(void) 1689void __init page_alloc_init_late(void)
@@ -1629,6 +1702,12 @@ void __init page_alloc_init_late(void)
1629 /* Block until all are initialised */ 1702 /* Block until all are initialised */
1630 wait_for_completion(&pgdat_init_all_done_comp); 1703 wait_for_completion(&pgdat_init_all_done_comp);
1631 1704
1705 /*
1706 * We initialized the rest of the deferred pages. Permanently disable
1707 * on-demand struct page initialization.
1708 */
1709 static_branch_disable(&deferred_pages);
1710
1632 /* Reinit limits that are based on free pages after the kernel is up */ 1711 /* Reinit limits that are based on free pages after the kernel is up */
1633 files_maxfiles_init(); 1712 files_maxfiles_init();
1634#endif 1713#endif
@@ -3208,6 +3287,16 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
3208 ac_classzone_idx(ac), alloc_flags)) { 3287 ac_classzone_idx(ac), alloc_flags)) {
3209 int ret; 3288 int ret;
3210 3289
3290#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
3291 /*
3292 * Watermark failed for this zone, but see if we can
3293 * grow this zone if it contains deferred pages.
3294 */
3295 if (static_branch_unlikely(&deferred_pages)) {
3296 if (_deferred_grow_zone(zone, order))
3297 goto try_this_zone;
3298 }
3299#endif
3211 /* Checked here to keep the fast path fast */ 3300 /* Checked here to keep the fast path fast */
3212 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 3301 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
3213 if (alloc_flags & ALLOC_NO_WATERMARKS) 3302 if (alloc_flags & ALLOC_NO_WATERMARKS)
@@ -3249,6 +3338,14 @@ try_this_zone:
3249 reserve_highatomic_pageblock(page, zone, order); 3338 reserve_highatomic_pageblock(page, zone, order);
3250 3339
3251 return page; 3340 return page;
3341 } else {
3342#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
3343 /* Try again if zone has deferred pages */
3344 if (static_branch_unlikely(&deferred_pages)) {
3345 if (_deferred_grow_zone(zone, order))
3346 goto try_this_zone;
3347 }
3348#endif
3252 } 3349 }
3253 } 3350 }
3254 3351
@@ -6244,7 +6341,15 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
6244 6341
6245 alloc_node_mem_map(pgdat); 6342 alloc_node_mem_map(pgdat);
6246 6343
6247 reset_deferred_meminit(pgdat); 6344#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
6345 /*
6346 * We start only with one section of pages, more pages are added as
6347 * needed until the rest of deferred pages are initialized.
6348 */
6349 pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
6350 pgdat->node_spanned_pages);
6351 pgdat->first_deferred_pfn = ULONG_MAX;
6352#endif
6248 free_area_init_core(pgdat); 6353 free_area_init_core(pgdat);
6249} 6354}
6250 6355