aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2014-04-07 18:37:48 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-04-07 19:35:57 -0400
commit3a025760fc158b3726eac89ee95d7f29599e9dfa (patch)
treef4b0a8b3da5f361c2206c83c9ad9d5247aa4690e /mm/page_alloc.c
parentd715ae08f2ff87508a081c4df78061bf4f7211d6 (diff)
mm: page_alloc: spill to remote nodes before waking kswapd
On NUMA systems, a node may start thrashing cache or even swap anonymous pages while there are still free pages on remote nodes. This is a result of commits 81c0a2bb515f ("mm: page_alloc: fair zone allocator policy") and fff4068cba48 ("mm: page_alloc: revert NUMA aspect of fair allocation policy"). Before those changes, the allocator would first try all allowed zones, including those on remote nodes, before waking any kswapds. But now, the allocator fastpath doubles as the fairness pass, which in turn can only consider the local node to prevent remote spilling based on exhausted fairness batches alone. Remote nodes are only considered in the slowpath, after the kswapds are woken up. But if remote nodes still have free memory, kswapd should not be woken to rebalance the local node or it may thrash cash or swap prematurely. Fix this by adding one more unfair pass over the zonelist that is allowed to spill to remote nodes after the local fairness pass fails but before entering the slowpath and waking the kswapds. This also gets rid of the GFP_THISNODE exemption from the fairness protocol because the unfair pass is no longer tied to kswapd, which GFP_THISNODE is not allowed to wake up. However, because remote spills can be more frequent now - we prefer them over local kswapd reclaim - the allocation batches on remote nodes could underflow more heavily. When resetting the batches, use atomic_long_read() directly instead of zone_page_state() to calculate the delta as the latter filters negative counter values. Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Rik van Riel <riel@redhat.com> Acked-by: Mel Gorman <mgorman@suse.de> Cc: <stable@kernel.org> [3.12+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c89
1 files changed, 45 insertions, 44 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 73c25912c7c4..15d140755e71 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1239,15 +1239,6 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1239 } 1239 }
1240 local_irq_restore(flags); 1240 local_irq_restore(flags);
1241} 1241}
1242static bool gfp_thisnode_allocation(gfp_t gfp_mask)
1243{
1244 return (gfp_mask & GFP_THISNODE) == GFP_THISNODE;
1245}
1246#else
1247static bool gfp_thisnode_allocation(gfp_t gfp_mask)
1248{
1249 return false;
1250}
1251#endif 1242#endif
1252 1243
1253/* 1244/*
@@ -1584,12 +1575,7 @@ again:
1584 get_pageblock_migratetype(page)); 1575 get_pageblock_migratetype(page));
1585 } 1576 }
1586 1577
1587 /* 1578 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
1588 * NOTE: GFP_THISNODE allocations do not partake in the kswapd
1589 * aging protocol, so they can't be fair.
1590 */
1591 if (!gfp_thisnode_allocation(gfp_flags))
1592 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
1593 1579
1594 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1580 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1595 zone_statistics(preferred_zone, zone, gfp_flags); 1581 zone_statistics(preferred_zone, zone, gfp_flags);
@@ -1955,23 +1941,12 @@ zonelist_scan:
1955 * zone size to ensure fair page aging. The zone a 1941 * zone size to ensure fair page aging. The zone a
1956 * page was allocated in should have no effect on the 1942 * page was allocated in should have no effect on the
1957 * time the page has in memory before being reclaimed. 1943 * time the page has in memory before being reclaimed.
1958 *
1959 * Try to stay in local zones in the fastpath. If
1960 * that fails, the slowpath is entered, which will do
1961 * another pass starting with the local zones, but
1962 * ultimately fall back to remote zones that do not
1963 * partake in the fairness round-robin cycle of this
1964 * zonelist.
1965 *
1966 * NOTE: GFP_THISNODE allocations do not partake in
1967 * the kswapd aging protocol, so they can't be fair.
1968 */ 1944 */
1969 if ((alloc_flags & ALLOC_WMARK_LOW) && 1945 if (alloc_flags & ALLOC_FAIR) {
1970 !gfp_thisnode_allocation(gfp_mask)) {
1971 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
1972 continue;
1973 if (!zone_local(preferred_zone, zone)) 1946 if (!zone_local(preferred_zone, zone))
1974 continue; 1947 continue;
1948 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
1949 continue;
1975 } 1950 }
1976 /* 1951 /*
1977 * When allocating a page cache page for writing, we 1952 * When allocating a page cache page for writing, we
@@ -2409,32 +2384,40 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2409 return page; 2384 return page;
2410} 2385}
2411 2386
2412static void prepare_slowpath(gfp_t gfp_mask, unsigned int order, 2387static void reset_alloc_batches(struct zonelist *zonelist,
2413 struct zonelist *zonelist, 2388 enum zone_type high_zoneidx,
2414 enum zone_type high_zoneidx, 2389 struct zone *preferred_zone)
2415 struct zone *preferred_zone)
2416{ 2390{
2417 struct zoneref *z; 2391 struct zoneref *z;
2418 struct zone *zone; 2392 struct zone *zone;
2419 2393
2420 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 2394 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
2421 if (!(gfp_mask & __GFP_NO_KSWAPD))
2422 wakeup_kswapd(zone, order, zone_idx(preferred_zone));
2423 /* 2395 /*
2424 * Only reset the batches of zones that were actually 2396 * Only reset the batches of zones that were actually
2425 * considered in the fast path, we don't want to 2397 * considered in the fairness pass, we don't want to
2426 * thrash fairness information for zones that are not 2398 * trash fairness information for zones that are not
2427 * actually part of this zonelist's round-robin cycle. 2399 * actually part of this zonelist's round-robin cycle.
2428 */ 2400 */
2429 if (!zone_local(preferred_zone, zone)) 2401 if (!zone_local(preferred_zone, zone))
2430 continue; 2402 continue;
2431 mod_zone_page_state(zone, NR_ALLOC_BATCH, 2403 mod_zone_page_state(zone, NR_ALLOC_BATCH,
2432 high_wmark_pages(zone) - 2404 high_wmark_pages(zone) - low_wmark_pages(zone) -
2433 low_wmark_pages(zone) - 2405 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
2434 zone_page_state(zone, NR_ALLOC_BATCH));
2435 } 2406 }
2436} 2407}
2437 2408
2409static void wake_all_kswapds(unsigned int order,
2410 struct zonelist *zonelist,
2411 enum zone_type high_zoneidx,
2412 struct zone *preferred_zone)
2413{
2414 struct zoneref *z;
2415 struct zone *zone;
2416
2417 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
2418 wakeup_kswapd(zone, order, zone_idx(preferred_zone));
2419}
2420
2438static inline int 2421static inline int
2439gfp_to_alloc_flags(gfp_t gfp_mask) 2422gfp_to_alloc_flags(gfp_t gfp_mask)
2440{ 2423{
@@ -2523,12 +2506,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2523 * allowed per node queues are empty and that nodes are 2506 * allowed per node queues are empty and that nodes are
2524 * over allocated. 2507 * over allocated.
2525 */ 2508 */
2526 if (gfp_thisnode_allocation(gfp_mask)) 2509 if (IS_ENABLED(CONFIG_NUMA) &&
2510 (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
2527 goto nopage; 2511 goto nopage;
2528 2512
2529restart: 2513restart:
2530 prepare_slowpath(gfp_mask, order, zonelist, 2514 if (!(gfp_mask & __GFP_NO_KSWAPD))
2531 high_zoneidx, preferred_zone); 2515 wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
2532 2516
2533 /* 2517 /*
2534 * OK, we're below the kswapd watermark and have kicked background 2518 * OK, we're below the kswapd watermark and have kicked background
@@ -2712,7 +2696,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2712 struct page *page = NULL; 2696 struct page *page = NULL;
2713 int migratetype = allocflags_to_migratetype(gfp_mask); 2697 int migratetype = allocflags_to_migratetype(gfp_mask);
2714 unsigned int cpuset_mems_cookie; 2698 unsigned int cpuset_mems_cookie;
2715 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; 2699 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
2716 struct mem_cgroup *memcg = NULL; 2700 struct mem_cgroup *memcg = NULL;
2717 2701
2718 gfp_mask &= gfp_allowed_mask; 2702 gfp_mask &= gfp_allowed_mask;
@@ -2753,12 +2737,29 @@ retry_cpuset:
2753 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 2737 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2754 alloc_flags |= ALLOC_CMA; 2738 alloc_flags |= ALLOC_CMA;
2755#endif 2739#endif
2740retry:
2756 /* First allocation attempt */ 2741 /* First allocation attempt */
2757 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2742 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2758 zonelist, high_zoneidx, alloc_flags, 2743 zonelist, high_zoneidx, alloc_flags,
2759 preferred_zone, migratetype); 2744 preferred_zone, migratetype);
2760 if (unlikely(!page)) { 2745 if (unlikely(!page)) {
2761 /* 2746 /*
2747 * The first pass makes sure allocations are spread
2748 * fairly within the local node. However, the local
2749 * node might have free pages left after the fairness
2750 * batches are exhausted, and remote zones haven't
2751 * even been considered yet. Try once more without
2752 * fairness, and include remote zones now, before
2753 * entering the slowpath and waking kswapd: prefer
2754 * spilling to a remote zone over swapping locally.
2755 */
2756 if (alloc_flags & ALLOC_FAIR) {
2757 reset_alloc_batches(zonelist, high_zoneidx,
2758 preferred_zone);
2759 alloc_flags &= ~ALLOC_FAIR;
2760 goto retry;
2761 }
2762 /*
2762 * Runtime PM, block IO and its error handling path 2763 * Runtime PM, block IO and its error handling path
2763 * can deadlock because I/O on the device might not 2764 * can deadlock because I/O on the device might not
2764 * complete. 2765 * complete.