2 files changed, 46 insertions, 44 deletions
diff --git a/mm/internal.h b/mm/internal.h
index 29e1e761f9eb..3e910000fda4 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -370,5 +370,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
 #define ALLOC_HIGH              0x20 /* __GFP_HIGH set */
 #define ALLOC_CPUSET            0x40 /* check for correct cpuset */
 #define ALLOC_CMA               0x80 /* allow allocations from CMA areas */
+#define ALLOC_FAIR              0x100 /* fair zone allocation */
 #endif  /* __MM_INTERNAL_H */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 73c25912c7c4..15d140755e71 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1239,15 +1239,6 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
        }
        local_irq_restore(flags);
 }
-static bool gfp_thisnode_allocation(gfp_t gfp_mask)
-{
-        return (gfp_mask & GFP_THISNODE) == GFP_THISNODE;
-}
-#else
-static bool gfp_thisnode_allocation(gfp_t gfp_mask)
-{
-        return false;
-}
 #endif
 /*
@@ -1584,12 +1575,7 @@ again:
                                          get_pageblock_migratetype(page));
        }
-        /*
+        __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
-         * NOTE: GFP_THISNODE allocations do not partake in the kswapd
-         * aging protocol, so they can't be fair.
-         */
-        if (!gfp_thisnode_allocation(gfp_flags))
-                __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
        __count_zone_vm_events(PGALLOC, zone, 1 << order);
        zone_statistics(preferred_zone, zone, gfp_flags);
@@ -1955,23 +1941,12 @@ zonelist_scan:
                 * zone size to ensure fair page aging.  The zone a
                 * page was allocated in should have no effect on the
                 * time the page has in memory before being reclaimed.
-                 *
-                 * Try to stay in local zones in the fastpath.  If
-                 * that fails, the slowpath is entered, which will do
-                 * another pass starting with the local zones, but
-                 * ultimately fall back to remote zones that do not
-                 * partake in the fairness round-robin cycle of this
-                 * zonelist.
-                 *
-                 * NOTE: GFP_THISNODE allocations do not partake in
-                 * the kswapd aging protocol, so they can't be fair.
                 */
-                if ((alloc_flags & ALLOC_WMARK_LOW) &&
+                if (alloc_flags & ALLOC_FAIR) {
-                    !gfp_thisnode_allocation(gfp_mask)) {
-                        if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
-                                continue;
                        if (!zone_local(preferred_zone, zone))
                                continue;
+                        if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
+                                continue;
                }
                /*
                 * When allocating a page cache page for writing, we
@@ -2409,32 +2384,40 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
        return page;
 }
-static void prepare_slowpath(gfp_t gfp_mask, unsigned int order,
+static void reset_alloc_batches(struct zonelist *zonelist,
-                             struct zonelist *zonelist,
+                                enum zone_type high_zoneidx,
-                             enum zone_type high_zoneidx,
+                                struct zone *preferred_zone)
-                             struct zone *preferred_zone)
 {
        struct zoneref *z;
        struct zone *zone;
        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-                if (!(gfp_mask & __GFP_NO_KSWAPD))
-                        wakeup_kswapd(zone, order, zone_idx(preferred_zone));
                /*
                 * Only reset the batches of zones that were actually
-                 * considered in the fast path, we don't want to
+                 * considered in the fairness pass, we don't want to
-                 * thrash fairness information for zones that are not
+                 * trash fairness information for zones that are not
                 * actually part of this zonelist's round-robin cycle.
                 */
                if (!zone_local(preferred_zone, zone))
                        continue;
                mod_zone_page_state(zone, NR_ALLOC_BATCH,
-                                    high_wmark_pages(zone) -
+                        high_wmark_pages(zone) - low_wmark_pages(zone) -
-                                    low_wmark_pages(zone) -
+                        atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
-                                    zone_page_state(zone, NR_ALLOC_BATCH));
        }
 }
+static void wake_all_kswapds(unsigned int order,
+                             struct zonelist *zonelist,
+                             enum zone_type high_zoneidx,
+                             struct zone *preferred_zone)
+{
+        struct zoneref *z;
+        struct zone *zone;
+        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+                wakeup_kswapd(zone, order, zone_idx(preferred_zone));
+}
 static inline int
 gfp_to_alloc_flags(gfp_t gfp_mask)
 {
@@ -2523,12 +2506,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
         * allowed per node queues are empty and that nodes are
         * over allocated.
         */
-        if (gfp_thisnode_allocation(gfp_mask))
+        if (IS_ENABLED(CONFIG_NUMA) &&
+            (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
                goto nopage;
 restart:
-        prepare_slowpath(gfp_mask, order, zonelist,
+        if (!(gfp_mask & __GFP_NO_KSWAPD))
-                         high_zoneidx, preferred_zone);
+                wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
        /*
         * OK, we're below the kswapd watermark and have kicked background
@@ -2712,7 +2696,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        struct page *page = NULL;
        int migratetype = allocflags_to_migratetype(gfp_mask);
        unsigned int cpuset_mems_cookie;
-        int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
+        int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
        struct mem_cgroup *memcg = NULL;
        gfp_mask &= gfp_allowed_mask;
@@ -2753,12 +2737,29 @@ retry_cpuset:
        if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
                alloc_flags |= ALLOC_CMA;
 #endif
+retry:
        /* First allocation attempt */
        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
                        zonelist, high_zoneidx, alloc_flags,
                        preferred_zone, migratetype);
        if (unlikely(!page)) {
                /*
+                 * The first pass makes sure allocations are spread
+                 * fairly within the local node.  However, the local
+                 * node might have free pages left after the fairness
+                 * batches are exhausted, and remote zones haven't
+                 * even been considered yet.  Try once more without
+                 * fairness, and include remote zones now, before
+                 * entering the slowpath and waking kswapd: prefer
+                 * spilling to a remote zone over swapping locally.
+                 */
+                if (alloc_flags & ALLOC_FAIR) {
+                        reset_alloc_batches(zonelist, high_zoneidx,
+                                            preferred_zone);
+                        alloc_flags &= ~ALLOC_FAIR;
+                        goto retry;
+                }
+                /*
                 * Runtime PM, block IO and its error handling path
                 * can deadlock because I/O on the device might not
                 * complete.