3 files changed, 54 insertions, 104 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index 5b2bfbaa821a..ccf97b02b85f 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1191,11 +1191,11 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
                /*
                 * Mark that the PG_migrate_skip information should be cleared
-                 * by kswapd when it goes to sleep. kswapd does not set the
+                 * by kswapd when it goes to sleep. kcompactd does not set the
                 * flag itself as the decision to be clear should be directly
                 * based on an allocation request.
                 */
-                if (!current_is_kswapd())
+                if (cc->direct_compaction)
                        zone->compact_blockskip_flush = true;
                return COMPACT_COMPLETE;
@@ -1338,10 +1338,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        /*
         * Clear pageblock skip if there were failures recently and compaction
-         * is about to be retried after being deferred. kswapd does not do
+         * is about to be retried after being deferred.
-         * this reset as it'll reset the cached information when going to sleep.
         */
-        if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
+        if (compaction_restarting(zone, cc->order))
                __reset_isolation_suitable(zone);
        /*
@@ -1477,6 +1476,7 @@ static unsigned long compact_zone_order(struct zone *zone, int order,
                .mode = mode,
                .alloc_flags = alloc_flags,
                .classzone_idx = classzone_idx,
+                .direct_compaction = true,
        };
        INIT_LIST_HEAD(&cc.freepages);
        INIT_LIST_HEAD(&cc.migratepages);
diff --git a/mm/internal.h b/mm/internal.h
index b95952c2faec..4042a8a05672 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -172,6 +172,7 @@ struct compact_control {
        unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
        enum migrate_mode mode;         /* Async or sync migration mode */
        bool ignore_skip_hint;          /* Scan blocks even if marked skip */
+        bool direct_compaction;         /* False from kcompactd or /proc/... */
        int order;                      /* order a direct compactor needs */
        const gfp_t gfp_mask;           /* gfp mask of a direct compactor */
        const int alloc_flags;          /* alloc flags of a direct compactor */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5dcc71140108..f87cfaa955a8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2968,18 +2968,23 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
        } while (memcg);
 }
-static bool zone_balanced(struct zone *zone, int order,
+static bool zone_balanced(struct zone *zone, int order, bool highorder,
-                          unsigned long balance_gap, int classzone_idx)
+                        unsigned long balance_gap, int classzone_idx)
 {
-        if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
+        unsigned long mark = high_wmark_pages(zone) + balance_gap;
-                                    balance_gap, classzone_idx))
-                return false;
-        if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone,
+        /*
-                                order, 0, classzone_idx) == COMPACT_SKIPPED)
+         * When checking from pgdat_balanced(), kswapd should stop and sleep
-                return false;
+         * when it reaches the high order-0 watermark and let kcompactd take
+         * over. Other callers such as wakeup_kswapd() want to determine the
+         * true high-order watermark.
+         */
+        if (IS_ENABLED(CONFIG_COMPACTION) && !highorder) {
+                mark += (1UL << order);
+                order = 0;
+        }
-        return true;
+        return zone_watermark_ok_safe(zone, order, mark, classzone_idx);
 }
 /*
@@ -3029,7 +3034,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
                        continue;
                }
-                if (zone_balanced(zone, order, 0, i))
+                if (zone_balanced(zone, order, false, 0, i))
                        balanced_pages += zone->managed_pages;
                else if (!order)
                        return false;
@@ -3083,10 +3088,8 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
 */
 static bool kswapd_shrink_zone(struct zone *zone,
                               int classzone_idx,
-                               struct scan_control *sc,
+                               struct scan_control *sc)
-                               unsigned long *nr_attempted)
 {
-        int testorder = sc->order;
        unsigned long balance_gap;
        bool lowmem_pressure;
@@ -3094,17 +3097,6 @@ static bool kswapd_shrink_zone(struct zone *zone,
        sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
        /*
-         * Kswapd reclaims only single pages with compaction enabled. Trying
-         * too hard to reclaim until contiguous free pages have become
-         * available can hurt performance by evicting too much useful data
-         * from memory. Do not reclaim more than needed for compaction.
-         */
-        if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
-                        compaction_suitable(zone, sc->order, 0, classzone_idx)
-                                                        != COMPACT_SKIPPED)
-                testorder = 0;
-        /*
         * We put equal pressure on every zone, unless one zone has way too
         * many pages free already. The "too many pages" is defined as the
         * high wmark plus a "gap" where the gap is either the low
@@ -3118,15 +3110,12 @@ static bool kswapd_shrink_zone(struct zone *zone,
         * reclaim is necessary
         */
        lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
-        if (!lowmem_pressure && zone_balanced(zone, testorder,
+        if (!lowmem_pressure && zone_balanced(zone, sc->order, false,
                                                balance_gap, classzone_idx))
                return true;
        shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
-        /* Account for the number of pages attempted to reclaim */
-        *nr_attempted += sc->nr_to_reclaim;
        clear_bit(ZONE_WRITEBACK, &zone->flags);
        /*
@@ -3136,7 +3125,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
         * waits.
         */
        if (zone_reclaimable(zone) &&
-            zone_balanced(zone, testorder, 0, classzone_idx)) {
+            zone_balanced(zone, sc->order, false, 0, classzone_idx)) {
                clear_bit(ZONE_CONGESTED, &zone->flags);
                clear_bit(ZONE_DIRTY, &zone->flags);
        }
@@ -3148,7 +3137,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
 * For kswapd, balance_pgdat() will work across all this node's zones until
 * they are all at high_wmark_pages(zone).
 *
- * Returns the final order kswapd was reclaiming at
+ * Returns the highest zone idx kswapd was reclaiming at
 *
 * There is special handling here for zones which are full of pinned pages.
 * This can happen if the pages are all mlocked, or if they are all used by
@@ -3165,8 +3154,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
 * interoperates with the page allocator fallback scheme to ensure that aging
 * of pages is balanced across the zones.
 */
-static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
+static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
-                                                        int *classzone_idx)
 {
        int i;
        int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
@@ -3183,9 +3171,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
        count_vm_event(PAGEOUTRUN);
        do {
-                unsigned long nr_attempted = 0;
                bool raise_priority = true;
-                bool pgdat_needs_compaction = (order > 0);
                sc.nr_reclaimed = 0;
@@ -3220,7 +3206,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                                break;
                        }
-                        if (!zone_balanced(zone, order, 0, 0)) {
+                        if (!zone_balanced(zone, order, false, 0, 0)) {
                                end_zone = i;
                                break;
                        } else {
@@ -3236,24 +3222,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                if (i < 0)
                        goto out;
-                for (i = 0; i <= end_zone; i++) {
-                        struct zone *zone = pgdat->node_zones + i;
-                        if (!populated_zone(zone))
-                                continue;
-                        /*
-                         * If any zone is currently balanced then kswapd will
-                         * not call compaction as it is expected that the
-                         * necessary pages are already available.
-                         */
-                        if (pgdat_needs_compaction &&
-                                        zone_watermark_ok(zone, order,
-                                                low_wmark_pages(zone),
-                                                *classzone_idx, 0))
-                                pgdat_needs_compaction = false;
-                }
                /*
                 * If we're getting trouble reclaiming, start doing writepage
                 * even in laptop mode.
@@ -3297,8 +3265,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                         * that that high watermark would be met at 100%
                         * efficiency.
                         */
-                        if (kswapd_shrink_zone(zone, end_zone,
+                        if (kswapd_shrink_zone(zone, end_zone, &sc))
-                                               &sc, &nr_attempted))
                                raise_priority = false;
                }
@@ -3311,49 +3278,29 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                                pfmemalloc_watermark_ok(pgdat))
                        wake_up_all(&pgdat->pfmemalloc_wait);
-                /*
-                 * Fragmentation may mean that the system cannot be rebalanced
-                 * for high-order allocations in all zones. If twice the
-                 * allocation size has been reclaimed and the zones are still
-                 * not balanced then recheck the watermarks at order-0 to
-                 * prevent kswapd reclaiming excessively. Assume that a
-                 * process requested a high-order can direct reclaim/compact.
-                 */
-                if (order && sc.nr_reclaimed >= 2UL << order)
-                        order = sc.order = 0;
                /* Check if kswapd should be suspending */
                if (try_to_freeze() || kthread_should_stop())
                        break;
                /*
-                 * Compact if necessary and kswapd is reclaiming at least the
-                 * high watermark number of pages as requsted
-                 */
-                if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
-                        compact_pgdat(pgdat, order);
-                /*
                 * Raise priority if scanning rate is too low or there was no
                 * progress in reclaiming pages
                 */
                if (raise_priority || !sc.nr_reclaimed)
                        sc.priority--;
        } while (sc.priority >= 1 &&
-                 !pgdat_balanced(pgdat, order, *classzone_idx));
+                        !pgdat_balanced(pgdat, order, classzone_idx));
 out:
        /*
-         * Return the order we were reclaiming at so prepare_kswapd_sleep()
+         * Return the highest zone idx we were reclaiming at so
-         * makes a decision on the order we were last reclaiming at. However,
+         * prepare_kswapd_sleep() makes the same decisions as here.
-         * if another caller entered the allocator slow path while kswapd
-         * was awake, order will remain at the higher level
         */
-        *classzone_idx = end_zone;
+        return end_zone;
-        return order;
 }
-static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
+static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
+                                int classzone_idx, int balanced_classzone_idx)
 {
        long remaining = 0;
        DEFINE_WAIT(wait);
@@ -3364,7 +3311,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
        prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
        /* Try to sleep for a short interval */
-        if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
+        if (prepare_kswapd_sleep(pgdat, order, remaining,
+                                                balanced_classzone_idx)) {
                remaining = schedule_timeout(HZ/10);
                finish_wait(&pgdat->kswapd_wait, &wait);
                prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
@@ -3374,7 +3322,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
         * After a short sleep, check if it was a premature sleep. If not, then
         * go fully to sleep until explicitly woken up.
         */
-        if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
+        if (prepare_kswapd_sleep(pgdat, order, remaining,
+                                                balanced_classzone_idx)) {
                trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
                /*
@@ -3395,6 +3344,12 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
                 */
                reset_isolation_suitable(pgdat);
+                /*
+                 * We have freed the memory, now we should compact it to make
+                 * allocation of the requested order possible.
+                 */
+                wakeup_kcompactd(pgdat, order, classzone_idx);
                if (!kthread_should_stop())
                        schedule();
@@ -3424,7 +3379,6 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 static int kswapd(void *p)
 {
        unsigned long order, new_order;
-        unsigned balanced_order;
        int classzone_idx, new_classzone_idx;
        int balanced_classzone_idx;
        pg_data_t *pgdat = (pg_data_t*)p;
@@ -3457,23 +3411,19 @@ static int kswapd(void *p)
        set_freezable();
        order = new_order = 0;
-        balanced_order = 0;
        classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
        balanced_classzone_idx = classzone_idx;
        for ( ; ; ) {
                bool ret;
                /*
-                 * If the last balance_pgdat was unsuccessful it's unlikely a
+                 * While we were reclaiming, there might have been another
-                 * new request of a similar or harder type will succeed soon
+                 * wakeup, so check the values.
-                 * so consider going to sleep on the basis we reclaimed at
                 */
-                if (balanced_order == new_order) {
+                new_order = pgdat->kswapd_max_order;
-                        new_order = pgdat->kswapd_max_order;
+                new_classzone_idx = pgdat->classzone_idx;
-                        new_classzone_idx = pgdat->classzone_idx;
+                pgdat->kswapd_max_order =  0;
-                        pgdat->kswapd_max_order =  0;
+                pgdat->classzone_idx = pgdat->nr_zones - 1;
-                        pgdat->classzone_idx = pgdat->nr_zones - 1;
-                }
                if (order < new_order || classzone_idx > new_classzone_idx) {
                        /*
@@ -3483,7 +3433,7 @@ static int kswapd(void *p)
                        order = new_order;
                        classzone_idx = new_classzone_idx;
                } else {
-                        kswapd_try_to_sleep(pgdat, balanced_order,
+                        kswapd_try_to_sleep(pgdat, order, classzone_idx,
                                                balanced_classzone_idx);
                        order = pgdat->kswapd_max_order;
                        classzone_idx = pgdat->classzone_idx;
@@ -3503,9 +3453,8 @@ static int kswapd(void *p)
                 */
                if (!ret) {
                        trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
-                        balanced_classzone_idx = classzone_idx;
+                        balanced_classzone_idx = balance_pgdat(pgdat, order,
-                        balanced_order = balance_pgdat(pgdat, order,
+                                                                classzone_idx);
-                                                &balanced_classzone_idx);
                }
        }
@@ -3535,7 +3484,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
        }
        if (!waitqueue_active(&pgdat->kswapd_wait))
                return;
-        if (zone_balanced(zone, order, 0, 0))
+        if (zone_balanced(zone, order, true, 0, 0))
                return;
        trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);