1 files changed, 384 insertions, 201 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fa6a85378ee4..99b3ac7771ad 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -546,7 +546,6 @@ int remove_mapping(struct address_space *mapping, struct page *page)
 void putback_lru_page(struct page *page)
 {
        int lru;
-        int active = !!TestClearPageActive(page);
        int was_unevictable = PageUnevictable(page);
        VM_BUG_ON(PageLRU(page));
@@ -561,8 +560,8 @@ redo:
                 * unevictable page on [in]active list.
                 * We know how to handle that.
                 */
-                lru = active + page_lru_base_type(page);
+                lru = page_lru_base_type(page);
-                lru_cache_add_lru(page, lru);
+                lru_cache_add(page);
        } else {
                /*
                 * Put unevictable pages directly on zone's unevictable
@@ -669,6 +668,35 @@ static enum page_references page_check_references(struct page *page,
        return PAGEREF_RECLAIM;
 }
+/* Check if a page is dirty or under writeback */
+static void page_check_dirty_writeback(struct page *page,
+                                       bool *dirty, bool *writeback)
+{
+        struct address_space *mapping;
+        /*
+         * Anonymous pages are not handled by flushers and must be written
+         * from reclaim context. Do not stall reclaim based on them
+         */
+        if (!page_is_file_cache(page)) {
+                *dirty = false;
+                *writeback = false;
+                return;
+        }
+        /* By default assume that the page flags are accurate */
+        *dirty = PageDirty(page);
+        *writeback = PageWriteback(page);
+        /* Verify dirty/writeback state if the filesystem supports it */
+        if (!page_has_private(page))
+                return;
+        mapping = page_mapping(page);
+        if (mapping && mapping->a_ops->is_dirty_writeback)
+                mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
+}
 /*
 * shrink_page_list() returns the number of reclaimed pages
 */
@@ -677,16 +705,21 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                      struct scan_control *sc,
                                      enum ttu_flags ttu_flags,
                                      unsigned long *ret_nr_dirty,
+                                      unsigned long *ret_nr_unqueued_dirty,
+                                      unsigned long *ret_nr_congested,
                                      unsigned long *ret_nr_writeback,
+                                      unsigned long *ret_nr_immediate,
                                      bool force_reclaim)
 {
        LIST_HEAD(ret_pages);
        LIST_HEAD(free_pages);
        int pgactivate = 0;
+        unsigned long nr_unqueued_dirty = 0;
        unsigned long nr_dirty = 0;
        unsigned long nr_congested = 0;
        unsigned long nr_reclaimed = 0;
        unsigned long nr_writeback = 0;
+        unsigned long nr_immediate = 0;
        cond_resched();
@@ -696,6 +729,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                struct page *page;
                int may_enter_fs;
                enum page_references references = PAGEREF_RECLAIM_CLEAN;
+                bool dirty, writeback;
                cond_resched();
@@ -723,25 +757,77 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
                        (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
+                /*
+                 * The number of dirty pages determines if a zone is marked
+                 * reclaim_congested which affects wait_iff_congested. kswapd
+                 * will stall and start writing pages if the tail of the LRU
+                 * is all dirty unqueued pages.
+                 */
+                page_check_dirty_writeback(page, &dirty, &writeback);
+                if (dirty || writeback)
+                        nr_dirty++;
+                if (dirty && !writeback)
+                        nr_unqueued_dirty++;
+                /*
+                 * Treat this page as congested if the underlying BDI is or if
+                 * pages are cycling through the LRU so quickly that the
+                 * pages marked for immediate reclaim are making it to the
+                 * end of the LRU a second time.
+                 */
+                mapping = page_mapping(page);
+                if ((mapping && bdi_write_congested(mapping->backing_dev_info)) ||
+                    (writeback && PageReclaim(page)))
+                        nr_congested++;
+                /*
+                 * If a page at the tail of the LRU is under writeback, there
+                 * are three cases to consider.
+                 *
+                 * 1) If reclaim is encountering an excessive number of pages
+                 *    under writeback and this page is both under writeback and
+                 *    PageReclaim then it indicates that pages are being queued
+                 *    for IO but are being recycled through the LRU before the
+                 *    IO can complete. Waiting on the page itself risks an
+                 *    indefinite stall if it is impossible to writeback the
+                 *    page due to IO error or disconnected storage so instead
+                 *    note that the LRU is being scanned too quickly and the
+                 *    caller can stall after page list has been processed.
+                 *
+                 * 2) Global reclaim encounters a page, memcg encounters a
+                 *    page that is not marked for immediate reclaim or
+                 *    the caller does not have __GFP_IO. In this case mark
+                 *    the page for immediate reclaim and continue scanning.
+                 *
+                 *    __GFP_IO is checked  because a loop driver thread might
+                 *    enter reclaim, and deadlock if it waits on a page for
+                 *    which it is needed to do the write (loop masks off
+                 *    __GFP_IO|__GFP_FS for this reason); but more thought
+                 *    would probably show more reasons.
+                 *
+                 *    Don't require __GFP_FS, since we're not going into the
+                 *    FS, just waiting on its writeback completion. Worryingly,
+                 *    ext4 gfs2 and xfs allocate pages with
+                 *    grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
+                 *    may_enter_fs here is liable to OOM on them.
+                 *
+                 * 3) memcg encounters a page that is not already marked
+                 *    PageReclaim. memcg does not have any dirty pages
+                 *    throttling so we could easily OOM just because too many
+                 *    pages are in writeback and there is nothing else to
+                 *    reclaim. Wait for the writeback to complete.
+                 */
                if (PageWriteback(page)) {
-                        /*
+                        /* Case 1 above */
-                         * memcg doesn't have any dirty pages throttling so we
+                        if (current_is_kswapd() &&
-                         * could easily OOM just because too many pages are in
+                            PageReclaim(page) &&
-                         * writeback and there is nothing else to reclaim.
+                            zone_is_reclaim_writeback(zone)) {
-                         *
+                                nr_immediate++;
-                         * Check __GFP_IO, certainly because a loop driver
+                                goto keep_locked;
-                         * thread might enter reclaim, and deadlock if it waits
-                         * on a page for which it is needed to do the write
+                        /* Case 2 above */
-                         * (loop masks off __GFP_IO|__GFP_FS for this reason);
+                        } else if (global_reclaim(sc) ||
-                         * but more thought would probably show more reasons.
-                         *
-                         * Don't require __GFP_FS, since we're not going into
-                         * the FS, just waiting on its writeback completion.
-                         * Worryingly, ext4 gfs2 and xfs allocate pages with
-                         * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
-                         * testing may_enter_fs here is liable to OOM on them.
-                         */
-                        if (global_reclaim(sc) ||
                            !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
                                /*
                                 * This is slightly racy - end_page_writeback()
@@ -756,9 +842,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                 */
                                SetPageReclaim(page);
                                nr_writeback++;
                                goto keep_locked;
+                        /* Case 3 above */
+                        } else {
+                                wait_on_page_writeback(page);
                        }
-                        wait_on_page_writeback(page);
                }
                if (!force_reclaim)
@@ -784,9 +874,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        if (!add_to_swap(page, page_list))
                                goto activate_locked;
                        may_enter_fs = 1;
-                }
-                mapping = page_mapping(page);
+                        /* Adding to swap updated mapping */
+                        mapping = page_mapping(page);
+                }
                /*
                 * The page is mapped into the page tables of one or more
@@ -806,16 +897,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                }
                if (PageDirty(page)) {
-                        nr_dirty++;
                        /*
                         * Only kswapd can writeback filesystem pages to
-                         * avoid risk of stack overflow but do not writeback
+                         * avoid risk of stack overflow but only writeback
-                         * unless under significant pressure.
+                         * if many dirty pages have been encountered.
                         */
                        if (page_is_file_cache(page) &&
                                        (!current_is_kswapd() ||
-                                         sc->priority >= DEF_PRIORITY - 2)) {
+                                         !zone_is_reclaim_dirty(zone))) {
                                /*
                                 * Immediately reclaim when written back.
                                 * Similar in principal to deactivate_page()
@@ -838,7 +927,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        /* Page is dirty, try to write it out here */
                        switch (pageout(page, mapping, sc)) {
                        case PAGE_KEEP:
-                                nr_congested++;
                                goto keep_locked;
                        case PAGE_ACTIVATE:
                                goto activate_locked;
@@ -946,22 +1034,16 @@ keep:
                VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
        }
-        /*
-         * Tag a zone as congested if all the dirty pages encountered were
-         * backed by a congested BDI. In this case, reclaimers should just
-         * back off and wait for congestion to clear because further reclaim
-         * will encounter the same problem
-         */
-        if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
-                zone_set_flag(zone, ZONE_CONGESTED);
        free_hot_cold_page_list(&free_pages, 1);
        list_splice(&ret_pages, page_list);
        count_vm_events(PGACTIVATE, pgactivate);
        mem_cgroup_uncharge_end();
        *ret_nr_dirty += nr_dirty;
+        *ret_nr_congested += nr_congested;
+        *ret_nr_unqueued_dirty += nr_unqueued_dirty;
        *ret_nr_writeback += nr_writeback;
+        *ret_nr_immediate += nr_immediate;
        return nr_reclaimed;
 }
@@ -973,7 +1055,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
                .priority = DEF_PRIORITY,
                .may_unmap = 1,
        };
-        unsigned long ret, dummy1, dummy2;
+        unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5;
        struct page *page, *next;
        LIST_HEAD(clean_pages);
@@ -985,8 +1067,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
        }
        ret = shrink_page_list(&clean_pages, zone, &sc,
-                                TTU_UNMAP|TTU_IGNORE_ACCESS,
+                        TTU_UNMAP|TTU_IGNORE_ACCESS,
-                                &dummy1, &dummy2, true);
+                        &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
        list_splice(&clean_pages, page_list);
        __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
        return ret;
@@ -1281,7 +1363,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
        unsigned long nr_reclaimed = 0;
        unsigned long nr_taken;
        unsigned long nr_dirty = 0;
+        unsigned long nr_congested = 0;
+        unsigned long nr_unqueued_dirty = 0;
        unsigned long nr_writeback = 0;
+        unsigned long nr_immediate = 0;
        isolate_mode_t isolate_mode = 0;
        int file = is_file_lru(lru);
        struct zone *zone = lruvec_zone(lruvec);
@@ -1323,7 +1408,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
                return 0;
        nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
-                                        &nr_dirty, &nr_writeback, false);
+                                &nr_dirty, &nr_unqueued_dirty, &nr_congested,
+                                &nr_writeback, &nr_immediate,
+                                false);
        spin_lock_irq(&zone->lru_lock);
@@ -1357,7 +1444,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
         * same way balance_dirty_pages() manages.
         *
         * This scales the number of dirty pages that must be under writeback
-         * before throttling depending on priority. It is a simple backoff
+         * before a zone gets flagged ZONE_WRITEBACK. It is a simple backoff
         * function that has the most effect in the range DEF_PRIORITY to
         * DEF_PRIORITY-2 which is the priority reclaim is considered to be
         * in trouble and reclaim is considered to be in trouble.
@@ -1368,9 +1455,53 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
         * ...
         * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
         *                     isolated page is PageWriteback
+         *
+         * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number
+         * of pages under pages flagged for immediate reclaim and stall if any
+         * are encountered in the nr_immediate check below.
         */
        if (nr_writeback && nr_writeback >=
                        (nr_taken >> (DEF_PRIORITY - sc->priority)))
+                zone_set_flag(zone, ZONE_WRITEBACK);
+        /*
+         * memcg will stall in page writeback so only consider forcibly
+         * stalling for global reclaim
+         */
+        if (global_reclaim(sc)) {
+                /*
+                 * Tag a zone as congested if all the dirty pages scanned were
+                 * backed by a congested BDI and wait_iff_congested will stall.
+                 */
+                if (nr_dirty && nr_dirty == nr_congested)
+                        zone_set_flag(zone, ZONE_CONGESTED);
+                /*
+                 * If dirty pages are scanned that are not queued for IO, it
+                 * implies that flushers are not keeping up. In this case, flag
+                 * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
+                 * pages from reclaim context. It will forcibly stall in the
+                 * next check.
+                 */
+                if (nr_unqueued_dirty == nr_taken)
+                        zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
+                /*
+                 * In addition, if kswapd scans pages marked marked for
+                 * immediate reclaim and under writeback (nr_immediate), it
+                 * implies that pages are cycling through the LRU faster than
+                 * they are written so also forcibly stall.
+                 */
+                if (nr_unqueued_dirty == nr_taken || nr_immediate)
+                        congestion_wait(BLK_RW_ASYNC, HZ/10);
+        }
+        /*
+         * Stall direct reclaim for IO completions if underlying BDIs or zone
+         * is congested. Allow kswapd to continue until it starts encountering
+         * unqueued dirty pages or cycling through the LRU too quickly.
+         */
+        if (!sc->hibernation_mode && !current_is_kswapd())
                wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
        trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
@@ -1822,17 +1953,25 @@ out:
 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 {
        unsigned long nr[NR_LRU_LISTS];
+        unsigned long targets[NR_LRU_LISTS];
        unsigned long nr_to_scan;
        enum lru_list lru;
        unsigned long nr_reclaimed = 0;
        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
        struct blk_plug plug;
+        bool scan_adjusted = false;
        get_scan_count(lruvec, sc, nr);
+        /* Record the original scan target for proportional adjustments later */
+        memcpy(targets, nr, sizeof(nr));
        blk_start_plug(&plug);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                                        nr[LRU_INACTIVE_FILE]) {
+                unsigned long nr_anon, nr_file, percentage;
+                unsigned long nr_scanned;
                for_each_evictable_lru(lru) {
                        if (nr[lru]) {
                                nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
@@ -1842,17 +1981,60 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
                                                            lruvec, sc);
                        }
                }
+                if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
+                        continue;
                /*
-                 * On large memory systems, scan >> priority can become
+                 * For global direct reclaim, reclaim only the number of pages
-                 * really large. This is fine for the starting priority;
+                 * requested. Less care is taken to scan proportionally as it
-                 * we want to put equal scanning pressure on each zone.
+                 * is more important to minimise direct reclaim stall latency
-                 * However, if the VM has a harder time of freeing pages,
+                 * than it is to properly age the LRU lists.
-                 * with multiple processes reclaiming pages, the total
-                 * freeing target can get unreasonably large.
                 */
-                if (nr_reclaimed >= nr_to_reclaim &&
+                if (global_reclaim(sc) && !current_is_kswapd())
-                    sc->priority < DEF_PRIORITY)
                        break;
+                /*
+                 * For kswapd and memcg, reclaim at least the number of pages
+                 * requested. Ensure that the anon and file LRUs shrink
+                 * proportionally what was requested by get_scan_count(). We
+                 * stop reclaiming one LRU and reduce the amount scanning
+                 * proportional to the original scan target.
+                 */
+                nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
+                nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
+                if (nr_file > nr_anon) {
+                        unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
+                                                targets[LRU_ACTIVE_ANON] + 1;
+                        lru = LRU_BASE;
+                        percentage = nr_anon * 100 / scan_target;
+                } else {
+                        unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
+                                                targets[LRU_ACTIVE_FILE] + 1;
+                        lru = LRU_FILE;
+                        percentage = nr_file * 100 / scan_target;
+                }
+                /* Stop scanning the smaller of the LRU */
+                nr[lru] = 0;
+                nr[lru + LRU_ACTIVE] = 0;
+                /*
+                 * Recalculate the other LRU scan count based on its original
+                 * scan target and the percentage scanning already complete
+                 */
+                lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
+                nr_scanned = targets[lru] - nr[lru];
+                nr[lru] = targets[lru] * (100 - percentage) / 100;
+                nr[lru] -= min(nr[lru], nr_scanned);
+                lru += LRU_ACTIVE;
+                nr_scanned = targets[lru] - nr[lru];
+                nr[lru] = targets[lru] * (100 - percentage) / 100;
+                nr[lru] -= min(nr[lru], nr_scanned);
+                scan_adjusted = true;
        }
        blk_finish_plug(&plug);
        sc->nr_reclaimed += nr_reclaimed;
@@ -2222,17 +2404,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                                                WB_REASON_TRY_TO_FREE_PAGES);
                        sc->may_writepage = 1;
                }
-                /* Take a nap, wait for some writeback to complete */
-                if (!sc->hibernation_mode && sc->nr_scanned &&
-                    sc->priority < DEF_PRIORITY - 2) {
-                        struct zone *preferred_zone;
-                        first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
-                                                &cpuset_current_mems_allowed,
-                                                &preferred_zone);
-                        wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
-                }
        } while (--sc->priority >= 0);
 out:
@@ -2601,6 +2772,91 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
 }
 /*
+ * kswapd shrinks the zone by the number of pages required to reach
+ * the high watermark.
+ *
+ * Returns true if kswapd scanned at least the requested number of pages to
+ * reclaim or if the lack of progress was due to pages under writeback.
+ * This is used to determine if the scanning priority needs to be raised.
+ */
+static bool kswapd_shrink_zone(struct zone *zone,
+                               int classzone_idx,
+                               struct scan_control *sc,
+                               unsigned long lru_pages,
+                               unsigned long *nr_attempted)
+{
+        unsigned long nr_slab;
+        int testorder = sc->order;
+        unsigned long balance_gap;
+        struct reclaim_state *reclaim_state = current->reclaim_state;
+        struct shrink_control shrink = {
+                .gfp_mask = sc->gfp_mask,
+        };
+        bool lowmem_pressure;
+        /* Reclaim above the high watermark. */
+        sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
+        /*
+         * Kswapd reclaims only single pages with compaction enabled. Trying
+         * too hard to reclaim until contiguous free pages have become
+         * available can hurt performance by evicting too much useful data
+         * from memory. Do not reclaim more than needed for compaction.
+         */
+        if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
+                        compaction_suitable(zone, sc->order) !=
+                                COMPACT_SKIPPED)
+                testorder = 0;
+        /*
+         * We put equal pressure on every zone, unless one zone has way too
+         * many pages free already. The "too many pages" is defined as the
+         * high wmark plus a "gap" where the gap is either the low
+         * watermark or 1% of the zone, whichever is smaller.
+         */
+        balance_gap = min(low_wmark_pages(zone),
+                (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+                KSWAPD_ZONE_BALANCE_GAP_RATIO);
+        /*
+         * If there is no low memory pressure or the zone is balanced then no
+         * reclaim is necessary
+         */
+        lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
+        if (!lowmem_pressure && zone_balanced(zone, testorder,
+                                                balance_gap, classzone_idx))
+                return true;
+        shrink_zone(zone, sc);
+        reclaim_state->reclaimed_slab = 0;
+        nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages);
+        sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+        /* Account for the number of pages attempted to reclaim */
+        *nr_attempted += sc->nr_to_reclaim;
+        if (nr_slab == 0 && !zone_reclaimable(zone))
+                zone->all_unreclaimable = 1;
+        zone_clear_flag(zone, ZONE_WRITEBACK);
+        /*
+         * If a zone reaches its high watermark, consider it to be no longer
+         * congested. It's possible there are dirty pages backed by congested
+         * BDIs but as pressure is relieved, speculatively avoid congestion
+         * waits.
+         */
+        if (!zone->all_unreclaimable &&
+            zone_balanced(zone, testorder, 0, classzone_idx)) {
+                zone_clear_flag(zone, ZONE_CONGESTED);
+                zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
+        }
+        return sc->nr_scanned >= sc->nr_to_reclaim;
+}
+/*
 * For kswapd, balance_pgdat() will work across all this node's zones until
 * they are all at high_wmark_pages(zone).
 *
@@ -2624,35 +2880,28 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
 static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                                                        int *classzone_idx)
 {
-        bool pgdat_is_balanced = false;
        int i;
        int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
-        struct reclaim_state *reclaim_state = current->reclaim_state;
        unsigned long nr_soft_reclaimed;
        unsigned long nr_soft_scanned;
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
+                .priority = DEF_PRIORITY,
                .may_unmap = 1,
                .may_swap = 1,
-                /*
+                .may_writepage = !laptop_mode,
-                 * kswapd doesn't want to be bailed out while reclaim. because
-                 * we want to put equal scanning pressure on each zone.
-                 */
-                .nr_to_reclaim = ULONG_MAX,
                .order = order,
                .target_mem_cgroup = NULL,
        };
-        struct shrink_control shrink = {
-                .gfp_mask = sc.gfp_mask,
-        };
-loop_again:
-        sc.priority = DEF_PRIORITY;
-        sc.nr_reclaimed = 0;
-        sc.may_writepage = !laptop_mode;
        count_vm_event(PAGEOUTRUN);
        do {
                unsigned long lru_pages = 0;
+                unsigned long nr_attempted = 0;
+                bool raise_priority = true;
+                bool pgdat_needs_compaction = (order > 0);
+                sc.nr_reclaimed = 0;
                /*
                 * Scan in the highmem->dma direction for the highest
@@ -2689,23 +2938,46 @@ loop_again:
                                end_zone = i;
                                break;
                        } else {
-                                /* If balanced, clear the congested flag */
+                                /*
+                                 * If balanced, clear the dirty and congested
+                                 * flags
+                                 */
                                zone_clear_flag(zone, ZONE_CONGESTED);
+                                zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
                        }
                }
-                if (i < 0) {
+                if (i < 0)
-                        pgdat_is_balanced = true;
                        goto out;
-                }
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
+                        if (!populated_zone(zone))
+                                continue;
                        lru_pages += zone_reclaimable_pages(zone);
+                        /*
+                         * If any zone is currently balanced then kswapd will
+                         * not call compaction as it is expected that the
+                         * necessary pages are already available.
+                         */
+                        if (pgdat_needs_compaction &&
+                                        zone_watermark_ok(zone, order,
+                                                low_wmark_pages(zone),
+                                                *classzone_idx, 0))
+                                pgdat_needs_compaction = false;
                }
                /*
+                 * If we're getting trouble reclaiming, start doing writepage
+                 * even in laptop mode.
+                 */
+                if (sc.priority < DEF_PRIORITY - 2)
+                        sc.may_writepage = 1;
+                /*
                 * Now scan the zone in the dma->highmem direction, stopping
                 * at the last zone which needs scanning.
                 *
@@ -2716,8 +2988,6 @@ loop_again:
                 */
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
-                        int nr_slab, testorder;
-                        unsigned long balance_gap;
                        if (!populated_zone(zone))
                                continue;
@@ -2738,65 +3008,14 @@ loop_again:
                        sc.nr_reclaimed += nr_soft_reclaimed;
                        /*
-                         * We put equal pressure on every zone, unless
+                         * There should be no need to raise the scanning
-                         * one zone has way too many pages free
+                         * priority if enough pages are already being scanned
-                         * already. The "too many pages" is defined
+                         * that that high watermark would be met at 100%
-                         * as the high wmark plus a "gap" where the
+                         * efficiency.
-                         * gap is either the low watermark or 1%
-                         * of the zone, whichever is smaller.
                         */
-                        balance_gap = min(low_wmark_pages(zone),
+                        if (kswapd_shrink_zone(zone, end_zone, &sc,
-                                (zone->managed_pages +
+                                        lru_pages, &nr_attempted))
-                                        KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+                                raise_priority = false;
-                                KSWAPD_ZONE_BALANCE_GAP_RATIO);
-                        /*
-                         * Kswapd reclaims only single pages with compaction
-                         * enabled. Trying too hard to reclaim until contiguous
-                         * free pages have become available can hurt performance
-                         * by evicting too much useful data from memory.
-                         * Do not reclaim more than needed for compaction.
-                         */
-                        testorder = order;
-                        if (IS_ENABLED(CONFIG_COMPACTION) && order &&
-                                        compaction_suitable(zone, order) !=
-                                                COMPACT_SKIPPED)
-                                testorder = 0;
-                        if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
-                            !zone_balanced(zone, testorder,
-                                           balance_gap, end_zone)) {
-                                shrink_zone(zone, &sc);
-                                reclaim_state->reclaimed_slab = 0;
-                                nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
-                                sc.nr_reclaimed += reclaim_state->reclaimed_slab;
-                                if (nr_slab == 0 && !zone_reclaimable(zone))
-                                        zone->all_unreclaimable = 1;
-                        }
-                        /*
-                         * If we're getting trouble reclaiming, start doing
-                         * writepage even in laptop mode.
-                         */
-                        if (sc.priority < DEF_PRIORITY - 2)
-                                sc.may_writepage = 1;
-                        if (zone->all_unreclaimable) {
-                                if (end_zone && end_zone == i)
-                                        end_zone--;
-                                continue;
-                        }
-                        if (zone_balanced(zone, testorder, 0, end_zone))
-                                /*
-                                 * If a zone reaches its high watermark,
-                                 * consider it to be no longer congested. It's
-                                 * possible there are dirty pages backed by
-                                 * congested BDIs but as pressure is relieved,
-                                 * speculatively avoid congestion waits
-                                 */
-                                zone_clear_flag(zone, ZONE_CONGESTED);
                }
                /*
@@ -2808,74 +3027,38 @@ loop_again:
                                pfmemalloc_watermark_ok(pgdat))
                        wake_up(&pgdat->pfmemalloc_wait);
-                if (pgdat_balanced(pgdat, order, *classzone_idx)) {
-                        pgdat_is_balanced = true;
-                        break;          /* kswapd: all done */
-                }
                /*
-                 * We do this so kswapd doesn't build up large priorities for
+                 * Fragmentation may mean that the system cannot be rebalanced
-                 * example when it is freeing in parallel with allocators. It
+                 * for high-order allocations in all zones. If twice the
-                 * matches the direct reclaim path behaviour in terms of impact
+                 * allocation size has been reclaimed and the zones are still
-                 * on zone->*_priority.
+                 * not balanced then recheck the watermarks at order-0 to
+                 * prevent kswapd reclaiming excessively. Assume that a
+                 * process requested a high-order can direct reclaim/compact.
                 */
-                if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
+                if (order && sc.nr_reclaimed >= 2UL << order)
-                        break;
+                        order = sc.order = 0;
-        } while (--sc.priority >= 0);
-out:
-        if (!pgdat_is_balanced) {
-                cond_resched();
-                try_to_freeze();
+                /* Check if kswapd should be suspending */
+                if (try_to_freeze() || kthread_should_stop())
+                        break;
                /*
-                 * Fragmentation may mean that the system cannot be
+                 * Compact if necessary and kswapd is reclaiming at least the
-                 * rebalanced for high-order allocations in all zones.
+                 * high watermark number of pages as requsted
-                 * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX,
-                 * it means the zones have been fully scanned and are still
-                 * not balanced. For high-order allocations, there is
-                 * little point trying all over again as kswapd may
-                 * infinite loop.
-                 *
-                 * Instead, recheck all watermarks at order-0 as they
-                 * are the most important. If watermarks are ok, kswapd will go
-                 * back to sleep. High-order users can still perform direct
-                 * reclaim if they wish.
                 */
-                if (sc.nr_reclaimed < SWAP_CLUSTER_MAX)
+                if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
-                        order = sc.order = 0;
-                goto loop_again;
-        }
-        /*
-         * If kswapd was reclaiming at a higher order, it has the option of
-         * sleeping without all zones being balanced. Before it does, it must
-         * ensure that the watermarks for order-0 on *all* zones are met and
-         * that the congestion flags are cleared. The congestion flag must
-         * be cleared as kswapd is the only mechanism that clears the flag
-         * and it is potentially going to sleep here.
-         */
-        if (order) {
-                int zones_need_compaction = 1;
-                for (i = 0; i <= end_zone; i++) {
-                        struct zone *zone = pgdat->node_zones + i;
-                        if (!populated_zone(zone))
-                                continue;
-                        /* Check if the memory needs to be defragmented. */
-                        if (zone_watermark_ok(zone, order,
-                                    low_wmark_pages(zone), *classzone_idx, 0))
-                                zones_need_compaction = 0;
-                }
-                if (zones_need_compaction)
                        compact_pgdat(pgdat, order);
-        }
+                /*
+                 * Raise priority if scanning rate is too low or there was no
+                 * progress in reclaiming pages
+                 */
+                if (raise_priority || !sc.nr_reclaimed)
+                        sc.priority--;
+        } while (sc.priority >= 1 &&
+                 !pgdat_balanced(pgdat, order, *classzone_idx));
+out:
        /*
         * Return the order we were reclaiming at so prepare_kswapd_sleep()
         * makes a decision on the order we were last reclaiming at. However,