1 files changed, 207 insertions, 173 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index febbc044e792..a1893c050795 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -95,8 +95,6 @@ struct scan_control {
        /* Can pages be swapped as part of reclaim? */
        int may_swap;
-        int swappiness;
        int order;
        /*
@@ -173,7 +171,8 @@ static unsigned long zone_nr_lru_pages(struct zone *zone,
                                struct scan_control *sc, enum lru_list lru)
 {
        if (!scanning_global_lru(sc))
-                return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, zone, lru);
+                return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup,
+                                zone_to_nid(zone), zone_idx(zone), BIT(lru));
        return zone_page_state(zone, NR_LRU_BASE + lru);
 }
@@ -496,15 +495,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                        return PAGE_ACTIVATE;
                }
-                /*
-                 * Wait on writeback if requested to. This happens when
-                 * direct reclaiming a large contiguous area and the
-                 * first attempt to free a range of pages fails.
-                 */
-                if (PageWriteback(page) &&
-                    (sc->reclaim_mode & RECLAIM_MODE_SYNC))
-                        wait_on_page_writeback(page);
                if (!PageWriteback(page)) {
                        /* synchronous write or broken a_ops? */
                        ClearPageReclaim(page);
@@ -643,13 +633,14 @@ redo:
                lru = LRU_UNEVICTABLE;
                add_page_to_unevictable_list(page);
                /*
-                 * When racing with an mlock clearing (page is
+                 * When racing with an mlock or AS_UNEVICTABLE clearing
-                 * unlocked), make sure that if the other thread does
+                 * (page is unlocked) make sure that if the other thread
-                 * not observe our setting of PG_lru and fails
+                 * does not observe our setting of PG_lru and fails
-                 * isolation, we see PG_mlocked cleared below and move
+                 * isolation/check_move_unevictable_page,
+                 * we see PG_mlocked/AS_UNEVICTABLE cleared below and move
                 * the page back to the evictable list.
                 *
-                 * The other side is TestClearPageMlocked().
+                 * The other side is TestClearPageMlocked() or shmem_lock().
                 */
                smp_mb();
        }
@@ -760,7 +751,10 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
 */
 static unsigned long shrink_page_list(struct list_head *page_list,
                                      struct zone *zone,
-                                      struct scan_control *sc)
+                                      struct scan_control *sc,
+                                      int priority,
+                                      unsigned long *ret_nr_dirty,
+                                      unsigned long *ret_nr_writeback)
 {
        LIST_HEAD(ret_pages);
        LIST_HEAD(free_pages);
@@ -768,6 +762,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
        unsigned long nr_dirty = 0;
        unsigned long nr_congested = 0;
        unsigned long nr_reclaimed = 0;
+        unsigned long nr_writeback = 0;
        cond_resched();
@@ -804,13 +799,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
                if (PageWriteback(page)) {
+                        nr_writeback++;
                        /*
-                         * Synchronous reclaim is performed in two passes,
+                         * Synchronous reclaim cannot queue pages for
-                         * first an asynchronous pass over the list to
+                         * writeback due to the possibility of stack overflow
-                         * start parallel writeback, and a second synchronous
+                         * but if it encounters a page under writeback, wait
-                         * pass to wait for the IO to complete.  Wait here
+                         * for the IO to complete.
-                         * for any page for which writeback has already
-                         * started.
                         */
                        if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
                            may_enter_fs)
@@ -866,6 +860,25 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                if (PageDirty(page)) {
                        nr_dirty++;
+                        /*
+                         * Only kswapd can writeback filesystem pages to
+                         * avoid risk of stack overflow but do not writeback
+                         * unless under significant pressure.
+                         */
+                        if (page_is_file_cache(page) &&
+                                        (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) {
+                                /*
+                                 * Immediately reclaim when written back.
+                                 * Similar in principal to deactivate_page()
+                                 * except we already have the page isolated
+                                 * and know it's dirty
+                                 */
+                                inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
+                                SetPageReclaim(page);
+                                goto keep_locked;
+                        }
                        if (references == PAGEREF_RECLAIM_CLEAN)
                                goto keep_locked;
                        if (!may_enter_fs)
@@ -1000,6 +1013,8 @@ keep_lumpy:
        list_splice(&ret_pages, page_list);
        count_vm_events(PGACTIVATE, pgactivate);
+        *ret_nr_dirty += nr_dirty;
+        *ret_nr_writeback += nr_writeback;
        return nr_reclaimed;
 }
@@ -1013,23 +1028,27 @@ keep_lumpy:
 *
 * returns 0 on success, -ve errno on failure.
 */
-int __isolate_lru_page(struct page *page, int mode, int file)
+int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
 {
+        bool all_lru_mode;
        int ret = -EINVAL;
        /* Only take pages on the LRU. */
        if (!PageLRU(page))
                return ret;
+        all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
+                (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
        /*
         * When checking the active state, we need to be sure we are
         * dealing with comparible boolean values.  Take the logical not
         * of each.
         */
-        if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
+        if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
                return ret;
-        if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file)
+        if (!all_lru_mode && !!page_is_file_cache(page) != file)
                return ret;
        /*
@@ -1042,6 +1061,12 @@ int __isolate_lru_page(struct page *page, int mode, int file)
        ret = -EBUSY;
+        if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page)))
+                return ret;
+        if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
+                return ret;
        if (likely(get_page_unless_zero(page))) {
                /*
                 * Be careful not to clear PageLRU until after we're
@@ -1077,7 +1102,8 @@ int __isolate_lru_page(struct page *page, int mode, int file)
 */
 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                struct list_head *src, struct list_head *dst,
-                unsigned long *scanned, int order, int mode, int file)
+                unsigned long *scanned, int order, isolate_mode_t mode,
+                int file)
 {
        unsigned long nr_taken = 0;
        unsigned long nr_lumpy_taken = 0;
@@ -1202,8 +1228,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 static unsigned long isolate_pages_global(unsigned long nr,
                                        struct list_head *dst,
                                        unsigned long *scanned, int order,
-                                        int mode, struct zone *z,
+                                        isolate_mode_t mode,
-                                        int active, int file)
+                                        struct zone *z, int active, int file)
 {
        int lru = LRU_BASE;
        if (active)
@@ -1395,7 +1421,7 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone,
 }
 /*
- * Returns true if the caller should wait to clean dirty/writeback pages.
+ * Returns true if a direct reclaim should wait on pages under writeback.
 *
 * If we are direct reclaiming for contiguous pages and we do not reclaim
 * everything in the list, try again and wait for writeback IO to complete.
@@ -1417,7 +1443,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
        if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
                return false;
-        /* If we have relaimed everything on the isolated list, no stall */
+        /* If we have reclaimed everything on the isolated list, no stall */
        if (nr_freed == nr_taken)
                return false;
@@ -1449,6 +1475,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        unsigned long nr_taken;
        unsigned long nr_anon;
        unsigned long nr_file;
+        unsigned long nr_dirty = 0;
+        unsigned long nr_writeback = 0;
+        isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
        while (unlikely(too_many_isolated(zone, file, sc))) {
                congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1459,15 +1488,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        }
        set_reclaim_mode(priority, sc, false);
+        if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
+                reclaim_mode |= ISOLATE_ACTIVE;
        lru_add_drain();
+        if (!sc->may_unmap)
+                reclaim_mode |= ISOLATE_UNMAPPED;
+        if (!sc->may_writepage)
+                reclaim_mode |= ISOLATE_CLEAN;
        spin_lock_irq(&zone->lru_lock);
        if (scanning_global_lru(sc)) {
-                nr_taken = isolate_pages_global(nr_to_scan,
+                nr_taken = isolate_pages_global(nr_to_scan, &page_list,
-                        &page_list, &nr_scanned, sc->order,
+                        &nr_scanned, sc->order, reclaim_mode, zone, 0, file);
-                        sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
-                                        ISOLATE_BOTH : ISOLATE_INACTIVE,
-                        zone, 0, file);
                zone->pages_scanned += nr_scanned;
                if (current_is_kswapd())
                        __count_zone_vm_events(PGSCAN_KSWAPD, zone,
@@ -1476,12 +1511,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
                        __count_zone_vm_events(PGSCAN_DIRECT, zone,
                                               nr_scanned);
        } else {
-                nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
+                nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list,
-                        &page_list, &nr_scanned, sc->order,
+                        &nr_scanned, sc->order, reclaim_mode, zone,
-                        sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
+                        sc->mem_cgroup, 0, file);
-                                        ISOLATE_BOTH : ISOLATE_INACTIVE,
-                        zone, sc->mem_cgroup,
-                        0, file);
                /*
                 * mem_cgroup_isolate_pages() keeps track of
                 * scanned pages on its own.
@@ -1497,12 +1529,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        spin_unlock_irq(&zone->lru_lock);
-        nr_reclaimed = shrink_page_list(&page_list, zone, sc);
+        nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority,
+                                                &nr_dirty, &nr_writeback);
        /* Check if we should syncronously wait for writeback */
        if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
                set_reclaim_mode(priority, sc, true);
-                nr_reclaimed += shrink_page_list(&page_list, zone, sc);
+                nr_reclaimed += shrink_page_list(&page_list, zone, sc,
+                                        priority, &nr_dirty, &nr_writeback);
        }
        local_irq_disable();
@@ -1512,6 +1546,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
+        /*
+         * If reclaim is isolating dirty pages under writeback, it implies
+         * that the long-lived page allocation rate is exceeding the page
+         * laundering rate. Either the global limits are not being effective
+         * at throttling processes due to the page distribution throughout
+         * zones or there is heavy usage of a slow backing device. The
+         * only option is to throttle from reclaim context which is not ideal
+         * as there is no guarantee the dirtying process is throttled in the
+         * same way balance_dirty_pages() manages.
+         *
+         * This scales the number of dirty pages that must be under writeback
+         * before throttling depending on priority. It is a simple backoff
+         * function that has the most effect in the range DEF_PRIORITY to
+         * DEF_PRIORITY-2 which is the priority reclaim is considered to be
+         * in trouble and reclaim is considered to be in trouble.
+         *
+         * DEF_PRIORITY   100% isolated pages must be PageWriteback to throttle
+         * DEF_PRIORITY-1  50% must be PageWriteback
+         * DEF_PRIORITY-2  25% must be PageWriteback, kswapd in trouble
+         * ...
+         * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
+         *                     isolated page is PageWriteback
+         */
+        if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority)))
+                wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
        trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
                zone_idx(zone),
                nr_scanned, nr_reclaimed,
@@ -1583,19 +1643,26 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
        struct page *page;
        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
        unsigned long nr_rotated = 0;
+        isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
        lru_add_drain();
+        if (!sc->may_unmap)
+                reclaim_mode |= ISOLATE_UNMAPPED;
+        if (!sc->may_writepage)
+                reclaim_mode |= ISOLATE_CLEAN;
        spin_lock_irq(&zone->lru_lock);
        if (scanning_global_lru(sc)) {
                nr_taken = isolate_pages_global(nr_pages, &l_hold,
                                                &pgscanned, sc->order,
-                                                ISOLATE_ACTIVE, zone,
+                                                reclaim_mode, zone,
                                                1, file);
                zone->pages_scanned += pgscanned;
        } else {
                nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
                                                &pgscanned, sc->order,
-                                                ISOLATE_ACTIVE, zone,
+                                                reclaim_mode, zone,
                                                sc->mem_cgroup, 1, file);
                /*
                 * mem_cgroup_isolate_pages() keeps track of
@@ -1700,7 +1767,7 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
        if (scanning_global_lru(sc))
                low = inactive_anon_is_low_global(zone);
        else
-                low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
+                low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone);
        return low;
 }
 #else
@@ -1743,7 +1810,7 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
        if (scanning_global_lru(sc))
                low = inactive_file_is_low_global(zone);
        else
-                low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup);
+                low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone);
        return low;
 }
@@ -1770,6 +1837,13 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
        return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
 }
+static int vmscan_swappiness(struct scan_control *sc)
+{
+        if (scanning_global_lru(sc))
+                return vm_swappiness;
+        return mem_cgroup_swappiness(sc->mem_cgroup);
+}
 /*
 * Determine how aggressively the anon and file LRU lists should be
 * scanned.  The relative value of each set of LRU lists is determined
@@ -1788,22 +1862,22 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
        u64 fraction[2], denominator;
        enum lru_list l;
        int noswap = 0;
-        int force_scan = 0;
+        bool force_scan = false;
-        anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
-                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
-        file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
-                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
-        if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) {
+        /*
-                /* kswapd does zone balancing and need to scan this zone */
+         * If the zone or memcg is small, nr[l] can be 0.  This
-                if (scanning_global_lru(sc) && current_is_kswapd())
+         * results in no scanning on this priority and a potential
-                        force_scan = 1;
+         * priority drop.  Global direct reclaim can go to the next
-                /* memcg may have small limit and need to avoid priority drop */
+         * zone and tends to have no problems. Global kswapd is for
-                if (!scanning_global_lru(sc))
+         * zone balancing and it needs to scan a minimum amount. When
-                        force_scan = 1;
+         * reclaiming for a memcg, a priority drop can cause high
-        }
+         * latencies, so it's better to scan a minimum amount there as
+         * well.
+         */
+        if (scanning_global_lru(sc) && current_is_kswapd())
+                force_scan = true;
+        if (!scanning_global_lru(sc))
+                force_scan = true;
        /* If we have no swap space, do not bother scanning anon pages. */
        if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1814,6 +1888,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
                goto out;
        }
+        anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
+                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
+        file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
+                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
        if (scanning_global_lru(sc)) {
                free  = zone_page_state(zone, NR_FREE_PAGES);
                /* If we have very few page cache pages,
@@ -1830,8 +1909,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
         * With swappiness at 100, anonymous and file have the same priority.
         * This scanning priority is essentially the inverse of IO cost.
         */
-        anon_prio = sc->swappiness;
+        anon_prio = vmscan_swappiness(sc);
-        file_prio = 200 - sc->swappiness;
+        file_prio = 200 - vmscan_swappiness(sc);
        /*
         * OK, so we have swap space and a fair amount of page cache
@@ -1878,23 +1957,9 @@ out:
                scan = zone_nr_lru_pages(zone, sc, l);
                if (priority || noswap) {
                        scan >>= priority;
-                        scan = div64_u64(scan * fraction[file], denominator);
+                        if (!scan && force_scan)
-                }
-                /*
-                 * If zone is small or memcg is small, nr[l] can be 0.
-                 * This results no-scan on this priority and priority drop down.
-                 * For global direct reclaim, it can visit next zone and tend
-                 * not to have problems. For global kswapd, it's for zone
-                 * balancing and it need to scan a small amounts. When using
-                 * memcg, priority drop can cause big latency. So, it's better
-                 * to scan small amount. See may_noscan above.
-                 */
-                if (!scan && force_scan) {
-                        if (file)
-                                scan = SWAP_CLUSTER_MAX;
-                        else if (!noswap)
                                scan = SWAP_CLUSTER_MAX;
+                        scan = div64_u64(scan * fraction[file], denominator);
                }
                nr[l] = scan;
        }
@@ -1974,12 +2039,14 @@ static void shrink_zone(int priority, struct zone *zone,
        enum lru_list l;
        unsigned long nr_reclaimed, nr_scanned;
        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+        struct blk_plug plug;
 restart:
        nr_reclaimed = 0;
        nr_scanned = sc->nr_scanned;
        get_scan_count(zone, sc, nr, priority);
+        blk_start_plug(&plug);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                                        nr[LRU_INACTIVE_FILE]) {
                for_each_evictable_lru(l) {
@@ -2003,6 +2070,7 @@ restart:
                if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
                        break;
        }
+        blk_finish_plug(&plug);
        sc->nr_reclaimed += nr_reclaimed;
        /*
@@ -2035,14 +2103,19 @@ restart:
 *
 * If a zone is deemed to be full of pinned pages then just give it a light
 * scan then give up on it.
+ *
+ * This function returns true if a zone is being reclaimed for a costly
+ * high-order allocation and compaction is either ready to begin or deferred.
+ * This indicates to the caller that it should retry the allocation or fail.
 */
-static void shrink_zones(int priority, struct zonelist *zonelist,
+static bool shrink_zones(int priority, struct zonelist *zonelist,
                                        struct scan_control *sc)
 {
        struct zoneref *z;
        struct zone *zone;
        unsigned long nr_soft_reclaimed;
        unsigned long nr_soft_scanned;
+        bool should_abort_reclaim = false;
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                        gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2057,6 +2130,23 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
                                continue;
                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                continue;       /* Let kswapd poll it */
+                        if (COMPACTION_BUILD) {
+                                /*
+                                 * If we already have plenty of memory free for
+                                 * compaction in this zone, don't free any more.
+                                 * Even though compaction is invoked for any
+                                 * non-zero order, only frequent costly order
+                                 * reclamation is disruptive enough to become a
+                                 * noticable problem, like transparent huge page
+                                 * allocations.
+                                 */
+                                if (sc->order > PAGE_ALLOC_COSTLY_ORDER &&
+                                        (compaction_suitable(zone, sc->order) ||
+                                         compaction_deferred(zone))) {
+                                        should_abort_reclaim = true;
+                                        continue;
+                                }
+                        }
                        /*
                         * This steals pages from memory cgroups over softlimit
                         * and returns the number of reclaimed pages and
@@ -2074,6 +2164,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
                shrink_zone(priority, zone, sc);
        }
+        return should_abort_reclaim;
 }
 static bool zone_reclaimable(struct zone *zone)
@@ -2138,7 +2230,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                sc->nr_scanned = 0;
                if (!priority)
                        disable_swap_token(sc->mem_cgroup);
-                shrink_zones(priority, zonelist, sc);
+                if (shrink_zones(priority, zonelist, sc))
+                        break;
                /*
                 * Don't shrink slabs when reclaiming memory from
                 * over limit cgroups
@@ -2172,7 +2266,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                 */
                writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
                if (total_scanned > writeback_threshold) {
-                        wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
+                        wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
+                                                WB_REASON_TRY_TO_FREE_PAGES);
                        sc->may_writepage = 1;
                }
@@ -2220,7 +2315,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                .nr_to_reclaim = SWAP_CLUSTER_MAX,
                .may_unmap = 1,
                .may_swap = 1,
-                .swappiness = vm_swappiness,
                .order = order,
                .mem_cgroup = NULL,
                .nodemask = nodemask,
@@ -2244,7 +2338,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
                                                gfp_t gfp_mask, bool noswap,
-                                                unsigned int swappiness,
                                                struct zone *zone,
                                                unsigned long *nr_scanned)
 {
@@ -2254,7 +2347,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
                .may_swap = !noswap,
-                .swappiness = swappiness,
                .order = 0,
                .mem_cgroup = mem,
        };
@@ -2283,8 +2375,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                                           gfp_t gfp_mask,
-                                           bool noswap,
+                                           bool noswap)
-                                           unsigned int swappiness)
 {
        struct zonelist *zonelist;
        unsigned long nr_reclaimed;
@@ -2294,7 +2385,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                .may_unmap = 1,
                .may_swap = !noswap,
                .nr_to_reclaim = SWAP_CLUSTER_MAX,
-                .swappiness = swappiness,
                .order = 0,
                .mem_cgroup = mem_cont,
                .nodemask = NULL, /* we don't care the placement */
@@ -2445,7 +2535,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                 * we want to put equal scanning pressure on each zone.
                 */
                .nr_to_reclaim = ULONG_MAX,
-                .swappiness = vm_swappiness,
                .order = order,
                .mem_cgroup = NULL,
        };
@@ -2494,6 +2583,9 @@ loop_again:
                                        high_wmark_pages(zone), 0, 0)) {
                                end_zone = i;
                                break;
+                        } else {
+                                /* If balanced, clear the congested flag */
+                                zone_clear_flag(zone, ZONE_CONGESTED);
                        }
                }
                if (i < 0)
@@ -2684,6 +2776,8 @@ out:
                        /* If balanced, clear the congested flag */
                        zone_clear_flag(zone, ZONE_CONGESTED);
+                        if (i <= *classzone_idx)
+                                balanced += zone->present_pages;
                }
        }
@@ -2757,7 +2851,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 static int kswapd(void *p)
 {
        unsigned long order, new_order;
+        unsigned balanced_order;
        int classzone_idx, new_classzone_idx;
+        int balanced_classzone_idx;
        pg_data_t *pgdat = (pg_data_t*)p;
        struct task_struct *tsk = current;
@@ -2788,7 +2884,9 @@ static int kswapd(void *p)
        set_freezable();
        order = new_order = 0;
+        balanced_order = 0;
        classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
+        balanced_classzone_idx = classzone_idx;
        for ( ; ; ) {
                int ret;
@@ -2797,7 +2895,8 @@ static int kswapd(void *p)
                 * new request of a similar or harder type will succeed soon
                 * so consider going to sleep on the basis we reclaimed at
                 */
-                if (classzone_idx >= new_classzone_idx && order == new_order) {
+                if (balanced_classzone_idx >= new_classzone_idx &&
+                                        balanced_order == new_order) {
                        new_order = pgdat->kswapd_max_order;
                        new_classzone_idx = pgdat->classzone_idx;
                        pgdat->kswapd_max_order =  0;
@@ -2812,9 +2911,12 @@ static int kswapd(void *p)
                        order = new_order;
                        classzone_idx = new_classzone_idx;
                } else {
-                        kswapd_try_to_sleep(pgdat, order, classzone_idx);
+                        kswapd_try_to_sleep(pgdat, balanced_order,
+                                                balanced_classzone_idx);
                        order = pgdat->kswapd_max_order;
                        classzone_idx = pgdat->classzone_idx;
+                        new_order = order;
+                        new_classzone_idx = classzone_idx;
                        pgdat->kswapd_max_order = 0;
                        pgdat->classzone_idx = pgdat->nr_zones - 1;
                }
@@ -2829,7 +2931,9 @@ static int kswapd(void *p)
                 */
                if (!ret) {
                        trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
-                        order = balance_pgdat(pgdat, order, &classzone_idx);
+                        balanced_classzone_idx = classzone_idx;
+                        balanced_order = balance_pgdat(pgdat, order,
+                                                &balanced_classzone_idx);
                }
        }
        return 0;
@@ -2915,7 +3019,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
                .may_writepage = 1,
                .nr_to_reclaim = nr_to_reclaim,
                .hibernation_mode = 1,
-                .swappiness = vm_swappiness,
                .order = 0,
        };
        struct shrink_control shrink = {
@@ -3102,7 +3205,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                .nr_to_reclaim = max_t(unsigned long, nr_pages,
                                       SWAP_CLUSTER_MAX),
                .gfp_mask = gfp_mask,
-                .swappiness = vm_swappiness,
                .order = order,
        };
        struct shrink_control shrink = {
@@ -3343,66 +3445,12 @@ void scan_mapping_unevictable_pages(struct address_space *mapping)
 }
-/**
+static void warn_scan_unevictable_pages(void)
- * scan_zone_unevictable_pages - check unevictable list for evictable pages
- * @zone - zone of which to scan the unevictable list
- *
- * Scan @zone's unevictable LRU lists to check for pages that have become
- * evictable.  Move those that have to @zone's inactive list where they
- * become candidates for reclaim, unless shrink_inactive_zone() decides
- * to reactivate them.  Pages that are still unevictable are rotated
- * back onto @zone's unevictable list.
- */
-#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
-static void scan_zone_unevictable_pages(struct zone *zone)
-{
-        struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
-        unsigned long scan;
-        unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE);
-        while (nr_to_scan > 0) {
-                unsigned long batch_size = min(nr_to_scan,
-                                                SCAN_UNEVICTABLE_BATCH_SIZE);
-                spin_lock_irq(&zone->lru_lock);
-                for (scan = 0;  scan < batch_size; scan++) {
-                        struct page *page = lru_to_page(l_unevictable);
-                        if (!trylock_page(page))
-                                continue;
-                        prefetchw_prev_lru_page(page, l_unevictable, flags);
-                        if (likely(PageLRU(page) && PageUnevictable(page)))
-                                check_move_unevictable_page(page, zone);
-                        unlock_page(page);
-                }
-                spin_unlock_irq(&zone->lru_lock);
-                nr_to_scan -= batch_size;
-        }
-}
-/**
- * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages
- *
- * A really big hammer:  scan all zones' unevictable LRU lists to check for
- * pages that have become evictable.  Move those back to the zones'
- * inactive list where they become candidates for reclaim.
- * This occurs when, e.g., we have unswappable pages on the unevictable lists,
- * and we add swap to the system.  As such, it runs in the context of a task
- * that has possibly/probably made some previously unevictable pages
- * evictable.
- */
-static void scan_all_zones_unevictable_pages(void)
 {
-        struct zone *zone;
+        printk_once(KERN_WARNING
+                    "The scan_unevictable_pages sysctl/node-interface has been "
-        for_each_zone(zone) {
+                    "disabled for lack of a legitimate use case.  If you have "
-                scan_zone_unevictable_pages(zone);
+                    "one, please send an email to linux-mm@kvack.org.\n");
-        }
 }
 /*
@@ -3415,11 +3463,8 @@ int scan_unevictable_handler(struct ctl_table *table, int write,
                           void __user *buffer,
                           size_t *length, loff_t *ppos)
 {
+        warn_scan_unevictable_pages();
        proc_doulongvec_minmax(table, write, buffer, length, ppos);
-        if (write && *(unsigned long *)table->data)
-                scan_all_zones_unevictable_pages();
        scan_unevictable_pages = 0;
        return 0;
 }
@@ -3434,6 +3479,7 @@ static ssize_t read_scan_unevictable_node(struct sys_device *dev,
                                          struct sysdev_attribute *attr,
                                          char *buf)
 {
+        warn_scan_unevictable_pages();
        return sprintf(buf, "0\n");     /* always zero; should fit... */
 }
@@ -3441,19 +3487,7 @@ static ssize_t write_scan_unevictable_node(struct sys_device *dev,
                                           struct sysdev_attribute *attr,
                                        const char *buf, size_t count)
 {
-        struct zone *node_zones = NODE_DATA(dev->id)->node_zones;
+        warn_scan_unevictable_pages();
-        struct zone *zone;
-        unsigned long res;
-        unsigned long req = strict_strtoul(buf, 10, &res);
-        if (!req)
-                return 1;       /* zero is no-op */
-        for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
-                if (!populated_zone(zone))
-                        continue;
-                scan_zone_unevictable_pages(zone);
-        }
        return 1;
 }