Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp

Conflicts: litmus/sched_cedf.c
author: Glenn Elliott <gelliott@cs.unc.edu> 2012-03-04 19:47:13 -0500
committer: Glenn Elliott <gelliott@cs.unc.edu> 2012-03-04 19:47:13 -0500
commit: c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
tree: ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /mm/vmscan.c
parent: ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent: 6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
1 files changed, 625 insertions, 228 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c5dfabf25f11..d036e59d302b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -32,6 +32,7 @@
 #include <linux/topology.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
+#include <linux/compaction.h>
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
 #include <linux/delay.h>
@@ -40,6 +41,8 @@
 #include <linux/memcontrol.h>
 #include <linux/delayacct.h>
 #include <linux/sysctl.h>
+#include <linux/oom.h>
+#include <linux/prefetch.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -51,6 +54,24 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/vmscan.h>
+/*
+ * reclaim_mode determines how the inactive list is shrunk
+ * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages
+ * RECLAIM_MODE_ASYNC:  Do not block
+ * RECLAIM_MODE_SYNC:   Allow blocking e.g. call wait_on_page_writeback
+ * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference
+ *                      page from the LRU and reclaim all pages within a
+ *                      naturally aligned range
+ * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of
+ *                      order-0 pages and then compact the zone
+ */
+typedef unsigned __bitwise__ reclaim_mode_t;
+#define RECLAIM_MODE_SINGLE             ((__force reclaim_mode_t)0x01u)
+#define RECLAIM_MODE_ASYNC              ((__force reclaim_mode_t)0x02u)
+#define RECLAIM_MODE_SYNC               ((__force reclaim_mode_t)0x04u)
+#define RECLAIM_MODE_LUMPYRECLAIM       ((__force reclaim_mode_t)0x08u)
+#define RECLAIM_MODE_COMPACTION         ((__force reclaim_mode_t)0x10u)
 struct scan_control {
        /* Incremented by the number of inactive pages that were scanned */
        unsigned long nr_scanned;
@@ -79,10 +100,10 @@ struct scan_control {
        int order;
        /*
-         * Intend to reclaim enough contenious memory rather than to reclaim
+         * Intend to reclaim enough continuous memory rather than reclaim
-         * enough amount memory. I.e, it's the mode for high order allocation.
+         * enough amount of memory. i.e, mode for high order allocation.
         */
-        bool lumpy_reclaim_mode;
+        reclaim_mode_t reclaim_mode;
        /* Which cgroup do we reclaim from */
        struct mem_cgroup *mem_cgroup;
@@ -152,7 +173,7 @@ static unsigned long zone_nr_lru_pages(struct zone *zone,
                                struct scan_control *sc, enum lru_list lru)
 {
        if (!scanning_global_lru(sc))
-                return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru);
+                return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, zone, lru);
        return zone_page_state(zone, NR_LRU_BASE + lru);
 }
@@ -181,6 +202,14 @@ void unregister_shrinker(struct shrinker *shrinker)
 }
 EXPORT_SYMBOL(unregister_shrinker);
+static inline int do_shrinker_shrink(struct shrinker *shrinker,
+                                     struct shrink_control *sc,
+                                     unsigned long nr_to_scan)
+{
+        sc->nr_to_scan = nr_to_scan;
+        return (*shrinker->shrink)(shrinker, sc);
+}
 #define SHRINK_BATCH 128
 /*
 * Call the shrink functions to age shrinkable caches
@@ -201,25 +230,29 @@ EXPORT_SYMBOL(unregister_shrinker);
 *
 * Returns the number of slab objects which we shrunk.
 */
-unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
+unsigned long shrink_slab(struct shrink_control *shrink,
-                        unsigned long lru_pages)
+                          unsigned long nr_pages_scanned,
+                          unsigned long lru_pages)
 {
        struct shrinker *shrinker;
        unsigned long ret = 0;
-        if (scanned == 0)
+        if (nr_pages_scanned == 0)
-                scanned = SWAP_CLUSTER_MAX;
+                nr_pages_scanned = SWAP_CLUSTER_MAX;
-        if (!down_read_trylock(&shrinker_rwsem))
+        if (!down_read_trylock(&shrinker_rwsem)) {
-                return 1;       /* Assume we'll be able to shrink next time */
+                /* Assume we'll be able to shrink next time */
+                ret = 1;
+                goto out;
+        }
        list_for_each_entry(shrinker, &shrinker_list, list) {
                unsigned long long delta;
                unsigned long total_scan;
                unsigned long max_pass;
-                max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask);
+                max_pass = do_shrinker_shrink(shrinker, shrink, 0);
-                delta = (4 * scanned) / shrinker->seeks;
+                delta = (4 * nr_pages_scanned) / shrinker->seeks;
                delta *= max_pass;
                do_div(delta, lru_pages + 1);
                shrinker->nr += delta;
@@ -246,9 +279,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
                        int shrink_ret;
                        int nr_before;
-                        nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask);
+                        nr_before = do_shrinker_shrink(shrinker, shrink, 0);
-                        shrink_ret = (*shrinker->shrink)(shrinker, this_scan,
+                        shrink_ret = do_shrinker_shrink(shrinker, shrink,
-                                                                gfp_mask);
+                                                        this_scan);
                        if (shrink_ret == -1)
                                break;
                        if (shrink_ret < nr_before)
@@ -262,9 +295,44 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
                shrinker->nr += total_scan;
        }
        up_read(&shrinker_rwsem);
+out:
+        cond_resched();
        return ret;
 }
+static void set_reclaim_mode(int priority, struct scan_control *sc,
+                                   bool sync)
+{
+        reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
+        /*
+         * Initially assume we are entering either lumpy reclaim or
+         * reclaim/compaction.Depending on the order, we will either set the
+         * sync mode or just reclaim order-0 pages later.
+         */
+        if (COMPACTION_BUILD)
+                sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
+        else
+                sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
+        /*
+         * Avoid using lumpy reclaim or reclaim/compaction if possible by
+         * restricting when its set to either costly allocations or when
+         * under memory pressure
+         */
+        if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+                sc->reclaim_mode |= syncmode;
+        else if (sc->order && priority < DEF_PRIORITY - 2)
+                sc->reclaim_mode |= syncmode;
+        else
+                sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
+}
+static void reset_reclaim_mode(struct scan_control *sc)
+{
+        sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
+}
 static inline int is_page_cache_freeable(struct page *page)
 {
        /*
@@ -275,7 +343,8 @@ static inline int is_page_cache_freeable(struct page *page)
        return page_count(page) - page_has_private(page) == 2;
 }
-static int may_write_to_queue(struct backing_dev_info *bdi)
+static int may_write_to_queue(struct backing_dev_info *bdi,
+                              struct scan_control *sc)
 {
        if (current->flags & PF_SWAPWRITE)
                return 1;
@@ -283,6 +352,10 @@ static int may_write_to_queue(struct backing_dev_info *bdi)
                return 1;
        if (bdi == current->backing_dev_info)
                return 1;
+        /* lumpy reclaim for hugepage often need a lot of write */
+        if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+                return 1;
        return 0;
 }
@@ -301,18 +374,12 @@ static int may_write_to_queue(struct backing_dev_info *bdi)
 static void handle_write_error(struct address_space *mapping,
                                struct page *page, int error)
 {
-        lock_page_nosync(page);
+        lock_page(page);
        if (page_mapping(page) == mapping)
                mapping_set_error(mapping, error);
        unlock_page(page);
 }
-/* Request for sync pageout. */
-enum pageout_io {
-        PAGEOUT_IO_ASYNC,
-        PAGEOUT_IO_SYNC,
-};
 /* possible outcome of pageout() */
 typedef enum {
        /* failed to write page out, page is locked */
@@ -330,7 +397,7 @@ typedef enum {
 * Calls ->writepage().
 */
 static pageout_t pageout(struct page *page, struct address_space *mapping,
-                                                enum pageout_io sync_writeback)
+                         struct scan_control *sc)
 {
        /*
         * If the page is dirty, only perform writeback if that write
@@ -366,7 +433,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
        }
        if (mapping->a_ops->writepage == NULL)
                return PAGE_ACTIVATE;
-        if (!may_write_to_queue(mapping->backing_dev_info))
+        if (!may_write_to_queue(mapping->backing_dev_info, sc))
                return PAGE_KEEP;
        if (clear_page_dirty_for_io(page)) {
@@ -376,7 +443,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                        .nr_to_write = SWAP_CLUSTER_MAX,
                        .range_start = 0,
                        .range_end = LLONG_MAX,
-                        .nonblocking = 1,
                        .for_reclaim = 1,
                };
@@ -394,7 +460,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                 * direct reclaiming a large contiguous area and the
                 * first attempt to free a range of pages fails.
                 */
-                if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC)
+                if (PageWriteback(page) &&
+                    (sc->reclaim_mode & RECLAIM_MODE_SYNC))
                        wait_on_page_writeback(page);
                if (!PageWriteback(page)) {
@@ -402,7 +469,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                        ClearPageReclaim(page);
                }
                trace_mm_vmscan_writepage(page,
-                        trace_reclaim_flags(page, sync_writeback));
+                        trace_reclaim_flags(page, sc->reclaim_mode));
                inc_zone_page_state(page, NR_VMSCAN_WRITE);
                return PAGE_SUCCESS;
        }
@@ -459,9 +526,16 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
                spin_unlock_irq(&mapping->tree_lock);
                swapcache_free(swap, page);
        } else {
-                __remove_from_page_cache(page);
+                void (*freepage)(struct page *);
+                freepage = mapping->a_ops->freepage;
+                __delete_from_page_cache(page);
                spin_unlock_irq(&mapping->tree_lock);
                mem_cgroup_uncharge_cache_page(page);
+                if (freepage != NULL)
+                        freepage(page);
        }
        return 1;
@@ -580,7 +654,7 @@ static enum page_references page_check_references(struct page *page,
        referenced_page = TestClearPageReferenced(page);
        /* Lumpy reclaim - ignore references */
-        if (sc->lumpy_reclaim_mode)
+        if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
                return PAGEREF_RECLAIM;
        /*
@@ -616,7 +690,7 @@ static enum page_references page_check_references(struct page *page,
        }
        /* Reclaim if clean, defer dirty pages to writeback */
-        if (referenced_page)
+        if (referenced_page && !PageSwapBacked(page))
                return PAGEREF_RECLAIM_CLEAN;
        return PAGEREF_RECLAIM;
@@ -644,12 +718,14 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
 * shrink_page_list() returns the number of reclaimed pages
 */
 static unsigned long shrink_page_list(struct list_head *page_list,
-                                        struct scan_control *sc,
+                                      struct zone *zone,
-                                        enum pageout_io sync_writeback)
+                                      struct scan_control *sc)
 {
        LIST_HEAD(ret_pages);
        LIST_HEAD(free_pages);
        int pgactivate = 0;
+        unsigned long nr_dirty = 0;
+        unsigned long nr_congested = 0;
        unsigned long nr_reclaimed = 0;
        cond_resched();
@@ -669,6 +745,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        goto keep;
                VM_BUG_ON(PageActive(page));
+                VM_BUG_ON(page_zone(page) != zone);
                sc->nr_scanned++;
@@ -694,10 +771,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                         * for any page for which writeback has already
                         * started.
                         */
-                        if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs)
+                        if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
+                            may_enter_fs)
                                wait_on_page_writeback(page);
-                        else
+                        else {
-                                goto keep_locked;
+                                unlock_page(page);
+                                goto keep_lumpy;
+                        }
                }
                references = page_check_references(page, sc);
@@ -743,6 +823,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                }
                if (PageDirty(page)) {
+                        nr_dirty++;
                        if (references == PAGEREF_RECLAIM_CLEAN)
                                goto keep_locked;
                        if (!may_enter_fs)
@@ -751,14 +833,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                goto keep_locked;
                        /* Page is dirty, try to write it out here */
-                        switch (pageout(page, mapping, sync_writeback)) {
+                        switch (pageout(page, mapping, sc)) {
                        case PAGE_KEEP:
+                                nr_congested++;
                                goto keep_locked;
                        case PAGE_ACTIVATE:
                                goto activate_locked;
                        case PAGE_SUCCESS:
-                                if (PageWriteback(page) || PageDirty(page))
+                                if (PageWriteback(page))
+                                        goto keep_lumpy;
+                                if (PageDirty(page))
                                        goto keep;
                                /*
                                 * A synchronous write - probably a ramdisk.  Go
                                 * ahead and try to reclaim the page.
@@ -841,6 +927,7 @@ cull_mlocked:
                        try_to_free_swap(page);
                unlock_page(page);
                putback_lru_page(page);
+                reset_reclaim_mode(sc);
                continue;
 activate_locked:
@@ -853,10 +940,21 @@ activate_locked:
 keep_locked:
                unlock_page(page);
 keep:
+                reset_reclaim_mode(sc);
+keep_lumpy:
                list_add(&page->lru, &ret_pages);
                VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
        }
+        /*
+         * Tag a zone as congested if all the dirty pages encountered were
+         * backed by a congested BDI. In this case, reclaimers should just
+         * back off and wait for congestion to clear because further reclaim
+         * will encounter the same problem
+         */
+        if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc))
+                zone_set_flag(zone, ZONE_CONGESTED);
        free_page_list(&free_pages);
        list_splice(&ret_pages, page_list);
@@ -962,7 +1060,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                case 0:
                        list_move(&page->lru, dst);
                        mem_cgroup_del_lru(page);
-                        nr_taken++;
+                        nr_taken += hpage_nr_pages(page);
                        break;
                case -EBUSY:
@@ -983,7 +1081,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                 * surrounding the tag page.  Only take those pages of
                 * the same active state as that tag page.  We may safely
                 * round the target page pfn down to the requested order
-                 * as the mem_map is guarenteed valid out to MAX_ORDER,
+                 * as the mem_map is guaranteed valid out to MAX_ORDER,
                 * where that page is in a different zone we will detect
                 * it from its zone id and abort this block scan.
                 */
@@ -1006,7 +1104,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                        /* Check that we have not crossed a zone boundary. */
                        if (unlikely(page_zone_id(cursor_page) != zone_id))
-                                continue;
+                                break;
                        /*
                         * If we don't have enough swap space, reclaiming of
@@ -1014,23 +1112,40 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                         * pointless.
                         */
                        if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
-                                        !PageSwapCache(cursor_page))
+                            !PageSwapCache(cursor_page))
-                                continue;
+                                break;
                        if (__isolate_lru_page(cursor_page, mode, file) == 0) {
                                list_move(&cursor_page->lru, dst);
                                mem_cgroup_del_lru(cursor_page);
-                                nr_taken++;
+                                nr_taken += hpage_nr_pages(page);
                                nr_lumpy_taken++;
                                if (PageDirty(cursor_page))
                                        nr_lumpy_dirty++;
                                scan++;
                        } else {
-                                if (mode == ISOLATE_BOTH &&
+                                /*
-                                                page_count(cursor_page))
+                                 * Check if the page is freed already.
-                                        nr_lumpy_failed++;
+                                 *
+                                 * We can't use page_count() as that
+                                 * requires compound_head and we don't
+                                 * have a pin on the page here. If a
+                                 * page is tail, we may or may not
+                                 * have isolated the head, so assume
+                                 * it's not free, it'd be tricky to
+                                 * track the head status without a
+                                 * page pin.
+                                 */
+                                if (!PageTail(cursor_page) &&
+                                    !atomic_read(&cursor_page->_count))
+                                        continue;
+                                break;
                        }
                }
+                /* If we break out of the loop above, lumpy reclaim failed */
+                if (pfn < end_pfn)
+                        nr_lumpy_failed++;
        }
        *scanned = scan;
@@ -1070,14 +1185,15 @@ static unsigned long clear_active_flags(struct list_head *page_list,
        struct page *page;
        list_for_each_entry(page, page_list, lru) {
+                int numpages = hpage_nr_pages(page);
                lru = page_lru_base_type(page);
                if (PageActive(page)) {
                        lru += LRU_ACTIVE;
                        ClearPageActive(page);
-                        nr_active++;
+                        nr_active += numpages;
                }
                if (count)
-                        count[lru]++;
+                        count[lru] += numpages;
        }
        return nr_active;
@@ -1112,13 +1228,16 @@ int isolate_lru_page(struct page *page)
 {
        int ret = -EBUSY;
+        VM_BUG_ON(!page_count(page));
        if (PageLRU(page)) {
                struct zone *zone = page_zone(page);
                spin_lock_irq(&zone->lru_lock);
-                if (PageLRU(page) && get_page_unless_zero(page)) {
+                if (PageLRU(page)) {
                        int lru = page_lru(page);
                        ret = 0;
+                        get_page(page);
                        ClearPageLRU(page);
                        del_page_from_lru_list(zone, page, lru);
@@ -1187,7 +1306,8 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
                add_page_to_lru_list(zone, page, lru);
                if (is_active_lru(lru)) {
                        int file = is_file_lru(lru);
-                        reclaim_stat->recent_rotated[file]++;
+                        int numpages = hpage_nr_pages(page);
+                        reclaim_stat->recent_rotated[file] += numpages;
                }
                if (!pagevec_add(&pvec, page)) {
                        spin_unlock_irq(&zone->lru_lock);
@@ -1253,7 +1373,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
                return false;
        /* Only stall on lumpy reclaim */
-        if (!sc->lumpy_reclaim_mode)
+        if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
                return false;
        /* If we have relaimed everything on the isolated list, no stall */
@@ -1286,7 +1406,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        unsigned long nr_scanned;
        unsigned long nr_reclaimed = 0;
        unsigned long nr_taken;
-        unsigned long nr_active;
        unsigned long nr_anon;
        unsigned long nr_file;
@@ -1298,15 +1417,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
                        return SWAP_CLUSTER_MAX;
        }
+        set_reclaim_mode(priority, sc, false);
        lru_add_drain();
        spin_lock_irq(&zone->lru_lock);
        if (scanning_global_lru(sc)) {
                nr_taken = isolate_pages_global(nr_to_scan,
                        &page_list, &nr_scanned, sc->order,
-                        sc->lumpy_reclaim_mode ?
+                        sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
-                                ISOLATE_BOTH : ISOLATE_INACTIVE,
+                                        ISOLATE_BOTH : ISOLATE_INACTIVE,
                        zone, 0, file);
                zone->pages_scanned += nr_scanned;
                if (current_is_kswapd())
@@ -1318,8 +1437,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        } else {
                nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
                        &page_list, &nr_scanned, sc->order,
-                        sc->lumpy_reclaim_mode ?
+                        sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
-                                ISOLATE_BOTH : ISOLATE_INACTIVE,
+                                        ISOLATE_BOTH : ISOLATE_INACTIVE,
                        zone, sc->mem_cgroup,
                        0, file);
                /*
@@ -1337,20 +1456,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        spin_unlock_irq(&zone->lru_lock);
-        nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
+        nr_reclaimed = shrink_page_list(&page_list, zone, sc);
        /* Check if we should syncronously wait for writeback */
        if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
-                congestion_wait(BLK_RW_ASYNC, HZ/10);
+                set_reclaim_mode(priority, sc, true);
+                nr_reclaimed += shrink_page_list(&page_list, zone, sc);
-                /*
-                 * The attempt at page out may have made some
-                 * of the pages active, mark them inactive again.
-                 */
-                nr_active = clear_active_flags(&page_list, NULL);
-                count_vm_events(PGDEACTIVATE, nr_active);
-                nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC);
        }
        local_irq_disable();
@@ -1359,6 +1470,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
        putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
+        trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
+                zone_idx(zone),
+                nr_scanned, nr_reclaimed,
+                priority,
+                trace_shrink_flags(file, sc->reclaim_mode));
        return nr_reclaimed;
 }
@@ -1398,7 +1515,7 @@ static void move_active_pages_to_lru(struct zone *zone,
                list_move(&page->lru, &zone->lru[lru].list);
                mem_cgroup_add_lru_list(page, lru);
-                pgmoved++;
+                pgmoved += hpage_nr_pages(page);
                if (!pagevec_add(&pvec, page) || list_empty(list)) {
                        spin_unlock_irq(&zone->lru_lock);
@@ -1466,7 +1583,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                }
                if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
-                        nr_rotated++;
+                        nr_rotated += hpage_nr_pages(page);
                        /*
                         * Identify referenced, file-backed active pages and
                         * give them one more trip around the active list. So
@@ -1506,6 +1623,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
        spin_unlock_irq(&zone->lru_lock);
 }
+#ifdef CONFIG_SWAP
 static int inactive_anon_is_low_global(struct zone *zone)
 {
        unsigned long active, inactive;
@@ -1531,12 +1649,26 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
 {
        int low;
+        /*
+         * If we don't have swap space, anonymous page deactivation
+         * is pointless.
+         */
+        if (!total_swap_pages)
+                return 0;
        if (scanning_global_lru(sc))
                low = inactive_anon_is_low_global(zone);
        else
                low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
        return low;
 }
+#else
+static inline int inactive_anon_is_low(struct zone *zone,
+                                        struct scan_control *sc)
+{
+        return 0;
+}
+#endif
 static int inactive_file_is_low_global(struct zone *zone)
 {
@@ -1598,26 +1730,6 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
 }
 /*
- * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
- * until we collected @swap_cluster_max pages to scan.
- */
-static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
-                                       unsigned long *nr_saved_scan)
-{
-        unsigned long nr;
-        *nr_saved_scan += nr_to_scan;
-        nr = *nr_saved_scan;
-        if (nr >= SWAP_CLUSTER_MAX)
-                *nr_saved_scan = 0;
-        else
-                nr = 0;
-        return nr;
-}
-/*
 * Determine how aggressively the anon and file LRU lists should be
 * scanned.  The relative value of each set of LRU lists is determined
 * by looking at the fraction of the pages scanned we did rotate back
@@ -1635,6 +1747,22 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
        u64 fraction[2], denominator;
        enum lru_list l;
        int noswap = 0;
+        int force_scan = 0;
+        anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
+                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
+        file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
+                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+        if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) {
+                /* kswapd does zone balancing and need to scan this zone */
+                if (scanning_global_lru(sc) && current_is_kswapd())
+                        force_scan = 1;
+                /* memcg may have small limit and need to avoid priority drop */
+                if (!scanning_global_lru(sc))
+                        force_scan = 1;
+        }
        /* If we have no swap space, do not bother scanning anon pages. */
        if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1645,11 +1773,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
                goto out;
        }
-        anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
-                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
-        file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
-                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
        if (scanning_global_lru(sc)) {
                free  = zone_page_state(zone, NR_FREE_PAGES);
                /* If we have very few page cache pages,
@@ -1716,24 +1839,87 @@ out:
                        scan >>= priority;
                        scan = div64_u64(scan * fraction[file], denominator);
                }
-                nr[l] = nr_scan_try_batch(scan,
-                                          &reclaim_stat->nr_saved_scan[l]);
+                /*
+                 * If zone is small or memcg is small, nr[l] can be 0.
+                 * This results no-scan on this priority and priority drop down.
+                 * For global direct reclaim, it can visit next zone and tend
+                 * not to have problems. For global kswapd, it's for zone
+                 * balancing and it need to scan a small amounts. When using
+                 * memcg, priority drop can cause big latency. So, it's better
+                 * to scan small amount. See may_noscan above.
+                 */
+                if (!scan && force_scan) {
+                        if (file)
+                                scan = SWAP_CLUSTER_MAX;
+                        else if (!noswap)
+                                scan = SWAP_CLUSTER_MAX;
+                }
+                nr[l] = scan;
        }
 }
-static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc)
+/*
+ * Reclaim/compaction depends on a number of pages being freed. To avoid
+ * disruption to the system, a small number of order-0 pages continue to be
+ * rotated and reclaimed in the normal fashion. However, by the time we get
+ * back to the allocator and call try_to_compact_zone(), we ensure that
+ * there are enough free pages for it to be likely successful
+ */
+static inline bool should_continue_reclaim(struct zone *zone,
+                                        unsigned long nr_reclaimed,
+                                        unsigned long nr_scanned,
+                                        struct scan_control *sc)
 {
+        unsigned long pages_for_compaction;
+        unsigned long inactive_lru_pages;
+        /* If not in reclaim/compaction mode, stop */
+        if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
+                return false;
+        /* Consider stopping depending on scan and reclaim activity */
+        if (sc->gfp_mask & __GFP_REPEAT) {
+                /*
+                 * For __GFP_REPEAT allocations, stop reclaiming if the
+                 * full LRU list has been scanned and we are still failing
+                 * to reclaim pages. This full LRU scan is potentially
+                 * expensive but a __GFP_REPEAT caller really wants to succeed
+                 */
+                if (!nr_reclaimed && !nr_scanned)
+                        return false;
+        } else {
+                /*
+                 * For non-__GFP_REPEAT allocations which can presumably
+                 * fail without consequence, stop if we failed to reclaim
+                 * any pages from the last SWAP_CLUSTER_MAX number of
+                 * pages that were scanned. This will return to the
+                 * caller faster at the risk reclaim/compaction and
+                 * the resulting allocation attempt fails
+                 */
+                if (!nr_reclaimed)
+                        return false;
+        }
        /*
-         * If we need a large contiguous chunk of memory, or have
+         * If we have not reclaimed enough pages for compaction and the
-         * trouble getting a small set of contiguous pages, we
+         * inactive lists are large enough, continue reclaiming
-         * will reclaim both active and inactive pages.
         */
-        if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+        pages_for_compaction = (2UL << sc->order);
-                sc->lumpy_reclaim_mode = 1;
+        inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
-        else if (sc->order && priority < DEF_PRIORITY - 2)
+                                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
-                sc->lumpy_reclaim_mode = 1;
+        if (sc->nr_reclaimed < pages_for_compaction &&
-        else
+                        inactive_lru_pages > pages_for_compaction)
-                sc->lumpy_reclaim_mode = 0;
+                return true;
+        /* If compaction would go ahead or the allocation would succeed, stop */
+        switch (compaction_suitable(zone, sc->order)) {
+        case COMPACT_PARTIAL:
+        case COMPACT_CONTINUE:
+                return false;
+        default:
+                return true;
+        }
 }
 /*
@@ -1745,13 +1931,14 @@ static void shrink_zone(int priority, struct zone *zone,
        unsigned long nr[NR_LRU_LISTS];
        unsigned long nr_to_scan;
        enum lru_list l;
-        unsigned long nr_reclaimed = sc->nr_reclaimed;
+        unsigned long nr_reclaimed, nr_scanned;
        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+restart:
+        nr_reclaimed = 0;
+        nr_scanned = sc->nr_scanned;
        get_scan_count(zone, sc, nr, priority);
-        set_lumpy_reclaim_mode(priority, sc);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                                        nr[LRU_INACTIVE_FILE]) {
                for_each_evictable_lru(l) {
@@ -1775,16 +1962,20 @@ static void shrink_zone(int priority, struct zone *zone,
                if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
                        break;
        }
+        sc->nr_reclaimed += nr_reclaimed;
-        sc->nr_reclaimed = nr_reclaimed;
        /*
         * Even if we did not try to evict anon pages at all, we want to
         * rebalance the anon lru active/inactive ratio.
         */
-        if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0)
+        if (inactive_anon_is_low(zone, sc))
                shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
+        /* reclaim/compaction might need reclaim to continue */
+        if (should_continue_reclaim(zone, nr_reclaimed,
+                                        sc->nr_scanned - nr_scanned, sc))
+                goto restart;
        throttle_vm_writeout(sc->gfp_mask);
 }
@@ -1809,6 +2000,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
 {
        struct zoneref *z;
        struct zone *zone;
+        unsigned long nr_soft_reclaimed;
+        unsigned long nr_soft_scanned;
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                        gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -1823,6 +2016,19 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
                                continue;
                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                continue;       /* Let kswapd poll it */
+                        /*
+                         * This steals pages from memory cgroups over softlimit
+                         * and returns the number of reclaimed pages and
+                         * scanned pages. This works for global memory pressure
+                         * and balancing, not for a memcg's limit.
+                         */
+                        nr_soft_scanned = 0;
+                        nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+                                                sc->order, sc->gfp_mask,
+                                                &nr_soft_scanned);
+                        sc->nr_reclaimed += nr_soft_reclaimed;
+                        sc->nr_scanned += nr_soft_scanned;
+                        /* need some check for avoid more shrink_zone() */
                }
                shrink_zone(priority, zone, sc);
@@ -1834,17 +2040,12 @@ static bool zone_reclaimable(struct zone *zone)
        return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
 }
-/*
+/* All zones in zonelist are unreclaimable? */
- * As hibernation is going on, kswapd is freezed so that it can't mark
- * the zone into all_unreclaimable. It can't handle OOM during hibernation.
- * So let's check zone's unreclaimable in direct reclaim as well as kswapd.
- */
 static bool all_unreclaimable(struct zonelist *zonelist,
                struct scan_control *sc)
 {
        struct zoneref *z;
        struct zone *zone;
-        bool all_unreclaimable = true;
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                        gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -1852,13 +2053,11 @@ static bool all_unreclaimable(struct zonelist *zonelist,
                        continue;
                if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                        continue;
-                if (zone_reclaimable(zone)) {
+                if (!zone->all_unreclaimable)
-                        all_unreclaimable = false;
+                        return false;
-                        break;
-                }
        }
-        return all_unreclaimable;
+        return true;
 }
 /*
@@ -1878,7 +2077,8 @@ static bool all_unreclaimable(struct zonelist *zonelist,
 *              else, the number of pages reclaimed
 */
 static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
-                                        struct scan_control *sc)
+                                        struct scan_control *sc,
+                                        struct shrink_control *shrink)
 {
        int priority;
        unsigned long total_scanned = 0;
@@ -1896,7 +2096,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
                sc->nr_scanned = 0;
                if (!priority)
-                        disable_swap_token();
+                        disable_swap_token(sc->mem_cgroup);
                shrink_zones(priority, zonelist, sc);
                /*
                 * Don't shrink slabs when reclaiming memory from
@@ -1912,7 +2112,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                                lru_pages += zone_reclaimable_pages(zone);
                        }
-                        shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
+                        shrink_slab(shrink, sc->nr_scanned, lru_pages);
                        if (reclaim_state) {
                                sc->nr_reclaimed += reclaim_state->reclaimed_slab;
                                reclaim_state->reclaimed_slab = 0;
@@ -1937,27 +2137,31 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                /* Take a nap, wait for some writeback to complete */
                if (!sc->hibernation_mode && sc->nr_scanned &&
-                    priority < DEF_PRIORITY - 2)
+                    priority < DEF_PRIORITY - 2) {
-                        congestion_wait(BLK_RW_ASYNC, HZ/10);
+                        struct zone *preferred_zone;
+                        first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
+                                                &cpuset_current_mems_allowed,
+                                                &preferred_zone);
+                        wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
+                }
        }
 out:
-        /*
-         * Now that we've scanned all the zones at this priority level, note
-         * that level within the zone so that the next thread which performs
-         * scanning of this zone will immediately start out at this priority
-         * level.  This affects only the decision whether or not to bring
-         * mapped pages onto the inactive list.
-         */
-        if (priority < 0)
-                priority = 0;
        delayacct_freepages_end();
        put_mems_allowed();
        if (sc->nr_reclaimed)
                return sc->nr_reclaimed;
+        /*
+         * As hibernation is going on, kswapd is freezed so that it can't mark
+         * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
+         * check.
+         */
+        if (oom_killer_disabled)
+                return 0;
        /* top priority shrink_zones still had more to do? don't OOM, then */
        if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
                return 1;
@@ -1980,12 +2184,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                .mem_cgroup = NULL,
                .nodemask = nodemask,
        };
+        struct shrink_control shrink = {
+                .gfp_mask = sc.gfp_mask,
+        };
        trace_mm_vmscan_direct_reclaim_begin(order,
                                sc.may_writepage,
                                gfp_mask);
-        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+        nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
        trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
@@ -1997,9 +2204,11 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
                                                gfp_t gfp_mask, bool noswap,
                                                unsigned int swappiness,
-                                                struct zone *zone)
+                                                struct zone *zone,
+                                                unsigned long *nr_scanned)
 {
        struct scan_control sc = {
+                .nr_scanned = 0,
                .nr_to_reclaim = SWAP_CLUSTER_MAX,
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
@@ -2008,6 +2217,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
                .order = 0,
                .mem_cgroup = mem,
        };
        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -2026,6 +2236,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
        trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
+        *nr_scanned = sc.nr_scanned;
        return sc.nr_reclaimed;
 }
@@ -2036,6 +2247,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 {
        struct zonelist *zonelist;
        unsigned long nr_reclaimed;
+        int nid;
        struct scan_control sc = {
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
@@ -2045,17 +2257,27 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                .order = 0,
                .mem_cgroup = mem_cont,
                .nodemask = NULL, /* we don't care the placement */
+                .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+                                (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
+        };
+        struct shrink_control shrink = {
+                .gfp_mask = sc.gfp_mask,
        };
-        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+        /*
-                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
+         * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
-        zonelist = NODE_DATA(numa_node_id())->node_zonelists;
+         * take care of from where we get pages. So the node where we start the
+         * scan does not need to be the current node.
+         */
+        nid = mem_cgroup_select_victim_node(mem_cont);
+        zonelist = NODE_DATA(nid)->node_zonelists;
        trace_mm_vmscan_memcg_reclaim_begin(0,
                                            sc.may_writepage,
                                            sc.gfp_mask);
-        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+        nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
        trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
@@ -2063,38 +2285,88 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 }
 #endif
+/*
+ * pgdat_balanced is used when checking if a node is balanced for high-order
+ * allocations. Only zones that meet watermarks and are in a zone allowed
+ * by the callers classzone_idx are added to balanced_pages. The total of
+ * balanced pages must be at least 25% of the zones allowed by classzone_idx
+ * for the node to be considered balanced. Forcing all zones to be balanced
+ * for high orders can cause excessive reclaim when there are imbalanced zones.
+ * The choice of 25% is due to
+ *   o a 16M DMA zone that is balanced will not balance a zone on any
+ *     reasonable sized machine
+ *   o On all other machines, the top zone must be at least a reasonable
+ *     percentage of the middle zones. For example, on 32-bit x86, highmem
+ *     would need to be at least 256M for it to be balance a whole node.
+ *     Similarly, on x86-64 the Normal zone would need to be at least 1G
+ *     to balance a node on its own. These seemed like reasonable ratios.
+ */
+static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
+                                                int classzone_idx)
+{
+        unsigned long present_pages = 0;
+        int i;
+        for (i = 0; i <= classzone_idx; i++)
+                present_pages += pgdat->node_zones[i].present_pages;
+        /* A special case here: if zone has no page, we think it's balanced */
+        return balanced_pages >= (present_pages >> 2);
+}
 /* is kswapd sleeping prematurely? */
-static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
+static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
+                                        int classzone_idx)
 {
        int i;
+        unsigned long balanced = 0;
+        bool all_zones_ok = true;
        /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
        if (remaining)
-                return 1;
+                return true;
-        /* If after HZ/10, a zone is below the high mark, it's premature */
+        /* Check the watermark levels */
-        for (i = 0; i < pgdat->nr_zones; i++) {
+        for (i = 0; i <= classzone_idx; i++) {
                struct zone *zone = pgdat->node_zones + i;
                if (!populated_zone(zone))
                        continue;
-                if (zone->all_unreclaimable)
+                /*
+                 * balance_pgdat() skips over all_unreclaimable after
+                 * DEF_PRIORITY. Effectively, it considers them balanced so
+                 * they must be considered balanced here as well if kswapd
+                 * is to sleep
+                 */
+                if (zone->all_unreclaimable) {
+                        balanced += zone->present_pages;
                        continue;
+                }
-                if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
+                if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
-                                                                0, 0))
+                                                        i, 0))
-                        return 1;
+                        all_zones_ok = false;
+                else
+                        balanced += zone->present_pages;
        }
-        return 0;
+        /*
+         * For high-order requests, the balanced zones must contain at least
+         * 25% of the nodes pages for kswapd to sleep. For order-0, all zones
+         * must be balanced
+         */
+        if (order)
+                return !pgdat_balanced(pgdat, balanced, classzone_idx);
+        else
+                return !all_zones_ok;
 }
 /*
 * For kswapd, balance_pgdat() will work across all this node's zones until
 * they are all at high_wmark_pages(zone).
 *
- * Returns the number of pages which were actually freed.
+ * Returns the final order kswapd was reclaiming at
 *
 * There is special handling here for zones which are full of pinned pages.
 * This can happen if the pages are all mlocked, or if they are all used by
@@ -2111,13 +2383,18 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
 * interoperates with the page allocator fallback scheme to ensure that aging
 * of pages is balanced across the zones.
 */
-static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
+static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
+                                                        int *classzone_idx)
 {
        int all_zones_ok;
+        unsigned long balanced;
        int priority;
        int i;
+        int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
        unsigned long total_scanned;
        struct reclaim_state *reclaim_state = current->reclaim_state;
+        unsigned long nr_soft_reclaimed;
+        unsigned long nr_soft_scanned;
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
                .may_unmap = 1,
@@ -2131,6 +2408,9 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
                .order = order,
                .mem_cgroup = NULL,
        };
+        struct shrink_control shrink = {
+                .gfp_mask = sc.gfp_mask,
+        };
 loop_again:
        total_scanned = 0;
        sc.nr_reclaimed = 0;
@@ -2138,15 +2418,15 @@ loop_again:
        count_vm_event(PAGEOUTRUN);
        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
-                int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
                unsigned long lru_pages = 0;
                int has_under_min_watermark_zone = 0;
                /* The swap token gets in the way of swapout... */
                if (!priority)
-                        disable_swap_token();
+                        disable_swap_token(NULL);
                all_zones_ok = 1;
+                balanced = 0;
                /*
                 * Scan in the highmem->dma direction for the highest
@@ -2169,7 +2449,7 @@ loop_again:
                                shrink_active_list(SWAP_CLUSTER_MAX, zone,
                                                        &sc, priority, 0);
-                        if (!zone_watermark_ok(zone, order,
+                        if (!zone_watermark_ok_safe(zone, order,
                                        high_wmark_pages(zone), 0, 0)) {
                                end_zone = i;
                                break;
@@ -2196,6 +2476,7 @@ loop_again:
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
                        int nr_slab;
+                        unsigned long balance_gap;
                        if (!populated_zone(zone))
                                continue;
@@ -2205,28 +2486,42 @@ loop_again:
                        sc.nr_scanned = 0;
+                        nr_soft_scanned = 0;
                        /*
                         * Call soft limit reclaim before calling shrink_zone.
-                         * For now we ignore the return value
                         */
-                        mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
+                        nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+                                                        order, sc.gfp_mask,
+                                                        &nr_soft_scanned);
+                        sc.nr_reclaimed += nr_soft_reclaimed;
+                        total_scanned += nr_soft_scanned;
                        /*
-                         * We put equal pressure on every zone, unless one
+                         * We put equal pressure on every zone, unless
-                         * zone has way too many pages free already.
+                         * one zone has way too many pages free
+                         * already. The "too many pages" is defined
+                         * as the high wmark plus a "gap" where the
+                         * gap is either the low watermark or 1%
+                         * of the zone, whichever is smaller.
                         */
-                        if (!zone_watermark_ok(zone, order,
+                        balance_gap = min(low_wmark_pages(zone),
-                                        8*high_wmark_pages(zone), end_zone, 0))
+                                (zone->present_pages +
+                                        KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+                                KSWAPD_ZONE_BALANCE_GAP_RATIO);
+                        if (!zone_watermark_ok_safe(zone, order,
+                                        high_wmark_pages(zone) + balance_gap,
+                                        end_zone, 0)) {
                                shrink_zone(priority, zone, &sc);
-                        reclaim_state->reclaimed_slab = 0;
-                        nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
+                                reclaim_state->reclaimed_slab = 0;
-                                                lru_pages);
+                                nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
-                        sc.nr_reclaimed += reclaim_state->reclaimed_slab;
+                                sc.nr_reclaimed += reclaim_state->reclaimed_slab;
-                        total_scanned += sc.nr_scanned;
+                                total_scanned += sc.nr_scanned;
-                        if (zone->all_unreclaimable)
-                                continue;
+                                if (nr_slab == 0 && !zone_reclaimable(zone))
-                        if (nr_slab == 0 && !zone_reclaimable(zone))
+                                        zone->all_unreclaimable = 1;
-                                zone->all_unreclaimable = 1;
+                        }
                        /*
                         * If we've done a decent amount of scanning and
                         * the reclaim ratio is low, start doing writepage
@@ -2236,7 +2531,13 @@ loop_again:
                            total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
                                sc.may_writepage = 1;
-                        if (!zone_watermark_ok(zone, order,
+                        if (zone->all_unreclaimable) {
+                                if (end_zone && end_zone == i)
+                                        end_zone--;
+                                continue;
+                        }
+                        if (!zone_watermark_ok_safe(zone, order,
                                        high_wmark_pages(zone), end_zone, 0)) {
                                all_zones_ok = 0;
                                /*
@@ -2244,13 +2545,24 @@ loop_again:
                                 * means that we have a GFP_ATOMIC allocation
                                 * failure risk. Hurry up!
                                 */
-                                if (!zone_watermark_ok(zone, order,
+                                if (!zone_watermark_ok_safe(zone, order,
                                            min_wmark_pages(zone), end_zone, 0))
                                        has_under_min_watermark_zone = 1;
+                        } else {
+                                /*
+                                 * If a zone reaches its high watermark,
+                                 * consider it to be no longer congested. It's
+                                 * possible there are dirty pages backed by
+                                 * congested BDIs but as pressure is relieved,
+                                 * spectulatively avoid congestion waits
+                                 */
+                                zone_clear_flag(zone, ZONE_CONGESTED);
+                                if (i <= *classzone_idx)
+                                        balanced += zone->present_pages;
                        }
                }
-                if (all_zones_ok)
+                if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
                        break;          /* kswapd: all done */
                /*
                 * OK, kswapd is getting into trouble.  Take a nap, then take
@@ -2273,7 +2585,13 @@ loop_again:
                        break;
        }
 out:
-        if (!all_zones_ok) {
+        /*
+         * order-0: All zones must meet high watermark for a balanced node
+         * high-order: Balanced zones must make up at least 25% of the node
+         *             for the node to be balanced
+         */
+        if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
                cond_resched();
                try_to_freeze();
@@ -2298,7 +2616,88 @@ out:
                goto loop_again;
        }
-        return sc.nr_reclaimed;
+        /*
+         * If kswapd was reclaiming at a higher order, it has the option of
+         * sleeping without all zones being balanced. Before it does, it must
+         * ensure that the watermarks for order-0 on *all* zones are met and
+         * that the congestion flags are cleared. The congestion flag must
+         * be cleared as kswapd is the only mechanism that clears the flag
+         * and it is potentially going to sleep here.
+         */
+        if (order) {
+                for (i = 0; i <= end_zone; i++) {
+                        struct zone *zone = pgdat->node_zones + i;
+                        if (!populated_zone(zone))
+                                continue;
+                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+                                continue;
+                        /* Confirm the zone is balanced for order-0 */
+                        if (!zone_watermark_ok(zone, 0,
+                                        high_wmark_pages(zone), 0, 0)) {
+                                order = sc.order = 0;
+                                goto loop_again;
+                        }
+                        /* If balanced, clear the congested flag */
+                        zone_clear_flag(zone, ZONE_CONGESTED);
+                }
+        }
+        /*
+         * Return the order we were reclaiming at so sleeping_prematurely()
+         * makes a decision on the order we were last reclaiming at. However,
+         * if another caller entered the allocator slow path while kswapd
+         * was awake, order will remain at the higher level
+         */
+        *classzone_idx = end_zone;
+        return order;
+}
+static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
+{
+        long remaining = 0;
+        DEFINE_WAIT(wait);
+        if (freezing(current) || kthread_should_stop())
+                return;
+        prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+        /* Try to sleep for a short interval */
+        if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+                remaining = schedule_timeout(HZ/10);
+                finish_wait(&pgdat->kswapd_wait, &wait);
+                prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+        }
+        /*
+         * After a short sleep, check if it was a premature sleep. If not, then
+         * go fully to sleep until explicitly woken up.
+         */
+        if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+                trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
+                /*
+                 * vmstat counters are not perfectly accurate and the estimated
+                 * value for counters such as NR_FREE_PAGES can deviate from the
+                 * true value by nr_online_cpus * threshold. To avoid the zone
+                 * watermarks being breached while under pressure, we reduce the
+                 * per-cpu vmstat threshold while kswapd is awake and restore
+                 * them before going back to sleep.
+                 */
+                set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
+                schedule();
+                set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
+        } else {
+                if (remaining)
+                        count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
+                else
+                        count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
+        }
+        finish_wait(&pgdat->kswapd_wait, &wait);
 }
 /*
@@ -2316,10 +2715,11 @@ out:
 */
 static int kswapd(void *p)
 {
-        unsigned long order;
+        unsigned long order, new_order;
+        int classzone_idx, new_classzone_idx;
        pg_data_t *pgdat = (pg_data_t*)p;
        struct task_struct *tsk = current;
-        DEFINE_WAIT(wait);
        struct reclaim_state reclaim_state = {
                .reclaimed_slab = 0,
        };
@@ -2346,50 +2746,37 @@ static int kswapd(void *p)
        tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
        set_freezable();
-        order = 0;
+        order = new_order = 0;
+        classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
        for ( ; ; ) {
-                unsigned long new_order;
                int ret;
-                prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+                /*
-                new_order = pgdat->kswapd_max_order;
+                 * If the last balance_pgdat was unsuccessful it's unlikely a
-                pgdat->kswapd_max_order = 0;
+                 * new request of a similar or harder type will succeed soon
-                if (order < new_order) {
+                 * so consider going to sleep on the basis we reclaimed at
+                 */
+                if (classzone_idx >= new_classzone_idx && order == new_order) {
+                        new_order = pgdat->kswapd_max_order;
+                        new_classzone_idx = pgdat->classzone_idx;
+                        pgdat->kswapd_max_order =  0;
+                        pgdat->classzone_idx = pgdat->nr_zones - 1;
+                }
+                if (order < new_order || classzone_idx > new_classzone_idx) {
                        /*
                         * Don't sleep if someone wants a larger 'order'
-                         * allocation
+                         * allocation or has tigher zone constraints
                         */
                        order = new_order;
+                        classzone_idx = new_classzone_idx;
                } else {
-                        if (!freezing(current) && !kthread_should_stop()) {
+                        kswapd_try_to_sleep(pgdat, order, classzone_idx);
-                                long remaining = 0;
-                                /* Try to sleep for a short interval */
-                                if (!sleeping_prematurely(pgdat, order, remaining)) {
-                                        remaining = schedule_timeout(HZ/10);
-                                        finish_wait(&pgdat->kswapd_wait, &wait);
-                                        prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
-                                }
-                                /*
-                                 * After a short sleep, check if it was a
-                                 * premature sleep. If not, then go fully
-                                 * to sleep until explicitly woken up
-                                 */
-                                if (!sleeping_prematurely(pgdat, order, remaining)) {
-                                        trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
-                                        schedule();
-                                } else {
-                                        if (remaining)
-                                                count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
-                                        else
-                                                count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
-                                }
-                        }
                        order = pgdat->kswapd_max_order;
+                        classzone_idx = pgdat->classzone_idx;
+                        pgdat->kswapd_max_order = 0;
+                        pgdat->classzone_idx = pgdat->nr_zones - 1;
                }
-                finish_wait(&pgdat->kswapd_wait, &wait);
                ret = try_to_freeze();
                if (kthread_should_stop())
@@ -2401,7 +2788,7 @@ static int kswapd(void *p)
                 */
                if (!ret) {
                        trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
-                        balance_pgdat(pgdat, order);
+                        order = balance_pgdat(pgdat, order, &classzone_idx);
                }
        }
        return 0;
@@ -2410,23 +2797,26 @@ static int kswapd(void *p)
 /*
 * A zone is low on free memory, so wake its kswapd task to service it.
 */
-void wakeup_kswapd(struct zone *zone, int order)
+void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
 {
        pg_data_t *pgdat;
        if (!populated_zone(zone))
                return;
-        pgdat = zone->zone_pgdat;
-        if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
-                return;
-        if (pgdat->kswapd_max_order < order)
-                pgdat->kswapd_max_order = order;
-        trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                return;
+        pgdat = zone->zone_pgdat;
+        if (pgdat->kswapd_max_order < order) {
+                pgdat->kswapd_max_order = order;
+                pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
+        }
        if (!waitqueue_active(&pgdat->kswapd_wait))
                return;
+        if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
+                return;
+        trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
        wake_up_interruptible(&pgdat->kswapd_wait);
 }
@@ -2487,7 +2877,10 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
                .swappiness = vm_swappiness,
                .order = 0,
        };
-        struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
+        struct shrink_control shrink = {
+                .gfp_mask = sc.gfp_mask,
+        };
+        struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
        struct task_struct *p = current;
        unsigned long nr_reclaimed;
@@ -2496,7 +2889,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
-        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+        nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
        p->reclaim_state = NULL;
        lockdep_clear_current_reclaim_state();
@@ -2671,6 +3064,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                .swappiness = vm_swappiness,
                .order = order,
        };
+        struct shrink_control shrink = {
+                .gfp_mask = sc.gfp_mask,
+        };
        unsigned long nr_slab_pages0, nr_slab_pages1;
        cond_resched();
@@ -2712,7 +3108,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                        unsigned long lru_pages = zone_reclaimable_pages(zone);
                        /* No reclaimable slab or very low memory pressure */
-                        if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages))
+                        if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
                                break;
                        /* Freed enough memory */
@@ -2987,6 +3383,7 @@ int scan_unevictable_handler(struct ctl_table *table, int write,
        return 0;
 }
+#ifdef CONFIG_NUMA
 /*
 * per node 'scan_unevictable_pages' attribute.  On demand re-scan of
 * a specified node's per zone unevictable lists for evictable pages.
@@ -3033,4 +3430,4 @@ void scan_unevictable_unregister_node(struct node *node)
 {
        sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
 }
+#endif
author	Glenn Elliott <gelliott@cs.unc.edu>	2012-03-04 19:47:13 -0500
committer	Glenn Elliott <gelliott@cs.unc.edu>	2012-03-04 19:47:13 -0500
commit	c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
tree	ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /mm/vmscan.c
parent	ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent	6a00f206debf8a5c8899055726ad127dbeeed098 (diff)